iso2022_jpms.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445
  1. /*
  2. * Copyright (C) 1999-2001, 2008, 2011-2012, 2016, 2018 Free Software Foundation, Inc.
  3. * This file is part of the GNU LIBICONV Library.
  4. *
  5. * The GNU LIBICONV Library is free software; you can redistribute it
  6. * and/or modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either version 2.1
  8. * of the License, or (at your option) any later version.
  9. *
  10. * The GNU LIBICONV Library is distributed in the hope that it will be
  11. * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
  17. * If not, see <https://www.gnu.org/licenses/>.
  18. */
  19. /*
  20. * ISO-2022-JP-MS
  21. * alias CP50221
  22. *
  23. * This is an extension of ISO-2022-JP-1 with larger character sets.
  24. * It uses ESC $ B and ESC $ ( D to denote *extensions* of JIS X 0208 and
  25. * JIS X 0212, respectively. This violates the principles of ISO 2022,
  26. * where
  27. * 1. character sets to be used by ISO 2022 have to be registered at the
  28. * ISO IR registry <https://www.itscj.ipsj.or.jp/ISO-IR/>,
  29. * 2. different character sets are designated by different escape
  30. * sequences.
  31. * It's a typical instance of the "embrace and extend" strategy by Microsoft
  32. * <https://en.wikipedia.org/wiki/Embrace,_extend_and_extinguish>.
  33. */
  34. /*
  35. * Windows has three encodings CP50220, CP50221, CP50222.
  36. * The common parts are:
  37. * - US-ASCII (0x00..0x7F)
  38. * - JIS X 0208 extended by
  39. * - one row (0x2D),
  40. * - a private use area (rows 0x75..0x7E = U+E000..U+E3AB),
  41. * enabled with ESC $ B, disabled with ESC ( B.
  42. * - JIS X 0212 extended by
  43. * - two rows (0x73..0x74),
  44. * - a private use area (rows 0x75..0x7E = U+E3AC..U+E757),
  45. * enabled with ESC $ ( D, disabled with ESC ( B.
  46. * They differ in the handling of JIS X 0201 characters (halfwidth Katakana)
  47. * in the conversion direction Unicode -> CP5022x:
  48. * * CP50220 maps the halfwidth Katakana to fullwidth Katakana characters.
  49. * * CP50221 contains the JIS X 0201 halfwidth Katakana characters,
  50. * enabled with ESC ( I, disabled with ESC ( B.
  51. * * CP50222 contains the JIS X 0201 halfwidth Katakana characters,
  52. * enabled with ESC ( J 0x0E, disabled with ESC ( B.
  53. * In the conversion direction CP5022x -> Unicode, all three operate the same:
  54. * - ESC ( I is supported and understood.
  55. * - ESC ( J 0x0E is not accepted. (Tested on Windows XP SP3.)
  56. * Conclusion:
  57. * - CP50222 should not be used, because the multibyte sequence that it
  58. * produces cannot be parsed by either of the three encodings.
  59. * - CP50221 is preferrable to CP50220, because it can faithfully represent
  60. * the halfwidth Katakana characters.
  61. * We therefore implement CP50221. As an extension, in the mbtowc conversion
  62. * direction, we support also ESC ( J 0x0E, just in case.
  63. */
  64. #include "cp50221_0208_ext.h"
  65. #include "cp50221_0212_ext.h"
  66. #define ESC 0x1b
  67. #define SO 0x0e
  68. #define SI 0x0f
  69. /*
  70. * The state can be one of the following values.
  71. */
  72. #define STATE_ASCII 0 /* Esc ( B */
  73. #define STATE_JISX0201ROMAN 1 /* Esc ( J */ /* only in mbtowc direction */
  74. #define STATE_JISX0201KATAKANA 2 /* Esc ( I */
  75. #define STATE_JISX0208MS 3 /* Esc $ @ or Esc $ B */
  76. #define STATE_JISX0212MS 4 /* Esc $ ( D */
  77. static int
  78. iso2022_jpms_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)
  79. {
  80. state_t state = conv->istate;
  81. int count = 0;
  82. unsigned char c;
  83. for (;;) {
  84. c = *s;
  85. if (c == ESC) {
  86. if (n < count+3)
  87. goto none;
  88. if (s[1] == '(') {
  89. if (s[2] == 'B') {
  90. state = STATE_ASCII;
  91. s += 3; count += 3;
  92. if (n < count+1)
  93. goto none;
  94. continue;
  95. }
  96. if (s[2] == 'I') {
  97. state = STATE_JISX0201KATAKANA;
  98. s += 3; count += 3;
  99. if (n < count+1)
  100. goto none;
  101. continue;
  102. }
  103. if (s[2] == 'J') {
  104. state = STATE_JISX0201ROMAN;
  105. s += 3; count += 3;
  106. if (n < count+1)
  107. goto none;
  108. continue;
  109. }
  110. goto ilseq;
  111. }
  112. if (s[1] == '$') {
  113. if (s[2] == '@' || s[2] == 'B') {
  114. /* We don't distinguish JIS X 0208-1978 and JIS X 0208-1983. */
  115. state = STATE_JISX0208MS;
  116. s += 3; count += 3;
  117. if (n < count+1)
  118. goto none;
  119. continue;
  120. }
  121. if (s[2] == '(') {
  122. if (n < count+4)
  123. goto none;
  124. if (s[3] == 'D') {
  125. state = STATE_JISX0212MS;
  126. s += 4; count += 4;
  127. if (n < count+1)
  128. goto none;
  129. continue;
  130. }
  131. }
  132. goto ilseq;
  133. }
  134. goto ilseq;
  135. }
  136. if (c == SO) {
  137. if (state == STATE_JISX0201ROMAN)
  138. state = STATE_JISX0201KATAKANA;
  139. s += 1; count += 1;
  140. if (n < count+1)
  141. goto none;
  142. continue;
  143. }
  144. if (c == SI) {
  145. if (state == STATE_JISX0201KATAKANA)
  146. state = STATE_JISX0201ROMAN;
  147. s += 1; count += 1;
  148. if (n < count+1)
  149. goto none;
  150. continue;
  151. }
  152. break;
  153. }
  154. switch (state) {
  155. case STATE_ASCII:
  156. if (c < 0x80) {
  157. int ret = ascii_mbtowc(conv,pwc,s,1);
  158. if (ret == RET_ILSEQ)
  159. goto ilseq;
  160. if (ret != 1) abort();
  161. conv->istate = state;
  162. return count+1;
  163. } else
  164. goto ilseq;
  165. case STATE_JISX0201ROMAN:
  166. if (c < 0x80) {
  167. int ret = jisx0201_mbtowc(conv,pwc,s,1);
  168. if (ret == RET_ILSEQ)
  169. goto ilseq;
  170. if (ret != 1) abort();
  171. conv->istate = state;
  172. return count+1;
  173. } else
  174. goto ilseq;
  175. case STATE_JISX0201KATAKANA:
  176. if (c < 0x80) {
  177. unsigned char buf = c+0x80;
  178. int ret = jisx0201_mbtowc(conv,pwc,&buf,1);
  179. if (ret == RET_ILSEQ)
  180. goto ilseq;
  181. if (ret != 1) abort();
  182. conv->istate = state;
  183. return count+1;
  184. } else
  185. goto ilseq;
  186. case STATE_JISX0208MS:
  187. if (n < count+2)
  188. goto none;
  189. if (s[0] < 0x80 && s[1] < 0x80) {
  190. int ret;
  191. if (s[0] < 0x75) {
  192. if (s[0] == 0x2d) {
  193. /* Extension of JIS X 0208. */
  194. if (s[1] >= 0x21 && s[1] <= 0x79) {
  195. unsigned char i = (s[1] - 0x21) + 1;
  196. ret = cp50221_0208_ext_mbtowc(conv,pwc,&i,1);
  197. if (ret == 1)
  198. ret = 2;
  199. } else
  200. ret = RET_ILSEQ;
  201. } else {
  202. /* JIS X 0208. */
  203. ret = jisx0208_mbtowc(conv,pwc,s,2);
  204. }
  205. } else {
  206. /* Extension of JIS X 0208.
  207. 0x{75..7E}{21..8E} maps to U+E000..U+E3AB.
  208. But some rows maps to characters present in CP932. */
  209. if (s[0] <= 0x7e && (s[1] >= 0x21 && s[1] <= 0x7e)) {
  210. unsigned short wc = 0xfffd;
  211. if (s[0] >= 0x79 && s[0] <= 0x7c)
  212. wc = cp932ext_2uni_pageed[(s[0] - 0x79) * 94 + (s[1] - 0x21)];
  213. if (wc == 0xfffd)
  214. wc = (s[0] - 0x75) * 94 + (s[1] - 0x21) + 0xe000;
  215. *pwc = wc;
  216. ret = 2;
  217. } else
  218. ret = RET_ILSEQ;
  219. }
  220. if (ret == RET_ILSEQ)
  221. goto ilseq;
  222. if (ret != 2) abort();
  223. conv->istate = state;
  224. return count+2;
  225. } else
  226. goto ilseq;
  227. case STATE_JISX0212MS:
  228. if (n < count+2)
  229. goto none;
  230. if (s[0] < 0x80 && s[1] < 0x80) {
  231. int ret;
  232. if (s[0] < 0x73) {
  233. /* JIS X 0212. */
  234. ret = jisx0212_mbtowc(conv,pwc,s,2);
  235. } else {
  236. if (s[0] < 0x75) {
  237. /* Extension of JIS X 0212. */
  238. if (s[1] >= 0x21 && s[1] <= 0x7e) {
  239. unsigned char i = (s[0] - 0x73) * 94 + (s[1] - 0x21) + 1;
  240. ret = cp50221_0212_ext_mbtowc(conv,pwc,&i,1);
  241. if (ret == 1)
  242. ret = 2;
  243. } else
  244. ret = RET_ILSEQ;
  245. } else {
  246. /* Extension of JIS X 0208.
  247. 0x{75..7E}{21..8E} maps to U+E3AC..U+E757. */
  248. if (s[0] <= 0x7e && (s[1] >= 0x21 && s[1] <= 0x7e)) {
  249. *pwc = (s[0] - 0x75) * 94 + (s[1] - 0x21) + 0xe3ac;
  250. ret = 2;
  251. } else
  252. ret = RET_ILSEQ;
  253. }
  254. }
  255. if (ret == RET_ILSEQ)
  256. goto ilseq;
  257. if (ret != 2) abort();
  258. conv->istate = state;
  259. return count+2;
  260. } else
  261. goto ilseq;
  262. default: abort();
  263. }
  264. none:
  265. conv->istate = state;
  266. return RET_TOOFEW(count);
  267. ilseq:
  268. conv->istate = state;
  269. return RET_SHIFT_ILSEQ(count);
  270. }
  271. static int
  272. iso2022_jpms_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
  273. {
  274. state_t state = conv->ostate;
  275. unsigned char buf[2];
  276. int ret;
  277. /* Try ASCII. */
  278. ret = ascii_wctomb(conv,buf,wc,1);
  279. if (ret != RET_ILUNI) {
  280. if (ret != 1) abort();
  281. if (buf[0] < 0x80) {
  282. int count = (state == STATE_ASCII ? 1 : 4);
  283. if (n < count)
  284. return RET_TOOSMALL;
  285. if (state != STATE_ASCII) {
  286. r[0] = ESC;
  287. r[1] = '(';
  288. r[2] = 'B';
  289. r += 3;
  290. state = STATE_ASCII;
  291. }
  292. r[0] = buf[0];
  293. conv->ostate = state;
  294. return count;
  295. }
  296. }
  297. /* Try JIS X 0201-1976 Katakana. */
  298. ret = jisx0201_wctomb(conv,buf,wc,1);
  299. if (ret != RET_ILUNI) {
  300. if (ret != 1) abort();
  301. if (buf[0] >= 0x80) {
  302. int count = (state == STATE_JISX0201KATAKANA ? 1 : 4);
  303. if (n < count)
  304. return RET_TOOSMALL;
  305. if (state != STATE_JISX0201KATAKANA) {
  306. r[0] = ESC;
  307. r[1] = '(';
  308. r[2] = 'I';
  309. r += 3;
  310. state = STATE_JISX0201KATAKANA;
  311. }
  312. r[0] = buf[0]-0x80;
  313. conv->ostate = state;
  314. return count;
  315. }
  316. }
  317. /* Try JIS X 0208-1990, in place of JIS X 0208-1978 and JIS X 0208-1983,
  318. and the extensions mentioned above. */
  319. if (wc >= 0xe000 && wc < 0xe3ac) {
  320. unsigned short i = wc - 0xe000;
  321. buf[0] = (i / 94) + 0x75;
  322. buf[1] = (i % 94) + 0x21;
  323. ret = 2;
  324. } else {
  325. ret = jisx0208_wctomb(conv,buf,wc,2);
  326. if (ret == RET_ILUNI) {
  327. /* Extension of JIS X 0208. */
  328. unsigned char i;
  329. ret = cp50221_0208_ext_wctomb(conv,&i,wc,1);
  330. if (ret == 1) {
  331. buf[0] = 0x2d;
  332. buf[1] = i-1 + 0x21;
  333. ret = 2;
  334. } else if (wc == 0x663B) {
  335. buf[0] = 0x7a;
  336. buf[1] = 0x36;
  337. ret = 2;
  338. } else if (wc == 0xffe2) {
  339. buf[0] = 0x7c;
  340. buf[1] = 0x7b;
  341. ret = 2;
  342. } else if (wc == 0xffe4) {
  343. buf[0] = 0x7c;
  344. buf[1] = 0x7c;
  345. ret = 2;
  346. }
  347. }
  348. }
  349. if (ret != RET_ILUNI) {
  350. if (ret != 2) abort();
  351. if (buf[0] < 0x80 && buf[1] < 0x80) {
  352. int count = (state == STATE_JISX0208MS ? 2 : 5);
  353. if (n < count)
  354. return RET_TOOSMALL;
  355. if (state != STATE_JISX0208MS) {
  356. r[0] = ESC;
  357. r[1] = '$';
  358. r[2] = 'B';
  359. r += 3;
  360. state = STATE_JISX0208MS;
  361. }
  362. r[0] = buf[0];
  363. r[1] = buf[1];
  364. conv->ostate = state;
  365. return count;
  366. }
  367. }
  368. /* Try JIS X 0212-1990 and the extensions mentioned above. */
  369. if (wc >= 0xe3ac && wc < 0xe758) {
  370. unsigned short i = wc - 0xe3ac;
  371. buf[0] = (i / 94) + 0x75;
  372. buf[1] = (i % 94) + 0x21;
  373. ret = 2;
  374. } else {
  375. ret = jisx0212_wctomb(conv,buf,wc,2);
  376. if (ret == RET_ILUNI) {
  377. /* Extension of JIS X 0212. */
  378. unsigned char i;
  379. ret = cp50221_0212_ext_wctomb(conv,&i,wc,1);
  380. if (ret == 1) {
  381. i -= 1;
  382. buf[0] = (i / 94) + 0x73;
  383. buf[1] = (i % 94) + 0x21;
  384. ret = 2;
  385. }
  386. }
  387. }
  388. if (ret != RET_ILUNI) {
  389. if (ret != 2) abort();
  390. if (buf[0] < 0x80 && buf[1] < 0x80) {
  391. int count = (state == STATE_JISX0212MS ? 2 : 6);
  392. if (n < count)
  393. return RET_TOOSMALL;
  394. if (state != STATE_JISX0212MS) {
  395. r[0] = ESC;
  396. r[1] = '$';
  397. r[2] = '(';
  398. r[3] = 'D';
  399. r += 4;
  400. state = STATE_JISX0212MS;
  401. }
  402. r[0] = buf[0];
  403. r[1] = buf[1];
  404. conv->ostate = state;
  405. return count;
  406. }
  407. }
  408. return RET_ILUNI;
  409. }
  410. static int
  411. iso2022_jpms_reset (conv_t conv, unsigned char *r, size_t n)
  412. {
  413. state_t state = conv->ostate;
  414. if (state != STATE_ASCII) {
  415. if (n < 3)
  416. return RET_TOOSMALL;
  417. r[0] = ESC;
  418. r[1] = '(';
  419. r[2] = 'B';
  420. /* conv->ostate = 0; will be done by the caller */
  421. return 3;
  422. } else
  423. return 0;
  424. }
  425. #undef STATE_JISX0212MS
  426. #undef STATE_JISX0208MS
  427. #undef STATE_JISX0201KATAKANA
  428. #undef STATE_JISX0201ROMAN
  429. #undef STATE_ASCII