iso2022_cn.h 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323
  1. /*
  2. * Copyright (C) 1999-2001, 2008, 2016 Free Software Foundation, Inc.
  3. * This file is part of the GNU LIBICONV Library.
  4. *
  5. * The GNU LIBICONV Library is free software; you can redistribute it
  6. * and/or modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either version 2.1
  8. * of the License, or (at your option) any later version.
  9. *
  10. * The GNU LIBICONV Library is distributed in the hope that it will be
  11. * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
  17. * If not, see <https://www.gnu.org/licenses/>.
  18. */
  19. /*
  20. * ISO-2022-CN
  21. */
  22. /* Specification: RFC 1922 */
  23. #define ESC 0x1b
  24. #define SO 0x0e
  25. #define SI 0x0f
  26. /*
  27. * The state is composed of one of the following values
  28. */
  29. #define STATE_ASCII 0
  30. #define STATE_TWOBYTE 1
  31. /*
  32. * and one of the following values, << 8
  33. */
  34. #define STATE2_NONE 0
  35. #define STATE2_DESIGNATED_GB2312 1
  36. #define STATE2_DESIGNATED_CNS11643_1 2
  37. /*
  38. * and one of the following values, << 16
  39. */
  40. #define STATE3_NONE 0
  41. #define STATE3_DESIGNATED_CNS11643_2 1
  42. #define SPLIT_STATE \
  43. unsigned int state1 = state & 0xff, state2 = (state >> 8) & 0xff, state3 = state >> 16
  44. #define COMBINE_STATE \
  45. state = (state3 << 16) | (state2 << 8) | state1
  46. static int
  47. iso2022_cn_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)
  48. {
  49. state_t state = conv->istate;
  50. SPLIT_STATE;
  51. int count = 0;
  52. unsigned char c;
  53. for (;;) {
  54. c = *s;
  55. if (c == ESC) {
  56. if (n < count+4)
  57. goto none;
  58. if (s[1] == '$') {
  59. if (s[2] == ')') {
  60. if (s[3] == 'A') {
  61. state2 = STATE2_DESIGNATED_GB2312;
  62. s += 4; count += 4;
  63. if (n < count+1)
  64. goto none;
  65. continue;
  66. }
  67. if (s[3] == 'G') {
  68. state2 = STATE2_DESIGNATED_CNS11643_1;
  69. s += 4; count += 4;
  70. if (n < count+1)
  71. goto none;
  72. continue;
  73. }
  74. }
  75. if (s[2] == '*') {
  76. if (s[3] == 'H') {
  77. state3 = STATE3_DESIGNATED_CNS11643_2;
  78. s += 4; count += 4;
  79. if (n < count+1)
  80. goto none;
  81. continue;
  82. }
  83. }
  84. }
  85. if (s[1] == 'N') {
  86. switch (state3) {
  87. case STATE3_NONE:
  88. goto ilseq;
  89. case STATE3_DESIGNATED_CNS11643_2:
  90. if (s[2] < 0x80 && s[3] < 0x80) {
  91. int ret = cns11643_2_mbtowc(conv,pwc,s+2,2);
  92. if (ret == RET_ILSEQ)
  93. goto ilseq;
  94. if (ret != 2) abort();
  95. COMBINE_STATE;
  96. conv->istate = state;
  97. return count+4;
  98. } else
  99. goto ilseq;
  100. default: abort();
  101. }
  102. }
  103. goto ilseq;
  104. }
  105. if (c == SO) {
  106. if (state2 != STATE2_DESIGNATED_GB2312 && state2 != STATE2_DESIGNATED_CNS11643_1)
  107. goto ilseq;
  108. state1 = STATE_TWOBYTE;
  109. s++; count++;
  110. if (n < count+1)
  111. goto none;
  112. continue;
  113. }
  114. if (c == SI) {
  115. state1 = STATE_ASCII;
  116. s++; count++;
  117. if (n < count+1)
  118. goto none;
  119. continue;
  120. }
  121. break;
  122. }
  123. switch (state1) {
  124. case STATE_ASCII:
  125. if (c < 0x80) {
  126. int ret = ascii_mbtowc(conv,pwc,s,1);
  127. if (ret == RET_ILSEQ)
  128. goto ilseq;
  129. if (ret != 1) abort();
  130. if (*pwc == 0x000a || *pwc == 0x000d) {
  131. state2 = STATE2_NONE; state3 = STATE3_NONE;
  132. }
  133. COMBINE_STATE;
  134. conv->istate = state;
  135. return count+1;
  136. } else
  137. goto ilseq;
  138. case STATE_TWOBYTE:
  139. if (n < count+2)
  140. goto none;
  141. if (s[0] < 0x80 && s[1] < 0x80) {
  142. int ret;
  143. switch (state2) {
  144. case STATE2_NONE:
  145. goto ilseq;
  146. case STATE2_DESIGNATED_GB2312:
  147. ret = gb2312_mbtowc(conv,pwc,s,2); break;
  148. case STATE2_DESIGNATED_CNS11643_1:
  149. ret = cns11643_1_mbtowc(conv,pwc,s,2); break;
  150. default: abort();
  151. }
  152. if (ret == RET_ILSEQ)
  153. goto ilseq;
  154. if (ret != 2) abort();
  155. COMBINE_STATE;
  156. conv->istate = state;
  157. return count+2;
  158. } else
  159. goto ilseq;
  160. default: abort();
  161. }
  162. none:
  163. COMBINE_STATE;
  164. conv->istate = state;
  165. return RET_TOOFEW(count);
  166. ilseq:
  167. COMBINE_STATE;
  168. conv->istate = state;
  169. return RET_SHIFT_ILSEQ(count);
  170. }
  171. static int
  172. iso2022_cn_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
  173. {
  174. state_t state = conv->ostate;
  175. SPLIT_STATE;
  176. unsigned char buf[3];
  177. int ret;
  178. /* There is no need to handle Unicode 3.1 tag characters and to look for
  179. "zh-CN" or "zh-TW" tags, because GB2312 and CNS11643 are disjoint. */
  180. /* Try ASCII. */
  181. ret = ascii_wctomb(conv,buf,wc,1);
  182. if (ret != RET_ILUNI) {
  183. if (ret != 1) abort();
  184. if (buf[0] < 0x80) {
  185. int count = (state1 == STATE_ASCII ? 1 : 2);
  186. if (n < count)
  187. return RET_TOOSMALL;
  188. if (state1 != STATE_ASCII) {
  189. r[0] = SI;
  190. r += 1;
  191. state1 = STATE_ASCII;
  192. }
  193. r[0] = buf[0];
  194. if (wc == 0x000a || wc == 0x000d) {
  195. state2 = STATE2_NONE; state3 = STATE3_NONE;
  196. }
  197. COMBINE_STATE;
  198. conv->ostate = state;
  199. return count;
  200. }
  201. }
  202. /* Try GB 2312-1980. */
  203. ret = gb2312_wctomb(conv,buf,wc,2);
  204. if (ret != RET_ILUNI) {
  205. if (ret != 2) abort();
  206. if (buf[0] < 0x80 && buf[1] < 0x80) {
  207. int count = (state2 == STATE2_DESIGNATED_GB2312 ? 0 : 4) + (state1 == STATE_TWOBYTE ? 0 : 1) + 2;
  208. if (n < count)
  209. return RET_TOOSMALL;
  210. if (state2 != STATE2_DESIGNATED_GB2312) {
  211. r[0] = ESC;
  212. r[1] = '$';
  213. r[2] = ')';
  214. r[3] = 'A';
  215. r += 4;
  216. state2 = STATE2_DESIGNATED_GB2312;
  217. }
  218. if (state1 != STATE_TWOBYTE) {
  219. r[0] = SO;
  220. r += 1;
  221. state1 = STATE_TWOBYTE;
  222. }
  223. r[0] = buf[0];
  224. r[1] = buf[1];
  225. COMBINE_STATE;
  226. conv->ostate = state;
  227. return count;
  228. }
  229. }
  230. ret = cns11643_wctomb(conv,buf,wc,3);
  231. if (ret != RET_ILUNI) {
  232. if (ret != 3) abort();
  233. /* Try CNS 11643-1992 Plane 1. */
  234. if (buf[0] == 1 && buf[1] < 0x80 && buf[2] < 0x80) {
  235. int count = (state2 == STATE2_DESIGNATED_CNS11643_1 ? 0 : 4) + (state1 == STATE_TWOBYTE ? 0 : 1) + 2;
  236. if (n < count)
  237. return RET_TOOSMALL;
  238. if (state2 != STATE2_DESIGNATED_CNS11643_1) {
  239. r[0] = ESC;
  240. r[1] = '$';
  241. r[2] = ')';
  242. r[3] = 'G';
  243. r += 4;
  244. state2 = STATE2_DESIGNATED_CNS11643_1;
  245. }
  246. if (state1 != STATE_TWOBYTE) {
  247. r[0] = SO;
  248. r += 1;
  249. state1 = STATE_TWOBYTE;
  250. }
  251. r[0] = buf[1];
  252. r[1] = buf[2];
  253. COMBINE_STATE;
  254. conv->ostate = state;
  255. return count;
  256. }
  257. /* Try CNS 11643-1992 Plane 2. */
  258. if (buf[0] == 2 && buf[1] < 0x80 && buf[2] < 0x80) {
  259. int count = (state3 == STATE3_DESIGNATED_CNS11643_2 ? 0 : 4) + 4;
  260. if (n < count)
  261. return RET_TOOSMALL;
  262. if (state3 != STATE3_DESIGNATED_CNS11643_2) {
  263. r[0] = ESC;
  264. r[1] = '$';
  265. r[2] = '*';
  266. r[3] = 'H';
  267. r += 4;
  268. state3 = STATE3_DESIGNATED_CNS11643_2;
  269. }
  270. r[0] = ESC;
  271. r[1] = 'N';
  272. r[2] = buf[1];
  273. r[3] = buf[2];
  274. COMBINE_STATE;
  275. conv->ostate = state;
  276. return count;
  277. }
  278. }
  279. return RET_ILUNI;
  280. }
  281. static int
  282. iso2022_cn_reset (conv_t conv, unsigned char *r, size_t n)
  283. {
  284. state_t state = conv->ostate;
  285. SPLIT_STATE;
  286. (void)state2;
  287. (void)state3;
  288. if (state1 != STATE_ASCII) {
  289. if (n < 1)
  290. return RET_TOOSMALL;
  291. r[0] = SI;
  292. /* conv->ostate = 0; will be done by the caller */
  293. return 1;
  294. } else
  295. return 0;
  296. }
  297. #undef COMBINE_STATE
  298. #undef SPLIT_STATE
  299. #undef STATE3_DESIGNATED_CNS11643_2
  300. #undef STATE3_NONE
  301. #undef STATE2_DESIGNATED_CNS11643_1
  302. #undef STATE2_DESIGNATED_GB2312
  303. #undef STATE2_NONE
  304. #undef STATE_TWOBYTE
  305. #undef STATE_ASCII