johab_hangul.h 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. /*
  2. * Copyright (C) 1999-2001, 2016 Free Software Foundation, Inc.
  3. * This file is part of the GNU LIBICONV Library.
  4. *
  5. * The GNU LIBICONV Library is free software; you can redistribute it
  6. * and/or modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either version 2.1
  8. * of the License, or (at your option) any later version.
  9. *
  10. * The GNU LIBICONV Library is distributed in the hope that it will be
  11. * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
  17. * If not, see <https://www.gnu.org/licenses/>.
  18. */
  19. /*
  20. * JOHAB Hangul
  21. *
  22. * Ken Lunde writes in his "CJKV Information Processing" book, p. 114:
  23. * "Hangul can be composed of two or three jamo (some jamo are considered
  24. * compound). Johab uses 19 initial jamo (consonants), 21 medial jamo (vowels)
  25. * and 27 final jamo (consonants; 28 when you include the "fill" character
  26. * for Hangul containing only two jamo). Multiplying these numbers results in
  27. * 11172."
  28. *
  29. * Structure of the Johab encoding (see p. 181-184):
  30. * bit 15 = 1
  31. * bit 14..10 = initial jamo, only 19+1 out of 32 possible values are used
  32. * bit 9..5 = medial jamo, only 21+1 out of 32 possible values are used
  33. * bit 4..0 = final jamo, only 27+1 out of 32 possible values are used
  34. *
  35. * Structure of the Unicode encoding:
  36. * grep '^0x\([8-C]...\|D[0-7]..\)' unicode.org-mappings/EASTASIA/KSC/JOHAB.TXT
  37. * You see that all characters there are marked "HANGUL LETTER" or "HANGUL
  38. * SYLLABLE". If you eliminate the "HANGUL LETTER"s, the table is sorted
  39. * in ascending order according to Johab encoding and according to the Unicode
  40. * encoding. Now look a little more carefully, and you see that the following
  41. * formula holds:
  42. * unicode == 0xAC00
  43. * + 21 * 28 * (jamo_initial_index[(johab >> 10) & 31] - 1)
  44. * + 28 * (jamo_medial_index[(johab >> 5) & 31] - 1)
  45. * + jamo_final_index[johab & 31]
  46. * where the index tables are defined as below.
  47. */
  48. /* Tables mapping 5-bit groups to jamo letters. */
  49. /* Note that Jamo XX = UHC 0xA4A0+XX = Unicode 0x3130+XX */
  50. #define NONE 0xfd
  51. #define FILL 0xff
  52. static const unsigned char jamo_initial[32] = {
  53. NONE, FILL, 0x01, 0x02, 0x04, 0x07, 0x08, 0x09,
  54. 0x11, 0x12, 0x13, 0x15, 0x16, 0x17, 0x18, 0x19,
  55. 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, NONE, NONE, NONE,
  56. NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
  57. };
  58. static const unsigned char jamo_medial[32] = {
  59. NONE, NONE, FILL, 0x1f, 0x20, 0x21, 0x22, 0x23,
  60. NONE, NONE, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
  61. NONE, NONE, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
  62. NONE, NONE, 0x30, 0x31, 0x32, 0x33, NONE, NONE,
  63. };
  64. static const unsigned char jamo_final[32] = {
  65. NONE, FILL, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
  66. 0x07, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  67. 0x10, 0x11, NONE, 0x12, 0x14, 0x15, 0x16, 0x17,
  68. 0x18, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, NONE, NONE,
  69. };
  70. /* Same as jamo_final, except that it excludes characters already
  71. contained in jamo_initial. 11 characters instead of 27. */
  72. static const unsigned char jamo_final_notinitial[32] = {
  73. NONE, NONE, NONE, NONE, 0x03, NONE, 0x05, 0x06,
  74. NONE, NONE, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  75. 0x10, NONE, NONE, NONE, 0x14, NONE, NONE, NONE,
  76. NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
  77. };
  78. /* Tables mapping 5-bit groups to packed indices. */
  79. #define none -1
  80. #define fill 0
  81. static const signed char jamo_initial_index[32] = {
  82. none, fill, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
  83. 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
  84. 0x0f, 0x10, 0x11, 0x12, 0x13, none, none, none,
  85. none, none, none, none, none, none, none, none,
  86. };
  87. static const signed char jamo_medial_index[32] = {
  88. none, none, fill, 0x01, 0x02, 0x03, 0x04, 0x05,
  89. none, none, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
  90. none, none, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11,
  91. none, none, 0x12, 0x13, 0x14, 0x15, none, none,
  92. };
  93. static const signed char jamo_final_index[32] = {
  94. none, fill, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
  95. 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
  96. 0x0f, 0x10, none, 0x11, 0x12, 0x13, 0x14, 0x15,
  97. 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, none, none,
  98. };
  99. static int
  100. johab_hangul_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)
  101. {
  102. unsigned char c1 = s[0];
  103. if ((c1 >= 0x84 && c1 <= 0xd3)) {
  104. if (n >= 2) {
  105. unsigned char c2 = s[1];
  106. if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff)) {
  107. unsigned int johab = (c1 << 8) | c2;
  108. unsigned int bitspart1 = (johab >> 10) & 31;
  109. unsigned int bitspart2 = (johab >> 5) & 31;
  110. unsigned int bitspart3 = johab & 31;
  111. int index1 = jamo_initial_index[bitspart1];
  112. int index2 = jamo_medial_index[bitspart2];
  113. int index3 = jamo_final_index[bitspart3];
  114. /* Exclude "none" values. */
  115. if (index1 >= 0 && index2 >= 0 && index3 >= 0) {
  116. /* Deal with "fill" values in initial or medial position. */
  117. if (index1 == fill) {
  118. if (index2 == fill) {
  119. unsigned char jamo3 = jamo_final_notinitial[bitspart3];
  120. if (jamo3 != NONE) {
  121. *pwc = (ucs4_t) 0x3130 + jamo3;
  122. return 2;
  123. }
  124. } else if (index3 == fill) {
  125. unsigned char jamo2 = jamo_medial[bitspart2];
  126. if (jamo2 != NONE && jamo2 != FILL) {
  127. *pwc = (ucs4_t) 0x3130 + jamo2;
  128. return 2;
  129. }
  130. }
  131. /* Syllables composed only of medial and final don't exist. */
  132. } else if (index2 == fill) {
  133. if (index3 == fill) {
  134. unsigned char jamo1 = jamo_initial[bitspart1];
  135. if (jamo1 != NONE && jamo1 != FILL) {
  136. *pwc = (ucs4_t) 0x3130 + jamo1;
  137. return 2;
  138. }
  139. }
  140. /* Syllables composed only of initial and final don't exist. */
  141. } else {
  142. /* index1 and index2 are not fill, but index3 may be fill. */
  143. /* Nothing more to exclude. All 11172 code points are valid. */
  144. *pwc = 0xac00 + ((index1 - 1) * 21 + (index2 - 1)) * 28 + index3;
  145. return 2;
  146. }
  147. }
  148. }
  149. return RET_ILSEQ;
  150. }
  151. return RET_TOOFEW(0);
  152. }
  153. return RET_ILSEQ;
  154. }
  155. /* 51 Jamo: 19 initial, 21 medial, 11 final not initial. */
  156. static const unsigned short johab_hangul_page31[51] = {
  157. 0x8841, 0x8c41, 0x8444, 0x9041, 0x8446, 0x8447, 0x9441, /*0x30-0x37*/
  158. 0x9841, 0x9c41, 0x844a, 0x844b, 0x844c, 0x844d, 0x844e, 0x844f, /*0x38-0x3f*/
  159. 0x8450, 0xa041, 0xa441, 0xa841, 0x8454, 0xac41, 0xb041, 0xb441, /*0x40-0x47*/
  160. 0xb841, 0xbc41, 0xc041, 0xc441, 0xc841, 0xcc41, 0xd041, 0x8461, /*0x48-0x4f*/
  161. 0x8481, 0x84a1, 0x84c1, 0x84e1, 0x8541, 0x8561, 0x8581, 0x85a1, /*0x50-0x57*/
  162. 0x85c1, 0x85e1, 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1, /*0x58-0x5f*/
  163. 0x8741, 0x8761, 0x8781, 0x87a1, /*0x60-0x67*/
  164. };
  165. /* Tables mapping packed indices to 5-bit groups. */
  166. /* index1+1 = jamo_initial_index[bitspart1] <==>
  167. bitspart1 = jamo_initial_index_inverse[index1] */
  168. static const char jamo_initial_index_inverse[19] = {
  169. 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  170. 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  171. 0x10, 0x11, 0x12, 0x13, 0x14,
  172. };
  173. /* index2+1 = jamo_medial_index[bitspart2] <==>
  174. bitspart2 = jamo_medial_index_inverse[index2] */
  175. static const char jamo_medial_index_inverse[21] = {
  176. 0x03, 0x04, 0x05, 0x06, 0x07,
  177. 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  178. 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
  179. 0x1a, 0x1b, 0x1c, 0x1d,
  180. };
  181. /* index3 = jamo_final_index[bitspart3] <==>
  182. bitspart3 = jamo_final_index_inverse[index3] */
  183. static const char jamo_final_index_inverse[28] = {
  184. 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  185. 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  186. 0x10, 0x11, 0x13, 0x14, 0x15, 0x16, 0x17,
  187. 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d,
  188. };
  189. static int
  190. johab_hangul_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
  191. {
  192. if (n >= 2) {
  193. if (wc >= 0x3131 && wc < 0x3164) {
  194. unsigned short c = johab_hangul_page31[wc-0x3131];
  195. r[0] = (c >> 8); r[1] = (c & 0xff);
  196. return 2;
  197. } else if (wc >= 0xac00 && wc < 0xd7a4) {
  198. unsigned int index1;
  199. unsigned int index2;
  200. unsigned int index3;
  201. unsigned short c;
  202. unsigned int tmp = wc - 0xac00;
  203. index3 = tmp % 28; tmp = tmp / 28;
  204. index2 = tmp % 21; tmp = tmp / 21;
  205. index1 = tmp;
  206. c = (((((1 << 5)
  207. | jamo_initial_index_inverse[index1]) << 5)
  208. | jamo_medial_index_inverse[index2]) << 5)
  209. | jamo_final_index_inverse[index3];
  210. r[0] = (c >> 8); r[1] = (c & 0xff);
  211. return 2;
  212. }
  213. return RET_ILUNI;
  214. }
  215. return RET_TOOSMALL;
  216. }
  217. /*
  218. * Decomposition of JOHAB Hangul in one to three Johab Jamo elements.
  219. */
  220. /* Decompose wc into r[0..2], and return the number of resulting Jamo elements.
  221. Return RET_ILUNI if decomposition is not possible. */
  222. static int johab_hangul_decompose (conv_t conv, ucs4_t* r, ucs4_t wc)
  223. {
  224. unsigned char buf[2];
  225. int ret = johab_hangul_wctomb(conv,buf,wc,2);
  226. if (ret != RET_ILUNI) {
  227. unsigned int hangul = (buf[0] << 8) | buf[1];
  228. unsigned char jamo1 = jamo_initial[(hangul >> 10) & 31];
  229. unsigned char jamo2 = jamo_medial[(hangul >> 5) & 31];
  230. unsigned char jamo3 = jamo_final[hangul & 31];
  231. if ((hangul >> 15) != 1) abort();
  232. if (jamo1 != NONE && jamo2 != NONE && jamo3 != NONE) {
  233. /* They are not all three == FILL because that would correspond to
  234. johab = 0x8441, which doesn't exist. */
  235. ucs4_t* p = r;
  236. if (jamo1 != FILL)
  237. *p++ = 0x3130 + jamo1;
  238. if (jamo2 != FILL)
  239. *p++ = 0x3130 + jamo2;
  240. if (jamo3 != FILL)
  241. *p++ = 0x3130 + jamo3;
  242. return p-r;
  243. }
  244. }
  245. return RET_ILUNI;
  246. }
  247. #undef fill
  248. #undef none
  249. #undef FILL
  250. #undef NONE