gentranslit.c 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. /* Copyright (C) 1999-2003, 2005, 2011-2012, 2016 Free Software Foundation, Inc.
  2. This file is part of the GNU LIBICONV Library.
  3. The GNU LIBICONV Library is free software; you can redistribute it
  4. and/or modify it under the terms of the GNU Library General Public
  5. License as published by the Free Software Foundation; either version 2
  6. of the License, or (at your option) any later version.
  7. The GNU LIBICONV Library is distributed in the hope that it will be
  8. useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10. Library General Public License for more details.
  11. You should have received a copy of the GNU Library General Public
  12. License along with the GNU LIBICONV Library; see the file COPYING.LIB.
  13. If not, see <http://www.gnu.org/licenses/>. */
  14. /*
  15. * Generates a table of small strings, used for transliteration, from a table
  16. * containing lines of the form
  17. * Unicode <tab> utf-8 replacement <tab> # comment
  18. */
  19. #include <stdio.h>
  20. #include <stdlib.h>
  21. #include <stdbool.h>
  22. int main (int argc, char *argv[])
  23. {
  24. unsigned int *data;
  25. int *uni2index;
  26. int index;
  27. if (argc != 1)
  28. exit(1);
  29. data = malloc(0x100000 * sizeof(*data));
  30. uni2index = malloc(0x110000 * sizeof(*uni2index));
  31. if (data == NULL || uni2index == NULL) {
  32. fprintf(stderr, "out of memory\n");
  33. exit(1);
  34. }
  35. printf("/*\n");
  36. printf(" * Copyright (C) 1999-2003 Free Software Foundation, Inc.\n");
  37. printf(" * This file is part of the GNU LIBICONV Library.\n");
  38. printf(" *\n");
  39. printf(" * The GNU LIBICONV Library is free software; you can redistribute it\n");
  40. printf(" * and/or modify it under the terms of the GNU Library General Public\n");
  41. printf(" * License as published by the Free Software Foundation; either version 2\n");
  42. printf(" * of the License, or (at your option) any later version.\n");
  43. printf(" *\n");
  44. printf(" * The GNU LIBICONV Library is distributed in the hope that it will be\n");
  45. printf(" * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
  46. printf(" * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n");
  47. printf(" * Library General Public License for more details.\n");
  48. printf(" *\n");
  49. printf(" * You should have received a copy of the GNU Library General Public\n");
  50. printf(" * License along with the GNU LIBICONV Library; see the file COPYING.LIB.\n");
  51. printf(" * If not, see <http://www.gnu.org/licenses/>.\n");
  52. printf(" */\n");
  53. printf("\n");
  54. printf("/*\n");
  55. printf(" * Transliteration table\n");
  56. printf(" */\n");
  57. printf("\n");
  58. {
  59. int c;
  60. int j;
  61. for (j = 0; j < 0x110000; j++)
  62. uni2index[j] = -1;
  63. index = 0;
  64. for (;;) {
  65. c = getc(stdin);
  66. if (c == EOF)
  67. break;
  68. if (c == '#') {
  69. do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
  70. continue;
  71. }
  72. ungetc(c,stdin);
  73. if (scanf("%x",&j) != 1)
  74. exit(1);
  75. c = getc(stdin);
  76. if (c != '\t')
  77. exit(1);
  78. for (;;) {
  79. c = getc(stdin);
  80. if (c == EOF || c == '\n')
  81. exit(1);
  82. if (c == '\t')
  83. break;
  84. if (uni2index[j] < 0) {
  85. uni2index[j] = index;
  86. data[index++] = 0;
  87. }
  88. if (c >= 0x80) {
  89. /* Finish reading an UTF-8 character. */
  90. if (c < 0xc0)
  91. exit(1);
  92. else {
  93. unsigned int i = (c < 0xe0 ? 2 : c < 0xf0 ? 3 : c < 0xf8 ? 4 : c < 0xfc ? 5 : 6);
  94. c &= (1 << (8-i)) - 1;
  95. while (--i > 0) {
  96. int cc = getc(stdin);
  97. if (!(cc >= 0x80 && cc < 0xc0))
  98. exit(1);
  99. c <<= 6; c |= (cc & 0x3f);
  100. }
  101. }
  102. }
  103. data[index++] = (unsigned int) c;
  104. }
  105. if (uni2index[j] >= 0)
  106. data[uni2index[j]] = index - uni2index[j] - 1;
  107. do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
  108. }
  109. }
  110. printf("static const unsigned int translit_data[%d] = {",index);
  111. {
  112. int i;
  113. for (i = 0; i < index; i++) {
  114. if (data[i] < 32)
  115. printf("\n %3d,",data[i]);
  116. else if (data[i] == '\'')
  117. printf("'\\'',");
  118. else if (data[i] == '\\')
  119. printf("'\\\\',");
  120. else if (data[i] < 127)
  121. printf(" '%c',",data[i]);
  122. else if (data[i] < 256)
  123. printf("0x%02X,",data[i]);
  124. else
  125. printf("0x%04X,",data[i]);
  126. }
  127. printf("\n};\n");
  128. }
  129. printf("\n");
  130. {
  131. int line[0x22000];
  132. int tableno;
  133. struct { int minline; int maxline; int usecount; const char* suffix; } tables[0x2000];
  134. int i, j, p, j1, j2, t;
  135. for (j1 = 0; j1 < 0x22000; j1++) {
  136. bool all_invalid = true;
  137. for (j2 = 0; j2 < 8; j2++) {
  138. j = 8*j1+j2;
  139. if (uni2index[j] >= 0)
  140. all_invalid = false;
  141. }
  142. if (all_invalid)
  143. line[j1] = -1;
  144. else
  145. line[j1] = 0;
  146. }
  147. tableno = 0;
  148. for (j1 = 0; j1 < 0x22000; j1++) {
  149. if (line[j1] >= 0) {
  150. if (tableno > 0
  151. && ((j1 > 0 && line[j1-1] == tableno-1)
  152. || ((tables[tableno-1].maxline >> 5) == (j1 >> 5)
  153. && j1 - tables[tableno-1].maxline <= 8))) {
  154. line[j1] = tableno-1;
  155. tables[tableno-1].maxline = j1;
  156. } else {
  157. tableno++;
  158. line[j1] = tableno-1;
  159. tables[tableno-1].minline = tables[tableno-1].maxline = j1;
  160. }
  161. }
  162. }
  163. for (t = 0; t < tableno; t++) {
  164. tables[t].usecount = 0;
  165. j1 = 8*tables[t].minline;
  166. j2 = 8*(tables[t].maxline+1);
  167. for (j = j1; j < j2; j++)
  168. if (uni2index[j] >= 0)
  169. tables[t].usecount++;
  170. }
  171. for (t = 0, p = -1, i = 0; t < tableno; t++) {
  172. if (tables[t].usecount > 1) {
  173. char* s;
  174. if (p == tables[t].minline >> 5) {
  175. s = (char*) malloc(4+1+2+1);
  176. sprintf(s, "%02x_%d", p, ++i);
  177. } else {
  178. p = tables[t].minline >> 5;
  179. s = (char*) malloc(4+1);
  180. sprintf(s, "%02x", p);
  181. }
  182. tables[t].suffix = s;
  183. } else
  184. tables[t].suffix = NULL;
  185. }
  186. {
  187. p = -1;
  188. for (t = 0; t < tableno; t++)
  189. if (tables[t].usecount > 1) {
  190. p = 0;
  191. printf("static const short translit_page%s[%d] = {\n", tables[t].suffix, 8*(tables[t].maxline-tables[t].minline+1));
  192. for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) {
  193. if ((j1 % 0x20) == 0 && j1 > tables[t].minline)
  194. printf(" /* 0x%04x */\n", 8*j1);
  195. printf(" ");
  196. for (j2 = 0; j2 < 8; j2++) {
  197. j = 8*j1+j2;
  198. printf(" %4d,", uni2index[j]);
  199. }
  200. printf(" /* 0x%02x-0x%02x */\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7);
  201. }
  202. printf("};\n");
  203. }
  204. if (p >= 0)
  205. printf("\n");
  206. }
  207. printf("#define translit_index(wc) \\\n (");
  208. for (j1 = 0; j1 < 0x22000;) {
  209. t = line[j1];
  210. for (j2 = j1; j2 < 0x22000 && line[j2] == t; j2++);
  211. if (t >= 0) {
  212. if (j1 != tables[t].minline) abort();
  213. if (j2 > tables[t].maxline+1) abort();
  214. j2 = tables[t].maxline+1;
  215. }
  216. if (t == -1) {
  217. } else {
  218. if (t >= 0 && tables[t].usecount == 0) abort();
  219. if (t >= 0 && tables[t].usecount == 1) {
  220. if (j2 != j1+1) abort();
  221. for (j = 8*j1; j < 8*j2; j++)
  222. if (uni2index[j] >= 0) {
  223. printf("wc == 0x%04x ? %d", j, uni2index[j]);
  224. break;
  225. }
  226. } else {
  227. if (j1 == 0) {
  228. printf("wc < 0x%04x", 8*j2);
  229. } else {
  230. printf("wc >= 0x%04x && wc < 0x%04x", 8*j1, 8*j2);
  231. }
  232. printf(" ? translit_page%s[wc", tables[t].suffix);
  233. if (tables[t].minline > 0)
  234. printf("-0x%04x", 8*j1);
  235. printf("]");
  236. }
  237. printf(" : \\\n ");
  238. }
  239. j1 = j2;
  240. }
  241. printf("-1)\n");
  242. }
  243. if (ferror(stdout) || fclose(stdout))
  244. exit(1);
  245. exit(0);
  246. }