gentranslit.c 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263
  1. /* Copyright (C) 1999-2003, 2005, 2011-2012, 2016, 2018, 2020 Free Software Foundation, Inc.
  2. This file is part of the GNU LIBICONV Library.
  3. The GNU LIBICONV Library is free software; you can redistribute it
  4. and/or modify it under the terms of the GNU Lesser General Public
  5. License as published by the Free Software Foundation; either version 2.1
  6. of the License, or (at your option) any later version.
  7. The GNU LIBICONV Library is distributed in the hope that it will be
  8. useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10. Lesser General Public License for more details.
  11. You should have received a copy of the GNU Lesser General Public
  12. License along with the GNU LIBICONV Library; see the file COPYING.LIB.
  13. If not, see <https://www.gnu.org/licenses/>. */
  14. /*
  15. * Generates a table of small strings, used for transliteration, from a table
  16. * containing lines of the form
  17. * Unicode <tab> utf-8 replacement <tab> # comment
  18. */
  19. #include <stdio.h>
  20. #include <stdlib.h>
  21. #include <stdbool.h>
  22. int main (int argc, char *argv[])
  23. {
  24. unsigned int *data;
  25. int *uni2index;
  26. int index;
  27. if (argc != 1)
  28. exit(1);
  29. data = malloc(0x100000 * sizeof(*data));
  30. uni2index = malloc(0x110000 * sizeof(*uni2index));
  31. if (data == NULL || uni2index == NULL) {
  32. fprintf(stderr, "out of memory\n");
  33. exit(1);
  34. }
  35. printf("/*\n");
  36. printf(" * Copyright (C) 1999-2003 Free Software Foundation, Inc.\n");
  37. printf(" * This file is part of the GNU LIBICONV Library.\n");
  38. printf(" *\n");
  39. printf(" * The GNU LIBICONV Library is free software; you can redistribute it\n");
  40. printf(" * and/or modify it under the terms of the GNU Lesser General Public\n");
  41. printf(" * License as published by the Free Software Foundation; either version 2\n");
  42. printf(" * of the License, or (at your option) any later version.\n");
  43. printf(" *\n");
  44. printf(" * The GNU LIBICONV Library is distributed in the hope that it will be\n");
  45. printf(" * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
  46. printf(" * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n");
  47. printf(" * Lesser General Public License for more details.\n");
  48. printf(" *\n");
  49. printf(" * You should have received a copy of the GNU Lesser General Public\n");
  50. printf(" * License along with the GNU LIBICONV Library; see the file COPYING.LIB.\n");
  51. printf(" * If not, see <https://www.gnu.org/licenses/>.\n");
  52. printf(" */\n");
  53. printf("\n");
  54. printf("/*\n");
  55. printf(" * Transliteration table\n");
  56. printf(" */\n");
  57. printf("\n");
  58. {
  59. int c;
  60. int j;
  61. for (j = 0; j < 0x110000; j++)
  62. uni2index[j] = -1;
  63. index = 0;
  64. for (;;) {
  65. c = getc(stdin);
  66. if (c == EOF)
  67. break;
  68. if (c == '#') {
  69. do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
  70. continue;
  71. }
  72. ungetc(c,stdin);
  73. if (scanf("%x",&j) != 1)
  74. exit(1);
  75. c = getc(stdin);
  76. if (c != '\t')
  77. exit(1);
  78. for (;;) {
  79. c = getc(stdin);
  80. if (c == EOF || c == '\n')
  81. exit(1);
  82. if (c == '\t')
  83. break;
  84. if (uni2index[j] < 0) {
  85. uni2index[j] = index;
  86. data[index++] = 0;
  87. }
  88. if (c >= 0x80) {
  89. /* Finish reading an UTF-8 character. */
  90. if (c < 0xc0)
  91. exit(1);
  92. else {
  93. unsigned int i = (c < 0xe0 ? 2 : c < 0xf0 ? 3 : c < 0xf8 ? 4 : c < 0xfc ? 5 : 6);
  94. c &= (1 << (8-i)) - 1;
  95. while (--i > 0) {
  96. int cc = getc(stdin);
  97. if (!(cc >= 0x80 && cc < 0xc0))
  98. exit(1);
  99. c <<= 6; c |= (cc & 0x3f);
  100. }
  101. }
  102. }
  103. data[index++] = (unsigned int) c;
  104. }
  105. if (uni2index[j] >= 0)
  106. data[uni2index[j]] = index - uni2index[j] - 1;
  107. do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
  108. }
  109. }
  110. printf("static const unsigned int translit_data[%d] = {",index);
  111. {
  112. int i;
  113. for (i = 0; i < index; i++) {
  114. if (data[i] < 32)
  115. printf("\n %3d,",data[i]);
  116. else if (data[i] == '\'')
  117. printf("'\\'',");
  118. else if (data[i] == '\\')
  119. printf("'\\\\',");
  120. else if (data[i] < 127)
  121. printf(" '%c',",data[i]);
  122. else if (data[i] < 256)
  123. printf("0x%02X,",data[i]);
  124. else
  125. printf("0x%04X,",data[i]);
  126. }
  127. printf("\n};\n");
  128. }
  129. printf("\n");
  130. {
  131. int line[0x22000];
  132. int tableno;
  133. struct { int minline; int maxline; int usecount; const char* suffix; } tables[0x2000];
  134. int i, j, p, j1, j2, t;
  135. for (j1 = 0; j1 < 0x22000; j1++) {
  136. bool all_invalid = true;
  137. for (j2 = 0; j2 < 8; j2++) {
  138. j = 8*j1+j2;
  139. if (uni2index[j] >= 0)
  140. all_invalid = false;
  141. }
  142. if (all_invalid)
  143. line[j1] = -1;
  144. else
  145. line[j1] = 0;
  146. }
  147. tableno = 0;
  148. for (j1 = 0; j1 < 0x22000; j1++) {
  149. if (line[j1] >= 0) {
  150. if (tableno > 0
  151. && ((j1 > 0 && line[j1-1] == tableno-1)
  152. || ((tables[tableno-1].maxline >> 5) == (j1 >> 5)
  153. && j1 - tables[tableno-1].maxline <= 8))) {
  154. line[j1] = tableno-1;
  155. tables[tableno-1].maxline = j1;
  156. } else {
  157. tableno++;
  158. line[j1] = tableno-1;
  159. tables[tableno-1].minline = tables[tableno-1].maxline = j1;
  160. }
  161. }
  162. }
  163. for (t = 0; t < tableno; t++) {
  164. tables[t].usecount = 0;
  165. j1 = 8*tables[t].minline;
  166. j2 = 8*(tables[t].maxline+1);
  167. for (j = j1; j < j2; j++)
  168. if (uni2index[j] >= 0)
  169. tables[t].usecount++;
  170. }
  171. for (t = 0, p = -1, i = 0; t < tableno; t++) {
  172. if (tables[t].usecount > 1) {
  173. char* s;
  174. if (p == tables[t].minline >> 5) {
  175. i++;
  176. /* i is the number of tables with the same (tables[t].minline >> 5)
  177. that we have seen so far. Since the tables[t].minline values are
  178. strongly monotonically increasing, there are at most 32 of them. */
  179. if (!(i >= 0 && i <= 32)) abort();
  180. s = (char*) malloc(4+1+2+1);
  181. sprintf(s, "%02x_%d", p, i);
  182. } else {
  183. p = tables[t].minline >> 5;
  184. i = 0;
  185. s = (char*) malloc(4+1);
  186. sprintf(s, "%02x", p);
  187. }
  188. tables[t].suffix = s;
  189. } else
  190. tables[t].suffix = NULL;
  191. }
  192. {
  193. p = -1;
  194. for (t = 0; t < tableno; t++)
  195. if (tables[t].usecount > 1) {
  196. p = 0;
  197. printf("static const short translit_page%s[%d] = {\n", tables[t].suffix, 8*(tables[t].maxline-tables[t].minline+1));
  198. for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) {
  199. if ((j1 % 0x20) == 0 && j1 > tables[t].minline)
  200. printf(" /* 0x%04x */\n", 8*j1);
  201. printf(" ");
  202. for (j2 = 0; j2 < 8; j2++) {
  203. j = 8*j1+j2;
  204. printf(" %4d,", uni2index[j]);
  205. }
  206. printf(" /* 0x%02x-0x%02x */\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7);
  207. }
  208. printf("};\n");
  209. }
  210. if (p >= 0)
  211. printf("\n");
  212. }
  213. printf("#define translit_index(wc) \\\n (");
  214. for (j1 = 0; j1 < 0x22000;) {
  215. t = line[j1];
  216. for (j2 = j1; j2 < 0x22000 && line[j2] == t; j2++);
  217. if (t >= 0) {
  218. if (j1 != tables[t].minline) abort();
  219. if (j2 > tables[t].maxline+1) abort();
  220. j2 = tables[t].maxline+1;
  221. }
  222. if (t == -1) {
  223. } else {
  224. if (t >= 0 && tables[t].usecount == 0) abort();
  225. if (t >= 0 && tables[t].usecount == 1) {
  226. if (j2 != j1+1) abort();
  227. for (j = 8*j1; j < 8*j2; j++)
  228. if (uni2index[j] >= 0) {
  229. printf("wc == 0x%04x ? %d", j, uni2index[j]);
  230. break;
  231. }
  232. } else {
  233. if (j1 == 0) {
  234. printf("wc < 0x%04x", 8*j2);
  235. } else {
  236. printf("wc >= 0x%04x && wc < 0x%04x", 8*j1, 8*j2);
  237. }
  238. printf(" ? translit_page%s[wc", tables[t].suffix);
  239. if (tables[t].minline > 0)
  240. printf("-0x%04x", 8*j1);
  241. printf("]");
  242. }
  243. printf(" : \\\n ");
  244. }
  245. j1 = j2;
  246. }
  247. printf("-1)\n");
  248. }
  249. if (ferror(stdout) || fclose(stdout))
  250. exit(1);
  251. exit(0);
  252. }