iconv_open1.h 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. /*
  2. * Copyright (C) 1999-2008, 2011, 2018, 2020 Free Software Foundation, Inc.
  3. * This file is part of the GNU LIBICONV Library.
  4. *
  5. * The GNU LIBICONV Library is free software; you can redistribute it
  6. * and/or modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either version 2.1
  8. * of the License, or (at your option) any later version.
  9. *
  10. * The GNU LIBICONV Library is distributed in the hope that it will be
  11. * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
  17. * If not, see <https://www.gnu.org/licenses/>.
  18. */
  19. /* Part 1 of iconv_open.
  20. Input: const char* tocode, const char* fromcode.
  21. Output:
  22. unsigned int from_index;
  23. int from_wchar;
  24. unsigned int to_index;
  25. int to_wchar;
  26. int transliterate;
  27. int discard_ilseq;
  28. Jumps to 'invalid' in case of errror.
  29. */
  30. {
  31. char buf[MAX_WORD_LENGTH+10+1];
  32. const char* cp;
  33. char* bp;
  34. const struct alias * ap;
  35. unsigned int count;
  36. transliterate = 0;
  37. discard_ilseq = 0;
  38. /* Before calling aliases_lookup, convert the input string to upper case,
  39. * and check whether it's entirely ASCII (we call gperf with option "-7"
  40. * to achieve a smaller table) and non-empty. If it's not entirely ASCII,
  41. * or if it's too long, it is not a valid encoding name.
  42. */
  43. for (to_wchar = 0;;) {
  44. /* Search tocode in the table. */
  45. for (cp = tocode, bp = buf, count = MAX_WORD_LENGTH+10+1; ; cp++, bp++) {
  46. unsigned char c = (unsigned char) *cp;
  47. if (c >= 0x80)
  48. goto invalid;
  49. if (c >= 'a' && c <= 'z')
  50. c -= 'a'-'A';
  51. *bp = c;
  52. if (c == '\0')
  53. break;
  54. if (--count == 0)
  55. goto invalid;
  56. }
  57. for (;;) {
  58. if (bp-buf >= 10 && memcmp(bp-10,"//TRANSLIT",10)==0) {
  59. bp -= 10;
  60. *bp = '\0';
  61. transliterate = 1;
  62. continue;
  63. }
  64. if (bp-buf >= 8 && memcmp(bp-8,"//IGNORE",8)==0) {
  65. bp -= 8;
  66. *bp = '\0';
  67. discard_ilseq = 1;
  68. continue;
  69. }
  70. break;
  71. }
  72. if (buf[0] == '\0') {
  73. tocode = locale_charset();
  74. /* Avoid an endless loop that could occur when using an older version
  75. of localcharset.c. */
  76. if (tocode[0] == '\0')
  77. goto invalid;
  78. continue;
  79. }
  80. ap = aliases_lookup(buf,bp-buf);
  81. if (ap == NULL) {
  82. ap = aliases2_lookup(buf);
  83. if (ap == NULL)
  84. goto invalid;
  85. }
  86. if (ap->encoding_index == ei_local_char) {
  87. tocode = locale_charset();
  88. /* Avoid an endless loop that could occur when using an older version
  89. of localcharset.c. */
  90. if (tocode[0] == '\0')
  91. goto invalid;
  92. continue;
  93. }
  94. if (ap->encoding_index == ei_local_wchar_t) {
  95. /* On systems which define __STDC_ISO_10646__, wchar_t is Unicode.
  96. This is also the case on native Woe32 systems and Cygwin >= 1.7, where
  97. we know that it is UTF-16. */
  98. #if (defined _WIN32 && !defined __CYGWIN__) || (defined __CYGWIN__ && CYGWIN_VERSION_DLL_MAJOR >= 1007)
  99. if (sizeof(wchar_t) == 4) {
  100. to_index = ei_ucs4internal;
  101. break;
  102. }
  103. if (sizeof(wchar_t) == 2) {
  104. # if WORDS_LITTLEENDIAN
  105. to_index = ei_utf16le;
  106. # else
  107. to_index = ei_utf16be;
  108. # endif
  109. break;
  110. }
  111. #elif __STDC_ISO_10646__
  112. if (sizeof(wchar_t) == 4) {
  113. to_index = ei_ucs4internal;
  114. break;
  115. }
  116. if (sizeof(wchar_t) == 2) {
  117. to_index = ei_ucs2internal;
  118. break;
  119. }
  120. if (sizeof(wchar_t) == 1) {
  121. to_index = ei_iso8859_1;
  122. break;
  123. }
  124. #endif
  125. #if HAVE_MBRTOWC
  126. to_wchar = 1;
  127. tocode = locale_charset();
  128. continue;
  129. #endif
  130. goto invalid;
  131. }
  132. to_index = ap->encoding_index;
  133. break;
  134. }
  135. for (from_wchar = 0;;) {
  136. /* Search fromcode in the table. */
  137. for (cp = fromcode, bp = buf, count = MAX_WORD_LENGTH+10+1; ; cp++, bp++) {
  138. unsigned char c = (unsigned char) *cp;
  139. if (c >= 0x80)
  140. goto invalid;
  141. if (c >= 'a' && c <= 'z')
  142. c -= 'a'-'A';
  143. *bp = c;
  144. if (c == '\0')
  145. break;
  146. if (--count == 0)
  147. goto invalid;
  148. }
  149. for (;;) {
  150. if (bp-buf >= 10 && memcmp(bp-10,"//TRANSLIT",10)==0) {
  151. bp -= 10;
  152. *bp = '\0';
  153. continue;
  154. }
  155. if (bp-buf >= 8 && memcmp(bp-8,"//IGNORE",8)==0) {
  156. bp -= 8;
  157. *bp = '\0';
  158. continue;
  159. }
  160. break;
  161. }
  162. if (buf[0] == '\0') {
  163. fromcode = locale_charset();
  164. /* Avoid an endless loop that could occur when using an older version
  165. of localcharset.c. */
  166. if (fromcode[0] == '\0')
  167. goto invalid;
  168. continue;
  169. }
  170. ap = aliases_lookup(buf,bp-buf);
  171. if (ap == NULL) {
  172. ap = aliases2_lookup(buf);
  173. if (ap == NULL)
  174. goto invalid;
  175. }
  176. if (ap->encoding_index == ei_local_char) {
  177. fromcode = locale_charset();
  178. /* Avoid an endless loop that could occur when using an older version
  179. of localcharset.c. */
  180. if (fromcode[0] == '\0')
  181. goto invalid;
  182. continue;
  183. }
  184. if (ap->encoding_index == ei_local_wchar_t) {
  185. /* On systems which define __STDC_ISO_10646__, wchar_t is Unicode.
  186. This is also the case on native Woe32 systems and Cygwin >= 1.7, where
  187. we know that it is UTF-16. */
  188. #if (defined _WIN32 && !defined __CYGWIN__) || (defined __CYGWIN__ && CYGWIN_VERSION_DLL_MAJOR >= 1007)
  189. if (sizeof(wchar_t) == 4) {
  190. from_index = ei_ucs4internal;
  191. break;
  192. }
  193. if (sizeof(wchar_t) == 2) {
  194. # if WORDS_LITTLEENDIAN
  195. from_index = ei_utf16le;
  196. # else
  197. from_index = ei_utf16be;
  198. # endif
  199. break;
  200. }
  201. #elif __STDC_ISO_10646__
  202. if (sizeof(wchar_t) == 4) {
  203. from_index = ei_ucs4internal;
  204. break;
  205. }
  206. if (sizeof(wchar_t) == 2) {
  207. from_index = ei_ucs2internal;
  208. break;
  209. }
  210. if (sizeof(wchar_t) == 1) {
  211. from_index = ei_iso8859_1;
  212. break;
  213. }
  214. #endif
  215. #if HAVE_WCRTOMB
  216. from_wchar = 1;
  217. fromcode = locale_charset();
  218. continue;
  219. #endif
  220. goto invalid;
  221. }
  222. from_index = ap->encoding_index;
  223. break;
  224. }
  225. }