iconv.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629
  1. /*
  2. * Copyright (C) 1999-2008, 2011, 2016, 2018, 2020, 2022 Free Software Foundation, Inc.
  3. * This file is part of the GNU LIBICONV Library.
  4. *
  5. * The GNU LIBICONV Library is free software; you can redistribute it
  6. * and/or modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either version 2.1
  8. * of the License, or (at your option) any later version.
  9. *
  10. * The GNU LIBICONV Library is distributed in the hope that it will be
  11. * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
  17. * If not, see <https://www.gnu.org/licenses/>.
  18. */
  19. #include <iconv.h>
  20. #include <limits.h>
  21. #include <stdlib.h>
  22. #include <string.h>
  23. #include "config.h"
  24. #include "localcharset.h"
  25. #ifdef __CYGWIN__
  26. #include <cygwin/version.h>
  27. #endif
  28. #if ENABLE_EXTRA
  29. /*
  30. * Consider all system dependent encodings, for any system,
  31. * and the extra encodings.
  32. */
  33. #define USE_AIX
  34. #define USE_OSF1
  35. #define USE_DOS
  36. #define USE_ZOS
  37. #define USE_EXTRA
  38. #else
  39. /*
  40. * Consider those system dependent encodings that are needed for the
  41. * current system.
  42. */
  43. #ifdef _AIX
  44. #define USE_AIX
  45. #endif
  46. #if defined(__osf__) || defined(VMS)
  47. #define USE_OSF1
  48. #endif
  49. #if defined(__DJGPP__) || (defined(_WIN32) && (defined(_MSC_VER) || defined(__MINGW32__)))
  50. #define USE_DOS
  51. #endif
  52. /* Enable the EBCDIC encodings not only on z/OS but also on Linux/s390, for
  53. easier interoperability between z/OS and Linux/s390. */
  54. #if defined(__MVS__) || (defined(__linux__) && (defined(__s390__) || defined(__s390x__)))
  55. #define USE_ZOS
  56. #endif
  57. #endif
  58. /*
  59. * Data type for general conversion loop.
  60. */
  61. struct loop_funcs {
  62. size_t (*loop_convert) (iconv_t icd,
  63. const char* * inbuf, size_t *inbytesleft,
  64. char* * outbuf, size_t *outbytesleft);
  65. size_t (*loop_reset) (iconv_t icd,
  66. char* * outbuf, size_t *outbytesleft);
  67. };
  68. /*
  69. * Converters.
  70. */
  71. #include "converters.h"
  72. /*
  73. * Transliteration tables.
  74. */
  75. #include "cjk_variants.h"
  76. #include "translit.h"
  77. /*
  78. * Table of all supported encodings.
  79. */
  80. struct encoding {
  81. struct mbtowc_funcs ifuncs; /* conversion multibyte -> unicode */
  82. struct wctomb_funcs ofuncs; /* conversion unicode -> multibyte */
  83. int oflags; /* flags for unicode -> multibyte conversion */
  84. };
  85. #define DEFALIAS(xxx_alias,xxx) /* nothing */
  86. enum {
  87. #define DEFENCODING(xxx_names,xxx,xxx_ifuncs1,xxx_ifuncs2,xxx_ofuncs1,xxx_ofuncs2) \
  88. ei_##xxx ,
  89. #include "encodings.def"
  90. #ifdef USE_AIX
  91. # include "encodings_aix.def"
  92. #endif
  93. #ifdef USE_OSF1
  94. # include "encodings_osf1.def"
  95. #endif
  96. #ifdef USE_DOS
  97. # include "encodings_dos.def"
  98. #endif
  99. #ifdef USE_ZOS
  100. # include "encodings_zos.def"
  101. #endif
  102. #ifdef USE_EXTRA
  103. # include "encodings_extra.def"
  104. #endif
  105. #include "encodings_local.def"
  106. #undef DEFENCODING
  107. ei_for_broken_compilers_that_dont_like_trailing_commas
  108. };
  109. #include "flags.h"
  110. static struct encoding const all_encodings[] = {
  111. #define DEFENCODING(xxx_names,xxx,xxx_ifuncs1,xxx_ifuncs2,xxx_ofuncs1,xxx_ofuncs2) \
  112. { xxx_ifuncs1,xxx_ifuncs2, xxx_ofuncs1,xxx_ofuncs2, ei_##xxx##_oflags },
  113. #include "encodings.def"
  114. #ifdef USE_AIX
  115. # include "encodings_aix.def"
  116. #endif
  117. #ifdef USE_OSF1
  118. # include "encodings_osf1.def"
  119. #endif
  120. #ifdef USE_DOS
  121. # include "encodings_dos.def"
  122. #endif
  123. #ifdef USE_ZOS
  124. # include "encodings_zos.def"
  125. #endif
  126. #ifdef USE_EXTRA
  127. # include "encodings_extra.def"
  128. #endif
  129. #undef DEFENCODING
  130. #define DEFENCODING(xxx_names,xxx,xxx_ifuncs1,xxx_ifuncs2,xxx_ofuncs1,xxx_ofuncs2) \
  131. { xxx_ifuncs1,xxx_ifuncs2, xxx_ofuncs1,xxx_ofuncs2, 0 },
  132. #include "encodings_local.def"
  133. #undef DEFENCODING
  134. };
  135. #undef DEFALIAS
  136. /*
  137. * Conversion loops.
  138. */
  139. #include "loops.h"
  140. /*
  141. * Alias lookup function.
  142. * Defines
  143. * struct alias { int name; unsigned int encoding_index; };
  144. * const struct alias * aliases_lookup (const char *str, unsigned int len);
  145. * #define MAX_WORD_LENGTH ...
  146. */
  147. #if defined _AIX
  148. # include "aliases_sysaix.h"
  149. #elif defined hpux || defined __hpux
  150. # include "aliases_syshpux.h"
  151. #elif defined __osf__
  152. # include "aliases_sysosf1.h"
  153. #elif defined __sun
  154. # include "aliases_syssolaris.h"
  155. #else
  156. # include "aliases.h"
  157. #endif
  158. /*
  159. * System dependent alias lookup function.
  160. * Defines
  161. * const struct alias * aliases2_lookup (const char *str);
  162. */
  163. #if defined(USE_AIX) || defined(USE_OSF1) || defined(USE_DOS) || defined(USE_ZOS) || defined(USE_EXTRA) /* || ... */
  164. struct stringpool2_t {
  165. #define S(tag,name,encoding_index) char stringpool_##tag[sizeof(name)];
  166. #include "aliases2.h"
  167. #undef S
  168. };
  169. static const struct stringpool2_t stringpool2_contents = {
  170. #define S(tag,name,encoding_index) name,
  171. #include "aliases2.h"
  172. #undef S
  173. };
  174. #define stringpool2 ((const char *) &stringpool2_contents)
  175. static const struct alias sysdep_aliases[] = {
  176. #define S(tag,name,encoding_index) { (int)(long)&((struct stringpool2_t *)0)->stringpool_##tag, encoding_index },
  177. #include "aliases2.h"
  178. #undef S
  179. };
  180. #ifdef __GNUC__
  181. __inline
  182. #else
  183. #ifdef __cplusplus
  184. inline
  185. #endif
  186. #endif
  187. static const struct alias *
  188. aliases2_lookup (register const char *str)
  189. {
  190. const struct alias * ptr;
  191. unsigned int count;
  192. for (ptr = sysdep_aliases, count = sizeof(sysdep_aliases)/sizeof(sysdep_aliases[0]); count > 0; ptr++, count--)
  193. if (!strcmp(str, stringpool2 + ptr->name))
  194. return ptr;
  195. return NULL;
  196. }
  197. #else
  198. #define aliases2_lookup(str) NULL
  199. #define stringpool2 NULL
  200. #endif
  201. #if 0
  202. /* Like !strcasecmp, except that the both strings can be assumed to be ASCII
  203. and the first string can be assumed to be in uppercase. */
  204. static int strequal (const char* str1, const char* str2)
  205. {
  206. unsigned char c1;
  207. unsigned char c2;
  208. for (;;) {
  209. c1 = * (unsigned char *) str1++;
  210. c2 = * (unsigned char *) str2++;
  211. if (c1 == 0)
  212. break;
  213. if (c2 >= 'a' && c2 <= 'z')
  214. c2 -= 'a'-'A';
  215. if (c1 != c2)
  216. break;
  217. }
  218. return (c1 == c2);
  219. }
  220. #endif
  221. iconv_t iconv_open (const char* tocode, const char* fromcode)
  222. {
  223. struct conv_struct * cd;
  224. unsigned int from_index;
  225. int from_wchar;
  226. unsigned int to_index;
  227. int to_wchar;
  228. int transliterate;
  229. int discard_ilseq;
  230. #include "iconv_open1.h"
  231. cd = (struct conv_struct *) malloc(from_wchar != to_wchar
  232. ? sizeof(struct wchar_conv_struct)
  233. : sizeof(struct conv_struct));
  234. if (cd == NULL) {
  235. errno = ENOMEM;
  236. return (iconv_t)(-1);
  237. }
  238. #include "iconv_open2.h"
  239. return (iconv_t)cd;
  240. invalid:
  241. errno = EINVAL;
  242. return (iconv_t)(-1);
  243. }
  244. size_t iconv (iconv_t icd,
  245. ICONV_CONST char* * inbuf, size_t *inbytesleft,
  246. char* * outbuf, size_t *outbytesleft)
  247. {
  248. conv_t cd = (conv_t) icd;
  249. if (inbuf == NULL || *inbuf == NULL)
  250. return cd->lfuncs.loop_reset(icd,outbuf,outbytesleft);
  251. else
  252. return cd->lfuncs.loop_convert(icd,
  253. (const char* *)inbuf,inbytesleft,
  254. outbuf,outbytesleft);
  255. }
  256. int iconv_close (iconv_t icd)
  257. {
  258. conv_t cd = (conv_t) icd;
  259. free(cd);
  260. return 0;
  261. }
  262. #ifndef LIBICONV_PLUG
  263. /*
  264. * Verify that a 'struct conv_struct' and a 'struct wchar_conv_struct' each
  265. * fit in an iconv_allocation_t.
  266. * If this verification fails, iconv_allocation_t must be made larger and
  267. * the major version in LIBICONV_VERSION_INFO must be bumped.
  268. * Currently 'struct conv_struct' has 21 integer/pointer fields, and
  269. * 'struct wchar_conv_struct' additionally has an 'mbstate_t' field.
  270. */
  271. typedef int verify_size_1[2 * (sizeof (struct conv_struct) <= sizeof (iconv_allocation_t)) - 1];
  272. typedef int verify_size_2[2 * (sizeof (struct wchar_conv_struct) <= sizeof (iconv_allocation_t)) - 1];
  273. int iconv_open_into (const char* tocode, const char* fromcode,
  274. iconv_allocation_t* resultp)
  275. {
  276. struct conv_struct * cd;
  277. unsigned int from_index;
  278. int from_wchar;
  279. unsigned int to_index;
  280. int to_wchar;
  281. int transliterate;
  282. int discard_ilseq;
  283. #include "iconv_open1.h"
  284. cd = (struct conv_struct *) resultp;
  285. #include "iconv_open2.h"
  286. return 0;
  287. invalid:
  288. errno = EINVAL;
  289. return -1;
  290. }
  291. int iconvctl (iconv_t icd, int request, void* argument)
  292. {
  293. conv_t cd = (conv_t) icd;
  294. switch (request) {
  295. case ICONV_TRIVIALP:
  296. *(int *)argument =
  297. ((cd->lfuncs.loop_convert == unicode_loop_convert
  298. && cd->iindex == cd->oindex)
  299. || cd->lfuncs.loop_convert == wchar_id_loop_convert
  300. ? 1 : 0);
  301. return 0;
  302. case ICONV_GET_TRANSLITERATE:
  303. *(int *)argument = cd->transliterate;
  304. return 0;
  305. case ICONV_SET_TRANSLITERATE:
  306. cd->transliterate = (*(const int *)argument ? 1 : 0);
  307. return 0;
  308. case ICONV_GET_DISCARD_ILSEQ:
  309. *(int *)argument = cd->discard_ilseq;
  310. return 0;
  311. case ICONV_SET_DISCARD_ILSEQ:
  312. cd->discard_ilseq = (*(const int *)argument ? 1 : 0);
  313. return 0;
  314. case ICONV_SET_HOOKS:
  315. if (argument != NULL) {
  316. cd->hooks = *(const struct iconv_hooks *)argument;
  317. } else {
  318. cd->hooks.uc_hook = NULL;
  319. cd->hooks.wc_hook = NULL;
  320. cd->hooks.data = NULL;
  321. }
  322. return 0;
  323. case ICONV_SET_FALLBACKS:
  324. if (argument != NULL) {
  325. cd->fallbacks = *(const struct iconv_fallbacks *)argument;
  326. } else {
  327. cd->fallbacks.mb_to_uc_fallback = NULL;
  328. cd->fallbacks.uc_to_mb_fallback = NULL;
  329. cd->fallbacks.mb_to_wc_fallback = NULL;
  330. cd->fallbacks.wc_to_mb_fallback = NULL;
  331. cd->fallbacks.data = NULL;
  332. }
  333. return 0;
  334. default:
  335. errno = EINVAL;
  336. return -1;
  337. }
  338. }
  339. /* An alias after its name has been converted from 'int' to 'const char*'. */
  340. struct nalias { const char* name; unsigned int encoding_index; };
  341. static int compare_by_index (const void * arg1, const void * arg2)
  342. {
  343. const struct nalias * alias1 = (const struct nalias *) arg1;
  344. const struct nalias * alias2 = (const struct nalias *) arg2;
  345. return (int)alias1->encoding_index - (int)alias2->encoding_index;
  346. }
  347. static int compare_by_name (const void * arg1, const void * arg2)
  348. {
  349. const char * name1 = *(const char * const *)arg1;
  350. const char * name2 = *(const char * const *)arg2;
  351. /* Compare alphabetically, but put "CS" names at the end. */
  352. int sign = strcmp(name1,name2);
  353. if (sign != 0) {
  354. sign = ((name1[0]=='C' && name1[1]=='S') - (name2[0]=='C' && name2[1]=='S'))
  355. * 4 + (sign >= 0 ? 1 : -1);
  356. }
  357. return sign;
  358. }
  359. void iconvlist (int (*do_one) (unsigned int namescount,
  360. const char * const * names,
  361. void* data),
  362. void* data)
  363. {
  364. #define aliascount1 sizeof(aliases)/sizeof(aliases[0])
  365. #ifndef aliases2_lookup
  366. #define aliascount2 sizeof(sysdep_aliases)/sizeof(sysdep_aliases[0])
  367. #else
  368. #define aliascount2 0
  369. #endif
  370. #define aliascount (aliascount1+aliascount2)
  371. struct nalias aliasbuf[aliascount];
  372. const char * namesbuf[aliascount];
  373. size_t num_aliases;
  374. {
  375. /* Put all existing aliases into a buffer. */
  376. size_t i;
  377. size_t j;
  378. j = 0;
  379. for (i = 0; i < aliascount1; i++) {
  380. const struct alias * p = &aliases[i];
  381. if (p->name >= 0
  382. && p->encoding_index != ei_local_char
  383. && p->encoding_index != ei_local_wchar_t) {
  384. aliasbuf[j].name = stringpool + p->name;
  385. aliasbuf[j].encoding_index = p->encoding_index;
  386. j++;
  387. }
  388. }
  389. #ifndef aliases2_lookup
  390. for (i = 0; i < aliascount2; i++) {
  391. aliasbuf[j].name = stringpool2 + sysdep_aliases[i].name;
  392. aliasbuf[j].encoding_index = sysdep_aliases[i].encoding_index;
  393. j++;
  394. }
  395. #endif
  396. num_aliases = j;
  397. }
  398. /* Sort by encoding_index. */
  399. if (num_aliases > 1)
  400. qsort(aliasbuf, num_aliases, sizeof(struct nalias), compare_by_index);
  401. {
  402. /* Process all aliases with the same encoding_index together. */
  403. size_t j;
  404. j = 0;
  405. while (j < num_aliases) {
  406. unsigned int ei = aliasbuf[j].encoding_index;
  407. size_t i = 0;
  408. do
  409. namesbuf[i++] = aliasbuf[j++].name;
  410. while (j < num_aliases && aliasbuf[j].encoding_index == ei);
  411. if (i > 1)
  412. qsort(namesbuf, i, sizeof(const char *), compare_by_name);
  413. /* Call the callback. */
  414. if (do_one(i,namesbuf,data))
  415. break;
  416. }
  417. }
  418. #undef aliascount
  419. #undef aliascount2
  420. #undef aliascount1
  421. }
  422. /*
  423. * Table of canonical names of encodings.
  424. * Instead of strings, it contains offsets into stringpool and stringpool2.
  425. */
  426. static const unsigned short all_canonical[] = {
  427. #if defined _AIX
  428. # include "canonical_sysaix.h"
  429. #elif defined hpux || defined __hpux
  430. # include "canonical_syshpux.h"
  431. #elif defined __osf__
  432. # include "canonical_sysosf1.h"
  433. #elif defined __sun
  434. # include "canonical_syssolaris.h"
  435. #else
  436. # include "canonical.h"
  437. #endif
  438. #ifdef USE_AIX
  439. # if defined _AIX
  440. # include "canonical_aix_sysaix.h"
  441. # else
  442. # include "canonical_aix.h"
  443. # endif
  444. #endif
  445. #ifdef USE_OSF1
  446. # if defined __osf__
  447. # include "canonical_osf1_sysosf1.h"
  448. # else
  449. # include "canonical_osf1.h"
  450. # endif
  451. #endif
  452. #ifdef USE_DOS
  453. # include "canonical_dos.h"
  454. #endif
  455. #ifdef USE_ZOS
  456. # include "canonical_zos.h"
  457. #endif
  458. #ifdef USE_EXTRA
  459. # include "canonical_extra.h"
  460. #endif
  461. #if defined _AIX
  462. # include "canonical_local_sysaix.h"
  463. #elif defined hpux || defined __hpux
  464. # include "canonical_local_syshpux.h"
  465. #elif defined __osf__
  466. # include "canonical_local_sysosf1.h"
  467. #elif defined __sun
  468. # include "canonical_local_syssolaris.h"
  469. #else
  470. # include "canonical_local.h"
  471. #endif
  472. };
  473. const char * iconv_canonicalize (const char * name)
  474. {
  475. const char* code;
  476. char buf[MAX_WORD_LENGTH+10+1];
  477. const char* cp;
  478. char* bp;
  479. const struct alias * ap;
  480. unsigned int count;
  481. unsigned int index;
  482. const char* pool;
  483. /* Before calling aliases_lookup, convert the input string to upper case,
  484. * and check whether it's entirely ASCII (we call gperf with option "-7"
  485. * to achieve a smaller table) and non-empty. If it's not entirely ASCII,
  486. * or if it's too long, it is not a valid encoding name.
  487. */
  488. for (code = name;;) {
  489. /* Search code in the table. */
  490. for (cp = code, bp = buf, count = MAX_WORD_LENGTH+10+1; ; cp++, bp++) {
  491. unsigned char c = (unsigned char) *cp;
  492. if (c >= 0x80)
  493. goto invalid;
  494. if (c >= 'a' && c <= 'z')
  495. c -= 'a'-'A';
  496. *bp = c;
  497. if (c == '\0')
  498. break;
  499. if (--count == 0)
  500. goto invalid;
  501. }
  502. for (;;) {
  503. if (bp-buf >= 10 && memcmp(bp-10,"//TRANSLIT",10)==0) {
  504. bp -= 10;
  505. *bp = '\0';
  506. continue;
  507. }
  508. if (bp-buf >= 8 && memcmp(bp-8,"//IGNORE",8)==0) {
  509. bp -= 8;
  510. *bp = '\0';
  511. continue;
  512. }
  513. break;
  514. }
  515. if (buf[0] == '\0') {
  516. code = locale_charset();
  517. /* Avoid an endless loop that could occur when using an older version
  518. of localcharset.c. */
  519. if (code[0] == '\0')
  520. goto invalid;
  521. continue;
  522. }
  523. pool = stringpool;
  524. ap = aliases_lookup(buf,bp-buf);
  525. if (ap == NULL) {
  526. pool = stringpool2;
  527. ap = aliases2_lookup(buf);
  528. if (ap == NULL)
  529. goto invalid;
  530. }
  531. if (ap->encoding_index == ei_local_char) {
  532. code = locale_charset();
  533. /* Avoid an endless loop that could occur when using an older version
  534. of localcharset.c. */
  535. if (code[0] == '\0')
  536. goto invalid;
  537. continue;
  538. }
  539. if (ap->encoding_index == ei_local_wchar_t) {
  540. /* On systems which define __STDC_ISO_10646__, wchar_t is Unicode.
  541. This is also the case on native Woe32 systems and Cygwin >= 1.7, where
  542. we know that it is UTF-16. */
  543. #if (defined _WIN32 && !defined __CYGWIN__) || (defined __CYGWIN__ && CYGWIN_VERSION_DLL_MAJOR >= 1007)
  544. if (sizeof(wchar_t) == 4) {
  545. index = ei_ucs4internal;
  546. break;
  547. }
  548. if (sizeof(wchar_t) == 2) {
  549. # if WORDS_LITTLEENDIAN
  550. index = ei_utf16le;
  551. # else
  552. index = ei_utf16be;
  553. # endif
  554. break;
  555. }
  556. #elif __STDC_ISO_10646__
  557. if (sizeof(wchar_t) == 4) {
  558. index = ei_ucs4internal;
  559. break;
  560. }
  561. if (sizeof(wchar_t) == 2) {
  562. index = ei_ucs2internal;
  563. break;
  564. }
  565. if (sizeof(wchar_t) == 1) {
  566. index = ei_iso8859_1;
  567. break;
  568. }
  569. #endif
  570. }
  571. index = ap->encoding_index;
  572. break;
  573. }
  574. return all_canonical[index] + pool;
  575. invalid:
  576. return name;
  577. }
  578. int _libiconv_version = _LIBICONV_VERSION;
  579. #if defined __FreeBSD__ && !defined __gnu_freebsd__
  580. /* GNU libiconv is the native FreeBSD iconv implementation since 2002.
  581. It wants to define the symbols 'iconv_open', 'iconv', 'iconv_close'. */
  582. #define strong_alias(name, aliasname) _strong_alias(name, aliasname)
  583. #define _strong_alias(name, aliasname) \
  584. extern __typeof (name) aliasname __attribute__ ((alias (#name)));
  585. #undef iconv_open
  586. #undef iconv
  587. #undef iconv_close
  588. strong_alias (libiconv_open, iconv_open)
  589. strong_alias (libiconv, iconv)
  590. strong_alias (libiconv_close, iconv_close)
  591. #endif
  592. #endif