utf16.h 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. /*
  2. * Copyright (C) 1999-2001, 2008, 2016 Free Software Foundation, Inc.
  3. * This file is part of the GNU LIBICONV Library.
  4. *
  5. * The GNU LIBICONV Library is free software; you can redistribute it
  6. * and/or modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either version 2.1
  8. * of the License, or (at your option) any later version.
  9. *
  10. * The GNU LIBICONV Library is distributed in the hope that it will be
  11. * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
  17. * If not, see <https://www.gnu.org/licenses/>.
  18. */
  19. /*
  20. * UTF-16
  21. */
  22. /* Specification: RFC 2781 */
  23. /* Here we accept FFFE/FEFF marks as endianness indicators everywhere
  24. in the stream, not just at the beginning. (This is contrary to what
  25. RFC 2781 section 3.2 specifies, but it allows concatenation of byte
  26. sequences to work flawlessly, while disagreeing with the RFC behaviour
  27. only for strings containing U+FEFF characters, which is quite rare.)
  28. The default is big-endian. */
  29. /* The state is 0 if big-endian, 1 if little-endian. */
  30. static int
  31. utf16_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)
  32. {
  33. state_t state = conv->istate;
  34. int count = 0;
  35. for (; n >= 2 && count <= RET_COUNT_MAX && count <= INT_MAX-2;) {
  36. ucs4_t wc = (state ? s[0] + (s[1] << 8) : (s[0] << 8) + s[1]);
  37. if (wc == 0xfeff) {
  38. } else if (wc == 0xfffe) {
  39. state ^= 1;
  40. } else if (wc >= 0xd800 && wc < 0xdc00) {
  41. if (n >= 4) {
  42. ucs4_t wc2 = (state ? s[2] + (s[3] << 8) : (s[2] << 8) + s[3]);
  43. if (!(wc2 >= 0xdc00 && wc2 < 0xe000))
  44. goto ilseq;
  45. *pwc = 0x10000 + ((wc - 0xd800) << 10) + (wc2 - 0xdc00);
  46. conv->istate = state;
  47. return count+4;
  48. } else
  49. break;
  50. } else if (wc >= 0xdc00 && wc < 0xe000) {
  51. goto ilseq;
  52. } else {
  53. *pwc = wc;
  54. conv->istate = state;
  55. return count+2;
  56. }
  57. s += 2; n -= 2; count += 2;
  58. }
  59. conv->istate = state;
  60. return RET_TOOFEW(count);
  61. ilseq:
  62. conv->istate = state;
  63. return RET_SHIFT_ILSEQ(count);
  64. }
  65. /* We output UTF-16 in big-endian order, with byte-order mark.
  66. See RFC 2781 section 3.3 for a rationale: Some document formats
  67. mandate a BOM; the file concatenation issue is not so severe as
  68. long as the above utf16_mbtowc function is used. */
  69. /* The state is 0 at the beginning, 1 after the BOM has been written. */
  70. static int
  71. utf16_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
  72. {
  73. if (wc != 0xfffe && !(wc >= 0xd800 && wc < 0xe000)) {
  74. int count = 0;
  75. if (!conv->ostate) {
  76. if (n >= 2) {
  77. r[0] = 0xFE;
  78. r[1] = 0xFF;
  79. r += 2; n -= 2; count += 2;
  80. } else
  81. return RET_TOOSMALL;
  82. }
  83. if (wc < 0x10000) {
  84. if (n >= 2) {
  85. r[0] = (unsigned char) (wc >> 8);
  86. r[1] = (unsigned char) wc;
  87. conv->ostate = 1;
  88. return count+2;
  89. } else
  90. return RET_TOOSMALL;
  91. }
  92. else if (wc < 0x110000) {
  93. if (n >= 4) {
  94. ucs4_t wc1 = 0xd800 + ((wc - 0x10000) >> 10);
  95. ucs4_t wc2 = 0xdc00 + ((wc - 0x10000) & 0x3ff);
  96. r[0] = (unsigned char) (wc1 >> 8);
  97. r[1] = (unsigned char) wc1;
  98. r[2] = (unsigned char) (wc2 >> 8);
  99. r[3] = (unsigned char) wc2;
  100. conv->ostate = 1;
  101. return count+4;
  102. } else
  103. return RET_TOOSMALL;
  104. }
  105. }
  106. return RET_ILUNI;
  107. }