json_escaping.cc 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356
  1. // Protocol Buffers - Google's data interchange format
  2. // Copyright 2008 Google Inc. All rights reserved.
  3. // https://developers.google.com/protocol-buffers/
  4. //
  5. // Redistribution and use in source and binary forms, with or without
  6. // modification, are permitted provided that the following conditions are
  7. // met:
  8. //
  9. // * Redistributions of source code must retain the above copyright
  10. // notice, this list of conditions and the following disclaimer.
  11. // * Redistributions in binary form must reproduce the above
  12. // copyright notice, this list of conditions and the following disclaimer
  13. // in the documentation and/or other materials provided with the
  14. // distribution.
  15. // * Neither the name of Google Inc. nor the names of its
  16. // contributors may be used to endorse or promote products derived from
  17. // this software without specific prior written permission.
  18. //
  19. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  22. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  23. // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24. // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  25. // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  29. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30. #include <google/protobuf/util/internal/json_escaping.h>
  31. #include <google/protobuf/stubs/logging.h>
  32. #include <google/protobuf/stubs/common.h>
  33. namespace google {
  34. namespace protobuf {
  35. namespace util {
  36. namespace converter {
  37. namespace {
  38. // Array of hex characters for conversion to hex.
  39. static const char kHex[] = "0123456789abcdef";
  40. // Characters 0x00 to 0x9f are very commonly used, so we provide a special
  41. // table lookup.
  42. //
  43. // For unicode code point ch < 0xa0:
  44. // kCommonEscapes[ch] is the escaped string of ch, if escaping is needed;
  45. // or an empty string, if escaping is not needed.
  46. static const char kCommonEscapes[160][7] = {
  47. // C0 (ASCII and derivatives) control characters
  48. "\\u0000", "\\u0001", "\\u0002", "\\u0003", // 0x00
  49. "\\u0004", "\\u0005", "\\u0006", "\\u0007",
  50. "\\b", "\\t", "\\n", "\\u000b",
  51. "\\f", "\\r", "\\u000e", "\\u000f",
  52. "\\u0010", "\\u0011", "\\u0012", "\\u0013", // 0x10
  53. "\\u0014", "\\u0015", "\\u0016", "\\u0017",
  54. "\\u0018", "\\u0019", "\\u001a", "\\u001b",
  55. "\\u001c", "\\u001d", "\\u001e", "\\u001f",
  56. // Escaping of " and \ are required by www.json.org string definition.
  57. // Escaping of < and > are required for HTML security.
  58. "", "", "\\\"", "", "", "", "", "", // 0x20
  59. "", "", "", "", "", "", "", "",
  60. "", "", "", "", "", "", "", "", // 0x30
  61. "", "", "", "", "\\u003c", "", "\\u003e", "",
  62. "", "", "", "", "", "", "", "", // 0x40
  63. "", "", "", "", "", "", "", "",
  64. "", "", "", "", "", "", "", "", // 0x50
  65. "", "", "", "", "\\\\", "", "", "",
  66. "", "", "", "", "", "", "", "", // 0x60
  67. "", "", "", "", "", "", "", "",
  68. "", "", "", "", "", "", "", "", // 0x70
  69. "", "", "", "", "", "", "", "\\u007f",
  70. // C1 (ISO 8859 and Unicode) extended control characters
  71. "\\u0080", "\\u0081", "\\u0082", "\\u0083", // 0x80
  72. "\\u0084", "\\u0085", "\\u0086", "\\u0087",
  73. "\\u0088", "\\u0089", "\\u008a", "\\u008b",
  74. "\\u008c", "\\u008d", "\\u008e", "\\u008f",
  75. "\\u0090", "\\u0091", "\\u0092", "\\u0093", // 0x90
  76. "\\u0094", "\\u0095", "\\u0096", "\\u0097",
  77. "\\u0098", "\\u0099", "\\u009a", "\\u009b",
  78. "\\u009c", "\\u009d", "\\u009e", "\\u009f"
  79. };
  80. // Determines if the given char value is a unicode surrogate code unit (either
  81. // high-surrogate or low-surrogate).
  82. inline bool IsSurrogate(uint32 c) {
  83. // Optimized form of:
  84. // return c >= kMinHighSurrogate && c <= kMaxLowSurrogate;
  85. // (Reduced from 3 ALU instructions to 2 ALU instructions)
  86. return (c & 0xfffff800) == JsonEscaping::kMinHighSurrogate;
  87. }
  88. // Returns true if the given unicode code point cp is a valid
  89. // unicode code point (i.e. in the range 0 <= cp <= kMaxCodePoint).
  90. inline bool IsValidCodePoint(uint32 cp) {
  91. return cp <= JsonEscaping::kMaxCodePoint;
  92. }
  93. // Returns the low surrogate for the given unicode code point. The result is
  94. // meaningless if the given code point is not a supplementary character.
  95. inline uint16 ToLowSurrogate(uint32 cp) {
  96. return (cp & (JsonEscaping::kMaxLowSurrogate
  97. - JsonEscaping::kMinLowSurrogate))
  98. + JsonEscaping::kMinLowSurrogate;
  99. }
  100. // Returns the high surrogate for the given unicode code point. The result is
  101. // meaningless if the given code point is not a supplementary character.
  102. inline uint16 ToHighSurrogate(uint32 cp) {
  103. return (cp >> 10) + (JsonEscaping::kMinHighSurrogate -
  104. (JsonEscaping::kMinSupplementaryCodePoint >> 10));
  105. }
  106. // Input str is encoded in UTF-8. A unicode code point could be encoded in
  107. // UTF-8 using anywhere from 1 to 4 characters, and it could span multiple
  108. // reads of the ByteSource.
  109. //
  110. // This function reads the next unicode code point from the input (str) at
  111. // the given position (index), taking into account any left-over partial
  112. // code point from the previous iteration (cp), together with the number
  113. // of characters left to read to complete this code point (num_left).
  114. //
  115. // This function assumes that the input (str) is valid at the given position
  116. // (index). In order words, at least one character could be read successfully.
  117. //
  118. // The code point read (partial or complete) is stored in (cp). Upon return,
  119. // (num_left) stores the number of characters that has yet to be read in
  120. // order to complete the current unicode code point. If the read is complete,
  121. // then (num_left) is 0. Also, (num_read) is the number of characters read.
  122. //
  123. // Returns false if we encounter an invalid UTF-8 string. Returns true
  124. // otherwise, including the case when we reach the end of the input (str)
  125. // before a complete unicode code point is read.
  126. bool ReadCodePoint(StringPiece str, int index,
  127. uint32 *cp, int* num_left, int *num_read) {
  128. if (*num_left == 0) {
  129. // Last read was complete. Start reading a new unicode code point.
  130. *cp = static_cast<uint8>(str[index++]);
  131. *num_read = 1;
  132. // The length of the code point is determined from reading the first byte.
  133. //
  134. // If the first byte is between:
  135. // 0..0x7f: that's the value of the code point.
  136. // 0x80..0xbf: <invalid>
  137. // 0xc0..0xdf: 11-bit code point encoded in 2 bytes.
  138. // bit 10-6, bit 5-0
  139. // 0xe0..0xef: 16-bit code point encoded in 3 bytes.
  140. // bit 15-12, bit 11-6, bit 5-0
  141. // 0xf0..0xf7: 21-bit code point encoded in 4 bytes.
  142. // bit 20-18, bit 17-12, bit 11-6, bit 5-0
  143. // 0xf8..0xff: <invalid>
  144. //
  145. // Meaning of each bit:
  146. // <msb> bit 7: 0 - single byte code point: bits 6-0 are values.
  147. // 1 - multibyte code point
  148. // bit 6: 0 - subsequent bytes of multibyte code point:
  149. // bits 5-0 are values.
  150. // 1 - first byte of multibyte code point
  151. // bit 5: 0 - first byte of 2-byte code point: bits 4-0 are values.
  152. // 1 - first byte of code point with >= 3 bytes.
  153. // bit 4: 0 - first byte of 3-byte code point: bits 3-0 are values.
  154. // 1 - first byte of code point with >= 4 bytes.
  155. // bit 3: 0 - first byte of 4-byte code point: bits 2-0 are values.
  156. // 1 - reserved for future expansion.
  157. if (*cp <= 0x7f) {
  158. return true;
  159. } else if (*cp <= 0xbf) {
  160. return false;
  161. } else if (*cp <= 0xdf) {
  162. *cp &= 0x1f;
  163. *num_left = 1;
  164. } else if (*cp <= 0xef) {
  165. *cp &= 0x0f;
  166. *num_left = 2;
  167. } else if (*cp <= 0xf7) {
  168. *cp &= 0x07;
  169. *num_left = 3;
  170. } else {
  171. return false;
  172. }
  173. } else {
  174. // Last read was partial. Initialize num_read to 0 and continue reading
  175. // the last unicode code point.
  176. *num_read = 0;
  177. }
  178. while (*num_left > 0 && index < str.size()) {
  179. uint32 ch = static_cast<uint8>(str[index++]);
  180. --(*num_left);
  181. ++(*num_read);
  182. *cp = (*cp << 6) | (ch & 0x3f);
  183. if (ch < 0x80 || ch > 0xbf) return false;
  184. }
  185. return *num_left > 0 || (!IsSurrogate(*cp) && IsValidCodePoint(*cp));
  186. }
  187. // Stores the 16-bit unicode code point as its hexadecimal digits in buffer
  188. // and returns a StringPiece that points to this buffer. The input buffer needs
  189. // to be at least 6 bytes long.
  190. StringPiece ToHex(uint16 cp, char* buffer) {
  191. buffer[5] = kHex[cp & 0x0f];
  192. cp >>= 4;
  193. buffer[4] = kHex[cp & 0x0f];
  194. cp >>= 4;
  195. buffer[3] = kHex[cp & 0x0f];
  196. cp >>= 4;
  197. buffer[2] = kHex[cp & 0x0f];
  198. return StringPiece(buffer, 6);
  199. }
  200. // Stores the 32-bit unicode code point as its hexadecimal digits in buffer
  201. // and returns a StringPiece that points to this buffer. The input buffer needs
  202. // to be at least 12 bytes long.
  203. StringPiece ToSurrogateHex(uint32 cp, char* buffer) {
  204. uint16 low = ToLowSurrogate(cp);
  205. uint16 high = ToHighSurrogate(cp);
  206. buffer[11] = kHex[low & 0x0f];
  207. low >>= 4;
  208. buffer[10] = kHex[low & 0x0f];
  209. low >>= 4;
  210. buffer[9] = kHex[low & 0x0f];
  211. low >>= 4;
  212. buffer[8] = kHex[low & 0x0f];
  213. buffer[5] = kHex[high & 0x0f];
  214. high >>= 4;
  215. buffer[4] = kHex[high & 0x0f];
  216. high >>= 4;
  217. buffer[3] = kHex[high & 0x0f];
  218. high >>= 4;
  219. buffer[2] = kHex[high & 0x0f];
  220. return StringPiece(buffer, 12);
  221. }
  222. // If the given unicode code point needs escaping, then returns the
  223. // escaped form. The returned StringPiece either points to statically
  224. // pre-allocated char[] or to the given buffer. The input buffer needs
  225. // to be at least 12 bytes long.
  226. //
  227. // If the given unicode code point does not need escaping, an empty
  228. // StringPiece is returned.
  229. StringPiece EscapeCodePoint(uint32 cp, char* buffer) {
  230. if (cp < 0xa0) return kCommonEscapes[cp];
  231. switch (cp) {
  232. // These are not required by json spec
  233. // but used to prevent security bugs in javascript.
  234. case 0xfeff: // Zero width no-break space
  235. case 0xfff9: // Interlinear annotation anchor
  236. case 0xfffa: // Interlinear annotation separator
  237. case 0xfffb: // Interlinear annotation terminator
  238. case 0x00ad: // Soft-hyphen
  239. case 0x06dd: // Arabic end of ayah
  240. case 0x070f: // Syriac abbreviation mark
  241. case 0x17b4: // Khmer vowel inherent Aq
  242. case 0x17b5: // Khmer vowel inherent Aa
  243. return ToHex(cp, buffer);
  244. default:
  245. if ((cp >= 0x0600 && cp <= 0x0603) || // Arabic signs
  246. (cp >= 0x200b && cp <= 0x200f) || // Zero width etc.
  247. (cp >= 0x2028 && cp <= 0x202e) || // Separators etc.
  248. (cp >= 0x2060 && cp <= 0x2064) || // Invisible etc.
  249. (cp >= 0x206a && cp <= 0x206f)) { // Shaping etc.
  250. return ToHex(cp, buffer);
  251. }
  252. if (cp == 0x000e0001 || // Language tag
  253. (cp >= 0x0001d173 && cp <= 0x0001d17a) || // Music formatting
  254. (cp >= 0x000e0020 && cp <= 0x000e007f)) { // TAG symbols
  255. return ToSurrogateHex(cp, buffer);
  256. }
  257. }
  258. return StringPiece();
  259. }
  260. // Tries to escape the given code point first. If the given code point
  261. // does not need to be escaped, but force_output is true, then render
  262. // the given multi-byte code point in UTF8 in the buffer and returns it.
  263. StringPiece EscapeCodePoint(uint32 cp, char* buffer, bool force_output) {
  264. StringPiece sp = EscapeCodePoint(cp, buffer);
  265. if (force_output && sp.empty()) {
  266. buffer[5] = (cp & 0x3f) | 0x80;
  267. cp >>= 6;
  268. if (cp <= 0x1f) {
  269. buffer[4] = cp | 0xc0;
  270. sp = StringPiece(buffer + 4, 2);
  271. return sp;
  272. }
  273. buffer[4] = (cp & 0x3f) | 0x80;
  274. cp >>= 6;
  275. if (cp <= 0x0f) {
  276. buffer[3] = cp | 0xe0;
  277. sp = StringPiece(buffer + 3, 3);
  278. return sp;
  279. }
  280. buffer[3] = (cp & 0x3f) | 0x80;
  281. buffer[2] = ((cp >> 6) & 0x07) | 0xf0;
  282. sp = StringPiece(buffer + 2, 4);
  283. }
  284. return sp;
  285. }
  286. } // namespace
  287. void JsonEscaping::Escape(strings::ByteSource* input,
  288. strings::ByteSink* output) {
  289. char buffer[12] = "\\udead\\ubee";
  290. uint32 cp = 0; // Current unicode code point.
  291. int num_left = 0; // Num of chars to read to complete the code point.
  292. while (input->Available() > 0) {
  293. StringPiece str = input->Peek();
  294. StringPiece escaped;
  295. int i = 0;
  296. int num_read;
  297. bool ok;
  298. bool cp_was_split = num_left > 0;
  299. // Loop until we encounter either
  300. // i) a code point that needs to be escaped; or
  301. // ii) a split code point is completely read; or
  302. // iii) a character that is not a valid utf8; or
  303. // iv) end of the StringPiece str is reached.
  304. do {
  305. ok = ReadCodePoint(str, i, &cp, &num_left, &num_read);
  306. if (num_left > 0 || !ok) break; // case iii or iv
  307. escaped = EscapeCodePoint(cp, buffer, cp_was_split);
  308. if (!escaped.empty()) break; // case i or ii
  309. i += num_read;
  310. num_read = 0;
  311. } while (i < str.length()); // case iv
  312. // First copy the un-escaped prefix, if any, to the output ByteSink.
  313. if (i > 0) input->CopyTo(output, i);
  314. if (num_read > 0) input->Skip(num_read);
  315. if (!ok) {
  316. // Case iii: Report error.
  317. // TODO(wpoon): Add error reporting.
  318. num_left = 0;
  319. } else if (num_left == 0 && !escaped.empty()) {
  320. // Case i or ii: Append the escaped code point to the output ByteSink.
  321. output->Append(escaped.data(), escaped.size());
  322. }
  323. }
  324. if (num_left > 0) {
  325. // Treat as case iii: report error.
  326. // TODO(wpoon): Add error reporting.
  327. }
  328. }
  329. } // namespace converter
  330. } // namespace util
  331. } // namespace protobuf
  332. } // namespace google