json_stream_parser.cc 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861
  1. // Protocol Buffers - Google's data interchange format
  2. // Copyright 2008 Google Inc. All rights reserved.
  3. // https://developers.google.com/protocol-buffers/
  4. //
  5. // Redistribution and use in source and binary forms, with or without
  6. // modification, are permitted provided that the following conditions are
  7. // met:
  8. //
  9. // * Redistributions of source code must retain the above copyright
  10. // notice, this list of conditions and the following disclaimer.
  11. // * Redistributions in binary form must reproduce the above
  12. // copyright notice, this list of conditions and the following disclaimer
  13. // in the documentation and/or other materials provided with the
  14. // distribution.
  15. // * Neither the name of Google Inc. nor the names of its
  16. // contributors may be used to endorse or promote products derived from
  17. // this software without specific prior written permission.
  18. //
  19. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  22. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  23. // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24. // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  25. // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  29. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30. #include <google/protobuf/util/internal/json_stream_parser.h>
  31. #include <algorithm>
  32. #include <cctype>
  33. #include <cerrno>
  34. #include <cstdlib>
  35. #include <cstring>
  36. #include <memory>
  37. #include <google/protobuf/stubs/logging.h>
  38. #include <google/protobuf/stubs/common.h>
  39. #include <google/protobuf/util/internal/object_writer.h>
  40. #include <google/protobuf/util/internal/json_escaping.h>
  41. #include <google/protobuf/stubs/strutil.h>
  42. #include <google/protobuf/stubs/mathlimits.h>
  43. namespace google {
  44. namespace protobuf {
  45. namespace util {
  46. // Allow these symbols to be referenced as util::Status, util::error::* in
  47. // this file.
  48. using util::Status;
  49. namespace error {
  50. using util::error::CANCELLED;
  51. using util::error::INTERNAL;
  52. using util::error::INVALID_ARGUMENT;
  53. } // namespace error
  54. namespace converter {
  55. // Number of digits in an escaped UTF-16 code unit ('\\' 'u' X X X X)
  56. static const int kUnicodeEscapedLength = 6;
  57. // Length of the true, false, and null literals.
  58. static const int true_len = strlen("true");
  59. static const int false_len = strlen("false");
  60. static const int null_len = strlen("null");
  61. inline bool IsLetter(char c) {
  62. return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_') ||
  63. (c == '$');
  64. }
  65. inline bool IsAlphanumeric(char c) {
  66. return IsLetter(c) || ('0' <= c && c <= '9');
  67. }
  68. static bool ConsumeKey(StringPiece* input, StringPiece* key) {
  69. if (input->empty() || !IsLetter((*input)[0])) return false;
  70. int len = 1;
  71. for (; len < input->size(); ++len) {
  72. if (!IsAlphanumeric((*input)[len])) {
  73. break;
  74. }
  75. }
  76. *key = StringPiece(input->data(), len);
  77. *input = StringPiece(input->data() + len, input->size() - len);
  78. return true;
  79. }
  80. static bool MatchKey(StringPiece input) {
  81. return !input.empty() && IsLetter(input[0]);
  82. }
  83. JsonStreamParser::JsonStreamParser(ObjectWriter* ow)
  84. : ow_(ow),
  85. stack_(),
  86. leftover_(),
  87. json_(),
  88. p_(),
  89. key_(),
  90. key_storage_(),
  91. finishing_(false),
  92. parsed_(),
  93. parsed_storage_(),
  94. string_open_(0),
  95. chunk_storage_(),
  96. coerce_to_utf8_(false),
  97. allow_empty_null_(false),
  98. loose_float_number_conversion_(false) {
  99. // Initialize the stack with a single value to be parsed.
  100. stack_.push(VALUE);
  101. }
  102. JsonStreamParser::~JsonStreamParser() {}
  103. util::Status JsonStreamParser::Parse(StringPiece json) {
  104. StringPiece chunk = json;
  105. // If we have leftovers from a previous chunk, append the new chunk to it
  106. // and create a new StringPiece pointing at the string's data. This could
  107. // be large but we rely on the chunks to be small, assuming they are
  108. // fragments of a Cord.
  109. if (!leftover_.empty()) {
  110. // Don't point chunk to leftover_ because leftover_ will be updated in
  111. // ParseChunk(chunk).
  112. chunk_storage_.swap(leftover_);
  113. StrAppend(&chunk_storage_, json);
  114. chunk = StringPiece(chunk_storage_);
  115. }
  116. // Find the structurally valid UTF8 prefix and parse only that.
  117. int n = internal::UTF8SpnStructurallyValid(chunk);
  118. if (n > 0) {
  119. util::Status status = ParseChunk(chunk.substr(0, n));
  120. // Any leftover characters are stashed in leftover_ for later parsing when
  121. // there is more data available.
  122. StrAppend(&leftover_, chunk.substr(n));
  123. return status;
  124. } else {
  125. leftover_.assign(chunk.data(), chunk.size());
  126. return util::Status();
  127. }
  128. }
  129. util::Status JsonStreamParser::FinishParse() {
  130. // If we do not expect anything and there is nothing left to parse we're all
  131. // done.
  132. if (stack_.empty() && leftover_.empty()) {
  133. return util::Status();
  134. }
  135. // Storage for UTF8-coerced string.
  136. std::unique_ptr<char[]> utf8;
  137. if (coerce_to_utf8_) {
  138. utf8.reset(new char[leftover_.size()]);
  139. char* coerced = internal::UTF8CoerceToStructurallyValid(leftover_, utf8.get(), ' ');
  140. p_ = json_ = StringPiece(coerced, leftover_.size());
  141. } else {
  142. p_ = json_ = leftover_;
  143. if (!internal::IsStructurallyValidUTF8(leftover_)) {
  144. return ReportFailure("Encountered non UTF-8 code points.");
  145. }
  146. }
  147. // Parse the remainder in finishing mode, which reports errors for things like
  148. // unterminated strings or unknown tokens that would normally be retried.
  149. finishing_ = true;
  150. util::Status result = RunParser();
  151. if (result.ok()) {
  152. SkipWhitespace();
  153. if (!p_.empty()) {
  154. result = ReportFailure("Parsing terminated before end of input.");
  155. }
  156. }
  157. return result;
  158. }
  159. util::Status JsonStreamParser::ParseChunk(StringPiece chunk) {
  160. // Do not do any work if the chunk is empty.
  161. if (chunk.empty()) return util::Status();
  162. p_ = json_ = chunk;
  163. finishing_ = false;
  164. util::Status result = RunParser();
  165. if (!result.ok()) return result;
  166. SkipWhitespace();
  167. if (p_.empty()) {
  168. // If we parsed everything we had, clear the leftover.
  169. leftover_.clear();
  170. } else {
  171. // If we do not expect anything i.e. stack is empty, and we have non-empty
  172. // string left to parse, we report an error.
  173. if (stack_.empty()) {
  174. return ReportFailure("Parsing terminated before end of input.");
  175. }
  176. // If we expect future data i.e. stack is non-empty, and we have some
  177. // unparsed data left, we save it for later parse.
  178. leftover_ = p_.ToString();
  179. }
  180. return util::Status();
  181. }
  182. util::Status JsonStreamParser::RunParser() {
  183. while (!stack_.empty()) {
  184. ParseType type = stack_.top();
  185. TokenType t = (string_open_ == 0) ? GetNextTokenType() : BEGIN_STRING;
  186. stack_.pop();
  187. util::Status result;
  188. switch (type) {
  189. case VALUE:
  190. result = ParseValue(t);
  191. break;
  192. case OBJ_MID:
  193. result = ParseObjectMid(t);
  194. break;
  195. case ENTRY:
  196. result = ParseEntry(t);
  197. break;
  198. case ENTRY_MID:
  199. result = ParseEntryMid(t);
  200. break;
  201. case ARRAY_VALUE:
  202. result = ParseArrayValue(t);
  203. break;
  204. case ARRAY_MID:
  205. result = ParseArrayMid(t);
  206. break;
  207. default:
  208. result = util::Status(util::error::INTERNAL,
  209. StrCat("Unknown parse type: ", type));
  210. break;
  211. }
  212. if (!result.ok()) {
  213. // If we were cancelled, save our state and try again later.
  214. if (!finishing_ && result == util::Status(error::CANCELLED, "")) {
  215. stack_.push(type);
  216. // If we have a key we still need to render, make sure to save off the
  217. // contents in our own storage.
  218. if (!key_.empty() && key_storage_.empty()) {
  219. StrAppend(&key_storage_, key_);
  220. key_ = StringPiece(key_storage_);
  221. }
  222. result = util::Status();
  223. }
  224. return result;
  225. }
  226. }
  227. return util::Status();
  228. }
  229. util::Status JsonStreamParser::ParseValue(TokenType type) {
  230. switch (type) {
  231. case BEGIN_OBJECT:
  232. return HandleBeginObject();
  233. case BEGIN_ARRAY:
  234. return HandleBeginArray();
  235. case BEGIN_STRING:
  236. return ParseString();
  237. case BEGIN_NUMBER:
  238. return ParseNumber();
  239. case BEGIN_TRUE:
  240. return ParseTrue();
  241. case BEGIN_FALSE:
  242. return ParseFalse();
  243. case BEGIN_NULL:
  244. return ParseNull();
  245. case UNKNOWN:
  246. return ReportUnknown("Expected a value.");
  247. default: {
  248. if (allow_empty_null_ && IsEmptyNullAllowed(type)) {
  249. return ParseEmptyNull();
  250. }
  251. // Special case for having been cut off while parsing, wait for more data.
  252. // This handles things like 'fals' being at the end of the string, we
  253. // don't know if the next char would be e, completing it, or something
  254. // else, making it invalid.
  255. if (!finishing_ && p_.length() < false_len) {
  256. return util::Status(error::CANCELLED, "");
  257. }
  258. return ReportFailure("Unexpected token.");
  259. }
  260. }
  261. }
  262. util::Status JsonStreamParser::ParseString() {
  263. util::Status result = ParseStringHelper();
  264. if (result.ok()) {
  265. ow_->RenderString(key_, parsed_);
  266. key_ = StringPiece();
  267. parsed_ = StringPiece();
  268. parsed_storage_.clear();
  269. }
  270. return result;
  271. }
  272. util::Status JsonStreamParser::ParseStringHelper() {
  273. // If we haven't seen the start quote, grab it and remember it for later.
  274. if (string_open_ == 0) {
  275. string_open_ = *p_.data();
  276. GOOGLE_DCHECK(string_open_ == '\"' || string_open_ == '\'');
  277. Advance();
  278. }
  279. // Track where we last copied data from so we can minimize copying.
  280. const char* last = p_.data();
  281. while (!p_.empty()) {
  282. const char* data = p_.data();
  283. if (*data == '\\') {
  284. // We're about to handle an escape, copy all bytes from last to data.
  285. if (last < data) {
  286. parsed_storage_.append(last, data - last);
  287. }
  288. // If we ran out of string after the \, cancel or report an error
  289. // depending on if we expect more data later.
  290. if (p_.length() == 1) {
  291. if (!finishing_) {
  292. return util::Status(error::CANCELLED, "");
  293. }
  294. return ReportFailure("Closing quote expected in string.");
  295. }
  296. // Parse a unicode escape if we found \u in the string.
  297. if (data[1] == 'u') {
  298. util::Status result = ParseUnicodeEscape();
  299. if (!result.ok()) {
  300. return result;
  301. }
  302. // Move last pointer past the unicode escape and continue.
  303. last = p_.data();
  304. continue;
  305. }
  306. // Handle the standard set of backslash-escaped characters.
  307. switch (data[1]) {
  308. case 'b':
  309. parsed_storage_.push_back('\b');
  310. break;
  311. case 'f':
  312. parsed_storage_.push_back('\f');
  313. break;
  314. case 'n':
  315. parsed_storage_.push_back('\n');
  316. break;
  317. case 'r':
  318. parsed_storage_.push_back('\r');
  319. break;
  320. case 't':
  321. parsed_storage_.push_back('\t');
  322. break;
  323. case 'v':
  324. parsed_storage_.push_back('\v');
  325. break;
  326. default:
  327. parsed_storage_.push_back(data[1]);
  328. }
  329. // We handled two characters, so advance past them and continue.
  330. p_.remove_prefix(2);
  331. last = p_.data();
  332. continue;
  333. }
  334. // If we found the closing quote note it, advance past it, and return.
  335. if (*data == string_open_) {
  336. // If we didn't copy anything, reuse the input buffer.
  337. if (parsed_storage_.empty()) {
  338. parsed_ = StringPiece(last, data - last);
  339. } else {
  340. if (last < data) {
  341. parsed_storage_.append(last, data - last);
  342. }
  343. parsed_ = StringPiece(parsed_storage_);
  344. }
  345. // Clear the quote char so next time we try to parse a string we'll
  346. // start fresh.
  347. string_open_ = 0;
  348. Advance();
  349. return util::Status();
  350. }
  351. // Normal character, just advance past it.
  352. Advance();
  353. }
  354. // If we ran out of characters, copy over what we have so far.
  355. if (last < p_.data()) {
  356. parsed_storage_.append(last, p_.data() - last);
  357. }
  358. // If we didn't find the closing quote but we expect more data, cancel for now
  359. if (!finishing_) {
  360. return util::Status(error::CANCELLED, "");
  361. }
  362. // End of string reached without a closing quote, report an error.
  363. string_open_ = 0;
  364. return ReportFailure("Closing quote expected in string.");
  365. }
  366. // Converts a unicode escaped character to a decimal value stored in a char32
  367. // for use in UTF8 encoding utility. We assume that str begins with \uhhhh and
  368. // convert that from the hex number to a decimal value.
  369. //
  370. // There are some security exploits with UTF-8 that we should be careful of:
  371. // - http://www.unicode.org/reports/tr36/#UTF-8_Exploit
  372. // - http://sites/intl-eng/design-guide/core-application
  373. util::Status JsonStreamParser::ParseUnicodeEscape() {
  374. if (p_.length() < kUnicodeEscapedLength) {
  375. if (!finishing_) {
  376. return util::Status(error::CANCELLED, "");
  377. }
  378. return ReportFailure("Illegal hex string.");
  379. }
  380. GOOGLE_DCHECK_EQ('\\', p_.data()[0]);
  381. GOOGLE_DCHECK_EQ('u', p_.data()[1]);
  382. uint32 code = 0;
  383. for (int i = 2; i < kUnicodeEscapedLength; ++i) {
  384. if (!isxdigit(p_.data()[i])) {
  385. return ReportFailure("Invalid escape sequence.");
  386. }
  387. code = (code << 4) + hex_digit_to_int(p_.data()[i]);
  388. }
  389. if (code >= JsonEscaping::kMinHighSurrogate &&
  390. code <= JsonEscaping::kMaxHighSurrogate) {
  391. if (p_.length() < 2 * kUnicodeEscapedLength) {
  392. if (!finishing_) {
  393. return util::Status(error::CANCELLED, "");
  394. }
  395. if (!coerce_to_utf8_) {
  396. return ReportFailure("Missing low surrogate.");
  397. }
  398. } else if (p_.data()[kUnicodeEscapedLength] == '\\' &&
  399. p_.data()[kUnicodeEscapedLength + 1] == 'u') {
  400. uint32 low_code = 0;
  401. for (int i = kUnicodeEscapedLength + 2; i < 2 * kUnicodeEscapedLength;
  402. ++i) {
  403. if (!isxdigit(p_.data()[i])) {
  404. return ReportFailure("Invalid escape sequence.");
  405. }
  406. low_code = (low_code << 4) + hex_digit_to_int(p_.data()[i]);
  407. }
  408. if (low_code >= JsonEscaping::kMinLowSurrogate &&
  409. low_code <= JsonEscaping::kMaxLowSurrogate) {
  410. // Convert UTF-16 surrogate pair to 21-bit Unicode codepoint.
  411. code = (((code & 0x3FF) << 10) | (low_code & 0x3FF)) +
  412. JsonEscaping::kMinSupplementaryCodePoint;
  413. // Advance past the first code unit escape.
  414. p_.remove_prefix(kUnicodeEscapedLength);
  415. } else if (!coerce_to_utf8_) {
  416. return ReportFailure("Invalid low surrogate.");
  417. }
  418. } else if (!coerce_to_utf8_) {
  419. return ReportFailure("Missing low surrogate.");
  420. }
  421. }
  422. if (!coerce_to_utf8_ && !IsValidCodePoint(code)) {
  423. return ReportFailure("Invalid unicode code point.");
  424. }
  425. char buf[UTFmax];
  426. int len = EncodeAsUTF8Char(code, buf);
  427. // Advance past the [final] code unit escape.
  428. p_.remove_prefix(kUnicodeEscapedLength);
  429. parsed_storage_.append(buf, len);
  430. return util::Status();
  431. }
  432. util::Status JsonStreamParser::ParseNumber() {
  433. NumberResult number;
  434. util::Status result = ParseNumberHelper(&number);
  435. if (result.ok()) {
  436. switch (number.type) {
  437. case NumberResult::DOUBLE:
  438. ow_->RenderDouble(key_, number.double_val);
  439. key_ = StringPiece();
  440. break;
  441. case NumberResult::INT:
  442. ow_->RenderInt64(key_, number.int_val);
  443. key_ = StringPiece();
  444. break;
  445. case NumberResult::UINT:
  446. ow_->RenderUint64(key_, number.uint_val);
  447. key_ = StringPiece();
  448. break;
  449. default:
  450. return ReportFailure("Unable to parse number.");
  451. }
  452. }
  453. return result;
  454. }
  455. util::Status JsonStreamParser::ParseDoubleHelper(
  456. const string& number, NumberResult* result) {
  457. if (!safe_strtod(number, &result->double_val)) {
  458. return ReportFailure("Unable to parse number.");
  459. }
  460. if (!loose_float_number_conversion_ &&
  461. !MathLimits<double>::IsFinite(result->double_val)) {
  462. return ReportFailure("Number exceeds the range of double.");
  463. }
  464. result->type = NumberResult::DOUBLE;
  465. return util::Status();
  466. }
  467. util::Status JsonStreamParser::ParseNumberHelper(NumberResult* result) {
  468. const char* data = p_.data();
  469. int length = p_.length();
  470. // Look for the first non-numeric character, or the end of the string.
  471. int index = 0;
  472. bool floating = false;
  473. bool negative = data[index] == '-';
  474. // Find the first character that cannot be part of the number. Along the way
  475. // detect if the number needs to be parsed as a double.
  476. // Note that this restricts numbers to the JSON specification, so for example
  477. // we do not support hex or octal notations.
  478. for (; index < length; ++index) {
  479. char c = data[index];
  480. if (isdigit(c)) continue;
  481. if (c == '.' || c == 'e' || c == 'E') {
  482. floating = true;
  483. continue;
  484. }
  485. if (c == '+' || c == '-' || c == 'x') continue;
  486. // Not a valid number character, break out.
  487. break;
  488. }
  489. // If the entire input is a valid number, and we may have more content in the
  490. // future, we abort for now and resume when we know more.
  491. if (index == length && !finishing_) {
  492. return util::Status(error::CANCELLED, "");
  493. }
  494. // Create a string containing just the number, so we can use safe_strtoX
  495. string number = p_.substr(0, index).ToString();
  496. // Floating point number, parse as a double.
  497. if (floating) {
  498. util::Status status = ParseDoubleHelper(number, result);
  499. if (status.ok()) {
  500. p_.remove_prefix(index);
  501. }
  502. return status;
  503. }
  504. // Positive non-floating point number, parse as a uint64.
  505. if (!negative) {
  506. // Octal/Hex numbers are not valid JSON values.
  507. if (number.length() >= 2 && number[0] == '0') {
  508. return ReportFailure("Octal/hex numbers are not valid JSON values.");
  509. }
  510. if (safe_strtou64(number, &result->uint_val)) {
  511. result->type = NumberResult::UINT;
  512. p_.remove_prefix(index);
  513. return util::Status();
  514. } else {
  515. // If the value is too large, parse it as double.
  516. util::Status status = ParseDoubleHelper(number, result);
  517. if (status.ok()) {
  518. p_.remove_prefix(index);
  519. }
  520. return status;
  521. }
  522. }
  523. // Octal/Hex numbers are not valid JSON values.
  524. if (number.length() >= 3 && number[1] == '0') {
  525. return ReportFailure("Octal/hex numbers are not valid JSON values.");
  526. }
  527. // Negative non-floating point number, parse as an int64.
  528. if (safe_strto64(number, &result->int_val)) {
  529. result->type = NumberResult::INT;
  530. p_.remove_prefix(index);
  531. return util::Status();
  532. } else {
  533. // If the value is too large, parse it as double.
  534. util::Status status = ParseDoubleHelper(number, result);
  535. if (status.ok()) {
  536. p_.remove_prefix(index);
  537. }
  538. return status;
  539. }
  540. }
  541. util::Status JsonStreamParser::HandleBeginObject() {
  542. GOOGLE_DCHECK_EQ('{', *p_.data());
  543. Advance();
  544. ow_->StartObject(key_);
  545. key_ = StringPiece();
  546. stack_.push(ENTRY);
  547. return util::Status();
  548. }
  549. util::Status JsonStreamParser::ParseObjectMid(TokenType type) {
  550. if (type == UNKNOWN) {
  551. return ReportUnknown("Expected , or } after key:value pair.");
  552. }
  553. // Object is complete, advance past the comma and render the EndObject.
  554. if (type == END_OBJECT) {
  555. Advance();
  556. ow_->EndObject();
  557. return util::Status();
  558. }
  559. // Found a comma, advance past it and get ready for an entry.
  560. if (type == VALUE_SEPARATOR) {
  561. Advance();
  562. stack_.push(ENTRY);
  563. return util::Status();
  564. }
  565. // Illegal token after key:value pair.
  566. return ReportFailure("Expected , or } after key:value pair.");
  567. }
  568. util::Status JsonStreamParser::ParseEntry(TokenType type) {
  569. if (type == UNKNOWN) {
  570. return ReportUnknown("Expected an object key or }.");
  571. }
  572. // Close the object and return. This allows for trailing commas.
  573. if (type == END_OBJECT) {
  574. ow_->EndObject();
  575. Advance();
  576. return util::Status();
  577. }
  578. util::Status result;
  579. if (type == BEGIN_STRING) {
  580. // Key is a string (standard JSON), parse it and store the string.
  581. result = ParseStringHelper();
  582. if (result.ok()) {
  583. key_storage_.clear();
  584. if (!parsed_storage_.empty()) {
  585. parsed_storage_.swap(key_storage_);
  586. key_ = StringPiece(key_storage_);
  587. } else {
  588. key_ = parsed_;
  589. }
  590. parsed_ = StringPiece();
  591. }
  592. } else if (type == BEGIN_KEY) {
  593. // Key is a bare key (back compat), create a StringPiece pointing to it.
  594. result = ParseKey();
  595. } else {
  596. // Unknown key type, report an error.
  597. result = ReportFailure("Expected an object key or }.");
  598. }
  599. // On success we next expect an entry mid ':' then an object mid ',' or '}'
  600. if (result.ok()) {
  601. stack_.push(OBJ_MID);
  602. stack_.push(ENTRY_MID);
  603. }
  604. return result;
  605. }
  606. util::Status JsonStreamParser::ParseEntryMid(TokenType type) {
  607. if (type == UNKNOWN) {
  608. return ReportUnknown("Expected : between key:value pair.");
  609. }
  610. if (type == ENTRY_SEPARATOR) {
  611. Advance();
  612. stack_.push(VALUE);
  613. return util::Status();
  614. }
  615. return ReportFailure("Expected : between key:value pair.");
  616. }
  617. util::Status JsonStreamParser::HandleBeginArray() {
  618. GOOGLE_DCHECK_EQ('[', *p_.data());
  619. Advance();
  620. ow_->StartList(key_);
  621. key_ = StringPiece();
  622. stack_.push(ARRAY_VALUE);
  623. return util::Status();
  624. }
  625. util::Status JsonStreamParser::ParseArrayValue(TokenType type) {
  626. if (type == UNKNOWN) {
  627. return ReportUnknown("Expected a value or ] within an array.");
  628. }
  629. if (type == END_ARRAY) {
  630. ow_->EndList();
  631. Advance();
  632. return util::Status();
  633. }
  634. // The ParseValue call may push something onto the stack so we need to make
  635. // sure an ARRAY_MID is after it, so we push it on now. Also, the parsing of
  636. // empty-null array value is relying on this ARRAY_MID token.
  637. stack_.push(ARRAY_MID);
  638. util::Status result = ParseValue(type);
  639. if (result == util::Status(error::CANCELLED, "")) {
  640. // If we were cancelled, pop back off the ARRAY_MID so we don't try to
  641. // push it on again when we try over.
  642. stack_.pop();
  643. }
  644. return result;
  645. }
  646. util::Status JsonStreamParser::ParseArrayMid(TokenType type) {
  647. if (type == UNKNOWN) {
  648. return ReportUnknown("Expected , or ] after array value.");
  649. }
  650. if (type == END_ARRAY) {
  651. ow_->EndList();
  652. Advance();
  653. return util::Status();
  654. }
  655. // Found a comma, advance past it and expect an array value next.
  656. if (type == VALUE_SEPARATOR) {
  657. Advance();
  658. stack_.push(ARRAY_VALUE);
  659. return util::Status();
  660. }
  661. // Illegal token after array value.
  662. return ReportFailure("Expected , or ] after array value.");
  663. }
  664. util::Status JsonStreamParser::ParseTrue() {
  665. ow_->RenderBool(key_, true);
  666. key_ = StringPiece();
  667. p_.remove_prefix(true_len);
  668. return util::Status();
  669. }
  670. util::Status JsonStreamParser::ParseFalse() {
  671. ow_->RenderBool(key_, false);
  672. key_ = StringPiece();
  673. p_.remove_prefix(false_len);
  674. return util::Status();
  675. }
  676. util::Status JsonStreamParser::ParseNull() {
  677. ow_->RenderNull(key_);
  678. key_ = StringPiece();
  679. p_.remove_prefix(null_len);
  680. return util::Status();
  681. }
  682. util::Status JsonStreamParser::ParseEmptyNull() {
  683. ow_->RenderNull(key_);
  684. key_ = StringPiece();
  685. return util::Status();
  686. }
  687. bool JsonStreamParser::IsEmptyNullAllowed(TokenType type) {
  688. if (stack_.empty()) return false;
  689. return (stack_.top() == ARRAY_MID && type == VALUE_SEPARATOR) ||
  690. stack_.top() == OBJ_MID;
  691. }
  692. util::Status JsonStreamParser::ReportFailure(StringPiece message) {
  693. static const int kContextLength = 20;
  694. const char* p_start = p_.data();
  695. const char* json_start = json_.data();
  696. const char* begin = std::max(p_start - kContextLength, json_start);
  697. const char* end =
  698. std::min(p_start + kContextLength, json_start + json_.size());
  699. StringPiece segment(begin, end - begin);
  700. string location(p_start - begin, ' ');
  701. location.push_back('^');
  702. return util::Status(util::error::INVALID_ARGUMENT,
  703. StrCat(message, "\n", segment, "\n", location));
  704. }
  705. util::Status JsonStreamParser::ReportUnknown(StringPiece message) {
  706. // If we aren't finishing the parse, cancel parsing and try later.
  707. if (!finishing_) {
  708. return util::Status(error::CANCELLED, "");
  709. }
  710. if (p_.empty()) {
  711. return ReportFailure(StrCat("Unexpected end of string. ", message));
  712. }
  713. return ReportFailure(message);
  714. }
  715. void JsonStreamParser::SkipWhitespace() {
  716. while (!p_.empty() && ascii_isspace(*p_.data())) {
  717. Advance();
  718. }
  719. }
  720. void JsonStreamParser::Advance() {
  721. // Advance by moving one UTF8 character while making sure we don't go beyond
  722. // the length of StringPiece.
  723. p_.remove_prefix(std::min<int>(
  724. p_.length(), UTF8FirstLetterNumBytes(p_.data(), p_.length())));
  725. }
  726. util::Status JsonStreamParser::ParseKey() {
  727. StringPiece original = p_;
  728. if (!ConsumeKey(&p_, &key_)) {
  729. return ReportFailure("Invalid key or variable name.");
  730. }
  731. // If we consumed everything but expect more data, reset p_ and cancel since
  732. // we can't know if the key was complete or not.
  733. if (!finishing_ && p_.empty()) {
  734. p_ = original;
  735. return util::Status(error::CANCELLED, "");
  736. }
  737. // Since we aren't using the key storage, clear it out.
  738. key_storage_.clear();
  739. return util::Status();
  740. }
  741. JsonStreamParser::TokenType JsonStreamParser::GetNextTokenType() {
  742. SkipWhitespace();
  743. int size = p_.size();
  744. if (size == 0) {
  745. // If we ran out of data, report unknown and we'll place the previous parse
  746. // type onto the stack and try again when we have more data.
  747. return UNKNOWN;
  748. }
  749. // TODO(sven): Split this method based on context since different contexts
  750. // support different tokens. Would slightly speed up processing?
  751. const char* data = p_.data();
  752. if (*data == '\"' || *data == '\'') return BEGIN_STRING;
  753. if (*data == '-' || ('0' <= *data && *data <= '9')) {
  754. return BEGIN_NUMBER;
  755. }
  756. if (size >= true_len && !strncmp(data, "true", true_len)) {
  757. return BEGIN_TRUE;
  758. }
  759. if (size >= false_len && !strncmp(data, "false", false_len)) {
  760. return BEGIN_FALSE;
  761. }
  762. if (size >= null_len && !strncmp(data, "null", null_len)) {
  763. return BEGIN_NULL;
  764. }
  765. if (*data == '{') return BEGIN_OBJECT;
  766. if (*data == '}') return END_OBJECT;
  767. if (*data == '[') return BEGIN_ARRAY;
  768. if (*data == ']') return END_ARRAY;
  769. if (*data == ':') return ENTRY_SEPARATOR;
  770. if (*data == ',') return VALUE_SEPARATOR;
  771. if (MatchKey(p_)) {
  772. return BEGIN_KEY;
  773. }
  774. // We don't know that we necessarily have an invalid token here, just that we
  775. // can't parse what we have so far. So we don't report an error and just
  776. // return UNKNOWN so we can try again later when we have more data, or if we
  777. // finish and we have leftovers.
  778. return UNKNOWN;
  779. }
  780. } // namespace converter
  781. } // namespace util
  782. } // namespace protobuf
  783. } // namespace google