tokenizer.cc 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137
  1. // Protocol Buffers - Google's data interchange format
  2. // Copyright 2008 Google Inc. All rights reserved.
  3. // https://developers.google.com/protocol-buffers/
  4. //
  5. // Redistribution and use in source and binary forms, with or without
  6. // modification, are permitted provided that the following conditions are
  7. // met:
  8. //
  9. // * Redistributions of source code must retain the above copyright
  10. // notice, this list of conditions and the following disclaimer.
  11. // * Redistributions in binary form must reproduce the above
  12. // copyright notice, this list of conditions and the following disclaimer
  13. // in the documentation and/or other materials provided with the
  14. // distribution.
  15. // * Neither the name of Google Inc. nor the names of its
  16. // contributors may be used to endorse or promote products derived from
  17. // this software without specific prior written permission.
  18. //
  19. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  22. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  23. // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24. // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  25. // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  29. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30. // Author: kenton@google.com (Kenton Varda)
  31. // Based on original Protocol Buffers design by
  32. // Sanjay Ghemawat, Jeff Dean, and others.
  33. //
  34. // Here we have a hand-written lexer. At first you might ask yourself,
  35. // "Hand-written text processing? Is Kenton crazy?!" Well, first of all,
  36. // yes I am crazy, but that's beside the point. There are actually reasons
  37. // why I ended up writing this this way.
  38. //
  39. // The traditional approach to lexing is to use lex to generate a lexer for
  40. // you. Unfortunately, lex's output is ridiculously ugly and difficult to
  41. // integrate cleanly with C++ code, especially abstract code or code meant
  42. // as a library. Better parser-generators exist but would add dependencies
  43. // which most users won't already have, which we'd like to avoid. (GNU flex
  44. // has a C++ output option, but it's still ridiculously ugly, non-abstract,
  45. // and not library-friendly.)
  46. //
  47. // The next approach that any good software engineer should look at is to
  48. // use regular expressions. And, indeed, I did. I have code which
  49. // implements this same class using regular expressions. It's about 200
  50. // lines shorter. However:
  51. // - Rather than error messages telling you "This string has an invalid
  52. // escape sequence at line 5, column 45", you get error messages like
  53. // "Parse error on line 5". Giving more precise errors requires adding
  54. // a lot of code that ends up basically as complex as the hand-coded
  55. // version anyway.
  56. // - The regular expression to match a string literal looks like this:
  57. // kString = new RE("(\"([^\"\\\\]|" // non-escaped
  58. // "\\\\[abfnrtv?\"'\\\\0-7]|" // normal escape
  59. // "\\\\x[0-9a-fA-F])*\"|" // hex escape
  60. // "\'([^\'\\\\]|" // Also support single-quotes.
  61. // "\\\\[abfnrtv?\"'\\\\0-7]|"
  62. // "\\\\x[0-9a-fA-F])*\')");
  63. // Verifying the correctness of this line noise is actually harder than
  64. // verifying the correctness of ConsumeString(), defined below. I'm not
  65. // even confident that the above is correct, after staring at it for some
  66. // time.
  67. // - PCRE is fast, but there's still more overhead involved than the code
  68. // below.
  69. // - Sadly, regular expressions are not part of the C standard library, so
  70. // using them would require depending on some other library. For the
  71. // open source release, this could be really annoying. Nobody likes
  72. // downloading one piece of software just to find that they need to
  73. // download something else to make it work, and in all likelihood
  74. // people downloading Protocol Buffers will already be doing so just
  75. // to make something else work. We could include a copy of PCRE with
  76. // our code, but that obligates us to keep it up-to-date and just seems
  77. // like a big waste just to save 200 lines of code.
  78. //
  79. // On a similar but unrelated note, I'm even scared to use ctype.h.
  80. // Apparently functions like isalpha() are locale-dependent. So, if we used
  81. // that, then if this code is being called from some program that doesn't
  82. // have its locale set to "C", it would behave strangely. We can't just set
  83. // the locale to "C" ourselves since we might break the calling program that
  84. // way, particularly if it is multi-threaded. WTF? Someone please let me
  85. // (Kenton) know if I'm missing something here...
  86. //
  87. // I'd love to hear about other alternatives, though, as this code isn't
  88. // exactly pretty.
  89. #include <google/protobuf/io/tokenizer.h>
  90. #include <google/protobuf/stubs/common.h>
  91. #include <google/protobuf/stubs/logging.h>
  92. #include <google/protobuf/stubs/stringprintf.h>
  93. #include <google/protobuf/io/strtod.h>
  94. #include <google/protobuf/io/zero_copy_stream.h>
  95. #include <google/protobuf/stubs/strutil.h>
  96. #include <google/protobuf/stubs/stl_util.h>
  97. namespace google {
  98. namespace protobuf {
  99. namespace io {
  100. namespace {
  101. // As mentioned above, I don't trust ctype.h due to the presence of "locales".
  102. // So, I have written replacement functions here. Someone please smack me if
  103. // this is a bad idea or if there is some way around this.
  104. //
  105. // These "character classes" are designed to be used in template methods.
  106. // For instance, Tokenizer::ConsumeZeroOrMore<Whitespace>() will eat
  107. // whitespace.
  108. // Note: No class is allowed to contain '\0', since this is used to mark end-
  109. // of-input and is handled specially.
  110. #define CHARACTER_CLASS(NAME, EXPRESSION) \
  111. class NAME { \
  112. public: \
  113. static inline bool InClass(char c) { \
  114. return EXPRESSION; \
  115. } \
  116. }
  117. CHARACTER_CLASS(Whitespace, c == ' ' || c == '\n' || c == '\t' ||
  118. c == '\r' || c == '\v' || c == '\f');
  119. CHARACTER_CLASS(WhitespaceNoNewline, c == ' ' || c == '\t' ||
  120. c == '\r' || c == '\v' || c == '\f');
  121. CHARACTER_CLASS(Unprintable, c < ' ' && c > '\0');
  122. CHARACTER_CLASS(Digit, '0' <= c && c <= '9');
  123. CHARACTER_CLASS(OctalDigit, '0' <= c && c <= '7');
  124. CHARACTER_CLASS(HexDigit, ('0' <= c && c <= '9') ||
  125. ('a' <= c && c <= 'f') ||
  126. ('A' <= c && c <= 'F'));
  127. CHARACTER_CLASS(Letter, ('a' <= c && c <= 'z') ||
  128. ('A' <= c && c <= 'Z') ||
  129. (c == '_'));
  130. CHARACTER_CLASS(Alphanumeric, ('a' <= c && c <= 'z') ||
  131. ('A' <= c && c <= 'Z') ||
  132. ('0' <= c && c <= '9') ||
  133. (c == '_'));
  134. CHARACTER_CLASS(Escape, c == 'a' || c == 'b' || c == 'f' || c == 'n' ||
  135. c == 'r' || c == 't' || c == 'v' || c == '\\' ||
  136. c == '?' || c == '\'' || c == '\"');
  137. #undef CHARACTER_CLASS
  138. // Given a char, interpret it as a numeric digit and return its value.
  139. // This supports any number base up to 36.
  140. inline int DigitValue(char digit) {
  141. if ('0' <= digit && digit <= '9') return digit - '0';
  142. if ('a' <= digit && digit <= 'z') return digit - 'a' + 10;
  143. if ('A' <= digit && digit <= 'Z') return digit - 'A' + 10;
  144. return -1;
  145. }
  146. // Inline because it's only used in one place.
  147. inline char TranslateEscape(char c) {
  148. switch (c) {
  149. case 'a': return '\a';
  150. case 'b': return '\b';
  151. case 'f': return '\f';
  152. case 'n': return '\n';
  153. case 'r': return '\r';
  154. case 't': return '\t';
  155. case 'v': return '\v';
  156. case '\\': return '\\';
  157. case '?': return '\?'; // Trigraphs = :(
  158. case '\'': return '\'';
  159. case '"': return '\"';
  160. // We expect escape sequences to have been validated separately.
  161. default: return '?';
  162. }
  163. }
  164. } // anonymous namespace
  165. ErrorCollector::~ErrorCollector() {}
  166. // ===================================================================
  167. Tokenizer::Tokenizer(ZeroCopyInputStream* input,
  168. ErrorCollector* error_collector)
  169. : input_(input),
  170. error_collector_(error_collector),
  171. buffer_(NULL),
  172. buffer_size_(0),
  173. buffer_pos_(0),
  174. read_error_(false),
  175. line_(0),
  176. column_(0),
  177. record_target_(NULL),
  178. record_start_(-1),
  179. allow_f_after_float_(false),
  180. comment_style_(CPP_COMMENT_STYLE),
  181. require_space_after_number_(true),
  182. allow_multiline_strings_(false) {
  183. current_.line = 0;
  184. current_.column = 0;
  185. current_.end_column = 0;
  186. current_.type = TYPE_START;
  187. Refresh();
  188. }
  189. Tokenizer::~Tokenizer() {
  190. // If we had any buffer left unread, return it to the underlying stream
  191. // so that someone else can read it.
  192. if (buffer_size_ > buffer_pos_) {
  193. input_->BackUp(buffer_size_ - buffer_pos_);
  194. }
  195. }
  196. // -------------------------------------------------------------------
  197. // Internal helpers.
  198. void Tokenizer::NextChar() {
  199. // Update our line and column counters based on the character being
  200. // consumed.
  201. if (current_char_ == '\n') {
  202. ++line_;
  203. column_ = 0;
  204. } else if (current_char_ == '\t') {
  205. column_ += kTabWidth - column_ % kTabWidth;
  206. } else {
  207. ++column_;
  208. }
  209. // Advance to the next character.
  210. ++buffer_pos_;
  211. if (buffer_pos_ < buffer_size_) {
  212. current_char_ = buffer_[buffer_pos_];
  213. } else {
  214. Refresh();
  215. }
  216. }
  217. void Tokenizer::Refresh() {
  218. if (read_error_) {
  219. current_char_ = '\0';
  220. return;
  221. }
  222. // If we're in a token, append the rest of the buffer to it.
  223. if (record_target_ != NULL && record_start_ < buffer_size_) {
  224. record_target_->append(buffer_ + record_start_, buffer_size_ - record_start_);
  225. record_start_ = 0;
  226. }
  227. const void* data = NULL;
  228. buffer_ = NULL;
  229. buffer_pos_ = 0;
  230. do {
  231. if (!input_->Next(&data, &buffer_size_)) {
  232. // end of stream (or read error)
  233. buffer_size_ = 0;
  234. read_error_ = true;
  235. current_char_ = '\0';
  236. return;
  237. }
  238. } while (buffer_size_ == 0);
  239. buffer_ = static_cast<const char*>(data);
  240. current_char_ = buffer_[0];
  241. }
  242. inline void Tokenizer::RecordTo(string* target) {
  243. record_target_ = target;
  244. record_start_ = buffer_pos_;
  245. }
  246. inline void Tokenizer::StopRecording() {
  247. // Note: The if() is necessary because some STL implementations crash when
  248. // you call string::append(NULL, 0), presumably because they are trying to
  249. // be helpful by detecting the NULL pointer, even though there's nothing
  250. // wrong with reading zero bytes from NULL.
  251. if (buffer_pos_ != record_start_) {
  252. record_target_->append(buffer_ + record_start_, buffer_pos_ - record_start_);
  253. }
  254. record_target_ = NULL;
  255. record_start_ = -1;
  256. }
  257. inline void Tokenizer::StartToken() {
  258. current_.type = TYPE_START; // Just for the sake of initializing it.
  259. current_.text.clear();
  260. current_.line = line_;
  261. current_.column = column_;
  262. RecordTo(&current_.text);
  263. }
  264. inline void Tokenizer::EndToken() {
  265. StopRecording();
  266. current_.end_column = column_;
  267. }
  268. // -------------------------------------------------------------------
  269. // Helper methods that consume characters.
  270. template<typename CharacterClass>
  271. inline bool Tokenizer::LookingAt() {
  272. return CharacterClass::InClass(current_char_);
  273. }
  274. template<typename CharacterClass>
  275. inline bool Tokenizer::TryConsumeOne() {
  276. if (CharacterClass::InClass(current_char_)) {
  277. NextChar();
  278. return true;
  279. } else {
  280. return false;
  281. }
  282. }
  283. inline bool Tokenizer::TryConsume(char c) {
  284. if (current_char_ == c) {
  285. NextChar();
  286. return true;
  287. } else {
  288. return false;
  289. }
  290. }
  291. template<typename CharacterClass>
  292. inline void Tokenizer::ConsumeZeroOrMore() {
  293. while (CharacterClass::InClass(current_char_)) {
  294. NextChar();
  295. }
  296. }
  297. template<typename CharacterClass>
  298. inline void Tokenizer::ConsumeOneOrMore(const char* error) {
  299. if (!CharacterClass::InClass(current_char_)) {
  300. AddError(error);
  301. } else {
  302. do {
  303. NextChar();
  304. } while (CharacterClass::InClass(current_char_));
  305. }
  306. }
  307. // -------------------------------------------------------------------
  308. // Methods that read whole patterns matching certain kinds of tokens
  309. // or comments.
  310. void Tokenizer::ConsumeString(char delimiter) {
  311. while (true) {
  312. switch (current_char_) {
  313. case '\0':
  314. AddError("Unexpected end of string.");
  315. return;
  316. case '\n': {
  317. if (!allow_multiline_strings_) {
  318. AddError("String literals cannot cross line boundaries.");
  319. return;
  320. }
  321. NextChar();
  322. break;
  323. }
  324. case '\\': {
  325. // An escape sequence.
  326. NextChar();
  327. if (TryConsumeOne<Escape>()) {
  328. // Valid escape sequence.
  329. } else if (TryConsumeOne<OctalDigit>()) {
  330. // Possibly followed by two more octal digits, but these will
  331. // just be consumed by the main loop anyway so we don't need
  332. // to do so explicitly here.
  333. } else if (TryConsume('x') || TryConsume('X')) {
  334. if (!TryConsumeOne<HexDigit>()) {
  335. AddError("Expected hex digits for escape sequence.");
  336. }
  337. // Possibly followed by another hex digit, but again we don't care.
  338. } else if (TryConsume('u')) {
  339. if (!TryConsumeOne<HexDigit>() ||
  340. !TryConsumeOne<HexDigit>() ||
  341. !TryConsumeOne<HexDigit>() ||
  342. !TryConsumeOne<HexDigit>()) {
  343. AddError("Expected four hex digits for \\u escape sequence.");
  344. }
  345. } else if (TryConsume('U')) {
  346. // We expect 8 hex digits; but only the range up to 0x10ffff is
  347. // legal.
  348. if (!TryConsume('0') ||
  349. !TryConsume('0') ||
  350. !(TryConsume('0') || TryConsume('1')) ||
  351. !TryConsumeOne<HexDigit>() ||
  352. !TryConsumeOne<HexDigit>() ||
  353. !TryConsumeOne<HexDigit>() ||
  354. !TryConsumeOne<HexDigit>() ||
  355. !TryConsumeOne<HexDigit>()) {
  356. AddError("Expected eight hex digits up to 10ffff for \\U escape "
  357. "sequence");
  358. }
  359. } else {
  360. AddError("Invalid escape sequence in string literal.");
  361. }
  362. break;
  363. }
  364. default: {
  365. if (current_char_ == delimiter) {
  366. NextChar();
  367. return;
  368. }
  369. NextChar();
  370. break;
  371. }
  372. }
  373. }
  374. }
  375. Tokenizer::TokenType Tokenizer::ConsumeNumber(bool started_with_zero,
  376. bool started_with_dot) {
  377. bool is_float = false;
  378. if (started_with_zero && (TryConsume('x') || TryConsume('X'))) {
  379. // A hex number (started with "0x").
  380. ConsumeOneOrMore<HexDigit>("\"0x\" must be followed by hex digits.");
  381. } else if (started_with_zero && LookingAt<Digit>()) {
  382. // An octal number (had a leading zero).
  383. ConsumeZeroOrMore<OctalDigit>();
  384. if (LookingAt<Digit>()) {
  385. AddError("Numbers starting with leading zero must be in octal.");
  386. ConsumeZeroOrMore<Digit>();
  387. }
  388. } else {
  389. // A decimal number.
  390. if (started_with_dot) {
  391. is_float = true;
  392. ConsumeZeroOrMore<Digit>();
  393. } else {
  394. ConsumeZeroOrMore<Digit>();
  395. if (TryConsume('.')) {
  396. is_float = true;
  397. ConsumeZeroOrMore<Digit>();
  398. }
  399. }
  400. if (TryConsume('e') || TryConsume('E')) {
  401. is_float = true;
  402. TryConsume('-') || TryConsume('+');
  403. ConsumeOneOrMore<Digit>("\"e\" must be followed by exponent.");
  404. }
  405. if (allow_f_after_float_ && (TryConsume('f') || TryConsume('F'))) {
  406. is_float = true;
  407. }
  408. }
  409. if (LookingAt<Letter>() && require_space_after_number_) {
  410. AddError("Need space between number and identifier.");
  411. } else if (current_char_ == '.') {
  412. if (is_float) {
  413. AddError(
  414. "Already saw decimal point or exponent; can't have another one.");
  415. } else {
  416. AddError("Hex and octal numbers must be integers.");
  417. }
  418. }
  419. return is_float ? TYPE_FLOAT : TYPE_INTEGER;
  420. }
  421. void Tokenizer::ConsumeLineComment(string* content) {
  422. if (content != NULL) RecordTo(content);
  423. while (current_char_ != '\0' && current_char_ != '\n') {
  424. NextChar();
  425. }
  426. TryConsume('\n');
  427. if (content != NULL) StopRecording();
  428. }
  429. void Tokenizer::ConsumeBlockComment(string* content) {
  430. int start_line = line_;
  431. int start_column = column_ - 2;
  432. if (content != NULL) RecordTo(content);
  433. while (true) {
  434. while (current_char_ != '\0' &&
  435. current_char_ != '*' &&
  436. current_char_ != '/' &&
  437. current_char_ != '\n') {
  438. NextChar();
  439. }
  440. if (TryConsume('\n')) {
  441. if (content != NULL) StopRecording();
  442. // Consume leading whitespace and asterisk;
  443. ConsumeZeroOrMore<WhitespaceNoNewline>();
  444. if (TryConsume('*')) {
  445. if (TryConsume('/')) {
  446. // End of comment.
  447. break;
  448. }
  449. }
  450. if (content != NULL) RecordTo(content);
  451. } else if (TryConsume('*') && TryConsume('/')) {
  452. // End of comment.
  453. if (content != NULL) {
  454. StopRecording();
  455. // Strip trailing "*/".
  456. content->erase(content->size() - 2);
  457. }
  458. break;
  459. } else if (TryConsume('/') && current_char_ == '*') {
  460. // Note: We didn't consume the '*' because if there is a '/' after it
  461. // we want to interpret that as the end of the comment.
  462. AddError(
  463. "\"/*\" inside block comment. Block comments cannot be nested.");
  464. } else if (current_char_ == '\0') {
  465. AddError("End-of-file inside block comment.");
  466. error_collector_->AddError(
  467. start_line, start_column, " Comment started here.");
  468. if (content != NULL) StopRecording();
  469. break;
  470. }
  471. }
  472. }
  473. Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() {
  474. if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) {
  475. if (TryConsume('/')) {
  476. return LINE_COMMENT;
  477. } else if (TryConsume('*')) {
  478. return BLOCK_COMMENT;
  479. } else {
  480. // Oops, it was just a slash. Return it.
  481. current_.type = TYPE_SYMBOL;
  482. current_.text = "/";
  483. current_.line = line_;
  484. current_.column = column_ - 1;
  485. current_.end_column = column_;
  486. return SLASH_NOT_COMMENT;
  487. }
  488. } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) {
  489. return LINE_COMMENT;
  490. } else {
  491. return NO_COMMENT;
  492. }
  493. }
  494. // -------------------------------------------------------------------
  495. bool Tokenizer::Next() {
  496. previous_ = current_;
  497. while (!read_error_) {
  498. ConsumeZeroOrMore<Whitespace>();
  499. switch (TryConsumeCommentStart()) {
  500. case LINE_COMMENT:
  501. ConsumeLineComment(NULL);
  502. continue;
  503. case BLOCK_COMMENT:
  504. ConsumeBlockComment(NULL);
  505. continue;
  506. case SLASH_NOT_COMMENT:
  507. return true;
  508. case NO_COMMENT:
  509. break;
  510. }
  511. // Check for EOF before continuing.
  512. if (read_error_) break;
  513. if (LookingAt<Unprintable>() || current_char_ == '\0') {
  514. AddError("Invalid control characters encountered in text.");
  515. NextChar();
  516. // Skip more unprintable characters, too. But, remember that '\0' is
  517. // also what current_char_ is set to after EOF / read error. We have
  518. // to be careful not to go into an infinite loop of trying to consume
  519. // it, so make sure to check read_error_ explicitly before consuming
  520. // '\0'.
  521. while (TryConsumeOne<Unprintable>() ||
  522. (!read_error_ && TryConsume('\0'))) {
  523. // Ignore.
  524. }
  525. } else {
  526. // Reading some sort of token.
  527. StartToken();
  528. if (TryConsumeOne<Letter>()) {
  529. ConsumeZeroOrMore<Alphanumeric>();
  530. current_.type = TYPE_IDENTIFIER;
  531. } else if (TryConsume('0')) {
  532. current_.type = ConsumeNumber(true, false);
  533. } else if (TryConsume('.')) {
  534. // This could be the beginning of a floating-point number, or it could
  535. // just be a '.' symbol.
  536. if (TryConsumeOne<Digit>()) {
  537. // It's a floating-point number.
  538. if (previous_.type == TYPE_IDENTIFIER &&
  539. current_.line == previous_.line &&
  540. current_.column == previous_.end_column) {
  541. // We don't accept syntax like "blah.123".
  542. error_collector_->AddError(line_, column_ - 2,
  543. "Need space between identifier and decimal point.");
  544. }
  545. current_.type = ConsumeNumber(false, true);
  546. } else {
  547. current_.type = TYPE_SYMBOL;
  548. }
  549. } else if (TryConsumeOne<Digit>()) {
  550. current_.type = ConsumeNumber(false, false);
  551. } else if (TryConsume('\"')) {
  552. ConsumeString('\"');
  553. current_.type = TYPE_STRING;
  554. } else if (TryConsume('\'')) {
  555. ConsumeString('\'');
  556. current_.type = TYPE_STRING;
  557. } else {
  558. // Check if the high order bit is set.
  559. if (current_char_ & 0x80) {
  560. error_collector_->AddError(line_, column_,
  561. StringPrintf("Interpreting non ascii codepoint %d.",
  562. static_cast<unsigned char>(current_char_)));
  563. }
  564. NextChar();
  565. current_.type = TYPE_SYMBOL;
  566. }
  567. EndToken();
  568. return true;
  569. }
  570. }
  571. // EOF
  572. current_.type = TYPE_END;
  573. current_.text.clear();
  574. current_.line = line_;
  575. current_.column = column_;
  576. current_.end_column = column_;
  577. return false;
  578. }
  579. namespace {
  580. // Helper class for collecting comments and putting them in the right places.
  581. //
  582. // This basically just buffers the most recent comment until it can be decided
  583. // exactly where that comment should be placed. When Flush() is called, the
  584. // current comment goes into either prev_trailing_comments or detached_comments.
  585. // When the CommentCollector is destroyed, the last buffered comment goes into
  586. // next_leading_comments.
  587. class CommentCollector {
  588. public:
  589. CommentCollector(string* prev_trailing_comments,
  590. vector<string>* detached_comments,
  591. string* next_leading_comments)
  592. : prev_trailing_comments_(prev_trailing_comments),
  593. detached_comments_(detached_comments),
  594. next_leading_comments_(next_leading_comments),
  595. has_comment_(false),
  596. is_line_comment_(false),
  597. can_attach_to_prev_(true) {
  598. if (prev_trailing_comments != NULL) prev_trailing_comments->clear();
  599. if (detached_comments != NULL) detached_comments->clear();
  600. if (next_leading_comments != NULL) next_leading_comments->clear();
  601. }
  602. ~CommentCollector() {
  603. // Whatever is in the buffer is a leading comment.
  604. if (next_leading_comments_ != NULL && has_comment_) {
  605. comment_buffer_.swap(*next_leading_comments_);
  606. }
  607. }
  608. // About to read a line comment. Get the comment buffer pointer in order to
  609. // read into it.
  610. string* GetBufferForLineComment() {
  611. // We want to combine with previous line comments, but not block comments.
  612. if (has_comment_ && !is_line_comment_) {
  613. Flush();
  614. }
  615. has_comment_ = true;
  616. is_line_comment_ = true;
  617. return &comment_buffer_;
  618. }
  619. // About to read a block comment. Get the comment buffer pointer in order to
  620. // read into it.
  621. string* GetBufferForBlockComment() {
  622. if (has_comment_) {
  623. Flush();
  624. }
  625. has_comment_ = true;
  626. is_line_comment_ = false;
  627. return &comment_buffer_;
  628. }
  629. void ClearBuffer() {
  630. comment_buffer_.clear();
  631. has_comment_ = false;
  632. }
  633. // Called once we know that the comment buffer is complete and is *not*
  634. // connected to the next token.
  635. void Flush() {
  636. if (has_comment_) {
  637. if (can_attach_to_prev_) {
  638. if (prev_trailing_comments_ != NULL) {
  639. prev_trailing_comments_->append(comment_buffer_);
  640. }
  641. can_attach_to_prev_ = false;
  642. } else {
  643. if (detached_comments_ != NULL) {
  644. detached_comments_->push_back(comment_buffer_);
  645. }
  646. }
  647. ClearBuffer();
  648. }
  649. }
  650. void DetachFromPrev() {
  651. can_attach_to_prev_ = false;
  652. }
  653. private:
  654. string* prev_trailing_comments_;
  655. vector<string>* detached_comments_;
  656. string* next_leading_comments_;
  657. string comment_buffer_;
  658. // True if any comments were read into comment_buffer_. This can be true even
  659. // if comment_buffer_ is empty, namely if the comment was "/**/".
  660. bool has_comment_;
  661. // Is the comment in the comment buffer a line comment?
  662. bool is_line_comment_;
  663. // Is it still possible that we could be reading a comment attached to the
  664. // previous token?
  665. bool can_attach_to_prev_;
  666. };
  667. } // namespace
  668. bool Tokenizer::NextWithComments(string* prev_trailing_comments,
  669. vector<string>* detached_comments,
  670. string* next_leading_comments) {
  671. CommentCollector collector(prev_trailing_comments, detached_comments,
  672. next_leading_comments);
  673. if (current_.type == TYPE_START) {
  674. // Ignore unicode byte order mark(BOM) if it appears at the file
  675. // beginning. Only UTF-8 BOM (0xEF 0xBB 0xBF) is accepted.
  676. if (TryConsume((char)0xEF)) {
  677. if (!TryConsume((char)0xBB) || !TryConsume((char)0xBF)) {
  678. AddError("Proto file starts with 0xEF but not UTF-8 BOM. "
  679. "Only UTF-8 is accepted for proto file.");
  680. return false;
  681. }
  682. }
  683. collector.DetachFromPrev();
  684. } else {
  685. // A comment appearing on the same line must be attached to the previous
  686. // declaration.
  687. ConsumeZeroOrMore<WhitespaceNoNewline>();
  688. switch (TryConsumeCommentStart()) {
  689. case LINE_COMMENT:
  690. ConsumeLineComment(collector.GetBufferForLineComment());
  691. // Don't allow comments on subsequent lines to be attached to a trailing
  692. // comment.
  693. collector.Flush();
  694. break;
  695. case BLOCK_COMMENT:
  696. ConsumeBlockComment(collector.GetBufferForBlockComment());
  697. ConsumeZeroOrMore<WhitespaceNoNewline>();
  698. if (!TryConsume('\n')) {
  699. // Oops, the next token is on the same line. If we recorded a comment
  700. // we really have no idea which token it should be attached to.
  701. collector.ClearBuffer();
  702. return Next();
  703. }
  704. // Don't allow comments on subsequent lines to be attached to a trailing
  705. // comment.
  706. collector.Flush();
  707. break;
  708. case SLASH_NOT_COMMENT:
  709. return true;
  710. case NO_COMMENT:
  711. if (!TryConsume('\n')) {
  712. // The next token is on the same line. There are no comments.
  713. return Next();
  714. }
  715. break;
  716. }
  717. }
  718. // OK, we are now on the line *after* the previous token.
  719. while (true) {
  720. ConsumeZeroOrMore<WhitespaceNoNewline>();
  721. switch (TryConsumeCommentStart()) {
  722. case LINE_COMMENT:
  723. ConsumeLineComment(collector.GetBufferForLineComment());
  724. break;
  725. case BLOCK_COMMENT:
  726. ConsumeBlockComment(collector.GetBufferForBlockComment());
  727. // Consume the rest of the line so that we don't interpret it as a
  728. // blank line the next time around the loop.
  729. ConsumeZeroOrMore<WhitespaceNoNewline>();
  730. TryConsume('\n');
  731. break;
  732. case SLASH_NOT_COMMENT:
  733. return true;
  734. case NO_COMMENT:
  735. if (TryConsume('\n')) {
  736. // Completely blank line.
  737. collector.Flush();
  738. collector.DetachFromPrev();
  739. } else {
  740. bool result = Next();
  741. if (!result ||
  742. current_.text == "}" ||
  743. current_.text == "]" ||
  744. current_.text == ")") {
  745. // It looks like we're at the end of a scope. In this case it
  746. // makes no sense to attach a comment to the following token.
  747. collector.Flush();
  748. }
  749. return result;
  750. }
  751. break;
  752. }
  753. }
  754. }
  755. // -------------------------------------------------------------------
  756. // Token-parsing helpers. Remember that these don't need to report
  757. // errors since any errors should already have been reported while
  758. // tokenizing. Also, these can assume that whatever text they
  759. // are given is text that the tokenizer actually parsed as a token
  760. // of the given type.
  761. bool Tokenizer::ParseInteger(const string& text, uint64 max_value,
  762. uint64* output) {
  763. // Sadly, we can't just use strtoul() since it is only 32-bit and strtoull()
  764. // is non-standard. I hate the C standard library. :(
  765. // return strtoull(text.c_str(), NULL, 0);
  766. const char* ptr = text.c_str();
  767. int base = 10;
  768. if (ptr[0] == '0') {
  769. if (ptr[1] == 'x' || ptr[1] == 'X') {
  770. // This is hex.
  771. base = 16;
  772. ptr += 2;
  773. } else {
  774. // This is octal.
  775. base = 8;
  776. }
  777. }
  778. uint64 result = 0;
  779. for (; *ptr != '\0'; ptr++) {
  780. int digit = DigitValue(*ptr);
  781. GOOGLE_LOG_IF(DFATAL, digit < 0 || digit >= base)
  782. << " Tokenizer::ParseInteger() passed text that could not have been"
  783. " tokenized as an integer: " << CEscape(text);
  784. if (digit > max_value || result > (max_value - digit) / base) {
  785. // Overflow.
  786. return false;
  787. }
  788. result = result * base + digit;
  789. }
  790. *output = result;
  791. return true;
  792. }
  793. double Tokenizer::ParseFloat(const string& text) {
  794. const char* start = text.c_str();
  795. char* end;
  796. double result = NoLocaleStrtod(start, &end);
  797. // "1e" is not a valid float, but if the tokenizer reads it, it will
  798. // report an error but still return it as a valid token. We need to
  799. // accept anything the tokenizer could possibly return, error or not.
  800. if (*end == 'e' || *end == 'E') {
  801. ++end;
  802. if (*end == '-' || *end == '+') ++end;
  803. }
  804. // If the Tokenizer had allow_f_after_float_ enabled, the float may be
  805. // suffixed with the letter 'f'.
  806. if (*end == 'f' || *end == 'F') {
  807. ++end;
  808. }
  809. GOOGLE_LOG_IF(DFATAL, end - start != text.size() || *start == '-')
  810. << " Tokenizer::ParseFloat() passed text that could not have been"
  811. " tokenized as a float: " << CEscape(text);
  812. return result;
  813. }
  814. // Helper to append a Unicode code point to a string as UTF8, without bringing
  815. // in any external dependencies.
  816. static void AppendUTF8(uint32 code_point, string* output) {
  817. uint32 tmp = 0;
  818. int len = 0;
  819. if (code_point <= 0x7f) {
  820. tmp = code_point;
  821. len = 1;
  822. } else if (code_point <= 0x07ff) {
  823. tmp = 0x0000c080 |
  824. ((code_point & 0x07c0) << 2) |
  825. (code_point & 0x003f);
  826. len = 2;
  827. } else if (code_point <= 0xffff) {
  828. tmp = 0x00e08080 |
  829. ((code_point & 0xf000) << 4) |
  830. ((code_point & 0x0fc0) << 2) |
  831. (code_point & 0x003f);
  832. len = 3;
  833. } else if (code_point <= 0x1fffff) {
  834. tmp = 0xf0808080 |
  835. ((code_point & 0x1c0000) << 6) |
  836. ((code_point & 0x03f000) << 4) |
  837. ((code_point & 0x000fc0) << 2) |
  838. (code_point & 0x003f);
  839. len = 4;
  840. } else {
  841. // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
  842. // normally only defined up to there as well.
  843. StringAppendF(output, "\\U%08x", code_point);
  844. return;
  845. }
  846. tmp = ghtonl(tmp);
  847. output->append(reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
  848. }
  849. // Try to read <len> hex digits from ptr, and stuff the numeric result into
  850. // *result. Returns true if that many digits were successfully consumed.
  851. static bool ReadHexDigits(const char* ptr, int len, uint32* result) {
  852. *result = 0;
  853. if (len == 0) return false;
  854. for (const char* end = ptr + len; ptr < end; ++ptr) {
  855. if (*ptr == '\0') return false;
  856. *result = (*result << 4) + DigitValue(*ptr);
  857. }
  858. return true;
  859. }
  860. // Handling UTF-16 surrogate pairs. UTF-16 encodes code points in the range
  861. // 0x10000...0x10ffff as a pair of numbers, a head surrogate followed by a trail
  862. // surrogate. These numbers are in a reserved range of Unicode code points, so
  863. // if we encounter such a pair we know how to parse it and convert it into a
  864. // single code point.
  865. static const uint32 kMinHeadSurrogate = 0xd800;
  866. static const uint32 kMaxHeadSurrogate = 0xdc00;
  867. static const uint32 kMinTrailSurrogate = 0xdc00;
  868. static const uint32 kMaxTrailSurrogate = 0xe000;
  869. static inline bool IsHeadSurrogate(uint32 code_point) {
  870. return (code_point >= kMinHeadSurrogate) && (code_point < kMaxHeadSurrogate);
  871. }
  872. static inline bool IsTrailSurrogate(uint32 code_point) {
  873. return (code_point >= kMinTrailSurrogate) &&
  874. (code_point < kMaxTrailSurrogate);
  875. }
  876. // Combine a head and trail surrogate into a single Unicode code point.
  877. static uint32 AssembleUTF16(uint32 head_surrogate, uint32 trail_surrogate) {
  878. GOOGLE_DCHECK(IsHeadSurrogate(head_surrogate));
  879. GOOGLE_DCHECK(IsTrailSurrogate(trail_surrogate));
  880. return 0x10000 + (((head_surrogate - kMinHeadSurrogate) << 10) |
  881. (trail_surrogate - kMinTrailSurrogate));
  882. }
  883. // Convert the escape sequence parameter to a number of expected hex digits.
  884. static inline int UnicodeLength(char key) {
  885. if (key == 'u') return 4;
  886. if (key == 'U') return 8;
  887. return 0;
  888. }
  889. // Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt
  890. // to parse that sequence. On success, returns a pointer to the first char
  891. // beyond that sequence, and fills in *code_point. On failure, returns ptr
  892. // itself.
  893. static const char* FetchUnicodePoint(const char* ptr, uint32* code_point) {
  894. const char* p = ptr;
  895. // Fetch the code point.
  896. const int len = UnicodeLength(*p++);
  897. if (!ReadHexDigits(p, len, code_point))
  898. return ptr;
  899. p += len;
  900. // Check if the code point we read is a "head surrogate." If so, then we
  901. // expect it to be immediately followed by another code point which is a valid
  902. // "trail surrogate," and together they form a UTF-16 pair which decodes into
  903. // a single Unicode point. Trail surrogates may only use \u, not \U.
  904. if (IsHeadSurrogate(*code_point) && *p == '\\' && *(p + 1) == 'u') {
  905. uint32 trail_surrogate;
  906. if (ReadHexDigits(p + 2, 4, &trail_surrogate) &&
  907. IsTrailSurrogate(trail_surrogate)) {
  908. *code_point = AssembleUTF16(*code_point, trail_surrogate);
  909. p += 6;
  910. }
  911. // If this failed, then we just emit the head surrogate as a code point.
  912. // It's bogus, but so is the string.
  913. }
  914. return p;
  915. }
  916. // The text string must begin and end with single or double quote
  917. // characters.
  918. void Tokenizer::ParseStringAppend(const string& text, string* output) {
  919. // Reminder: text[0] is always a quote character. (If text is
  920. // empty, it's invalid, so we'll just return).
  921. const size_t text_size = text.size();
  922. if (text_size == 0) {
  923. GOOGLE_LOG(DFATAL)
  924. << " Tokenizer::ParseStringAppend() passed text that could not"
  925. " have been tokenized as a string: " << CEscape(text);
  926. return;
  927. }
  928. // Reserve room for new string. The branch is necessary because if
  929. // there is already space available the reserve() call might
  930. // downsize the output.
  931. const size_t new_len = text_size + output->size();
  932. if (new_len > output->capacity()) {
  933. output->reserve(new_len);
  934. }
  935. // Loop through the string copying characters to "output" and
  936. // interpreting escape sequences. Note that any invalid escape
  937. // sequences or other errors were already reported while tokenizing.
  938. // In this case we do not need to produce valid results.
  939. for (const char* ptr = text.c_str() + 1; *ptr != '\0'; ptr++) {
  940. if (*ptr == '\\' && ptr[1] != '\0') {
  941. // An escape sequence.
  942. ++ptr;
  943. if (OctalDigit::InClass(*ptr)) {
  944. // An octal escape. May one, two, or three digits.
  945. int code = DigitValue(*ptr);
  946. if (OctalDigit::InClass(ptr[1])) {
  947. ++ptr;
  948. code = code * 8 + DigitValue(*ptr);
  949. }
  950. if (OctalDigit::InClass(ptr[1])) {
  951. ++ptr;
  952. code = code * 8 + DigitValue(*ptr);
  953. }
  954. output->push_back(static_cast<char>(code));
  955. } else if (*ptr == 'x') {
  956. // A hex escape. May zero, one, or two digits. (The zero case
  957. // will have been caught as an error earlier.)
  958. int code = 0;
  959. if (HexDigit::InClass(ptr[1])) {
  960. ++ptr;
  961. code = DigitValue(*ptr);
  962. }
  963. if (HexDigit::InClass(ptr[1])) {
  964. ++ptr;
  965. code = code * 16 + DigitValue(*ptr);
  966. }
  967. output->push_back(static_cast<char>(code));
  968. } else if (*ptr == 'u' || *ptr == 'U') {
  969. uint32 unicode;
  970. const char* end = FetchUnicodePoint(ptr, &unicode);
  971. if (end == ptr) {
  972. // Failure: Just dump out what we saw, don't try to parse it.
  973. output->push_back(*ptr);
  974. } else {
  975. AppendUTF8(unicode, output);
  976. ptr = end - 1; // Because we're about to ++ptr.
  977. }
  978. } else {
  979. // Some other escape code.
  980. output->push_back(TranslateEscape(*ptr));
  981. }
  982. } else if (*ptr == text[0] && ptr[1] == '\0') {
  983. // Ignore final quote matching the starting quote.
  984. } else {
  985. output->push_back(*ptr);
  986. }
  987. }
  988. }
  989. template<typename CharacterClass>
  990. static bool AllInClass(const string& s) {
  991. for (int i = 0; i < s.size(); ++i) {
  992. if (!CharacterClass::InClass(s[i]))
  993. return false;
  994. }
  995. return true;
  996. }
  997. bool Tokenizer::IsIdentifier(const string& text) {
  998. // Mirrors IDENTIFIER definition in Tokenizer::Next() above.
  999. if (text.size() == 0)
  1000. return false;
  1001. if (!Letter::InClass(text.at(0)))
  1002. return false;
  1003. if (!AllInClass<Alphanumeric>(text.substr(1)))
  1004. return false;
  1005. return true;
  1006. }
  1007. } // namespace io
  1008. } // namespace protobuf
  1009. } // namespace google