tokenizer.cc 37 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139
  1. // Protocol Buffers - Google's data interchange format
  2. // Copyright 2008 Google Inc. All rights reserved.
  3. // https://developers.google.com/protocol-buffers/
  4. //
  5. // Redistribution and use in source and binary forms, with or without
  6. // modification, are permitted provided that the following conditions are
  7. // met:
  8. //
  9. // * Redistributions of source code must retain the above copyright
  10. // notice, this list of conditions and the following disclaimer.
  11. // * Redistributions in binary form must reproduce the above
  12. // copyright notice, this list of conditions and the following disclaimer
  13. // in the documentation and/or other materials provided with the
  14. // distribution.
  15. // * Neither the name of Google Inc. nor the names of its
  16. // contributors may be used to endorse or promote products derived from
  17. // this software without specific prior written permission.
  18. //
  19. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  22. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  23. // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24. // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  25. // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  29. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30. // Author: kenton@google.com (Kenton Varda)
  31. // Based on original Protocol Buffers design by
  32. // Sanjay Ghemawat, Jeff Dean, and others.
  33. //
  34. // Here we have a hand-written lexer. At first you might ask yourself,
  35. // "Hand-written text processing? Is Kenton crazy?!" Well, first of all,
  36. // yes I am crazy, but that's beside the point. There are actually reasons
  37. // why I ended up writing this this way.
  38. //
  39. // The traditional approach to lexing is to use lex to generate a lexer for
  40. // you. Unfortunately, lex's output is ridiculously ugly and difficult to
  41. // integrate cleanly with C++ code, especially abstract code or code meant
  42. // as a library. Better parser-generators exist but would add dependencies
  43. // which most users won't already have, which we'd like to avoid. (GNU flex
  44. // has a C++ output option, but it's still ridiculously ugly, non-abstract,
  45. // and not library-friendly.)
  46. //
  47. // The next approach that any good software engineer should look at is to
  48. // use regular expressions. And, indeed, I did. I have code which
  49. // implements this same class using regular expressions. It's about 200
  50. // lines shorter. However:
  51. // - Rather than error messages telling you "This string has an invalid
  52. // escape sequence at line 5, column 45", you get error messages like
  53. // "Parse error on line 5". Giving more precise errors requires adding
  54. // a lot of code that ends up basically as complex as the hand-coded
  55. // version anyway.
  56. // - The regular expression to match a string literal looks like this:
  57. // kString = new RE("(\"([^\"\\\\]|" // non-escaped
  58. // "\\\\[abfnrtv?\"'\\\\0-7]|" // normal escape
  59. // "\\\\x[0-9a-fA-F])*\"|" // hex escape
  60. // "\'([^\'\\\\]|" // Also support single-quotes.
  61. // "\\\\[abfnrtv?\"'\\\\0-7]|"
  62. // "\\\\x[0-9a-fA-F])*\')");
  63. // Verifying the correctness of this line noise is actually harder than
  64. // verifying the correctness of ConsumeString(), defined below. I'm not
  65. // even confident that the above is correct, after staring at it for some
  66. // time.
  67. // - PCRE is fast, but there's still more overhead involved than the code
  68. // below.
  69. // - Sadly, regular expressions are not part of the C standard library, so
  70. // using them would require depending on some other library. For the
  71. // open source release, this could be really annoying. Nobody likes
  72. // downloading one piece of software just to find that they need to
  73. // download something else to make it work, and in all likelihood
  74. // people downloading Protocol Buffers will already be doing so just
  75. // to make something else work. We could include a copy of PCRE with
  76. // our code, but that obligates us to keep it up-to-date and just seems
  77. // like a big waste just to save 200 lines of code.
  78. //
  79. // On a similar but unrelated note, I'm even scared to use ctype.h.
  80. // Apparently functions like isalpha() are locale-dependent. So, if we used
  81. // that, then if this code is being called from some program that doesn't
  82. // have its locale set to "C", it would behave strangely. We can't just set
  83. // the locale to "C" ourselves since we might break the calling program that
  84. // way, particularly if it is multi-threaded. WTF? Someone please let me
  85. // (Kenton) know if I'm missing something here...
  86. //
  87. // I'd love to hear about other alternatives, though, as this code isn't
  88. // exactly pretty.
  89. #include <google/protobuf/io/tokenizer.h>
  90. #include <google/protobuf/stubs/common.h>
  91. #include <google/protobuf/stubs/logging.h>
  92. #include <google/protobuf/stubs/stringprintf.h>
  93. #include <google/protobuf/io/strtod.h>
  94. #include <google/protobuf/io/zero_copy_stream.h>
  95. #include <google/protobuf/stubs/strutil.h>
  96. #include <google/protobuf/stubs/stl_util.h>
  97. namespace google {
  98. namespace protobuf {
  99. namespace io {
  100. namespace {
  101. // As mentioned above, I don't trust ctype.h due to the presence of "locales".
  102. // So, I have written replacement functions here. Someone please smack me if
  103. // this is a bad idea or if there is some way around this.
  104. //
  105. // These "character classes" are designed to be used in template methods.
  106. // For instance, Tokenizer::ConsumeZeroOrMore<Whitespace>() will eat
  107. // whitespace.
  108. // Note: No class is allowed to contain '\0', since this is used to mark end-
  109. // of-input and is handled specially.
  110. #define CHARACTER_CLASS(NAME, EXPRESSION) \
  111. class NAME { \
  112. public: \
  113. static inline bool InClass(char c) { \
  114. return EXPRESSION; \
  115. } \
  116. }
  117. CHARACTER_CLASS(Whitespace, c == ' ' || c == '\n' || c == '\t' ||
  118. c == '\r' || c == '\v' || c == '\f');
  119. CHARACTER_CLASS(WhitespaceNoNewline, c == ' ' || c == '\t' ||
  120. c == '\r' || c == '\v' || c == '\f');
  121. CHARACTER_CLASS(Unprintable, c < ' ' && c > '\0');
  122. CHARACTER_CLASS(Digit, '0' <= c && c <= '9');
  123. CHARACTER_CLASS(OctalDigit, '0' <= c && c <= '7');
  124. CHARACTER_CLASS(HexDigit, ('0' <= c && c <= '9') ||
  125. ('a' <= c && c <= 'f') ||
  126. ('A' <= c && c <= 'F'));
  127. CHARACTER_CLASS(Letter, ('a' <= c && c <= 'z') ||
  128. ('A' <= c && c <= 'Z') ||
  129. (c == '_'));
  130. CHARACTER_CLASS(Alphanumeric, ('a' <= c && c <= 'z') ||
  131. ('A' <= c && c <= 'Z') ||
  132. ('0' <= c && c <= '9') ||
  133. (c == '_'));
  134. CHARACTER_CLASS(Escape, c == 'a' || c == 'b' || c == 'f' || c == 'n' ||
  135. c == 'r' || c == 't' || c == 'v' || c == '\\' ||
  136. c == '?' || c == '\'' || c == '\"');
  137. #undef CHARACTER_CLASS
  138. // Given a char, interpret it as a numeric digit and return its value.
  139. // This supports any number base up to 36.
  140. inline int DigitValue(char digit) {
  141. if ('0' <= digit && digit <= '9') return digit - '0';
  142. if ('a' <= digit && digit <= 'z') return digit - 'a' + 10;
  143. if ('A' <= digit && digit <= 'Z') return digit - 'A' + 10;
  144. return -1;
  145. }
  146. // Inline because it's only used in one place.
  147. inline char TranslateEscape(char c) {
  148. switch (c) {
  149. case 'a': return '\a';
  150. case 'b': return '\b';
  151. case 'f': return '\f';
  152. case 'n': return '\n';
  153. case 'r': return '\r';
  154. case 't': return '\t';
  155. case 'v': return '\v';
  156. case '\\': return '\\';
  157. case '?': return '\?'; // Trigraphs = :(
  158. case '\'': return '\'';
  159. case '"': return '\"';
  160. // We expect escape sequences to have been validated separately.
  161. default: return '?';
  162. }
  163. }
  164. } // anonymous namespace
  165. ErrorCollector::~ErrorCollector() {}
  166. // ===================================================================
  167. Tokenizer::Tokenizer(ZeroCopyInputStream* input,
  168. ErrorCollector* error_collector)
  169. : input_(input),
  170. error_collector_(error_collector),
  171. buffer_(NULL),
  172. buffer_size_(0),
  173. buffer_pos_(0),
  174. read_error_(false),
  175. line_(0),
  176. column_(0),
  177. record_target_(NULL),
  178. record_start_(-1),
  179. allow_f_after_float_(false),
  180. comment_style_(CPP_COMMENT_STYLE),
  181. require_space_after_number_(true),
  182. allow_multiline_strings_(false) {
  183. current_.line = 0;
  184. current_.column = 0;
  185. current_.end_column = 0;
  186. current_.type = TYPE_START;
  187. Refresh();
  188. }
  189. Tokenizer::~Tokenizer() {
  190. // If we had any buffer left unread, return it to the underlying stream
  191. // so that someone else can read it.
  192. if (buffer_size_ > buffer_pos_) {
  193. input_->BackUp(buffer_size_ - buffer_pos_);
  194. }
  195. }
  196. // -------------------------------------------------------------------
  197. // Internal helpers.
  198. void Tokenizer::NextChar() {
  199. // Update our line and column counters based on the character being
  200. // consumed.
  201. if (current_char_ == '\n') {
  202. ++line_;
  203. column_ = 0;
  204. } else if (current_char_ == '\t') {
  205. column_ += kTabWidth - column_ % kTabWidth;
  206. } else {
  207. ++column_;
  208. }
  209. // Advance to the next character.
  210. ++buffer_pos_;
  211. if (buffer_pos_ < buffer_size_) {
  212. current_char_ = buffer_[buffer_pos_];
  213. } else {
  214. Refresh();
  215. }
  216. }
  217. void Tokenizer::Refresh() {
  218. if (read_error_) {
  219. current_char_ = '\0';
  220. return;
  221. }
  222. // If we're in a token, append the rest of the buffer to it.
  223. if (record_target_ != NULL && record_start_ < buffer_size_) {
  224. record_target_->append(buffer_ + record_start_, buffer_size_ - record_start_);
  225. record_start_ = 0;
  226. }
  227. const void* data = NULL;
  228. buffer_ = NULL;
  229. buffer_pos_ = 0;
  230. do {
  231. if (!input_->Next(&data, &buffer_size_)) {
  232. // end of stream (or read error)
  233. buffer_size_ = 0;
  234. read_error_ = true;
  235. current_char_ = '\0';
  236. return;
  237. }
  238. } while (buffer_size_ == 0);
  239. buffer_ = static_cast<const char*>(data);
  240. current_char_ = buffer_[0];
  241. }
  242. inline void Tokenizer::RecordTo(string* target) {
  243. record_target_ = target;
  244. record_start_ = buffer_pos_;
  245. }
  246. inline void Tokenizer::StopRecording() {
  247. // Note: The if() is necessary because some STL implementations crash when
  248. // you call string::append(NULL, 0), presumably because they are trying to
  249. // be helpful by detecting the NULL pointer, even though there's nothing
  250. // wrong with reading zero bytes from NULL.
  251. if (buffer_pos_ != record_start_) {
  252. record_target_->append(buffer_ + record_start_, buffer_pos_ - record_start_);
  253. }
  254. record_target_ = NULL;
  255. record_start_ = -1;
  256. }
  257. inline void Tokenizer::StartToken() {
  258. current_.type = TYPE_START; // Just for the sake of initializing it.
  259. current_.text.clear();
  260. current_.line = line_;
  261. current_.column = column_;
  262. RecordTo(&current_.text);
  263. }
  264. inline void Tokenizer::EndToken() {
  265. StopRecording();
  266. current_.end_column = column_;
  267. }
  268. // -------------------------------------------------------------------
  269. // Helper methods that consume characters.
  270. template<typename CharacterClass>
  271. inline bool Tokenizer::LookingAt() {
  272. return CharacterClass::InClass(current_char_);
  273. }
  274. template<typename CharacterClass>
  275. inline bool Tokenizer::TryConsumeOne() {
  276. if (CharacterClass::InClass(current_char_)) {
  277. NextChar();
  278. return true;
  279. } else {
  280. return false;
  281. }
  282. }
  283. inline bool Tokenizer::TryConsume(char c) {
  284. if (current_char_ == c) {
  285. NextChar();
  286. return true;
  287. } else {
  288. return false;
  289. }
  290. }
  291. template<typename CharacterClass>
  292. inline void Tokenizer::ConsumeZeroOrMore() {
  293. while (CharacterClass::InClass(current_char_)) {
  294. NextChar();
  295. }
  296. }
  297. template<typename CharacterClass>
  298. inline void Tokenizer::ConsumeOneOrMore(const char* error) {
  299. if (!CharacterClass::InClass(current_char_)) {
  300. AddError(error);
  301. } else {
  302. do {
  303. NextChar();
  304. } while (CharacterClass::InClass(current_char_));
  305. }
  306. }
  307. // -------------------------------------------------------------------
  308. // Methods that read whole patterns matching certain kinds of tokens
  309. // or comments.
  310. void Tokenizer::ConsumeString(char delimiter) {
  311. while (true) {
  312. switch (current_char_) {
  313. case '\0':
  314. AddError("Unexpected end of string.");
  315. return;
  316. case '\n': {
  317. if (!allow_multiline_strings_) {
  318. AddError("String literals cannot cross line boundaries.");
  319. return;
  320. }
  321. NextChar();
  322. break;
  323. }
  324. case '\\': {
  325. // An escape sequence.
  326. NextChar();
  327. if (TryConsumeOne<Escape>()) {
  328. // Valid escape sequence.
  329. } else if (TryConsumeOne<OctalDigit>()) {
  330. // Possibly followed by two more octal digits, but these will
  331. // just be consumed by the main loop anyway so we don't need
  332. // to do so explicitly here.
  333. } else if (TryConsume('x')) {
  334. if (!TryConsumeOne<HexDigit>()) {
  335. AddError("Expected hex digits for escape sequence.");
  336. }
  337. // Possibly followed by another hex digit, but again we don't care.
  338. } else if (TryConsume('u')) {
  339. if (!TryConsumeOne<HexDigit>() ||
  340. !TryConsumeOne<HexDigit>() ||
  341. !TryConsumeOne<HexDigit>() ||
  342. !TryConsumeOne<HexDigit>()) {
  343. AddError("Expected four hex digits for \\u escape sequence.");
  344. }
  345. } else if (TryConsume('U')) {
  346. // We expect 8 hex digits; but only the range up to 0x10ffff is
  347. // legal.
  348. if (!TryConsume('0') ||
  349. !TryConsume('0') ||
  350. !(TryConsume('0') || TryConsume('1')) ||
  351. !TryConsumeOne<HexDigit>() ||
  352. !TryConsumeOne<HexDigit>() ||
  353. !TryConsumeOne<HexDigit>() ||
  354. !TryConsumeOne<HexDigit>() ||
  355. !TryConsumeOne<HexDigit>()) {
  356. AddError("Expected eight hex digits up to 10ffff for \\U escape "
  357. "sequence");
  358. }
  359. } else {
  360. AddError("Invalid escape sequence in string literal.");
  361. }
  362. break;
  363. }
  364. default: {
  365. if (current_char_ == delimiter) {
  366. NextChar();
  367. return;
  368. }
  369. NextChar();
  370. break;
  371. }
  372. }
  373. }
  374. }
  375. Tokenizer::TokenType Tokenizer::ConsumeNumber(bool started_with_zero,
  376. bool started_with_dot) {
  377. bool is_float = false;
  378. if (started_with_zero && (TryConsume('x') || TryConsume('X'))) {
  379. // A hex number (started with "0x").
  380. ConsumeOneOrMore<HexDigit>("\"0x\" must be followed by hex digits.");
  381. } else if (started_with_zero && LookingAt<Digit>()) {
  382. // An octal number (had a leading zero).
  383. ConsumeZeroOrMore<OctalDigit>();
  384. if (LookingAt<Digit>()) {
  385. AddError("Numbers starting with leading zero must be in octal.");
  386. ConsumeZeroOrMore<Digit>();
  387. }
  388. } else {
  389. // A decimal number.
  390. if (started_with_dot) {
  391. is_float = true;
  392. ConsumeZeroOrMore<Digit>();
  393. } else {
  394. ConsumeZeroOrMore<Digit>();
  395. if (TryConsume('.')) {
  396. is_float = true;
  397. ConsumeZeroOrMore<Digit>();
  398. }
  399. }
  400. if (TryConsume('e') || TryConsume('E')) {
  401. is_float = true;
  402. TryConsume('-') || TryConsume('+');
  403. ConsumeOneOrMore<Digit>("\"e\" must be followed by exponent.");
  404. }
  405. if (allow_f_after_float_ && (TryConsume('f') || TryConsume('F'))) {
  406. is_float = true;
  407. }
  408. }
  409. if (LookingAt<Letter>() && require_space_after_number_) {
  410. AddError("Need space between number and identifier.");
  411. } else if (current_char_ == '.') {
  412. if (is_float) {
  413. AddError(
  414. "Already saw decimal point or exponent; can't have another one.");
  415. } else {
  416. AddError("Hex and octal numbers must be integers.");
  417. }
  418. }
  419. return is_float ? TYPE_FLOAT : TYPE_INTEGER;
  420. }
  421. void Tokenizer::ConsumeLineComment(string* content) {
  422. if (content != NULL) RecordTo(content);
  423. while (current_char_ != '\0' && current_char_ != '\n') {
  424. NextChar();
  425. }
  426. TryConsume('\n');
  427. if (content != NULL) StopRecording();
  428. }
  429. void Tokenizer::ConsumeBlockComment(string* content) {
  430. int start_line = line_;
  431. int start_column = column_ - 2;
  432. if (content != NULL) RecordTo(content);
  433. while (true) {
  434. while (current_char_ != '\0' &&
  435. current_char_ != '*' &&
  436. current_char_ != '/' &&
  437. current_char_ != '\n') {
  438. NextChar();
  439. }
  440. if (TryConsume('\n')) {
  441. if (content != NULL) StopRecording();
  442. // Consume leading whitespace and asterisk;
  443. ConsumeZeroOrMore<WhitespaceNoNewline>();
  444. if (TryConsume('*')) {
  445. if (TryConsume('/')) {
  446. // End of comment.
  447. break;
  448. }
  449. }
  450. if (content != NULL) RecordTo(content);
  451. } else if (TryConsume('*') && TryConsume('/')) {
  452. // End of comment.
  453. if (content != NULL) {
  454. StopRecording();
  455. // Strip trailing "*/".
  456. content->erase(content->size() - 2);
  457. }
  458. break;
  459. } else if (TryConsume('/') && current_char_ == '*') {
  460. // Note: We didn't consume the '*' because if there is a '/' after it
  461. // we want to interpret that as the end of the comment.
  462. AddError(
  463. "\"/*\" inside block comment. Block comments cannot be nested.");
  464. } else if (current_char_ == '\0') {
  465. AddError("End-of-file inside block comment.");
  466. error_collector_->AddError(
  467. start_line, start_column, " Comment started here.");
  468. if (content != NULL) StopRecording();
  469. break;
  470. }
  471. }
  472. }
  473. Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() {
  474. if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) {
  475. if (TryConsume('/')) {
  476. return LINE_COMMENT;
  477. } else if (TryConsume('*')) {
  478. return BLOCK_COMMENT;
  479. } else {
  480. // Oops, it was just a slash. Return it.
  481. current_.type = TYPE_SYMBOL;
  482. current_.text = "/";
  483. current_.line = line_;
  484. current_.column = column_ - 1;
  485. current_.end_column = column_;
  486. return SLASH_NOT_COMMENT;
  487. }
  488. } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) {
  489. return LINE_COMMENT;
  490. } else {
  491. return NO_COMMENT;
  492. }
  493. }
  494. // -------------------------------------------------------------------
  495. bool Tokenizer::Next() {
  496. previous_ = current_;
  497. while (!read_error_) {
  498. ConsumeZeroOrMore<Whitespace>();
  499. switch (TryConsumeCommentStart()) {
  500. case LINE_COMMENT:
  501. ConsumeLineComment(NULL);
  502. continue;
  503. case BLOCK_COMMENT:
  504. ConsumeBlockComment(NULL);
  505. continue;
  506. case SLASH_NOT_COMMENT:
  507. return true;
  508. case NO_COMMENT:
  509. break;
  510. }
  511. // Check for EOF before continuing.
  512. if (read_error_) break;
  513. if (LookingAt<Unprintable>() || current_char_ == '\0') {
  514. AddError("Invalid control characters encountered in text.");
  515. NextChar();
  516. // Skip more unprintable characters, too. But, remember that '\0' is
  517. // also what current_char_ is set to after EOF / read error. We have
  518. // to be careful not to go into an infinite loop of trying to consume
  519. // it, so make sure to check read_error_ explicitly before consuming
  520. // '\0'.
  521. while (TryConsumeOne<Unprintable>() ||
  522. (!read_error_ && TryConsume('\0'))) {
  523. // Ignore.
  524. }
  525. } else {
  526. // Reading some sort of token.
  527. StartToken();
  528. if (TryConsumeOne<Letter>()) {
  529. ConsumeZeroOrMore<Alphanumeric>();
  530. current_.type = TYPE_IDENTIFIER;
  531. } else if (TryConsume('0')) {
  532. current_.type = ConsumeNumber(true, false);
  533. } else if (TryConsume('.')) {
  534. // This could be the beginning of a floating-point number, or it could
  535. // just be a '.' symbol.
  536. if (TryConsumeOne<Digit>()) {
  537. // It's a floating-point number.
  538. if (previous_.type == TYPE_IDENTIFIER &&
  539. current_.line == previous_.line &&
  540. current_.column == previous_.end_column) {
  541. // We don't accept syntax like "blah.123".
  542. error_collector_->AddError(line_, column_ - 2,
  543. "Need space between identifier and decimal point.");
  544. }
  545. current_.type = ConsumeNumber(false, true);
  546. } else {
  547. current_.type = TYPE_SYMBOL;
  548. }
  549. } else if (TryConsumeOne<Digit>()) {
  550. current_.type = ConsumeNumber(false, false);
  551. } else if (TryConsume('\"')) {
  552. ConsumeString('\"');
  553. current_.type = TYPE_STRING;
  554. } else if (TryConsume('\'')) {
  555. ConsumeString('\'');
  556. current_.type = TYPE_STRING;
  557. } else {
  558. // Check if the high order bit is set.
  559. if (current_char_ & 0x80) {
  560. error_collector_->AddError(line_, column_,
  561. StringPrintf("Interpreting non ascii codepoint %d.",
  562. static_cast<unsigned char>(current_char_)));
  563. }
  564. NextChar();
  565. current_.type = TYPE_SYMBOL;
  566. }
  567. EndToken();
  568. return true;
  569. }
  570. }
  571. // EOF
  572. current_.type = TYPE_END;
  573. current_.text.clear();
  574. current_.line = line_;
  575. current_.column = column_;
  576. current_.end_column = column_;
  577. return false;
  578. }
  579. namespace {
  580. // Helper class for collecting comments and putting them in the right places.
  581. //
  582. // This basically just buffers the most recent comment until it can be decided
  583. // exactly where that comment should be placed. When Flush() is called, the
  584. // current comment goes into either prev_trailing_comments or detached_comments.
  585. // When the CommentCollector is destroyed, the last buffered comment goes into
  586. // next_leading_comments.
  587. class CommentCollector {
  588. public:
  589. CommentCollector(string* prev_trailing_comments,
  590. std::vector<string>* detached_comments,
  591. string* next_leading_comments)
  592. : prev_trailing_comments_(prev_trailing_comments),
  593. detached_comments_(detached_comments),
  594. next_leading_comments_(next_leading_comments),
  595. has_comment_(false),
  596. is_line_comment_(false),
  597. can_attach_to_prev_(true) {
  598. if (prev_trailing_comments != NULL) prev_trailing_comments->clear();
  599. if (detached_comments != NULL) detached_comments->clear();
  600. if (next_leading_comments != NULL) next_leading_comments->clear();
  601. }
  602. ~CommentCollector() {
  603. // Whatever is in the buffer is a leading comment.
  604. if (next_leading_comments_ != NULL && has_comment_) {
  605. comment_buffer_.swap(*next_leading_comments_);
  606. }
  607. }
  608. // About to read a line comment. Get the comment buffer pointer in order to
  609. // read into it.
  610. string* GetBufferForLineComment() {
  611. // We want to combine with previous line comments, but not block comments.
  612. if (has_comment_ && !is_line_comment_) {
  613. Flush();
  614. }
  615. has_comment_ = true;
  616. is_line_comment_ = true;
  617. return &comment_buffer_;
  618. }
  619. // About to read a block comment. Get the comment buffer pointer in order to
  620. // read into it.
  621. string* GetBufferForBlockComment() {
  622. if (has_comment_) {
  623. Flush();
  624. }
  625. has_comment_ = true;
  626. is_line_comment_ = false;
  627. return &comment_buffer_;
  628. }
  629. void ClearBuffer() {
  630. comment_buffer_.clear();
  631. has_comment_ = false;
  632. }
  633. // Called once we know that the comment buffer is complete and is *not*
  634. // connected to the next token.
  635. void Flush() {
  636. if (has_comment_) {
  637. if (can_attach_to_prev_) {
  638. if (prev_trailing_comments_ != NULL) {
  639. prev_trailing_comments_->append(comment_buffer_);
  640. }
  641. can_attach_to_prev_ = false;
  642. } else {
  643. if (detached_comments_ != NULL) {
  644. detached_comments_->push_back(comment_buffer_);
  645. }
  646. }
  647. ClearBuffer();
  648. }
  649. }
  650. void DetachFromPrev() {
  651. can_attach_to_prev_ = false;
  652. }
  653. private:
  654. string* prev_trailing_comments_;
  655. std::vector<string>* detached_comments_;
  656. string* next_leading_comments_;
  657. string comment_buffer_;
  658. // True if any comments were read into comment_buffer_. This can be true even
  659. // if comment_buffer_ is empty, namely if the comment was "/**/".
  660. bool has_comment_;
  661. // Is the comment in the comment buffer a line comment?
  662. bool is_line_comment_;
  663. // Is it still possible that we could be reading a comment attached to the
  664. // previous token?
  665. bool can_attach_to_prev_;
  666. };
  667. } // namespace
  668. bool Tokenizer::NextWithComments(string* prev_trailing_comments,
  669. std::vector<string>* detached_comments,
  670. string* next_leading_comments) {
  671. CommentCollector collector(prev_trailing_comments, detached_comments,
  672. next_leading_comments);
  673. if (current_.type == TYPE_START) {
  674. // Ignore unicode byte order mark(BOM) if it appears at the file
  675. // beginning. Only UTF-8 BOM (0xEF 0xBB 0xBF) is accepted.
  676. if (TryConsume((char)0xEF)) {
  677. if (!TryConsume((char)0xBB) || !TryConsume((char)0xBF)) {
  678. AddError("Proto file starts with 0xEF but not UTF-8 BOM. "
  679. "Only UTF-8 is accepted for proto file.");
  680. return false;
  681. }
  682. }
  683. collector.DetachFromPrev();
  684. } else {
  685. // A comment appearing on the same line must be attached to the previous
  686. // declaration.
  687. ConsumeZeroOrMore<WhitespaceNoNewline>();
  688. switch (TryConsumeCommentStart()) {
  689. case LINE_COMMENT:
  690. ConsumeLineComment(collector.GetBufferForLineComment());
  691. // Don't allow comments on subsequent lines to be attached to a trailing
  692. // comment.
  693. collector.Flush();
  694. break;
  695. case BLOCK_COMMENT:
  696. ConsumeBlockComment(collector.GetBufferForBlockComment());
  697. ConsumeZeroOrMore<WhitespaceNoNewline>();
  698. if (!TryConsume('\n')) {
  699. // Oops, the next token is on the same line. If we recorded a comment
  700. // we really have no idea which token it should be attached to.
  701. collector.ClearBuffer();
  702. return Next();
  703. }
  704. // Don't allow comments on subsequent lines to be attached to a trailing
  705. // comment.
  706. collector.Flush();
  707. break;
  708. case SLASH_NOT_COMMENT:
  709. return true;
  710. case NO_COMMENT:
  711. if (!TryConsume('\n')) {
  712. // The next token is on the same line. There are no comments.
  713. return Next();
  714. }
  715. break;
  716. }
  717. }
  718. // OK, we are now on the line *after* the previous token.
  719. while (true) {
  720. ConsumeZeroOrMore<WhitespaceNoNewline>();
  721. switch (TryConsumeCommentStart()) {
  722. case LINE_COMMENT:
  723. ConsumeLineComment(collector.GetBufferForLineComment());
  724. break;
  725. case BLOCK_COMMENT:
  726. ConsumeBlockComment(collector.GetBufferForBlockComment());
  727. // Consume the rest of the line so that we don't interpret it as a
  728. // blank line the next time around the loop.
  729. ConsumeZeroOrMore<WhitespaceNoNewline>();
  730. TryConsume('\n');
  731. break;
  732. case SLASH_NOT_COMMENT:
  733. return true;
  734. case NO_COMMENT:
  735. if (TryConsume('\n')) {
  736. // Completely blank line.
  737. collector.Flush();
  738. collector.DetachFromPrev();
  739. } else {
  740. bool result = Next();
  741. if (!result ||
  742. current_.text == "}" ||
  743. current_.text == "]" ||
  744. current_.text == ")") {
  745. // It looks like we're at the end of a scope. In this case it
  746. // makes no sense to attach a comment to the following token.
  747. collector.Flush();
  748. }
  749. return result;
  750. }
  751. break;
  752. }
  753. }
  754. }
  755. // -------------------------------------------------------------------
  756. // Token-parsing helpers. Remember that these don't need to report
  757. // errors since any errors should already have been reported while
  758. // tokenizing. Also, these can assume that whatever text they
  759. // are given is text that the tokenizer actually parsed as a token
  760. // of the given type.
  761. bool Tokenizer::ParseInteger(const string& text, uint64 max_value,
  762. uint64* output) {
  763. // Sadly, we can't just use strtoul() since it is only 32-bit and strtoull()
  764. // is non-standard. I hate the C standard library. :(
  765. // return strtoull(text.c_str(), NULL, 0);
  766. const char* ptr = text.c_str();
  767. int base = 10;
  768. if (ptr[0] == '0') {
  769. if (ptr[1] == 'x' || ptr[1] == 'X') {
  770. // This is hex.
  771. base = 16;
  772. ptr += 2;
  773. } else {
  774. // This is octal.
  775. base = 8;
  776. }
  777. }
  778. uint64 result = 0;
  779. for (; *ptr != '\0'; ptr++) {
  780. int digit = DigitValue(*ptr);
  781. if (digit < 0 || digit >= base) {
  782. // The token provided by Tokenizer is invalid. i.e., 099 is an invalid
  783. // token, but Tokenizer still think it's integer.
  784. return false;
  785. }
  786. if (digit > max_value || result > (max_value - digit) / base) {
  787. // Overflow.
  788. return false;
  789. }
  790. result = result * base + digit;
  791. }
  792. *output = result;
  793. return true;
  794. }
  795. double Tokenizer::ParseFloat(const string& text) {
  796. const char* start = text.c_str();
  797. char* end;
  798. double result = NoLocaleStrtod(start, &end);
  799. // "1e" is not a valid float, but if the tokenizer reads it, it will
  800. // report an error but still return it as a valid token. We need to
  801. // accept anything the tokenizer could possibly return, error or not.
  802. if (*end == 'e' || *end == 'E') {
  803. ++end;
  804. if (*end == '-' || *end == '+') ++end;
  805. }
  806. // If the Tokenizer had allow_f_after_float_ enabled, the float may be
  807. // suffixed with the letter 'f'.
  808. if (*end == 'f' || *end == 'F') {
  809. ++end;
  810. }
  811. GOOGLE_LOG_IF(DFATAL, end - start != text.size() || *start == '-')
  812. << " Tokenizer::ParseFloat() passed text that could not have been"
  813. " tokenized as a float: " << CEscape(text);
  814. return result;
  815. }
  816. // Helper to append a Unicode code point to a string as UTF8, without bringing
  817. // in any external dependencies.
  818. static void AppendUTF8(uint32 code_point, string* output) {
  819. uint32 tmp = 0;
  820. int len = 0;
  821. if (code_point <= 0x7f) {
  822. tmp = code_point;
  823. len = 1;
  824. } else if (code_point <= 0x07ff) {
  825. tmp = 0x0000c080 |
  826. ((code_point & 0x07c0) << 2) |
  827. (code_point & 0x003f);
  828. len = 2;
  829. } else if (code_point <= 0xffff) {
  830. tmp = 0x00e08080 |
  831. ((code_point & 0xf000) << 4) |
  832. ((code_point & 0x0fc0) << 2) |
  833. (code_point & 0x003f);
  834. len = 3;
  835. } else if (code_point <= 0x1fffff) {
  836. tmp = 0xf0808080 |
  837. ((code_point & 0x1c0000) << 6) |
  838. ((code_point & 0x03f000) << 4) |
  839. ((code_point & 0x000fc0) << 2) |
  840. (code_point & 0x003f);
  841. len = 4;
  842. } else {
  843. // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
  844. // normally only defined up to there as well.
  845. StringAppendF(output, "\\U%08x", code_point);
  846. return;
  847. }
  848. tmp = ghtonl(tmp);
  849. output->append(reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
  850. }
  851. // Try to read <len> hex digits from ptr, and stuff the numeric result into
  852. // *result. Returns true if that many digits were successfully consumed.
  853. static bool ReadHexDigits(const char* ptr, int len, uint32* result) {
  854. *result = 0;
  855. if (len == 0) return false;
  856. for (const char* end = ptr + len; ptr < end; ++ptr) {
  857. if (*ptr == '\0') return false;
  858. *result = (*result << 4) + DigitValue(*ptr);
  859. }
  860. return true;
  861. }
  862. // Handling UTF-16 surrogate pairs. UTF-16 encodes code points in the range
  863. // 0x10000...0x10ffff as a pair of numbers, a head surrogate followed by a trail
  864. // surrogate. These numbers are in a reserved range of Unicode code points, so
  865. // if we encounter such a pair we know how to parse it and convert it into a
  866. // single code point.
  867. static const uint32 kMinHeadSurrogate = 0xd800;
  868. static const uint32 kMaxHeadSurrogate = 0xdc00;
  869. static const uint32 kMinTrailSurrogate = 0xdc00;
  870. static const uint32 kMaxTrailSurrogate = 0xe000;
  871. static inline bool IsHeadSurrogate(uint32 code_point) {
  872. return (code_point >= kMinHeadSurrogate) && (code_point < kMaxHeadSurrogate);
  873. }
  874. static inline bool IsTrailSurrogate(uint32 code_point) {
  875. return (code_point >= kMinTrailSurrogate) &&
  876. (code_point < kMaxTrailSurrogate);
  877. }
  878. // Combine a head and trail surrogate into a single Unicode code point.
  879. static uint32 AssembleUTF16(uint32 head_surrogate, uint32 trail_surrogate) {
  880. GOOGLE_DCHECK(IsHeadSurrogate(head_surrogate));
  881. GOOGLE_DCHECK(IsTrailSurrogate(trail_surrogate));
  882. return 0x10000 + (((head_surrogate - kMinHeadSurrogate) << 10) |
  883. (trail_surrogate - kMinTrailSurrogate));
  884. }
  885. // Convert the escape sequence parameter to a number of expected hex digits.
  886. static inline int UnicodeLength(char key) {
  887. if (key == 'u') return 4;
  888. if (key == 'U') return 8;
  889. return 0;
  890. }
  891. // Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt
  892. // to parse that sequence. On success, returns a pointer to the first char
  893. // beyond that sequence, and fills in *code_point. On failure, returns ptr
  894. // itself.
  895. static const char* FetchUnicodePoint(const char* ptr, uint32* code_point) {
  896. const char* p = ptr;
  897. // Fetch the code point.
  898. const int len = UnicodeLength(*p++);
  899. if (!ReadHexDigits(p, len, code_point))
  900. return ptr;
  901. p += len;
  902. // Check if the code point we read is a "head surrogate." If so, then we
  903. // expect it to be immediately followed by another code point which is a valid
  904. // "trail surrogate," and together they form a UTF-16 pair which decodes into
  905. // a single Unicode point. Trail surrogates may only use \u, not \U.
  906. if (IsHeadSurrogate(*code_point) && *p == '\\' && *(p + 1) == 'u') {
  907. uint32 trail_surrogate;
  908. if (ReadHexDigits(p + 2, 4, &trail_surrogate) &&
  909. IsTrailSurrogate(trail_surrogate)) {
  910. *code_point = AssembleUTF16(*code_point, trail_surrogate);
  911. p += 6;
  912. }
  913. // If this failed, then we just emit the head surrogate as a code point.
  914. // It's bogus, but so is the string.
  915. }
  916. return p;
  917. }
  918. // The text string must begin and end with single or double quote
  919. // characters.
  920. void Tokenizer::ParseStringAppend(const string& text, string* output) {
  921. // Reminder: text[0] is always a quote character. (If text is
  922. // empty, it's invalid, so we'll just return).
  923. const size_t text_size = text.size();
  924. if (text_size == 0) {
  925. GOOGLE_LOG(DFATAL)
  926. << " Tokenizer::ParseStringAppend() passed text that could not"
  927. " have been tokenized as a string: " << CEscape(text);
  928. return;
  929. }
  930. // Reserve room for new string. The branch is necessary because if
  931. // there is already space available the reserve() call might
  932. // downsize the output.
  933. const size_t new_len = text_size + output->size();
  934. if (new_len > output->capacity()) {
  935. output->reserve(new_len);
  936. }
  937. // Loop through the string copying characters to "output" and
  938. // interpreting escape sequences. Note that any invalid escape
  939. // sequences or other errors were already reported while tokenizing.
  940. // In this case we do not need to produce valid results.
  941. for (const char* ptr = text.c_str() + 1; *ptr != '\0'; ptr++) {
  942. if (*ptr == '\\' && ptr[1] != '\0') {
  943. // An escape sequence.
  944. ++ptr;
  945. if (OctalDigit::InClass(*ptr)) {
  946. // An octal escape. May one, two, or three digits.
  947. int code = DigitValue(*ptr);
  948. if (OctalDigit::InClass(ptr[1])) {
  949. ++ptr;
  950. code = code * 8 + DigitValue(*ptr);
  951. }
  952. if (OctalDigit::InClass(ptr[1])) {
  953. ++ptr;
  954. code = code * 8 + DigitValue(*ptr);
  955. }
  956. output->push_back(static_cast<char>(code));
  957. } else if (*ptr == 'x') {
  958. // A hex escape. May zero, one, or two digits. (The zero case
  959. // will have been caught as an error earlier.)
  960. int code = 0;
  961. if (HexDigit::InClass(ptr[1])) {
  962. ++ptr;
  963. code = DigitValue(*ptr);
  964. }
  965. if (HexDigit::InClass(ptr[1])) {
  966. ++ptr;
  967. code = code * 16 + DigitValue(*ptr);
  968. }
  969. output->push_back(static_cast<char>(code));
  970. } else if (*ptr == 'u' || *ptr == 'U') {
  971. uint32 unicode;
  972. const char* end = FetchUnicodePoint(ptr, &unicode);
  973. if (end == ptr) {
  974. // Failure: Just dump out what we saw, don't try to parse it.
  975. output->push_back(*ptr);
  976. } else {
  977. AppendUTF8(unicode, output);
  978. ptr = end - 1; // Because we're about to ++ptr.
  979. }
  980. } else {
  981. // Some other escape code.
  982. output->push_back(TranslateEscape(*ptr));
  983. }
  984. } else if (*ptr == text[0] && ptr[1] == '\0') {
  985. // Ignore final quote matching the starting quote.
  986. } else {
  987. output->push_back(*ptr);
  988. }
  989. }
  990. }
  991. template<typename CharacterClass>
  992. static bool AllInClass(const string& s) {
  993. for (int i = 0; i < s.size(); ++i) {
  994. if (!CharacterClass::InClass(s[i]))
  995. return false;
  996. }
  997. return true;
  998. }
  999. bool Tokenizer::IsIdentifier(const string& text) {
  1000. // Mirrors IDENTIFIER definition in Tokenizer::Next() above.
  1001. if (text.size() == 0)
  1002. return false;
  1003. if (!Letter::InClass(text.at(0)))
  1004. return false;
  1005. if (!AllInClass<Alphanumeric>(text.substr(1)))
  1006. return false;
  1007. return true;
  1008. }
  1009. } // namespace io
  1010. } // namespace protobuf
  1011. } // namespace google