tokenizer_unittest.cc 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995
  1. // Protocol Buffers - Google's data interchange format
  2. // Copyright 2008 Google Inc. All rights reserved.
  3. // https://developers.google.com/protocol-buffers/
  4. //
  5. // Redistribution and use in source and binary forms, with or without
  6. // modification, are permitted provided that the following conditions are
  7. // met:
  8. //
  9. // * Redistributions of source code must retain the above copyright
  10. // notice, this list of conditions and the following disclaimer.
  11. // * Redistributions in binary form must reproduce the above
  12. // copyright notice, this list of conditions and the following disclaimer
  13. // in the documentation and/or other materials provided with the
  14. // distribution.
  15. // * Neither the name of Google Inc. nor the names of its
  16. // contributors may be used to endorse or promote products derived from
  17. // this software without specific prior written permission.
  18. //
  19. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  22. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  23. // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24. // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  25. // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  29. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30. // Author: kenton@google.com (Kenton Varda)
  31. // Based on original Protocol Buffers design by
  32. // Sanjay Ghemawat, Jeff Dean, and others.
  33. #include <limits.h>
  34. #include <math.h>
  35. #include <vector>
  36. #include <google/protobuf/io/tokenizer.h>
  37. #include <google/protobuf/io/zero_copy_stream_impl.h>
  38. #include <google/protobuf/stubs/common.h>
  39. #include <google/protobuf/stubs/logging.h>
  40. #include <google/protobuf/stubs/strutil.h>
  41. #include <google/protobuf/stubs/substitute.h>
  42. #include <google/protobuf/testing/googletest.h>
  43. #include <gtest/gtest.h>
  44. namespace google {
  45. namespace protobuf {
  46. namespace io {
  47. namespace {
  48. // ===================================================================
  49. // Data-Driven Test Infrastructure
  50. // TODO(kenton): This is copied from coded_stream_unittest. This is
  51. // temporary until these fetaures are integrated into gTest itself.
  52. // TEST_1D and TEST_2D are macros I'd eventually like to see added to
  53. // gTest. These macros can be used to declare tests which should be
  54. // run multiple times, once for each item in some input array. TEST_1D
  55. // tests all cases in a single input array. TEST_2D tests all
  56. // combinations of cases from two arrays. The arrays must be statically
  57. // defined such that the GOOGLE_ARRAYSIZE() macro works on them. Example:
  58. //
  59. // int kCases[] = {1, 2, 3, 4}
  60. // TEST_1D(MyFixture, MyTest, kCases) {
  61. // EXPECT_GT(kCases_case, 0);
  62. // }
  63. //
  64. // This test iterates through the numbers 1, 2, 3, and 4 and tests that
  65. // they are all grater than zero. In case of failure, the exact case
  66. // which failed will be printed. The case type must be printable using
  67. // ostream::operator<<.
  68. #define TEST_1D(FIXTURE, NAME, CASES) \
  69. class FIXTURE##_##NAME##_DD : public FIXTURE { \
  70. protected: \
  71. template <typename CaseType> \
  72. void DoSingleCase(const CaseType& CASES##_case); \
  73. }; \
  74. \
  75. TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
  76. for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) { \
  77. SCOPED_TRACE(testing::Message() \
  78. << #CASES " case #" << i << ": " << CASES[i]); \
  79. DoSingleCase(CASES[i]); \
  80. } \
  81. } \
  82. \
  83. template <typename CaseType> \
  84. void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
  85. #define TEST_2D(FIXTURE, NAME, CASES1, CASES2) \
  86. class FIXTURE##_##NAME##_DD : public FIXTURE { \
  87. protected: \
  88. template <typename CaseType1, typename CaseType2> \
  89. void DoSingleCase(const CaseType1& CASES1##_case, \
  90. const CaseType2& CASES2##_case); \
  91. }; \
  92. \
  93. TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
  94. for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) { \
  95. for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) { \
  96. SCOPED_TRACE(testing::Message() \
  97. << #CASES1 " case #" << i << ": " << CASES1[i] << ", " \
  98. << #CASES2 " case #" << j << ": " << CASES2[j]); \
  99. DoSingleCase(CASES1[i], CASES2[j]); \
  100. } \
  101. } \
  102. } \
  103. \
  104. template <typename CaseType1, typename CaseType2> \
  105. void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
  106. const CaseType2& CASES2##_case)
  107. // -------------------------------------------------------------------
  108. // An input stream that is basically like an ArrayInputStream but sometimes
  109. // returns empty buffers, just to throw us off.
  110. class TestInputStream : public ZeroCopyInputStream {
  111. public:
  112. TestInputStream(const void* data, int size, int block_size)
  113. : array_stream_(data, size, block_size), counter_(0) {}
  114. ~TestInputStream() {}
  115. // implements ZeroCopyInputStream ----------------------------------
  116. bool Next(const void** data, int* size) {
  117. // We'll return empty buffers starting with the first buffer, and every
  118. // 3 and 5 buffers after that.
  119. if (counter_ % 3 == 0 || counter_ % 5 == 0) {
  120. *data = NULL;
  121. *size = 0;
  122. ++counter_;
  123. return true;
  124. } else {
  125. ++counter_;
  126. return array_stream_.Next(data, size);
  127. }
  128. }
  129. void BackUp(int count) { return array_stream_.BackUp(count); }
  130. bool Skip(int count) { return array_stream_.Skip(count); }
  131. int64 ByteCount() const { return array_stream_.ByteCount(); }
  132. private:
  133. ArrayInputStream array_stream_;
  134. int counter_;
  135. };
  136. // -------------------------------------------------------------------
  137. // An error collector which simply concatenates all its errors into a big
  138. // block of text which can be checked.
  139. class TestErrorCollector : public ErrorCollector {
  140. public:
  141. TestErrorCollector() {}
  142. ~TestErrorCollector() {}
  143. string text_;
  144. // implements ErrorCollector ---------------------------------------
  145. void AddError(int line, int column, const string& message) {
  146. strings::SubstituteAndAppend(&text_, "$0:$1: $2\n",
  147. line, column, message);
  148. }
  149. };
  150. // -------------------------------------------------------------------
  151. // We test each operation over a variety of block sizes to insure that
  152. // we test cases where reads cross buffer boundaries as well as cases
  153. // where they don't. This is sort of a brute-force approach to this,
  154. // but it's easy to write and easy to understand.
  155. const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
  156. class TokenizerTest : public testing::Test {
  157. protected:
  158. // For easy testing.
  159. uint64 ParseInteger(const string& text) {
  160. uint64 result;
  161. EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result));
  162. return result;
  163. }
  164. };
  165. // ===================================================================
  166. // These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
  167. // "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
  168. #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
  169. // In each test case, the entire input text should parse as a single token
  170. // of the given type.
  171. struct SimpleTokenCase {
  172. string input;
  173. Tokenizer::TokenType type;
  174. };
  175. inline std::ostream& operator<<(std::ostream& out,
  176. const SimpleTokenCase& test_case) {
  177. return out << CEscape(test_case.input);
  178. }
  179. SimpleTokenCase kSimpleTokenCases[] = {
  180. // Test identifiers.
  181. { "hello", Tokenizer::TYPE_IDENTIFIER },
  182. // Test integers.
  183. { "123", Tokenizer::TYPE_INTEGER },
  184. { "0xab6", Tokenizer::TYPE_INTEGER },
  185. { "0XAB6", Tokenizer::TYPE_INTEGER },
  186. { "0X1234567", Tokenizer::TYPE_INTEGER },
  187. { "0x89abcdef", Tokenizer::TYPE_INTEGER },
  188. { "0x89ABCDEF", Tokenizer::TYPE_INTEGER },
  189. { "01234567", Tokenizer::TYPE_INTEGER },
  190. // Test floats.
  191. { "123.45", Tokenizer::TYPE_FLOAT },
  192. { "1.", Tokenizer::TYPE_FLOAT },
  193. { "1e3", Tokenizer::TYPE_FLOAT },
  194. { "1E3", Tokenizer::TYPE_FLOAT },
  195. { "1e-3", Tokenizer::TYPE_FLOAT },
  196. { "1e+3", Tokenizer::TYPE_FLOAT },
  197. { "1.e3", Tokenizer::TYPE_FLOAT },
  198. { "1.2e3", Tokenizer::TYPE_FLOAT },
  199. { ".1", Tokenizer::TYPE_FLOAT },
  200. { ".1e3", Tokenizer::TYPE_FLOAT },
  201. { ".1e-3", Tokenizer::TYPE_FLOAT },
  202. { ".1e+3", Tokenizer::TYPE_FLOAT },
  203. // Test strings.
  204. { "'hello'", Tokenizer::TYPE_STRING },
  205. { "\"foo\"", Tokenizer::TYPE_STRING },
  206. { "'a\"b'", Tokenizer::TYPE_STRING },
  207. { "\"a'b\"", Tokenizer::TYPE_STRING },
  208. { "'a\\'b'", Tokenizer::TYPE_STRING },
  209. { "\"a\\\"b\"", Tokenizer::TYPE_STRING },
  210. { "'\\xf'", Tokenizer::TYPE_STRING },
  211. { "'\\0'", Tokenizer::TYPE_STRING },
  212. // Test symbols.
  213. { "+", Tokenizer::TYPE_SYMBOL },
  214. { ".", Tokenizer::TYPE_SYMBOL },
  215. };
  216. TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
  217. // Set up the tokenizer.
  218. TestInputStream input(kSimpleTokenCases_case.input.data(),
  219. kSimpleTokenCases_case.input.size(),
  220. kBlockSizes_case);
  221. TestErrorCollector error_collector;
  222. Tokenizer tokenizer(&input, &error_collector);
  223. // Before Next() is called, the initial token should always be TYPE_START.
  224. EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
  225. EXPECT_EQ("", tokenizer.current().text);
  226. EXPECT_EQ(0, tokenizer.current().line);
  227. EXPECT_EQ(0, tokenizer.current().column);
  228. EXPECT_EQ(0, tokenizer.current().end_column);
  229. // Parse the token.
  230. ASSERT_TRUE(tokenizer.Next());
  231. // Check that it has the right type.
  232. EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
  233. // Check that it contains the complete input text.
  234. EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
  235. // Check that it is located at the beginning of the input
  236. EXPECT_EQ(0, tokenizer.current().line);
  237. EXPECT_EQ(0, tokenizer.current().column);
  238. EXPECT_EQ(kSimpleTokenCases_case.input.size(),
  239. tokenizer.current().end_column);
  240. // There should be no more input.
  241. EXPECT_FALSE(tokenizer.Next());
  242. // After Next() returns false, the token should have type TYPE_END.
  243. EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
  244. EXPECT_EQ("", tokenizer.current().text);
  245. EXPECT_EQ(0, tokenizer.current().line);
  246. EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
  247. EXPECT_EQ(kSimpleTokenCases_case.input.size(),
  248. tokenizer.current().end_column);
  249. // There should be no errors.
  250. EXPECT_TRUE(error_collector.text_.empty());
  251. }
  252. TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
  253. // Test the "allow_f_after_float" option.
  254. // Set up the tokenizer.
  255. const char* text = "1f 2.5f 6e3f 7F";
  256. TestInputStream input(text, strlen(text), kBlockSizes_case);
  257. TestErrorCollector error_collector;
  258. Tokenizer tokenizer(&input, &error_collector);
  259. tokenizer.set_allow_f_after_float(true);
  260. // Advance through tokens and check that they are parsed as expected.
  261. ASSERT_TRUE(tokenizer.Next());
  262. EXPECT_EQ(tokenizer.current().text, "1f");
  263. EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
  264. ASSERT_TRUE(tokenizer.Next());
  265. EXPECT_EQ(tokenizer.current().text, "2.5f");
  266. EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
  267. ASSERT_TRUE(tokenizer.Next());
  268. EXPECT_EQ(tokenizer.current().text, "6e3f");
  269. EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
  270. ASSERT_TRUE(tokenizer.Next());
  271. EXPECT_EQ(tokenizer.current().text, "7F");
  272. EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
  273. // There should be no more input.
  274. EXPECT_FALSE(tokenizer.Next());
  275. // There should be no errors.
  276. EXPECT_TRUE(error_collector.text_.empty());
  277. }
  278. #endif
  279. // -------------------------------------------------------------------
  280. // In each case, the input is parsed to produce a list of tokens. The
  281. // last token in "output" must have type TYPE_END.
  282. struct MultiTokenCase {
  283. string input;
  284. Tokenizer::Token output[10]; // The compiler wants a constant array
  285. // size for initialization to work. There
  286. // is no reason this can't be increased if
  287. // needed.
  288. };
  289. inline std::ostream& operator<<(std::ostream& out,
  290. const MultiTokenCase& test_case) {
  291. return out << CEscape(test_case.input);
  292. }
  293. MultiTokenCase kMultiTokenCases[] = {
  294. // Test empty input.
  295. { "", {
  296. { Tokenizer::TYPE_END , "" , 0, 0, 0 },
  297. }},
  298. // Test all token types at the same time.
  299. { "foo 1 1.2 + 'bar'", {
  300. { Tokenizer::TYPE_IDENTIFIER, "foo" , 0, 0, 3 },
  301. { Tokenizer::TYPE_INTEGER , "1" , 0, 4, 5 },
  302. { Tokenizer::TYPE_FLOAT , "1.2" , 0, 6, 9 },
  303. { Tokenizer::TYPE_SYMBOL , "+" , 0, 10, 11 },
  304. { Tokenizer::TYPE_STRING , "'bar'", 0, 12, 17 },
  305. { Tokenizer::TYPE_END , "" , 0, 17, 17 },
  306. }},
  307. // Test that consecutive symbols are parsed as separate tokens.
  308. { "!@+%", {
  309. { Tokenizer::TYPE_SYMBOL , "!" , 0, 0, 1 },
  310. { Tokenizer::TYPE_SYMBOL , "@" , 0, 1, 2 },
  311. { Tokenizer::TYPE_SYMBOL , "+" , 0, 2, 3 },
  312. { Tokenizer::TYPE_SYMBOL , "%" , 0, 3, 4 },
  313. { Tokenizer::TYPE_END , "" , 0, 4, 4 },
  314. }},
  315. // Test that newlines affect line numbers correctly.
  316. { "foo bar\nrab oof", {
  317. { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
  318. { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 4, 7 },
  319. { Tokenizer::TYPE_IDENTIFIER, "rab", 1, 0, 3 },
  320. { Tokenizer::TYPE_IDENTIFIER, "oof", 1, 4, 7 },
  321. { Tokenizer::TYPE_END , "" , 1, 7, 7 },
  322. }},
  323. // Test that tabs affect column numbers correctly.
  324. { "foo\tbar \tbaz", {
  325. { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
  326. { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 8, 11 },
  327. { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19 },
  328. { Tokenizer::TYPE_END , "" , 0, 19, 19 },
  329. }},
  330. // Test that tabs in string literals affect column numbers correctly.
  331. { "\"foo\tbar\" baz", {
  332. { Tokenizer::TYPE_STRING , "\"foo\tbar\"", 0, 0, 12 },
  333. { Tokenizer::TYPE_IDENTIFIER, "baz" , 0, 13, 16 },
  334. { Tokenizer::TYPE_END , "" , 0, 16, 16 },
  335. }},
  336. // Test that line comments are ignored.
  337. { "foo // This is a comment\n"
  338. "bar // This is another comment", {
  339. { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
  340. { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 0, 3 },
  341. { Tokenizer::TYPE_END , "" , 1, 30, 30 },
  342. }},
  343. // Test that block comments are ignored.
  344. { "foo /* This is a block comment */ bar", {
  345. { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
  346. { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37 },
  347. { Tokenizer::TYPE_END , "" , 0, 37, 37 },
  348. }},
  349. // Test that sh-style comments are not ignored by default.
  350. { "foo # bar\n"
  351. "baz", {
  352. { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
  353. { Tokenizer::TYPE_SYMBOL , "#" , 0, 4, 5 },
  354. { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9 },
  355. { Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3 },
  356. { Tokenizer::TYPE_END , "" , 1, 3, 3 },
  357. }},
  358. // Test all whitespace chars
  359. { "foo\n\t\r\v\fbar", {
  360. { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
  361. { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14 },
  362. { Tokenizer::TYPE_END , "" , 1, 14, 14 },
  363. }},
  364. };
  365. TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
  366. // Set up the tokenizer.
  367. TestInputStream input(kMultiTokenCases_case.input.data(),
  368. kMultiTokenCases_case.input.size(),
  369. kBlockSizes_case);
  370. TestErrorCollector error_collector;
  371. Tokenizer tokenizer(&input, &error_collector);
  372. // Before Next() is called, the initial token should always be TYPE_START.
  373. EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
  374. EXPECT_EQ("", tokenizer.current().text);
  375. EXPECT_EQ(0, tokenizer.current().line);
  376. EXPECT_EQ(0, tokenizer.current().column);
  377. EXPECT_EQ(0, tokenizer.current().end_column);
  378. // Loop through all expected tokens.
  379. int i = 0;
  380. Tokenizer::Token token;
  381. do {
  382. token = kMultiTokenCases_case.output[i++];
  383. SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
  384. Tokenizer::Token previous = tokenizer.current();
  385. // Next() should only return false when it hits the end token.
  386. if (token.type != Tokenizer::TYPE_END) {
  387. ASSERT_TRUE(tokenizer.Next());
  388. } else {
  389. ASSERT_FALSE(tokenizer.Next());
  390. }
  391. // Check that the previous token is set correctly.
  392. EXPECT_EQ(previous.type, tokenizer.previous().type);
  393. EXPECT_EQ(previous.text, tokenizer.previous().text);
  394. EXPECT_EQ(previous.line, tokenizer.previous().line);
  395. EXPECT_EQ(previous.column, tokenizer.previous().column);
  396. EXPECT_EQ(previous.end_column, tokenizer.previous().end_column);
  397. // Check that the token matches the expected one.
  398. EXPECT_EQ(token.type, tokenizer.current().type);
  399. EXPECT_EQ(token.text, tokenizer.current().text);
  400. EXPECT_EQ(token.line, tokenizer.current().line);
  401. EXPECT_EQ(token.column, tokenizer.current().column);
  402. EXPECT_EQ(token.end_column, tokenizer.current().end_column);
  403. } while (token.type != Tokenizer::TYPE_END);
  404. // There should be no errors.
  405. EXPECT_TRUE(error_collector.text_.empty());
  406. }
  407. // This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
  408. // "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
  409. #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
  410. TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
  411. // Test the "comment_style" option.
  412. const char* text = "foo # bar\n"
  413. "baz // qux\n"
  414. "corge /* grault */\n"
  415. "garply";
  416. const char* const kTokens[] = {"foo", // "# bar" is ignored
  417. "baz", "/", "/", "qux",
  418. "corge", "/", "*", "grault", "*", "/",
  419. "garply"};
  420. // Set up the tokenizer.
  421. TestInputStream input(text, strlen(text), kBlockSizes_case);
  422. TestErrorCollector error_collector;
  423. Tokenizer tokenizer(&input, &error_collector);
  424. tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
  425. // Advance through tokens and check that they are parsed as expected.
  426. for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) {
  427. EXPECT_TRUE(tokenizer.Next());
  428. EXPECT_EQ(tokenizer.current().text, kTokens[i]);
  429. }
  430. // There should be no more input.
  431. EXPECT_FALSE(tokenizer.Next());
  432. // There should be no errors.
  433. EXPECT_TRUE(error_collector.text_.empty());
  434. }
  435. #endif
  436. // -------------------------------------------------------------------
  437. // In each case, the input is expected to have two tokens named "prev" and
  438. // "next" with comments in between.
  439. struct DocCommentCase {
  440. string input;
  441. const char* prev_trailing_comments;
  442. const char* detached_comments[10];
  443. const char* next_leading_comments;
  444. };
  445. inline std::ostream& operator<<(std::ostream& out,
  446. const DocCommentCase& test_case) {
  447. return out << CEscape(test_case.input);
  448. }
  449. DocCommentCase kDocCommentCases[] = {
  450. {
  451. "prev next",
  452. "",
  453. {},
  454. ""
  455. },
  456. {
  457. "prev /* ignored */ next",
  458. "",
  459. {},
  460. ""
  461. },
  462. {
  463. "prev // trailing comment\n"
  464. "next",
  465. " trailing comment\n",
  466. {},
  467. ""
  468. },
  469. {
  470. "prev\n"
  471. "// leading comment\n"
  472. "// line 2\n"
  473. "next",
  474. "",
  475. {},
  476. " leading comment\n"
  477. " line 2\n"
  478. },
  479. {
  480. "prev\n"
  481. "// trailing comment\n"
  482. "// line 2\n"
  483. "\n"
  484. "next",
  485. " trailing comment\n"
  486. " line 2\n",
  487. {},
  488. ""
  489. },
  490. {
  491. "prev // trailing comment\n"
  492. "// leading comment\n"
  493. "// line 2\n"
  494. "next",
  495. " trailing comment\n",
  496. {},
  497. " leading comment\n"
  498. " line 2\n"
  499. },
  500. {
  501. "prev /* trailing block comment */\n"
  502. "/* leading block comment\n"
  503. " * line 2\n"
  504. " * line 3 */"
  505. "next",
  506. " trailing block comment ",
  507. {},
  508. " leading block comment\n"
  509. " line 2\n"
  510. " line 3 "
  511. },
  512. {
  513. "prev\n"
  514. "/* trailing block comment\n"
  515. " * line 2\n"
  516. " * line 3\n"
  517. " */\n"
  518. "/* leading block comment\n"
  519. " * line 2\n"
  520. " * line 3 */"
  521. "next",
  522. " trailing block comment\n"
  523. " line 2\n"
  524. " line 3\n",
  525. {},
  526. " leading block comment\n"
  527. " line 2\n"
  528. " line 3 "
  529. },
  530. {
  531. "prev\n"
  532. "// trailing comment\n"
  533. "\n"
  534. "// detached comment\n"
  535. "// line 2\n"
  536. "\n"
  537. "// second detached comment\n"
  538. "/* third detached comment\n"
  539. " * line 2 */\n"
  540. "// leading comment\n"
  541. "next",
  542. " trailing comment\n",
  543. {
  544. " detached comment\n"
  545. " line 2\n",
  546. " second detached comment\n",
  547. " third detached comment\n"
  548. " line 2 "
  549. },
  550. " leading comment\n"
  551. },
  552. {
  553. "prev /**/\n"
  554. "\n"
  555. "// detached comment\n"
  556. "\n"
  557. "// leading comment\n"
  558. "next",
  559. "",
  560. {
  561. " detached comment\n"
  562. },
  563. " leading comment\n"
  564. },
  565. {
  566. "prev /**/\n"
  567. "// leading comment\n"
  568. "next",
  569. "",
  570. {},
  571. " leading comment\n"
  572. },
  573. };
  574. TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) {
  575. // Set up the tokenizer.
  576. TestInputStream input(kDocCommentCases_case.input.data(),
  577. kDocCommentCases_case.input.size(),
  578. kBlockSizes_case);
  579. TestErrorCollector error_collector;
  580. Tokenizer tokenizer(&input, &error_collector);
  581. // Set up a second tokenizer where we'll pass all NULLs to NextWithComments().
  582. TestInputStream input2(kDocCommentCases_case.input.data(),
  583. kDocCommentCases_case.input.size(),
  584. kBlockSizes_case);
  585. Tokenizer tokenizer2(&input2, &error_collector);
  586. tokenizer.Next();
  587. tokenizer2.Next();
  588. EXPECT_EQ("prev", tokenizer.current().text);
  589. EXPECT_EQ("prev", tokenizer2.current().text);
  590. string prev_trailing_comments;
  591. std::vector<string> detached_comments;
  592. string next_leading_comments;
  593. tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments,
  594. &next_leading_comments);
  595. tokenizer2.NextWithComments(NULL, NULL, NULL);
  596. EXPECT_EQ("next", tokenizer.current().text);
  597. EXPECT_EQ("next", tokenizer2.current().text);
  598. EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments,
  599. prev_trailing_comments);
  600. for (int i = 0; i < detached_comments.size(); i++) {
  601. ASSERT_LT(i, GOOGLE_ARRAYSIZE(kDocCommentCases));
  602. ASSERT_TRUE(kDocCommentCases_case.detached_comments[i] != NULL);
  603. EXPECT_EQ(kDocCommentCases_case.detached_comments[i],
  604. detached_comments[i]);
  605. }
  606. // Verify that we matched all the detached comments.
  607. EXPECT_EQ(NULL,
  608. kDocCommentCases_case.detached_comments[detached_comments.size()]);
  609. EXPECT_EQ(kDocCommentCases_case.next_leading_comments,
  610. next_leading_comments);
  611. }
  612. // -------------------------------------------------------------------
  613. // Test parse helpers. It's not really worth setting up a full data-driven
  614. // test here.
  615. TEST_F(TokenizerTest, ParseInteger) {
  616. EXPECT_EQ(0, ParseInteger("0"));
  617. EXPECT_EQ(123, ParseInteger("123"));
  618. EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
  619. EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
  620. EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
  621. EXPECT_EQ(01234567, ParseInteger("01234567"));
  622. EXPECT_EQ(0X123, ParseInteger("0X123"));
  623. // Test invalid integers that may still be tokenized as integers.
  624. EXPECT_EQ(0, ParseInteger("0x"));
  625. uint64 i;
  626. // Test invalid integers that will never be tokenized as integers.
  627. EXPECT_FALSE(Tokenizer::ParseInteger("zxy", kuint64max, &i));
  628. EXPECT_FALSE(Tokenizer::ParseInteger("1.2", kuint64max, &i));
  629. EXPECT_FALSE(Tokenizer::ParseInteger("08", kuint64max, &i));
  630. EXPECT_FALSE(Tokenizer::ParseInteger("0xg", kuint64max, &i));
  631. EXPECT_FALSE(Tokenizer::ParseInteger("-1", kuint64max, &i));
  632. // Test overflows.
  633. EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i));
  634. EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
  635. EXPECT_TRUE (Tokenizer::ParseInteger("1", 1, &i));
  636. EXPECT_TRUE (Tokenizer::ParseInteger("12345", 12345, &i));
  637. EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
  638. EXPECT_TRUE (Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF" , kuint64max, &i));
  639. EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
  640. }
  641. TEST_F(TokenizerTest, ParseFloat) {
  642. EXPECT_DOUBLE_EQ(1 , Tokenizer::ParseFloat("1."));
  643. EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1e3"));
  644. EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1E3"));
  645. EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
  646. EXPECT_DOUBLE_EQ(.1 , Tokenizer::ParseFloat(".1"));
  647. EXPECT_DOUBLE_EQ(.25 , Tokenizer::ParseFloat(".25"));
  648. EXPECT_DOUBLE_EQ(.1e3 , Tokenizer::ParseFloat(".1e3"));
  649. EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
  650. EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
  651. EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
  652. EXPECT_DOUBLE_EQ(5 , Tokenizer::ParseFloat("5"));
  653. EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
  654. EXPECT_DOUBLE_EQ(1.2 , Tokenizer::ParseFloat("1.2"));
  655. EXPECT_DOUBLE_EQ(1.e2 , Tokenizer::ParseFloat("1.e2"));
  656. // Test invalid integers that may still be tokenized as integers.
  657. EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
  658. EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
  659. EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));
  660. // Test 'f' suffix.
  661. EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
  662. EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
  663. EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));
  664. // These should parse successfully even though they are out of range.
  665. // Overflows become infinity and underflows become zero.
  666. EXPECT_EQ( 0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
  667. EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
  668. #ifdef PROTOBUF_HAS_DEATH_TEST // death tests do not work on Windows yet
  669. // Test invalid integers that will never be tokenized as integers.
  670. EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"),
  671. "passed text that could not have been tokenized as a float");
  672. EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("1-e0"),
  673. "passed text that could not have been tokenized as a float");
  674. EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"),
  675. "passed text that could not have been tokenized as a float");
  676. #endif // PROTOBUF_HAS_DEATH_TEST
  677. }
  678. TEST_F(TokenizerTest, ParseString) {
  679. string output;
  680. Tokenizer::ParseString("'hello'", &output);
  681. EXPECT_EQ("hello", output);
  682. Tokenizer::ParseString("\"blah\\nblah2\"", &output);
  683. EXPECT_EQ("blah\nblah2", output);
  684. Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
  685. EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
  686. Tokenizer::ParseString("'\\x20\\x4'", &output);
  687. EXPECT_EQ("\x20\x4", output);
  688. // Test invalid strings that may still be tokenized as strings.
  689. Tokenizer::ParseString("\"\\a\\l\\v\\t", &output); // \l is invalid
  690. EXPECT_EQ("\a?\v\t", output);
  691. Tokenizer::ParseString("'", &output);
  692. EXPECT_EQ("", output);
  693. Tokenizer::ParseString("'\\", &output);
  694. EXPECT_EQ("\\", output);
  695. // Experiment with Unicode escapes. Here are one-, two- and three-byte Unicode
  696. // characters.
  697. Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\U00024b62XX'", &output);
  698. EXPECT_EQ("$¢€𤭢XX", output);
  699. // Same thing encoded using UTF16.
  700. Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", &output);
  701. EXPECT_EQ("$¢€𤭢XX", output);
  702. // Here's some broken UTF16; there's a head surrogate with no tail surrogate.
  703. // We just output this as if it were UTF8; it's not a defined code point, but
  704. // it has a defined encoding.
  705. Tokenizer::ParseString("'\\ud852XX'", &output);
  706. EXPECT_EQ("\xed\xa1\x92XX", output);
  707. // Malformed escape: Demons may fly out of the nose.
  708. Tokenizer::ParseString("\\u0", &output);
  709. EXPECT_EQ("u0", output);
  710. // Test invalid strings that will never be tokenized as strings.
  711. #ifdef PROTOBUF_HAS_DEATH_TEST // death tests do not work on Windows yet
  712. EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output),
  713. "passed text that could not have been tokenized as a string");
  714. #endif // PROTOBUF_HAS_DEATH_TEST
  715. }
  716. TEST_F(TokenizerTest, ParseStringAppend) {
  717. // Check that ParseString and ParseStringAppend differ.
  718. string output("stuff+");
  719. Tokenizer::ParseStringAppend("'hello'", &output);
  720. EXPECT_EQ("stuff+hello", output);
  721. Tokenizer::ParseString("'hello'", &output);
  722. EXPECT_EQ("hello", output);
  723. }
  724. // -------------------------------------------------------------------
  725. // Each case parses some input text, ignoring the tokens produced, and
  726. // checks that the error output matches what is expected.
  727. struct ErrorCase {
  728. string input;
  729. bool recoverable; // True if the tokenizer should be able to recover and
  730. // parse more tokens after seeing this error. Cases
  731. // for which this is true must end with "foo" as
  732. // the last token, which the test will check for.
  733. const char* errors;
  734. };
  735. inline std::ostream& operator<<(std::ostream& out, const ErrorCase& test_case) {
  736. return out << CEscape(test_case.input);
  737. }
  738. ErrorCase kErrorCases[] = {
  739. // String errors.
  740. { "'\\l' foo", true,
  741. "0:2: Invalid escape sequence in string literal.\n" },
  742. { "'\\X' foo", true,
  743. "0:2: Invalid escape sequence in string literal.\n" },
  744. { "'\\x' foo", true,
  745. "0:3: Expected hex digits for escape sequence.\n" },
  746. { "'foo", false,
  747. "0:4: Unexpected end of string.\n" },
  748. { "'bar\nfoo", true,
  749. "0:4: String literals cannot cross line boundaries.\n" },
  750. { "'\\u01' foo", true,
  751. "0:5: Expected four hex digits for \\u escape sequence.\n" },
  752. { "'\\u01' foo", true,
  753. "0:5: Expected four hex digits for \\u escape sequence.\n" },
  754. { "'\\uXYZ' foo", true,
  755. "0:3: Expected four hex digits for \\u escape sequence.\n" },
  756. // Integer errors.
  757. { "123foo", true,
  758. "0:3: Need space between number and identifier.\n" },
  759. // Hex/octal errors.
  760. { "0x foo", true,
  761. "0:2: \"0x\" must be followed by hex digits.\n" },
  762. { "0541823 foo", true,
  763. "0:4: Numbers starting with leading zero must be in octal.\n" },
  764. { "0x123z foo", true,
  765. "0:5: Need space between number and identifier.\n" },
  766. { "0x123.4 foo", true,
  767. "0:5: Hex and octal numbers must be integers.\n" },
  768. { "0123.4 foo", true,
  769. "0:4: Hex and octal numbers must be integers.\n" },
  770. // Float errors.
  771. { "1e foo", true,
  772. "0:2: \"e\" must be followed by exponent.\n" },
  773. { "1e- foo", true,
  774. "0:3: \"e\" must be followed by exponent.\n" },
  775. { "1.2.3 foo", true,
  776. "0:3: Already saw decimal point or exponent; can't have another one.\n" },
  777. { "1e2.3 foo", true,
  778. "0:3: Already saw decimal point or exponent; can't have another one.\n" },
  779. { "a.1 foo", true,
  780. "0:1: Need space between identifier and decimal point.\n" },
  781. // allow_f_after_float not enabled, so this should be an error.
  782. { "1.0f foo", true,
  783. "0:3: Need space between number and identifier.\n" },
  784. // Block comment errors.
  785. { "/*", false,
  786. "0:2: End-of-file inside block comment.\n"
  787. "0:0: Comment started here.\n"},
  788. { "/*/*/ foo", true,
  789. "0:3: \"/*\" inside block comment. Block comments cannot be nested.\n"},
  790. // Control characters. Multiple consecutive control characters should only
  791. // produce one error.
  792. { "\b foo", true,
  793. "0:0: Invalid control characters encountered in text.\n" },
  794. { "\b\b foo", true,
  795. "0:0: Invalid control characters encountered in text.\n" },
  796. // Check that control characters at end of input don't result in an
  797. // infinite loop.
  798. { "\b", false,
  799. "0:0: Invalid control characters encountered in text.\n" },
  800. // Check recovery from '\0'. We have to explicitly specify the length of
  801. // these strings because otherwise the string constructor will just call
  802. // strlen() which will see the first '\0' and think that is the end of the
  803. // string.
  804. { string("\0foo", 4), true,
  805. "0:0: Invalid control characters encountered in text.\n" },
  806. { string("\0\0foo", 5), true,
  807. "0:0: Invalid control characters encountered in text.\n" },
  808. // Check error from high order bits set
  809. { "\300foo", true,
  810. "0:0: Interpreting non ascii codepoint 192.\n" },
  811. };
  812. TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
  813. // Set up the tokenizer.
  814. TestInputStream input(kErrorCases_case.input.data(),
  815. kErrorCases_case.input.size(),
  816. kBlockSizes_case);
  817. TestErrorCollector error_collector;
  818. Tokenizer tokenizer(&input, &error_collector);
  819. // Ignore all input, except remember if the last token was "foo".
  820. bool last_was_foo = false;
  821. while (tokenizer.Next()) {
  822. last_was_foo = tokenizer.current().text == "foo";
  823. }
  824. // Check that the errors match what was expected.
  825. EXPECT_EQ(kErrorCases_case.errors, error_collector.text_);
  826. // If the error was recoverable, make sure we saw "foo" after it.
  827. if (kErrorCases_case.recoverable) {
  828. EXPECT_TRUE(last_was_foo);
  829. }
  830. }
  831. // -------------------------------------------------------------------
  832. TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
  833. string text = "foo bar";
  834. TestInputStream input(text.data(), text.size(), kBlockSizes_case);
  835. // Create a tokenizer, read one token, then destroy it.
  836. {
  837. TestErrorCollector error_collector;
  838. Tokenizer tokenizer(&input, &error_collector);
  839. tokenizer.Next();
  840. }
  841. // Only "foo" should have been read.
  842. EXPECT_EQ(strlen("foo"), input.ByteCount());
  843. }
  844. } // namespace
  845. } // namespace io
  846. } // namespace protobuf
  847. } // namespace google