strutil.cc 77 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235
  1. // Protocol Buffers - Google's data interchange format
  2. // Copyright 2008 Google Inc. All rights reserved.
  3. // https://developers.google.com/protocol-buffers/
  4. //
  5. // Redistribution and use in source and binary forms, with or without
  6. // modification, are permitted provided that the following conditions are
  7. // met:
  8. //
  9. // * Redistributions of source code must retain the above copyright
  10. // notice, this list of conditions and the following disclaimer.
  11. // * Redistributions in binary form must reproduce the above
  12. // copyright notice, this list of conditions and the following disclaimer
  13. // in the documentation and/or other materials provided with the
  14. // distribution.
  15. // * Neither the name of Google Inc. nor the names of its
  16. // contributors may be used to endorse or promote products derived from
  17. // this software without specific prior written permission.
  18. //
  19. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  22. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  23. // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24. // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  25. // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  29. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30. // from google3/strings/strutil.cc
  31. #include <google/protobuf/stubs/strutil.h>
  32. #include <google/protobuf/stubs/mathlimits.h>
  33. #include <errno.h>
  34. #include <float.h> // FLT_DIG and DBL_DIG
  35. #include <limits>
  36. #include <limits.h>
  37. #include <stdio.h>
  38. #include <iterator>
  39. #include <google/protobuf/stubs/stl_util.h>
  40. #ifdef _WIN32
  41. // MSVC has only _snprintf, not snprintf.
  42. //
  43. // MinGW has both snprintf and _snprintf, but they appear to be different
  44. // functions. The former is buggy. When invoked like so:
  45. // char buffer[32];
  46. // snprintf(buffer, 32, "%.*g\n", FLT_DIG, 1.23e10f);
  47. // it prints "1.23000e+10". This is plainly wrong: %g should never print
  48. // trailing zeros after the decimal point. For some reason this bug only
  49. // occurs with some input values, not all. In any case, _snprintf does the
  50. // right thing, so we use it.
  51. #define snprintf _snprintf
  52. #endif
  53. namespace google {
  54. namespace protobuf {
  55. // These are defined as macros on some platforms. #undef them so that we can
  56. // redefine them.
  57. #undef isxdigit
  58. #undef isprint
  59. // The definitions of these in ctype.h change based on locale. Since our
  60. // string manipulation is all in relation to the protocol buffer and C++
  61. // languages, we always want to use the C locale. So, we re-define these
  62. // exactly as we want them.
  63. inline bool isxdigit(char c) {
  64. return ('0' <= c && c <= '9') ||
  65. ('a' <= c && c <= 'f') ||
  66. ('A' <= c && c <= 'F');
  67. }
  68. inline bool isprint(char c) {
  69. return c >= 0x20 && c <= 0x7E;
  70. }
  71. // ----------------------------------------------------------------------
  72. // StripString
  73. // Replaces any occurrence of the character 'remove' (or the characters
  74. // in 'remove') with the character 'replacewith'.
  75. // ----------------------------------------------------------------------
  76. void StripString(string* s, const char* remove, char replacewith) {
  77. const char * str_start = s->c_str();
  78. const char * str = str_start;
  79. for (str = strpbrk(str, remove);
  80. str != NULL;
  81. str = strpbrk(str + 1, remove)) {
  82. (*s)[str - str_start] = replacewith;
  83. }
  84. }
  85. void StripWhitespace(string* str) {
  86. int str_length = str->length();
  87. // Strip off leading whitespace.
  88. int first = 0;
  89. while (first < str_length && ascii_isspace(str->at(first))) {
  90. ++first;
  91. }
  92. // If entire string is white space.
  93. if (first == str_length) {
  94. str->clear();
  95. return;
  96. }
  97. if (first > 0) {
  98. str->erase(0, first);
  99. str_length -= first;
  100. }
  101. // Strip off trailing whitespace.
  102. int last = str_length - 1;
  103. while (last >= 0 && ascii_isspace(str->at(last))) {
  104. --last;
  105. }
  106. if (last != (str_length - 1) && last >= 0) {
  107. str->erase(last + 1, string::npos);
  108. }
  109. }
  110. // ----------------------------------------------------------------------
  111. // StringReplace()
  112. // Replace the "old" pattern with the "new" pattern in a string,
  113. // and append the result to "res". If replace_all is false,
  114. // it only replaces the first instance of "old."
  115. // ----------------------------------------------------------------------
  116. void StringReplace(const string& s, const string& oldsub,
  117. const string& newsub, bool replace_all,
  118. string* res) {
  119. if (oldsub.empty()) {
  120. res->append(s); // if empty, append the given string.
  121. return;
  122. }
  123. string::size_type start_pos = 0;
  124. string::size_type pos;
  125. do {
  126. pos = s.find(oldsub, start_pos);
  127. if (pos == string::npos) {
  128. break;
  129. }
  130. res->append(s, start_pos, pos - start_pos);
  131. res->append(newsub);
  132. start_pos = pos + oldsub.size(); // start searching again after the "old"
  133. } while (replace_all);
  134. res->append(s, start_pos, s.length() - start_pos);
  135. }
  136. // ----------------------------------------------------------------------
  137. // StringReplace()
  138. // Give me a string and two patterns "old" and "new", and I replace
  139. // the first instance of "old" in the string with "new", if it
  140. // exists. If "global" is true; call this repeatedly until it
  141. // fails. RETURN a new string, regardless of whether the replacement
  142. // happened or not.
  143. // ----------------------------------------------------------------------
  144. string StringReplace(const string& s, const string& oldsub,
  145. const string& newsub, bool replace_all) {
  146. string ret;
  147. StringReplace(s, oldsub, newsub, replace_all, &ret);
  148. return ret;
  149. }
  150. // ----------------------------------------------------------------------
  151. // SplitStringUsing()
  152. // Split a string using a character delimiter. Append the components
  153. // to 'result'.
  154. //
  155. // Note: For multi-character delimiters, this routine will split on *ANY* of
  156. // the characters in the string, not the entire string as a single delimiter.
  157. // ----------------------------------------------------------------------
  158. template <typename ITR>
  159. static inline
  160. void SplitStringToIteratorUsing(const string& full,
  161. const char* delim,
  162. ITR& result) {
  163. // Optimize the common case where delim is a single character.
  164. if (delim[0] != '\0' && delim[1] == '\0') {
  165. char c = delim[0];
  166. const char* p = full.data();
  167. const char* end = p + full.size();
  168. while (p != end) {
  169. if (*p == c) {
  170. ++p;
  171. } else {
  172. const char* start = p;
  173. while (++p != end && *p != c);
  174. *result++ = string(start, p - start);
  175. }
  176. }
  177. return;
  178. }
  179. string::size_type begin_index, end_index;
  180. begin_index = full.find_first_not_of(delim);
  181. while (begin_index != string::npos) {
  182. end_index = full.find_first_of(delim, begin_index);
  183. if (end_index == string::npos) {
  184. *result++ = full.substr(begin_index);
  185. return;
  186. }
  187. *result++ = full.substr(begin_index, (end_index - begin_index));
  188. begin_index = full.find_first_not_of(delim, end_index);
  189. }
  190. }
  191. void SplitStringUsing(const string& full,
  192. const char* delim,
  193. vector<string>* result) {
  194. back_insert_iterator< vector<string> > it(*result);
  195. SplitStringToIteratorUsing(full, delim, it);
  196. }
  197. // Split a string using a character delimiter. Append the components
  198. // to 'result'. If there are consecutive delimiters, this function
  199. // will return corresponding empty strings. The string is split into
  200. // at most the specified number of pieces greedily. This means that the
  201. // last piece may possibly be split further. To split into as many pieces
  202. // as possible, specify 0 as the number of pieces.
  203. //
  204. // If "full" is the empty string, yields an empty string as the only value.
  205. //
  206. // If "pieces" is negative for some reason, it returns the whole string
  207. // ----------------------------------------------------------------------
  208. template <typename StringType, typename ITR>
  209. static inline
  210. void SplitStringToIteratorAllowEmpty(const StringType& full,
  211. const char* delim,
  212. int pieces,
  213. ITR& result) {
  214. string::size_type begin_index, end_index;
  215. begin_index = 0;
  216. for (int i = 0; (i < pieces-1) || (pieces == 0); i++) {
  217. end_index = full.find_first_of(delim, begin_index);
  218. if (end_index == string::npos) {
  219. *result++ = full.substr(begin_index);
  220. return;
  221. }
  222. *result++ = full.substr(begin_index, (end_index - begin_index));
  223. begin_index = end_index + 1;
  224. }
  225. *result++ = full.substr(begin_index);
  226. }
  227. void SplitStringAllowEmpty(const string& full, const char* delim,
  228. vector<string>* result) {
  229. back_insert_iterator<vector<string> > it(*result);
  230. SplitStringToIteratorAllowEmpty(full, delim, 0, it);
  231. }
  232. // ----------------------------------------------------------------------
  233. // JoinStrings()
  234. // This merges a vector of string components with delim inserted
  235. // as separaters between components.
  236. //
  237. // ----------------------------------------------------------------------
  238. template <class ITERATOR>
  239. static void JoinStringsIterator(const ITERATOR& start,
  240. const ITERATOR& end,
  241. const char* delim,
  242. string* result) {
  243. GOOGLE_CHECK(result != NULL);
  244. result->clear();
  245. int delim_length = strlen(delim);
  246. // Precompute resulting length so we can reserve() memory in one shot.
  247. int length = 0;
  248. for (ITERATOR iter = start; iter != end; ++iter) {
  249. if (iter != start) {
  250. length += delim_length;
  251. }
  252. length += iter->size();
  253. }
  254. result->reserve(length);
  255. // Now combine everything.
  256. for (ITERATOR iter = start; iter != end; ++iter) {
  257. if (iter != start) {
  258. result->append(delim, delim_length);
  259. }
  260. result->append(iter->data(), iter->size());
  261. }
  262. }
  263. void JoinStrings(const vector<string>& components,
  264. const char* delim,
  265. string * result) {
  266. JoinStringsIterator(components.begin(), components.end(), delim, result);
  267. }
  268. // ----------------------------------------------------------------------
  269. // UnescapeCEscapeSequences()
  270. // This does all the unescaping that C does: \ooo, \r, \n, etc
  271. // Returns length of resulting string.
  272. // The implementation of \x parses any positive number of hex digits,
  273. // but it is an error if the value requires more than 8 bits, and the
  274. // result is truncated to 8 bits.
  275. //
  276. // The second call stores its errors in a supplied string vector.
  277. // If the string vector pointer is NULL, it reports the errors with LOG().
  278. // ----------------------------------------------------------------------
  279. #define IS_OCTAL_DIGIT(c) (((c) >= '0') && ((c) <= '7'))
  280. // Protocol buffers doesn't ever care about errors, but I don't want to remove
  281. // the code.
  282. #define LOG_STRING(LEVEL, VECTOR) GOOGLE_LOG_IF(LEVEL, false)
  283. int UnescapeCEscapeSequences(const char* source, char* dest) {
  284. return UnescapeCEscapeSequences(source, dest, NULL);
  285. }
  286. int UnescapeCEscapeSequences(const char* source, char* dest,
  287. vector<string> *errors) {
  288. GOOGLE_DCHECK(errors == NULL) << "Error reporting not implemented.";
  289. char* d = dest;
  290. const char* p = source;
  291. // Small optimization for case where source = dest and there's no escaping
  292. while ( p == d && *p != '\0' && *p != '\\' )
  293. p++, d++;
  294. while (*p != '\0') {
  295. if (*p != '\\') {
  296. *d++ = *p++;
  297. } else {
  298. switch ( *++p ) { // skip past the '\\'
  299. case '\0':
  300. LOG_STRING(ERROR, errors) << "String cannot end with \\";
  301. *d = '\0';
  302. return d - dest; // we're done with p
  303. case 'a': *d++ = '\a'; break;
  304. case 'b': *d++ = '\b'; break;
  305. case 'f': *d++ = '\f'; break;
  306. case 'n': *d++ = '\n'; break;
  307. case 'r': *d++ = '\r'; break;
  308. case 't': *d++ = '\t'; break;
  309. case 'v': *d++ = '\v'; break;
  310. case '\\': *d++ = '\\'; break;
  311. case '?': *d++ = '\?'; break; // \? Who knew?
  312. case '\'': *d++ = '\''; break;
  313. case '"': *d++ = '\"'; break;
  314. case '0': case '1': case '2': case '3': // octal digit: 1 to 3 digits
  315. case '4': case '5': case '6': case '7': {
  316. char ch = *p - '0';
  317. if ( IS_OCTAL_DIGIT(p[1]) )
  318. ch = ch * 8 + *++p - '0';
  319. if ( IS_OCTAL_DIGIT(p[1]) ) // safe (and easy) to do this twice
  320. ch = ch * 8 + *++p - '0'; // now points at last digit
  321. *d++ = ch;
  322. break;
  323. }
  324. case 'x': case 'X': {
  325. if (!isxdigit(p[1])) {
  326. if (p[1] == '\0') {
  327. LOG_STRING(ERROR, errors) << "String cannot end with \\x";
  328. } else {
  329. LOG_STRING(ERROR, errors) <<
  330. "\\x cannot be followed by non-hex digit: \\" << *p << p[1];
  331. }
  332. break;
  333. }
  334. unsigned int ch = 0;
  335. const char *hex_start = p;
  336. while (isxdigit(p[1])) // arbitrarily many hex digits
  337. ch = (ch << 4) + hex_digit_to_int(*++p);
  338. if (ch > 0xFF)
  339. LOG_STRING(ERROR, errors) << "Value of " <<
  340. "\\" << string(hex_start, p+1-hex_start) << " exceeds 8 bits";
  341. *d++ = ch;
  342. break;
  343. }
  344. #if 0 // TODO(kenton): Support \u and \U? Requires runetochar().
  345. case 'u': {
  346. // \uhhhh => convert 4 hex digits to UTF-8
  347. char32 rune = 0;
  348. const char *hex_start = p;
  349. for (int i = 0; i < 4; ++i) {
  350. if (isxdigit(p[1])) { // Look one char ahead.
  351. rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p.
  352. } else {
  353. LOG_STRING(ERROR, errors)
  354. << "\\u must be followed by 4 hex digits: \\"
  355. << string(hex_start, p+1-hex_start);
  356. break;
  357. }
  358. }
  359. d += runetochar(d, &rune);
  360. break;
  361. }
  362. case 'U': {
  363. // \Uhhhhhhhh => convert 8 hex digits to UTF-8
  364. char32 rune = 0;
  365. const char *hex_start = p;
  366. for (int i = 0; i < 8; ++i) {
  367. if (isxdigit(p[1])) { // Look one char ahead.
  368. // Don't change rune until we're sure this
  369. // is within the Unicode limit, but do advance p.
  370. char32 newrune = (rune << 4) + hex_digit_to_int(*++p);
  371. if (newrune > 0x10FFFF) {
  372. LOG_STRING(ERROR, errors)
  373. << "Value of \\"
  374. << string(hex_start, p + 1 - hex_start)
  375. << " exceeds Unicode limit (0x10FFFF)";
  376. break;
  377. } else {
  378. rune = newrune;
  379. }
  380. } else {
  381. LOG_STRING(ERROR, errors)
  382. << "\\U must be followed by 8 hex digits: \\"
  383. << string(hex_start, p+1-hex_start);
  384. break;
  385. }
  386. }
  387. d += runetochar(d, &rune);
  388. break;
  389. }
  390. #endif
  391. default:
  392. LOG_STRING(ERROR, errors) << "Unknown escape sequence: \\" << *p;
  393. }
  394. p++; // read past letter we escaped
  395. }
  396. }
  397. *d = '\0';
  398. return d - dest;
  399. }
  400. // ----------------------------------------------------------------------
  401. // UnescapeCEscapeString()
  402. // This does the same thing as UnescapeCEscapeSequences, but creates
  403. // a new string. The caller does not need to worry about allocating
  404. // a dest buffer. This should be used for non performance critical
  405. // tasks such as printing debug messages. It is safe for src and dest
  406. // to be the same.
  407. //
  408. // The second call stores its errors in a supplied string vector.
  409. // If the string vector pointer is NULL, it reports the errors with LOG().
  410. //
  411. // In the first and second calls, the length of dest is returned. In the
  412. // the third call, the new string is returned.
  413. // ----------------------------------------------------------------------
  414. int UnescapeCEscapeString(const string& src, string* dest) {
  415. return UnescapeCEscapeString(src, dest, NULL);
  416. }
  417. int UnescapeCEscapeString(const string& src, string* dest,
  418. vector<string> *errors) {
  419. scoped_array<char> unescaped(new char[src.size() + 1]);
  420. int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), errors);
  421. GOOGLE_CHECK(dest);
  422. dest->assign(unescaped.get(), len);
  423. return len;
  424. }
  425. string UnescapeCEscapeString(const string& src) {
  426. scoped_array<char> unescaped(new char[src.size() + 1]);
  427. int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), NULL);
  428. return string(unescaped.get(), len);
  429. }
  430. // ----------------------------------------------------------------------
  431. // CEscapeString()
  432. // CHexEscapeString()
  433. // Copies 'src' to 'dest', escaping dangerous characters using
  434. // C-style escape sequences. This is very useful for preparing query
  435. // flags. 'src' and 'dest' should not overlap. The 'Hex' version uses
  436. // hexadecimal rather than octal sequences.
  437. // Returns the number of bytes written to 'dest' (not including the \0)
  438. // or -1 if there was insufficient space.
  439. //
  440. // Currently only \n, \r, \t, ", ', \ and !isprint() chars are escaped.
  441. // ----------------------------------------------------------------------
  442. int CEscapeInternal(const char* src, int src_len, char* dest,
  443. int dest_len, bool use_hex, bool utf8_safe) {
  444. const char* src_end = src + src_len;
  445. int used = 0;
  446. bool last_hex_escape = false; // true if last output char was \xNN
  447. for (; src < src_end; src++) {
  448. if (dest_len - used < 2) // Need space for two letter escape
  449. return -1;
  450. bool is_hex_escape = false;
  451. switch (*src) {
  452. case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break;
  453. case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break;
  454. case '\t': dest[used++] = '\\'; dest[used++] = 't'; break;
  455. case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break;
  456. case '\'': dest[used++] = '\\'; dest[used++] = '\''; break;
  457. case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break;
  458. default:
  459. // Note that if we emit \xNN and the src character after that is a hex
  460. // digit then that digit must be escaped too to prevent it being
  461. // interpreted as part of the character code by C.
  462. if ((!utf8_safe || static_cast<uint8>(*src) < 0x80) &&
  463. (!isprint(*src) ||
  464. (last_hex_escape && isxdigit(*src)))) {
  465. if (dest_len - used < 4) // need space for 4 letter escape
  466. return -1;
  467. sprintf(dest + used, (use_hex ? "\\x%02x" : "\\%03o"),
  468. static_cast<uint8>(*src));
  469. is_hex_escape = use_hex;
  470. used += 4;
  471. } else {
  472. dest[used++] = *src; break;
  473. }
  474. }
  475. last_hex_escape = is_hex_escape;
  476. }
  477. if (dest_len - used < 1) // make sure that there is room for \0
  478. return -1;
  479. dest[used] = '\0'; // doesn't count towards return value though
  480. return used;
  481. }
  482. int CEscapeString(const char* src, int src_len, char* dest, int dest_len) {
  483. return CEscapeInternal(src, src_len, dest, dest_len, false, false);
  484. }
  485. // ----------------------------------------------------------------------
  486. // CEscape()
  487. // CHexEscape()
  488. // Copies 'src' to result, escaping dangerous characters using
  489. // C-style escape sequences. This is very useful for preparing query
  490. // flags. 'src' and 'dest' should not overlap. The 'Hex' version
  491. // hexadecimal rather than octal sequences.
  492. //
  493. // Currently only \n, \r, \t, ", ', \ and !isprint() chars are escaped.
  494. // ----------------------------------------------------------------------
  495. string CEscape(const string& src) {
  496. const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
  497. scoped_array<char> dest(new char[dest_length]);
  498. const int len = CEscapeInternal(src.data(), src.size(),
  499. dest.get(), dest_length, false, false);
  500. GOOGLE_DCHECK_GE(len, 0);
  501. return string(dest.get(), len);
  502. }
  503. namespace strings {
  504. string Utf8SafeCEscape(const string& src) {
  505. const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
  506. scoped_array<char> dest(new char[dest_length]);
  507. const int len = CEscapeInternal(src.data(), src.size(),
  508. dest.get(), dest_length, false, true);
  509. GOOGLE_DCHECK_GE(len, 0);
  510. return string(dest.get(), len);
  511. }
  512. string CHexEscape(const string& src) {
  513. const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
  514. scoped_array<char> dest(new char[dest_length]);
  515. const int len = CEscapeInternal(src.data(), src.size(),
  516. dest.get(), dest_length, true, false);
  517. GOOGLE_DCHECK_GE(len, 0);
  518. return string(dest.get(), len);
  519. }
  520. } // namespace strings
  521. // ----------------------------------------------------------------------
  522. // strto32_adaptor()
  523. // strtou32_adaptor()
  524. // Implementation of strto[u]l replacements that have identical
  525. // overflow and underflow characteristics for both ILP-32 and LP-64
  526. // platforms, including errno preservation in error-free calls.
  527. // ----------------------------------------------------------------------
  528. int32 strto32_adaptor(const char *nptr, char **endptr, int base) {
  529. const int saved_errno = errno;
  530. errno = 0;
  531. const long result = strtol(nptr, endptr, base);
  532. if (errno == ERANGE && result == LONG_MIN) {
  533. return kint32min;
  534. } else if (errno == ERANGE && result == LONG_MAX) {
  535. return kint32max;
  536. } else if (errno == 0 && result < kint32min) {
  537. errno = ERANGE;
  538. return kint32min;
  539. } else if (errno == 0 && result > kint32max) {
  540. errno = ERANGE;
  541. return kint32max;
  542. }
  543. if (errno == 0)
  544. errno = saved_errno;
  545. return static_cast<int32>(result);
  546. }
  547. uint32 strtou32_adaptor(const char *nptr, char **endptr, int base) {
  548. const int saved_errno = errno;
  549. errno = 0;
  550. const unsigned long result = strtoul(nptr, endptr, base);
  551. if (errno == ERANGE && result == ULONG_MAX) {
  552. return kuint32max;
  553. } else if (errno == 0 && result > kuint32max) {
  554. errno = ERANGE;
  555. return kuint32max;
  556. }
  557. if (errno == 0)
  558. errno = saved_errno;
  559. return static_cast<uint32>(result);
  560. }
  561. inline bool safe_parse_sign(string* text /*inout*/,
  562. bool* negative_ptr /*output*/) {
  563. const char* start = text->data();
  564. const char* end = start + text->size();
  565. // Consume whitespace.
  566. while (start < end && (start[0] == ' ')) {
  567. ++start;
  568. }
  569. while (start < end && (end[-1] == ' ')) {
  570. --end;
  571. }
  572. if (start >= end) {
  573. return false;
  574. }
  575. // Consume sign.
  576. *negative_ptr = (start[0] == '-');
  577. if (*negative_ptr || start[0] == '+') {
  578. ++start;
  579. if (start >= end) {
  580. return false;
  581. }
  582. }
  583. *text = text->substr(start - text->data(), end - start);
  584. return true;
  585. }
  586. template<typename IntType>
  587. bool safe_parse_positive_int(
  588. string text, IntType* value_p) {
  589. int base = 10;
  590. IntType value = 0;
  591. const IntType vmax = std::numeric_limits<IntType>::max();
  592. assert(vmax > 0);
  593. assert(vmax >= base);
  594. const IntType vmax_over_base = vmax / base;
  595. const char* start = text.data();
  596. const char* end = start + text.size();
  597. // loop over digits
  598. for (; start < end; ++start) {
  599. unsigned char c = static_cast<unsigned char>(start[0]);
  600. int digit = c - '0';
  601. if (digit >= base || digit < 0) {
  602. *value_p = value;
  603. return false;
  604. }
  605. if (value > vmax_over_base) {
  606. *value_p = vmax;
  607. return false;
  608. }
  609. value *= base;
  610. if (value > vmax - digit) {
  611. *value_p = vmax;
  612. return false;
  613. }
  614. value += digit;
  615. }
  616. *value_p = value;
  617. return true;
  618. }
  619. template<typename IntType>
  620. bool safe_parse_negative_int(
  621. const string& text, IntType* value_p) {
  622. int base = 10;
  623. IntType value = 0;
  624. const IntType vmin = std::numeric_limits<IntType>::min();
  625. assert(vmin < 0);
  626. assert(vmin <= 0 - base);
  627. IntType vmin_over_base = vmin / base;
  628. // 2003 c++ standard [expr.mul]
  629. // "... the sign of the remainder is implementation-defined."
  630. // Although (vmin/base)*base + vmin%base is always vmin.
  631. // 2011 c++ standard tightens the spec but we cannot rely on it.
  632. if (vmin % base > 0) {
  633. vmin_over_base += 1;
  634. }
  635. const char* start = text.data();
  636. const char* end = start + text.size();
  637. // loop over digits
  638. for (; start < end; ++start) {
  639. unsigned char c = static_cast<unsigned char>(start[0]);
  640. int digit = c - '0';
  641. if (digit >= base || digit < 0) {
  642. *value_p = value;
  643. return false;
  644. }
  645. if (value < vmin_over_base) {
  646. *value_p = vmin;
  647. return false;
  648. }
  649. value *= base;
  650. if (value < vmin + digit) {
  651. *value_p = vmin;
  652. return false;
  653. }
  654. value -= digit;
  655. }
  656. *value_p = value;
  657. return true;
  658. }
  659. template<typename IntType>
  660. bool safe_int_internal(string text, IntType* value_p) {
  661. *value_p = 0;
  662. bool negative;
  663. if (!safe_parse_sign(&text, &negative)) {
  664. return false;
  665. }
  666. if (!negative) {
  667. return safe_parse_positive_int(text, value_p);
  668. } else {
  669. return safe_parse_negative_int(text, value_p);
  670. }
  671. }
  672. template<typename IntType>
  673. bool safe_uint_internal(string text, IntType* value_p) {
  674. *value_p = 0;
  675. bool negative;
  676. if (!safe_parse_sign(&text, &negative) || negative) {
  677. return false;
  678. }
  679. return safe_parse_positive_int(text, value_p);
  680. }
  681. // ----------------------------------------------------------------------
  682. // FastIntToBuffer()
  683. // FastInt64ToBuffer()
  684. // FastHexToBuffer()
  685. // FastHex64ToBuffer()
  686. // FastHex32ToBuffer()
  687. // ----------------------------------------------------------------------
  688. // Offset into buffer where FastInt64ToBuffer places the end of string
  689. // null character. Also used by FastInt64ToBufferLeft.
  690. static const int kFastInt64ToBufferOffset = 21;
  691. char *FastInt64ToBuffer(int64 i, char* buffer) {
  692. // We could collapse the positive and negative sections, but that
  693. // would be slightly slower for positive numbers...
  694. // 22 bytes is enough to store -2**64, -18446744073709551616.
  695. char* p = buffer + kFastInt64ToBufferOffset;
  696. *p-- = '\0';
  697. if (i >= 0) {
  698. do {
  699. *p-- = '0' + i % 10;
  700. i /= 10;
  701. } while (i > 0);
  702. return p + 1;
  703. } else {
  704. // On different platforms, % and / have different behaviors for
  705. // negative numbers, so we need to jump through hoops to make sure
  706. // we don't divide negative numbers.
  707. if (i > -10) {
  708. i = -i;
  709. *p-- = '0' + i;
  710. *p = '-';
  711. return p;
  712. } else {
  713. // Make sure we aren't at MIN_INT, in which case we can't say i = -i
  714. i = i + 10;
  715. i = -i;
  716. *p-- = '0' + i % 10;
  717. // Undo what we did a moment ago
  718. i = i / 10 + 1;
  719. do {
  720. *p-- = '0' + i % 10;
  721. i /= 10;
  722. } while (i > 0);
  723. *p = '-';
  724. return p;
  725. }
  726. }
  727. }
  728. // Offset into buffer where FastInt32ToBuffer places the end of string
  729. // null character. Also used by FastInt32ToBufferLeft
  730. static const int kFastInt32ToBufferOffset = 11;
  731. // Yes, this is a duplicate of FastInt64ToBuffer. But, we need this for the
  732. // compiler to generate 32 bit arithmetic instructions. It's much faster, at
  733. // least with 32 bit binaries.
  734. char *FastInt32ToBuffer(int32 i, char* buffer) {
  735. // We could collapse the positive and negative sections, but that
  736. // would be slightly slower for positive numbers...
  737. // 12 bytes is enough to store -2**32, -4294967296.
  738. char* p = buffer + kFastInt32ToBufferOffset;
  739. *p-- = '\0';
  740. if (i >= 0) {
  741. do {
  742. *p-- = '0' + i % 10;
  743. i /= 10;
  744. } while (i > 0);
  745. return p + 1;
  746. } else {
  747. // On different platforms, % and / have different behaviors for
  748. // negative numbers, so we need to jump through hoops to make sure
  749. // we don't divide negative numbers.
  750. if (i > -10) {
  751. i = -i;
  752. *p-- = '0' + i;
  753. *p = '-';
  754. return p;
  755. } else {
  756. // Make sure we aren't at MIN_INT, in which case we can't say i = -i
  757. i = i + 10;
  758. i = -i;
  759. *p-- = '0' + i % 10;
  760. // Undo what we did a moment ago
  761. i = i / 10 + 1;
  762. do {
  763. *p-- = '0' + i % 10;
  764. i /= 10;
  765. } while (i > 0);
  766. *p = '-';
  767. return p;
  768. }
  769. }
  770. }
  771. char *FastHexToBuffer(int i, char* buffer) {
  772. GOOGLE_CHECK(i >= 0) << "FastHexToBuffer() wants non-negative integers, not " << i;
  773. static const char *hexdigits = "0123456789abcdef";
  774. char *p = buffer + 21;
  775. *p-- = '\0';
  776. do {
  777. *p-- = hexdigits[i & 15]; // mod by 16
  778. i >>= 4; // divide by 16
  779. } while (i > 0);
  780. return p + 1;
  781. }
  782. char *InternalFastHexToBuffer(uint64 value, char* buffer, int num_byte) {
  783. static const char *hexdigits = "0123456789abcdef";
  784. buffer[num_byte] = '\0';
  785. for (int i = num_byte - 1; i >= 0; i--) {
  786. #ifdef _M_X64
  787. // MSVC x64 platform has a bug optimizing the uint32(value) in the #else
  788. // block. Given that the uint32 cast was to improve performance on 32-bit
  789. // platforms, we use 64-bit '&' directly.
  790. buffer[i] = hexdigits[value & 0xf];
  791. #else
  792. buffer[i] = hexdigits[uint32(value) & 0xf];
  793. #endif
  794. value >>= 4;
  795. }
  796. return buffer;
  797. }
  798. char *FastHex64ToBuffer(uint64 value, char* buffer) {
  799. return InternalFastHexToBuffer(value, buffer, 16);
  800. }
  801. char *FastHex32ToBuffer(uint32 value, char* buffer) {
  802. return InternalFastHexToBuffer(value, buffer, 8);
  803. }
  804. // ----------------------------------------------------------------------
  805. // FastInt32ToBufferLeft()
  806. // FastUInt32ToBufferLeft()
  807. // FastInt64ToBufferLeft()
  808. // FastUInt64ToBufferLeft()
  809. //
  810. // Like the Fast*ToBuffer() functions above, these are intended for speed.
  811. // Unlike the Fast*ToBuffer() functions, however, these functions write
  812. // their output to the beginning of the buffer (hence the name, as the
  813. // output is left-aligned). The caller is responsible for ensuring that
  814. // the buffer has enough space to hold the output.
  815. //
  816. // Returns a pointer to the end of the string (i.e. the null character
  817. // terminating the string).
  818. // ----------------------------------------------------------------------
  819. static const char two_ASCII_digits[100][2] = {
  820. {'0','0'}, {'0','1'}, {'0','2'}, {'0','3'}, {'0','4'},
  821. {'0','5'}, {'0','6'}, {'0','7'}, {'0','8'}, {'0','9'},
  822. {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'},
  823. {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
  824. {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'},
  825. {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
  826. {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'},
  827. {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
  828. {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'},
  829. {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
  830. {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'},
  831. {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
  832. {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'},
  833. {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
  834. {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'},
  835. {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
  836. {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'},
  837. {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
  838. {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'},
  839. {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'}
  840. };
  841. char* FastUInt32ToBufferLeft(uint32 u, char* buffer) {
  842. int digits;
  843. const char *ASCII_digits = NULL;
  844. // The idea of this implementation is to trim the number of divides to as few
  845. // as possible by using multiplication and subtraction rather than mod (%),
  846. // and by outputting two digits at a time rather than one.
  847. // The huge-number case is first, in the hopes that the compiler will output
  848. // that case in one branch-free block of code, and only output conditional
  849. // branches into it from below.
  850. if (u >= 1000000000) { // >= 1,000,000,000
  851. digits = u / 100000000; // 100,000,000
  852. ASCII_digits = two_ASCII_digits[digits];
  853. buffer[0] = ASCII_digits[0];
  854. buffer[1] = ASCII_digits[1];
  855. buffer += 2;
  856. sublt100_000_000:
  857. u -= digits * 100000000; // 100,000,000
  858. lt100_000_000:
  859. digits = u / 1000000; // 1,000,000
  860. ASCII_digits = two_ASCII_digits[digits];
  861. buffer[0] = ASCII_digits[0];
  862. buffer[1] = ASCII_digits[1];
  863. buffer += 2;
  864. sublt1_000_000:
  865. u -= digits * 1000000; // 1,000,000
  866. lt1_000_000:
  867. digits = u / 10000; // 10,000
  868. ASCII_digits = two_ASCII_digits[digits];
  869. buffer[0] = ASCII_digits[0];
  870. buffer[1] = ASCII_digits[1];
  871. buffer += 2;
  872. sublt10_000:
  873. u -= digits * 10000; // 10,000
  874. lt10_000:
  875. digits = u / 100;
  876. ASCII_digits = two_ASCII_digits[digits];
  877. buffer[0] = ASCII_digits[0];
  878. buffer[1] = ASCII_digits[1];
  879. buffer += 2;
  880. sublt100:
  881. u -= digits * 100;
  882. lt100:
  883. digits = u;
  884. ASCII_digits = two_ASCII_digits[digits];
  885. buffer[0] = ASCII_digits[0];
  886. buffer[1] = ASCII_digits[1];
  887. buffer += 2;
  888. done:
  889. *buffer = 0;
  890. return buffer;
  891. }
  892. if (u < 100) {
  893. digits = u;
  894. if (u >= 10) goto lt100;
  895. *buffer++ = '0' + digits;
  896. goto done;
  897. }
  898. if (u < 10000) { // 10,000
  899. if (u >= 1000) goto lt10_000;
  900. digits = u / 100;
  901. *buffer++ = '0' + digits;
  902. goto sublt100;
  903. }
  904. if (u < 1000000) { // 1,000,000
  905. if (u >= 100000) goto lt1_000_000;
  906. digits = u / 10000; // 10,000
  907. *buffer++ = '0' + digits;
  908. goto sublt10_000;
  909. }
  910. if (u < 100000000) { // 100,000,000
  911. if (u >= 10000000) goto lt100_000_000;
  912. digits = u / 1000000; // 1,000,000
  913. *buffer++ = '0' + digits;
  914. goto sublt1_000_000;
  915. }
  916. // we already know that u < 1,000,000,000
  917. digits = u / 100000000; // 100,000,000
  918. *buffer++ = '0' + digits;
  919. goto sublt100_000_000;
  920. }
  921. char* FastInt32ToBufferLeft(int32 i, char* buffer) {
  922. uint32 u = i;
  923. if (i < 0) {
  924. *buffer++ = '-';
  925. u = -i;
  926. }
  927. return FastUInt32ToBufferLeft(u, buffer);
  928. }
  929. char* FastUInt64ToBufferLeft(uint64 u64, char* buffer) {
  930. int digits;
  931. const char *ASCII_digits = NULL;
  932. uint32 u = static_cast<uint32>(u64);
  933. if (u == u64) return FastUInt32ToBufferLeft(u, buffer);
  934. uint64 top_11_digits = u64 / 1000000000;
  935. buffer = FastUInt64ToBufferLeft(top_11_digits, buffer);
  936. u = u64 - (top_11_digits * 1000000000);
  937. digits = u / 10000000; // 10,000,000
  938. GOOGLE_DCHECK_LT(digits, 100);
  939. ASCII_digits = two_ASCII_digits[digits];
  940. buffer[0] = ASCII_digits[0];
  941. buffer[1] = ASCII_digits[1];
  942. buffer += 2;
  943. u -= digits * 10000000; // 10,000,000
  944. digits = u / 100000; // 100,000
  945. ASCII_digits = two_ASCII_digits[digits];
  946. buffer[0] = ASCII_digits[0];
  947. buffer[1] = ASCII_digits[1];
  948. buffer += 2;
  949. u -= digits * 100000; // 100,000
  950. digits = u / 1000; // 1,000
  951. ASCII_digits = two_ASCII_digits[digits];
  952. buffer[0] = ASCII_digits[0];
  953. buffer[1] = ASCII_digits[1];
  954. buffer += 2;
  955. u -= digits * 1000; // 1,000
  956. digits = u / 10;
  957. ASCII_digits = two_ASCII_digits[digits];
  958. buffer[0] = ASCII_digits[0];
  959. buffer[1] = ASCII_digits[1];
  960. buffer += 2;
  961. u -= digits * 10;
  962. digits = u;
  963. *buffer++ = '0' + digits;
  964. *buffer = 0;
  965. return buffer;
  966. }
  967. char* FastInt64ToBufferLeft(int64 i, char* buffer) {
  968. uint64 u = i;
  969. if (i < 0) {
  970. *buffer++ = '-';
  971. u = -i;
  972. }
  973. return FastUInt64ToBufferLeft(u, buffer);
  974. }
  975. // ----------------------------------------------------------------------
  976. // SimpleItoa()
  977. // Description: converts an integer to a string.
  978. //
  979. // Return value: string
  980. // ----------------------------------------------------------------------
  981. string SimpleItoa(int i) {
  982. char buffer[kFastToBufferSize];
  983. return (sizeof(i) == 4) ?
  984. FastInt32ToBuffer(i, buffer) :
  985. FastInt64ToBuffer(i, buffer);
  986. }
  987. string SimpleItoa(unsigned int i) {
  988. char buffer[kFastToBufferSize];
  989. return string(buffer, (sizeof(i) == 4) ?
  990. FastUInt32ToBufferLeft(i, buffer) :
  991. FastUInt64ToBufferLeft(i, buffer));
  992. }
  993. string SimpleItoa(long i) {
  994. char buffer[kFastToBufferSize];
  995. return (sizeof(i) == 4) ?
  996. FastInt32ToBuffer(i, buffer) :
  997. FastInt64ToBuffer(i, buffer);
  998. }
  999. string SimpleItoa(unsigned long i) {
  1000. char buffer[kFastToBufferSize];
  1001. return string(buffer, (sizeof(i) == 4) ?
  1002. FastUInt32ToBufferLeft(i, buffer) :
  1003. FastUInt64ToBufferLeft(i, buffer));
  1004. }
  1005. string SimpleItoa(long long i) {
  1006. char buffer[kFastToBufferSize];
  1007. return (sizeof(i) == 4) ?
  1008. FastInt32ToBuffer(i, buffer) :
  1009. FastInt64ToBuffer(i, buffer);
  1010. }
  1011. string SimpleItoa(unsigned long long i) {
  1012. char buffer[kFastToBufferSize];
  1013. return string(buffer, (sizeof(i) == 4) ?
  1014. FastUInt32ToBufferLeft(i, buffer) :
  1015. FastUInt64ToBufferLeft(i, buffer));
  1016. }
  1017. // ----------------------------------------------------------------------
  1018. // SimpleDtoa()
  1019. // SimpleFtoa()
  1020. // DoubleToBuffer()
  1021. // FloatToBuffer()
  1022. // We want to print the value without losing precision, but we also do
  1023. // not want to print more digits than necessary. This turns out to be
  1024. // trickier than it sounds. Numbers like 0.2 cannot be represented
  1025. // exactly in binary. If we print 0.2 with a very large precision,
  1026. // e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167".
  1027. // On the other hand, if we set the precision too low, we lose
  1028. // significant digits when printing numbers that actually need them.
  1029. // It turns out there is no precision value that does the right thing
  1030. // for all numbers.
  1031. //
  1032. // Our strategy is to first try printing with a precision that is never
  1033. // over-precise, then parse the result with strtod() to see if it
  1034. // matches. If not, we print again with a precision that will always
  1035. // give a precise result, but may use more digits than necessary.
  1036. //
  1037. // An arguably better strategy would be to use the algorithm described
  1038. // in "How to Print Floating-Point Numbers Accurately" by Steele &
  1039. // White, e.g. as implemented by David M. Gay's dtoa(). It turns out,
  1040. // however, that the following implementation is about as fast as
  1041. // DMG's code. Furthermore, DMG's code locks mutexes, which means it
  1042. // will not scale well on multi-core machines. DMG's code is slightly
  1043. // more accurate (in that it will never use more digits than
  1044. // necessary), but this is probably irrelevant for most users.
  1045. //
  1046. // Rob Pike and Ken Thompson also have an implementation of dtoa() in
  1047. // third_party/fmt/fltfmt.cc. Their implementation is similar to this
  1048. // one in that it makes guesses and then uses strtod() to check them.
  1049. // Their implementation is faster because they use their own code to
  1050. // generate the digits in the first place rather than use snprintf(),
  1051. // thus avoiding format string parsing overhead. However, this makes
  1052. // it considerably more complicated than the following implementation,
  1053. // and it is embedded in a larger library. If speed turns out to be
  1054. // an issue, we could re-implement this in terms of their
  1055. // implementation.
  1056. // ----------------------------------------------------------------------
  1057. string SimpleDtoa(double value) {
  1058. char buffer[kDoubleToBufferSize];
  1059. return DoubleToBuffer(value, buffer);
  1060. }
  1061. string SimpleFtoa(float value) {
  1062. char buffer[kFloatToBufferSize];
  1063. return FloatToBuffer(value, buffer);
  1064. }
  1065. static inline bool IsValidFloatChar(char c) {
  1066. return ('0' <= c && c <= '9') ||
  1067. c == 'e' || c == 'E' ||
  1068. c == '+' || c == '-';
  1069. }
  1070. void DelocalizeRadix(char* buffer) {
  1071. // Fast check: if the buffer has a normal decimal point, assume no
  1072. // translation is needed.
  1073. if (strchr(buffer, '.') != NULL) return;
  1074. // Find the first unknown character.
  1075. while (IsValidFloatChar(*buffer)) ++buffer;
  1076. if (*buffer == '\0') {
  1077. // No radix character found.
  1078. return;
  1079. }
  1080. // We are now pointing at the locale-specific radix character. Replace it
  1081. // with '.'.
  1082. *buffer = '.';
  1083. ++buffer;
  1084. if (!IsValidFloatChar(*buffer) && *buffer != '\0') {
  1085. // It appears the radix was a multi-byte character. We need to remove the
  1086. // extra bytes.
  1087. char* target = buffer;
  1088. do { ++buffer; } while (!IsValidFloatChar(*buffer) && *buffer != '\0');
  1089. memmove(target, buffer, strlen(buffer) + 1);
  1090. }
  1091. }
  1092. char* DoubleToBuffer(double value, char* buffer) {
  1093. // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all
  1094. // platforms these days. Just in case some system exists where DBL_DIG
  1095. // is significantly larger -- and risks overflowing our buffer -- we have
  1096. // this assert.
  1097. GOOGLE_COMPILE_ASSERT(DBL_DIG < 20, DBL_DIG_is_too_big);
  1098. if (value == numeric_limits<double>::infinity()) {
  1099. strcpy(buffer, "inf");
  1100. return buffer;
  1101. } else if (value == -numeric_limits<double>::infinity()) {
  1102. strcpy(buffer, "-inf");
  1103. return buffer;
  1104. } else if (MathLimits<double>::IsNaN(value)) {
  1105. strcpy(buffer, "nan");
  1106. return buffer;
  1107. }
  1108. int snprintf_result =
  1109. snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG, value);
  1110. // The snprintf should never overflow because the buffer is significantly
  1111. // larger than the precision we asked for.
  1112. GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
  1113. // We need to make parsed_value volatile in order to force the compiler to
  1114. // write it out to the stack. Otherwise, it may keep the value in a
  1115. // register, and if it does that, it may keep it as a long double instead
  1116. // of a double. This long double may have extra bits that make it compare
  1117. // unequal to "value" even though it would be exactly equal if it were
  1118. // truncated to a double.
  1119. volatile double parsed_value = strtod(buffer, NULL);
  1120. if (parsed_value != value) {
  1121. int snprintf_result =
  1122. snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG+2, value);
  1123. // Should never overflow; see above.
  1124. GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
  1125. }
  1126. DelocalizeRadix(buffer);
  1127. return buffer;
  1128. }
  1129. static int memcasecmp(const char *s1, const char *s2, size_t len) {
  1130. const unsigned char *us1 = reinterpret_cast<const unsigned char *>(s1);
  1131. const unsigned char *us2 = reinterpret_cast<const unsigned char *>(s2);
  1132. for ( int i = 0; i < len; i++ ) {
  1133. const int diff =
  1134. static_cast<int>(static_cast<unsigned char>(ascii_tolower(us1[i]))) -
  1135. static_cast<int>(static_cast<unsigned char>(ascii_tolower(us2[i])));
  1136. if (diff != 0) return diff;
  1137. }
  1138. return 0;
  1139. }
  1140. inline bool CaseEqual(StringPiece s1, StringPiece s2) {
  1141. if (s1.size() != s2.size()) return false;
  1142. return memcasecmp(s1.data(), s2.data(), s1.size()) == 0;
  1143. }
  1144. bool safe_strtob(StringPiece str, bool* value) {
  1145. GOOGLE_CHECK(value != NULL) << "NULL output boolean given.";
  1146. if (CaseEqual(str, "true") || CaseEqual(str, "t") ||
  1147. CaseEqual(str, "yes") || CaseEqual(str, "y") ||
  1148. CaseEqual(str, "1")) {
  1149. *value = true;
  1150. return true;
  1151. }
  1152. if (CaseEqual(str, "false") || CaseEqual(str, "f") ||
  1153. CaseEqual(str, "no") || CaseEqual(str, "n") ||
  1154. CaseEqual(str, "0")) {
  1155. *value = false;
  1156. return true;
  1157. }
  1158. return false;
  1159. }
  1160. bool safe_strtof(const char* str, float* value) {
  1161. char* endptr;
  1162. errno = 0; // errno only gets set on errors
  1163. #if defined(_WIN32) || defined (__hpux) // has no strtof()
  1164. *value = strtod(str, &endptr);
  1165. #else
  1166. *value = strtof(str, &endptr);
  1167. #endif
  1168. return *str != 0 && *endptr == 0 && errno == 0;
  1169. }
  1170. bool safe_strtod(const char* str, double* value) {
  1171. char* endptr;
  1172. *value = strtod(str, &endptr);
  1173. if (endptr != str) {
  1174. while (ascii_isspace(*endptr)) ++endptr;
  1175. }
  1176. // Ignore range errors from strtod. The values it
  1177. // returns on underflow and overflow are the right
  1178. // fallback in a robust setting.
  1179. return *str != '\0' && *endptr == '\0';
  1180. }
  1181. bool safe_strto32(const string& str, int32* value) {
  1182. return safe_int_internal(str, value);
  1183. }
  1184. bool safe_strtou32(const string& str, uint32* value) {
  1185. return safe_uint_internal(str, value);
  1186. }
  1187. bool safe_strto64(const string& str, int64* value) {
  1188. return safe_int_internal(str, value);
  1189. }
  1190. bool safe_strtou64(const string& str, uint64* value) {
  1191. return safe_uint_internal(str, value);
  1192. }
  1193. char* FloatToBuffer(float value, char* buffer) {
  1194. // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all
  1195. // platforms these days. Just in case some system exists where FLT_DIG
  1196. // is significantly larger -- and risks overflowing our buffer -- we have
  1197. // this assert.
  1198. GOOGLE_COMPILE_ASSERT(FLT_DIG < 10, FLT_DIG_is_too_big);
  1199. if (value == numeric_limits<double>::infinity()) {
  1200. strcpy(buffer, "inf");
  1201. return buffer;
  1202. } else if (value == -numeric_limits<double>::infinity()) {
  1203. strcpy(buffer, "-inf");
  1204. return buffer;
  1205. } else if (MathLimits<float>::IsNaN(value)) {
  1206. strcpy(buffer, "nan");
  1207. return buffer;
  1208. }
  1209. int snprintf_result =
  1210. snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG, value);
  1211. // The snprintf should never overflow because the buffer is significantly
  1212. // larger than the precision we asked for.
  1213. GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
  1214. float parsed_value;
  1215. if (!safe_strtof(buffer, &parsed_value) || parsed_value != value) {
  1216. int snprintf_result =
  1217. snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG+2, value);
  1218. // Should never overflow; see above.
  1219. GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
  1220. }
  1221. DelocalizeRadix(buffer);
  1222. return buffer;
  1223. }
  1224. namespace strings {
  1225. AlphaNum::AlphaNum(strings::Hex hex) {
  1226. char *const end = &digits[kFastToBufferSize];
  1227. char *writer = end;
  1228. uint64 value = hex.value;
  1229. uint64 width = hex.spec;
  1230. // We accomplish minimum width by OR'ing in 0x10000 to the user's value,
  1231. // where 0x10000 is the smallest hex number that is as wide as the user
  1232. // asked for.
  1233. uint64 mask = ((static_cast<uint64>(1) << (width - 1) * 4)) | value;
  1234. static const char hexdigits[] = "0123456789abcdef";
  1235. do {
  1236. *--writer = hexdigits[value & 0xF];
  1237. value >>= 4;
  1238. mask >>= 4;
  1239. } while (mask != 0);
  1240. piece_data_ = writer;
  1241. piece_size_ = end - writer;
  1242. }
  1243. } // namespace strings
  1244. // ----------------------------------------------------------------------
  1245. // StrCat()
  1246. // This merges the given strings or integers, with no delimiter. This
  1247. // is designed to be the fastest possible way to construct a string out
  1248. // of a mix of raw C strings, C++ strings, and integer values.
  1249. // ----------------------------------------------------------------------
  1250. // Append is merely a version of memcpy that returns the address of the byte
  1251. // after the area just overwritten. It comes in multiple flavors to minimize
  1252. // call overhead.
  1253. static char *Append1(char *out, const AlphaNum &x) {
  1254. memcpy(out, x.data(), x.size());
  1255. return out + x.size();
  1256. }
  1257. static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) {
  1258. memcpy(out, x1.data(), x1.size());
  1259. out += x1.size();
  1260. memcpy(out, x2.data(), x2.size());
  1261. return out + x2.size();
  1262. }
  1263. static char *Append4(char *out,
  1264. const AlphaNum &x1, const AlphaNum &x2,
  1265. const AlphaNum &x3, const AlphaNum &x4) {
  1266. memcpy(out, x1.data(), x1.size());
  1267. out += x1.size();
  1268. memcpy(out, x2.data(), x2.size());
  1269. out += x2.size();
  1270. memcpy(out, x3.data(), x3.size());
  1271. out += x3.size();
  1272. memcpy(out, x4.data(), x4.size());
  1273. return out + x4.size();
  1274. }
  1275. string StrCat(const AlphaNum &a, const AlphaNum &b) {
  1276. string result;
  1277. result.resize(a.size() + b.size());
  1278. char *const begin = &*result.begin();
  1279. char *out = Append2(begin, a, b);
  1280. GOOGLE_DCHECK_EQ(out, begin + result.size());
  1281. return result;
  1282. }
  1283. string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
  1284. string result;
  1285. result.resize(a.size() + b.size() + c.size());
  1286. char *const begin = &*result.begin();
  1287. char *out = Append2(begin, a, b);
  1288. out = Append1(out, c);
  1289. GOOGLE_DCHECK_EQ(out, begin + result.size());
  1290. return result;
  1291. }
  1292. string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
  1293. const AlphaNum &d) {
  1294. string result;
  1295. result.resize(a.size() + b.size() + c.size() + d.size());
  1296. char *const begin = &*result.begin();
  1297. char *out = Append4(begin, a, b, c, d);
  1298. GOOGLE_DCHECK_EQ(out, begin + result.size());
  1299. return result;
  1300. }
  1301. string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
  1302. const AlphaNum &d, const AlphaNum &e) {
  1303. string result;
  1304. result.resize(a.size() + b.size() + c.size() + d.size() + e.size());
  1305. char *const begin = &*result.begin();
  1306. char *out = Append4(begin, a, b, c, d);
  1307. out = Append1(out, e);
  1308. GOOGLE_DCHECK_EQ(out, begin + result.size());
  1309. return result;
  1310. }
  1311. string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
  1312. const AlphaNum &d, const AlphaNum &e, const AlphaNum &f) {
  1313. string result;
  1314. result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
  1315. f.size());
  1316. char *const begin = &*result.begin();
  1317. char *out = Append4(begin, a, b, c, d);
  1318. out = Append2(out, e, f);
  1319. GOOGLE_DCHECK_EQ(out, begin + result.size());
  1320. return result;
  1321. }
  1322. string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
  1323. const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
  1324. const AlphaNum &g) {
  1325. string result;
  1326. result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
  1327. f.size() + g.size());
  1328. char *const begin = &*result.begin();
  1329. char *out = Append4(begin, a, b, c, d);
  1330. out = Append2(out, e, f);
  1331. out = Append1(out, g);
  1332. GOOGLE_DCHECK_EQ(out, begin + result.size());
  1333. return result;
  1334. }
  1335. string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
  1336. const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
  1337. const AlphaNum &g, const AlphaNum &h) {
  1338. string result;
  1339. result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
  1340. f.size() + g.size() + h.size());
  1341. char *const begin = &*result.begin();
  1342. char *out = Append4(begin, a, b, c, d);
  1343. out = Append4(out, e, f, g, h);
  1344. GOOGLE_DCHECK_EQ(out, begin + result.size());
  1345. return result;
  1346. }
  1347. string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
  1348. const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
  1349. const AlphaNum &g, const AlphaNum &h, const AlphaNum &i) {
  1350. string result;
  1351. result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
  1352. f.size() + g.size() + h.size() + i.size());
  1353. char *const begin = &*result.begin();
  1354. char *out = Append4(begin, a, b, c, d);
  1355. out = Append4(out, e, f, g, h);
  1356. out = Append1(out, i);
  1357. GOOGLE_DCHECK_EQ(out, begin + result.size());
  1358. return result;
  1359. }
  1360. // It's possible to call StrAppend with a char * pointer that is partway into
  1361. // the string we're appending to. However the results of this are random.
  1362. // Therefore, check for this in debug mode. Use unsigned math so we only have
  1363. // to do one comparison.
  1364. #define GOOGLE_DCHECK_NO_OVERLAP(dest, src) \
  1365. GOOGLE_DCHECK_GT(uintptr_t((src).data() - (dest).data()), \
  1366. uintptr_t((dest).size()))
  1367. void StrAppend(string *result, const AlphaNum &a) {
  1368. GOOGLE_DCHECK_NO_OVERLAP(*result, a);
  1369. result->append(a.data(), a.size());
  1370. }
  1371. void StrAppend(string *result, const AlphaNum &a, const AlphaNum &b) {
  1372. GOOGLE_DCHECK_NO_OVERLAP(*result, a);
  1373. GOOGLE_DCHECK_NO_OVERLAP(*result, b);
  1374. string::size_type old_size = result->size();
  1375. result->resize(old_size + a.size() + b.size());
  1376. char *const begin = &*result->begin();
  1377. char *out = Append2(begin + old_size, a, b);
  1378. GOOGLE_DCHECK_EQ(out, begin + result->size());
  1379. }
  1380. void StrAppend(string *result,
  1381. const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
  1382. GOOGLE_DCHECK_NO_OVERLAP(*result, a);
  1383. GOOGLE_DCHECK_NO_OVERLAP(*result, b);
  1384. GOOGLE_DCHECK_NO_OVERLAP(*result, c);
  1385. string::size_type old_size = result->size();
  1386. result->resize(old_size + a.size() + b.size() + c.size());
  1387. char *const begin = &*result->begin();
  1388. char *out = Append2(begin + old_size, a, b);
  1389. out = Append1(out, c);
  1390. GOOGLE_DCHECK_EQ(out, begin + result->size());
  1391. }
  1392. void StrAppend(string *result,
  1393. const AlphaNum &a, const AlphaNum &b,
  1394. const AlphaNum &c, const AlphaNum &d) {
  1395. GOOGLE_DCHECK_NO_OVERLAP(*result, a);
  1396. GOOGLE_DCHECK_NO_OVERLAP(*result, b);
  1397. GOOGLE_DCHECK_NO_OVERLAP(*result, c);
  1398. GOOGLE_DCHECK_NO_OVERLAP(*result, d);
  1399. string::size_type old_size = result->size();
  1400. result->resize(old_size + a.size() + b.size() + c.size() + d.size());
  1401. char *const begin = &*result->begin();
  1402. char *out = Append4(begin + old_size, a, b, c, d);
  1403. GOOGLE_DCHECK_EQ(out, begin + result->size());
  1404. }
  1405. int GlobalReplaceSubstring(const string& substring,
  1406. const string& replacement,
  1407. string* s) {
  1408. GOOGLE_CHECK(s != NULL);
  1409. if (s->empty() || substring.empty())
  1410. return 0;
  1411. string tmp;
  1412. int num_replacements = 0;
  1413. int pos = 0;
  1414. for (int match_pos = s->find(substring.data(), pos, substring.length());
  1415. match_pos != string::npos;
  1416. pos = match_pos + substring.length(),
  1417. match_pos = s->find(substring.data(), pos, substring.length())) {
  1418. ++num_replacements;
  1419. // Append the original content before the match.
  1420. tmp.append(*s, pos, match_pos - pos);
  1421. // Append the replacement for the match.
  1422. tmp.append(replacement.begin(), replacement.end());
  1423. }
  1424. // Append the content after the last match. If no replacements were made, the
  1425. // original string is left untouched.
  1426. if (num_replacements > 0) {
  1427. tmp.append(*s, pos, s->length() - pos);
  1428. s->swap(tmp);
  1429. }
  1430. return num_replacements;
  1431. }
  1432. int CalculateBase64EscapedLen(int input_len, bool do_padding) {
  1433. // Base64 encodes three bytes of input at a time. If the input is not
  1434. // divisible by three, we pad as appropriate.
  1435. //
  1436. // (from http://tools.ietf.org/html/rfc3548)
  1437. // Special processing is performed if fewer than 24 bits are available
  1438. // at the end of the data being encoded. A full encoding quantum is
  1439. // always completed at the end of a quantity. When fewer than 24 input
  1440. // bits are available in an input group, zero bits are added (on the
  1441. // right) to form an integral number of 6-bit groups. Padding at the
  1442. // end of the data is performed using the '=' character. Since all base
  1443. // 64 input is an integral number of octets, only the following cases
  1444. // can arise:
  1445. // Base64 encodes each three bytes of input into four bytes of output.
  1446. int len = (input_len / 3) * 4;
  1447. if (input_len % 3 == 0) {
  1448. // (from http://tools.ietf.org/html/rfc3548)
  1449. // (1) the final quantum of encoding input is an integral multiple of 24
  1450. // bits; here, the final unit of encoded output will be an integral
  1451. // multiple of 4 characters with no "=" padding,
  1452. } else if (input_len % 3 == 1) {
  1453. // (from http://tools.ietf.org/html/rfc3548)
  1454. // (2) the final quantum of encoding input is exactly 8 bits; here, the
  1455. // final unit of encoded output will be two characters followed by two
  1456. // "=" padding characters, or
  1457. len += 2;
  1458. if (do_padding) {
  1459. len += 2;
  1460. }
  1461. } else { // (input_len % 3 == 2)
  1462. // (from http://tools.ietf.org/html/rfc3548)
  1463. // (3) the final quantum of encoding input is exactly 16 bits; here, the
  1464. // final unit of encoded output will be three characters followed by one
  1465. // "=" padding character.
  1466. len += 3;
  1467. if (do_padding) {
  1468. len += 1;
  1469. }
  1470. }
  1471. assert(len >= input_len); // make sure we didn't overflow
  1472. return len;
  1473. }
  1474. // Base64Escape does padding, so this calculation includes padding.
  1475. int CalculateBase64EscapedLen(int input_len) {
  1476. return CalculateBase64EscapedLen(input_len, true);
  1477. }
  1478. // ----------------------------------------------------------------------
  1479. // int Base64Unescape() - base64 decoder
  1480. // int Base64Escape() - base64 encoder
  1481. // int WebSafeBase64Unescape() - Google's variation of base64 decoder
  1482. // int WebSafeBase64Escape() - Google's variation of base64 encoder
  1483. //
  1484. // Check out
  1485. // http://tools.ietf.org/html/rfc2045 for formal description, but what we
  1486. // care about is that...
  1487. // Take the encoded stuff in groups of 4 characters and turn each
  1488. // character into a code 0 to 63 thus:
  1489. // A-Z map to 0 to 25
  1490. // a-z map to 26 to 51
  1491. // 0-9 map to 52 to 61
  1492. // +(- for WebSafe) maps to 62
  1493. // /(_ for WebSafe) maps to 63
  1494. // There will be four numbers, all less than 64 which can be represented
  1495. // by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
  1496. // Arrange the 6 digit binary numbers into three bytes as such:
  1497. // aaaaaabb bbbbcccc ccdddddd
  1498. // Equals signs (one or two) are used at the end of the encoded block to
  1499. // indicate that the text was not an integer multiple of three bytes long.
  1500. // ----------------------------------------------------------------------
  1501. int Base64UnescapeInternal(const char *src_param, int szsrc,
  1502. char *dest, int szdest,
  1503. const signed char* unbase64) {
  1504. static const char kPad64Equals = '=';
  1505. static const char kPad64Dot = '.';
  1506. int decode = 0;
  1507. int destidx = 0;
  1508. int state = 0;
  1509. unsigned int ch = 0;
  1510. unsigned int temp = 0;
  1511. // If "char" is signed by default, using *src as an array index results in
  1512. // accessing negative array elements. Treat the input as a pointer to
  1513. // unsigned char to avoid this.
  1514. const unsigned char *src = reinterpret_cast<const unsigned char*>(src_param);
  1515. // The GET_INPUT macro gets the next input character, skipping
  1516. // over any whitespace, and stopping when we reach the end of the
  1517. // string or when we read any non-data character. The arguments are
  1518. // an arbitrary identifier (used as a label for goto) and the number
  1519. // of data bytes that must remain in the input to avoid aborting the
  1520. // loop.
  1521. #define GET_INPUT(label, remain) \
  1522. label: \
  1523. --szsrc; \
  1524. ch = *src++; \
  1525. decode = unbase64[ch]; \
  1526. if (decode < 0) { \
  1527. if (ascii_isspace(ch) && szsrc >= remain) \
  1528. goto label; \
  1529. state = 4 - remain; \
  1530. break; \
  1531. }
  1532. // if dest is null, we're just checking to see if it's legal input
  1533. // rather than producing output. (I suspect this could just be done
  1534. // with a regexp...). We duplicate the loop so this test can be
  1535. // outside it instead of in every iteration.
  1536. if (dest) {
  1537. // This loop consumes 4 input bytes and produces 3 output bytes
  1538. // per iteration. We can't know at the start that there is enough
  1539. // data left in the string for a full iteration, so the loop may
  1540. // break out in the middle; if so 'state' will be set to the
  1541. // number of input bytes read.
  1542. while (szsrc >= 4) {
  1543. // We'll start by optimistically assuming that the next four
  1544. // bytes of the string (src[0..3]) are four good data bytes
  1545. // (that is, no nulls, whitespace, padding chars, or illegal
  1546. // chars). We need to test src[0..2] for nulls individually
  1547. // before constructing temp to preserve the property that we
  1548. // never read past a null in the string (no matter how long
  1549. // szsrc claims the string is).
  1550. if (!src[0] || !src[1] || !src[2] ||
  1551. (temp = ((unsigned(unbase64[src[0]]) << 18) |
  1552. (unsigned(unbase64[src[1]]) << 12) |
  1553. (unsigned(unbase64[src[2]]) << 6) |
  1554. (unsigned(unbase64[src[3]])))) & 0x80000000) {
  1555. // Iff any of those four characters was bad (null, illegal,
  1556. // whitespace, padding), then temp's high bit will be set
  1557. // (because unbase64[] is -1 for all bad characters).
  1558. //
  1559. // We'll back up and resort to the slower decoder, which knows
  1560. // how to handle those cases.
  1561. GET_INPUT(first, 4);
  1562. temp = decode;
  1563. GET_INPUT(second, 3);
  1564. temp = (temp << 6) | decode;
  1565. GET_INPUT(third, 2);
  1566. temp = (temp << 6) | decode;
  1567. GET_INPUT(fourth, 1);
  1568. temp = (temp << 6) | decode;
  1569. } else {
  1570. // We really did have four good data bytes, so advance four
  1571. // characters in the string.
  1572. szsrc -= 4;
  1573. src += 4;
  1574. decode = -1;
  1575. ch = '\0';
  1576. }
  1577. // temp has 24 bits of input, so write that out as three bytes.
  1578. if (destidx+3 > szdest) return -1;
  1579. dest[destidx+2] = temp;
  1580. temp >>= 8;
  1581. dest[destidx+1] = temp;
  1582. temp >>= 8;
  1583. dest[destidx] = temp;
  1584. destidx += 3;
  1585. }
  1586. } else {
  1587. while (szsrc >= 4) {
  1588. if (!src[0] || !src[1] || !src[2] ||
  1589. (temp = ((unsigned(unbase64[src[0]]) << 18) |
  1590. (unsigned(unbase64[src[1]]) << 12) |
  1591. (unsigned(unbase64[src[2]]) << 6) |
  1592. (unsigned(unbase64[src[3]])))) & 0x80000000) {
  1593. GET_INPUT(first_no_dest, 4);
  1594. GET_INPUT(second_no_dest, 3);
  1595. GET_INPUT(third_no_dest, 2);
  1596. GET_INPUT(fourth_no_dest, 1);
  1597. } else {
  1598. szsrc -= 4;
  1599. src += 4;
  1600. decode = -1;
  1601. ch = '\0';
  1602. }
  1603. destidx += 3;
  1604. }
  1605. }
  1606. #undef GET_INPUT
  1607. // if the loop terminated because we read a bad character, return
  1608. // now.
  1609. if (decode < 0 && ch != '\0' &&
  1610. ch != kPad64Equals && ch != kPad64Dot && !ascii_isspace(ch))
  1611. return -1;
  1612. if (ch == kPad64Equals || ch == kPad64Dot) {
  1613. // if we stopped by hitting an '=' or '.', un-read that character -- we'll
  1614. // look at it again when we count to check for the proper number of
  1615. // equals signs at the end.
  1616. ++szsrc;
  1617. --src;
  1618. } else {
  1619. // This loop consumes 1 input byte per iteration. It's used to
  1620. // clean up the 0-3 input bytes remaining when the first, faster
  1621. // loop finishes. 'temp' contains the data from 'state' input
  1622. // characters read by the first loop.
  1623. while (szsrc > 0) {
  1624. --szsrc;
  1625. ch = *src++;
  1626. decode = unbase64[ch];
  1627. if (decode < 0) {
  1628. if (ascii_isspace(ch)) {
  1629. continue;
  1630. } else if (ch == '\0') {
  1631. break;
  1632. } else if (ch == kPad64Equals || ch == kPad64Dot) {
  1633. // back up one character; we'll read it again when we check
  1634. // for the correct number of pad characters at the end.
  1635. ++szsrc;
  1636. --src;
  1637. break;
  1638. } else {
  1639. return -1;
  1640. }
  1641. }
  1642. // Each input character gives us six bits of output.
  1643. temp = (temp << 6) | decode;
  1644. ++state;
  1645. if (state == 4) {
  1646. // If we've accumulated 24 bits of output, write that out as
  1647. // three bytes.
  1648. if (dest) {
  1649. if (destidx+3 > szdest) return -1;
  1650. dest[destidx+2] = temp;
  1651. temp >>= 8;
  1652. dest[destidx+1] = temp;
  1653. temp >>= 8;
  1654. dest[destidx] = temp;
  1655. }
  1656. destidx += 3;
  1657. state = 0;
  1658. temp = 0;
  1659. }
  1660. }
  1661. }
  1662. // Process the leftover data contained in 'temp' at the end of the input.
  1663. int expected_equals = 0;
  1664. switch (state) {
  1665. case 0:
  1666. // Nothing left over; output is a multiple of 3 bytes.
  1667. break;
  1668. case 1:
  1669. // Bad input; we have 6 bits left over.
  1670. return -1;
  1671. case 2:
  1672. // Produce one more output byte from the 12 input bits we have left.
  1673. if (dest) {
  1674. if (destidx+1 > szdest) return -1;
  1675. temp >>= 4;
  1676. dest[destidx] = temp;
  1677. }
  1678. ++destidx;
  1679. expected_equals = 2;
  1680. break;
  1681. case 3:
  1682. // Produce two more output bytes from the 18 input bits we have left.
  1683. if (dest) {
  1684. if (destidx+2 > szdest) return -1;
  1685. temp >>= 2;
  1686. dest[destidx+1] = temp;
  1687. temp >>= 8;
  1688. dest[destidx] = temp;
  1689. }
  1690. destidx += 2;
  1691. expected_equals = 1;
  1692. break;
  1693. default:
  1694. // state should have no other values at this point.
  1695. GOOGLE_LOG(FATAL) << "This can't happen; base64 decoder state = " << state;
  1696. }
  1697. // The remainder of the string should be all whitespace, mixed with
  1698. // exactly 0 equals signs, or exactly 'expected_equals' equals
  1699. // signs. (Always accepting 0 equals signs is a google extension
  1700. // not covered in the RFC, as is accepting dot as the pad character.)
  1701. int equals = 0;
  1702. while (szsrc > 0 && *src) {
  1703. if (*src == kPad64Equals || *src == kPad64Dot)
  1704. ++equals;
  1705. else if (!ascii_isspace(*src))
  1706. return -1;
  1707. --szsrc;
  1708. ++src;
  1709. }
  1710. return (equals == 0 || equals == expected_equals) ? destidx : -1;
  1711. }
  1712. // The arrays below were generated by the following code
  1713. // #include <sys/time.h>
  1714. // #include <stdlib.h>
  1715. // #include <string.h>
  1716. // main()
  1717. // {
  1718. // static const char Base64[] =
  1719. // "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
  1720. // char *pos;
  1721. // int idx, i, j;
  1722. // printf(" ");
  1723. // for (i = 0; i < 255; i += 8) {
  1724. // for (j = i; j < i + 8; j++) {
  1725. // pos = strchr(Base64, j);
  1726. // if ((pos == NULL) || (j == 0))
  1727. // idx = -1;
  1728. // else
  1729. // idx = pos - Base64;
  1730. // if (idx == -1)
  1731. // printf(" %2d, ", idx);
  1732. // else
  1733. // printf(" %2d/*%c*/,", idx, j);
  1734. // }
  1735. // printf("\n ");
  1736. // }
  1737. // }
  1738. //
  1739. // where the value of "Base64[]" was replaced by one of the base-64 conversion
  1740. // tables from the functions below.
  1741. static const signed char kUnBase64[] = {
  1742. -1, -1, -1, -1, -1, -1, -1, -1,
  1743. -1, -1, -1, -1, -1, -1, -1, -1,
  1744. -1, -1, -1, -1, -1, -1, -1, -1,
  1745. -1, -1, -1, -1, -1, -1, -1, -1,
  1746. -1, -1, -1, -1, -1, -1, -1, -1,
  1747. -1, -1, -1, 62/*+*/, -1, -1, -1, 63/*/ */,
  1748. 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
  1749. 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
  1750. -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
  1751. 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
  1752. 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
  1753. 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, -1,
  1754. -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
  1755. 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
  1756. 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
  1757. 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
  1758. -1, -1, -1, -1, -1, -1, -1, -1,
  1759. -1, -1, -1, -1, -1, -1, -1, -1,
  1760. -1, -1, -1, -1, -1, -1, -1, -1,
  1761. -1, -1, -1, -1, -1, -1, -1, -1,
  1762. -1, -1, -1, -1, -1, -1, -1, -1,
  1763. -1, -1, -1, -1, -1, -1, -1, -1,
  1764. -1, -1, -1, -1, -1, -1, -1, -1,
  1765. -1, -1, -1, -1, -1, -1, -1, -1,
  1766. -1, -1, -1, -1, -1, -1, -1, -1,
  1767. -1, -1, -1, -1, -1, -1, -1, -1,
  1768. -1, -1, -1, -1, -1, -1, -1, -1,
  1769. -1, -1, -1, -1, -1, -1, -1, -1,
  1770. -1, -1, -1, -1, -1, -1, -1, -1,
  1771. -1, -1, -1, -1, -1, -1, -1, -1,
  1772. -1, -1, -1, -1, -1, -1, -1, -1,
  1773. -1, -1, -1, -1, -1, -1, -1, -1
  1774. };
  1775. static const signed char kUnWebSafeBase64[] = {
  1776. -1, -1, -1, -1, -1, -1, -1, -1,
  1777. -1, -1, -1, -1, -1, -1, -1, -1,
  1778. -1, -1, -1, -1, -1, -1, -1, -1,
  1779. -1, -1, -1, -1, -1, -1, -1, -1,
  1780. -1, -1, -1, -1, -1, -1, -1, -1,
  1781. -1, -1, -1, -1, -1, 62/*-*/, -1, -1,
  1782. 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
  1783. 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
  1784. -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
  1785. 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
  1786. 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
  1787. 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, 63/*_*/,
  1788. -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
  1789. 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
  1790. 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
  1791. 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
  1792. -1, -1, -1, -1, -1, -1, -1, -1,
  1793. -1, -1, -1, -1, -1, -1, -1, -1,
  1794. -1, -1, -1, -1, -1, -1, -1, -1,
  1795. -1, -1, -1, -1, -1, -1, -1, -1,
  1796. -1, -1, -1, -1, -1, -1, -1, -1,
  1797. -1, -1, -1, -1, -1, -1, -1, -1,
  1798. -1, -1, -1, -1, -1, -1, -1, -1,
  1799. -1, -1, -1, -1, -1, -1, -1, -1,
  1800. -1, -1, -1, -1, -1, -1, -1, -1,
  1801. -1, -1, -1, -1, -1, -1, -1, -1,
  1802. -1, -1, -1, -1, -1, -1, -1, -1,
  1803. -1, -1, -1, -1, -1, -1, -1, -1,
  1804. -1, -1, -1, -1, -1, -1, -1, -1,
  1805. -1, -1, -1, -1, -1, -1, -1, -1,
  1806. -1, -1, -1, -1, -1, -1, -1, -1,
  1807. -1, -1, -1, -1, -1, -1, -1, -1
  1808. };
  1809. int WebSafeBase64Unescape(const char *src, int szsrc, char *dest, int szdest) {
  1810. return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnWebSafeBase64);
  1811. }
  1812. static bool Base64UnescapeInternal(const char* src, int slen, string* dest,
  1813. const signed char* unbase64) {
  1814. // Determine the size of the output string. Base64 encodes every 3 bytes into
  1815. // 4 characters. any leftover chars are added directly for good measure.
  1816. // This is documented in the base64 RFC: http://tools.ietf.org/html/rfc3548
  1817. const int dest_len = 3 * (slen / 4) + (slen % 4);
  1818. dest->resize(dest_len);
  1819. // We are getting the destination buffer by getting the beginning of the
  1820. // string and converting it into a char *.
  1821. const int len = Base64UnescapeInternal(src, slen, string_as_array(dest),
  1822. dest_len, unbase64);
  1823. if (len < 0) {
  1824. dest->clear();
  1825. return false;
  1826. }
  1827. // could be shorter if there was padding
  1828. GOOGLE_DCHECK_LE(len, dest_len);
  1829. dest->erase(len);
  1830. return true;
  1831. }
  1832. bool Base64Unescape(StringPiece src, string* dest) {
  1833. return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
  1834. }
  1835. bool WebSafeBase64Unescape(StringPiece src, string* dest) {
  1836. return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
  1837. }
  1838. int Base64EscapeInternal(const unsigned char *src, int szsrc,
  1839. char *dest, int szdest, const char *base64,
  1840. bool do_padding) {
  1841. static const char kPad64 = '=';
  1842. if (szsrc <= 0) return 0;
  1843. if (szsrc * 4 > szdest * 3) return 0;
  1844. char *cur_dest = dest;
  1845. const unsigned char *cur_src = src;
  1846. char *limit_dest = dest + szdest;
  1847. const unsigned char *limit_src = src + szsrc;
  1848. // Three bytes of data encodes to four characters of cyphertext.
  1849. // So we can pump through three-byte chunks atomically.
  1850. while (cur_src < limit_src - 3) { // keep going as long as we have >= 32 bits
  1851. uint32 in = BigEndian::Load32(cur_src) >> 8;
  1852. cur_dest[0] = base64[in >> 18];
  1853. in &= 0x3FFFF;
  1854. cur_dest[1] = base64[in >> 12];
  1855. in &= 0xFFF;
  1856. cur_dest[2] = base64[in >> 6];
  1857. in &= 0x3F;
  1858. cur_dest[3] = base64[in];
  1859. cur_dest += 4;
  1860. cur_src += 3;
  1861. }
  1862. // To save time, we didn't update szdest or szsrc in the loop. So do it now.
  1863. szdest = limit_dest - cur_dest;
  1864. szsrc = limit_src - cur_src;
  1865. /* now deal with the tail (<=3 bytes) */
  1866. switch (szsrc) {
  1867. case 0:
  1868. // Nothing left; nothing more to do.
  1869. break;
  1870. case 1: {
  1871. // One byte left: this encodes to two characters, and (optionally)
  1872. // two pad characters to round out the four-character cypherblock.
  1873. if ((szdest -= 2) < 0) return 0;
  1874. uint32 in = cur_src[0];
  1875. cur_dest[0] = base64[in >> 2];
  1876. in &= 0x3;
  1877. cur_dest[1] = base64[in << 4];
  1878. cur_dest += 2;
  1879. if (do_padding) {
  1880. if ((szdest -= 2) < 0) return 0;
  1881. cur_dest[0] = kPad64;
  1882. cur_dest[1] = kPad64;
  1883. cur_dest += 2;
  1884. }
  1885. break;
  1886. }
  1887. case 2: {
  1888. // Two bytes left: this encodes to three characters, and (optionally)
  1889. // one pad character to round out the four-character cypherblock.
  1890. if ((szdest -= 3) < 0) return 0;
  1891. uint32 in = BigEndian::Load16(cur_src);
  1892. cur_dest[0] = base64[in >> 10];
  1893. in &= 0x3FF;
  1894. cur_dest[1] = base64[in >> 4];
  1895. in &= 0x00F;
  1896. cur_dest[2] = base64[in << 2];
  1897. cur_dest += 3;
  1898. if (do_padding) {
  1899. if ((szdest -= 1) < 0) return 0;
  1900. cur_dest[0] = kPad64;
  1901. cur_dest += 1;
  1902. }
  1903. break;
  1904. }
  1905. case 3: {
  1906. // Three bytes left: same as in the big loop above. We can't do this in
  1907. // the loop because the loop above always reads 4 bytes, and the fourth
  1908. // byte is past the end of the input.
  1909. if ((szdest -= 4) < 0) return 0;
  1910. uint32 in = (cur_src[0] << 16) + BigEndian::Load16(cur_src + 1);
  1911. cur_dest[0] = base64[in >> 18];
  1912. in &= 0x3FFFF;
  1913. cur_dest[1] = base64[in >> 12];
  1914. in &= 0xFFF;
  1915. cur_dest[2] = base64[in >> 6];
  1916. in &= 0x3F;
  1917. cur_dest[3] = base64[in];
  1918. cur_dest += 4;
  1919. break;
  1920. }
  1921. default:
  1922. // Should not be reached: blocks of 4 bytes are handled
  1923. // in the while loop before this switch statement.
  1924. GOOGLE_LOG(FATAL) << "Logic problem? szsrc = " << szsrc;
  1925. break;
  1926. }
  1927. return (cur_dest - dest);
  1928. }
  1929. static const char kBase64Chars[] =
  1930. "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
  1931. static const char kWebSafeBase64Chars[] =
  1932. "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
  1933. int Base64Escape(const unsigned char *src, int szsrc, char *dest, int szdest) {
  1934. return Base64EscapeInternal(src, szsrc, dest, szdest, kBase64Chars, true);
  1935. }
  1936. int WebSafeBase64Escape(const unsigned char *src, int szsrc, char *dest,
  1937. int szdest, bool do_padding) {
  1938. return Base64EscapeInternal(src, szsrc, dest, szdest,
  1939. kWebSafeBase64Chars, do_padding);
  1940. }
  1941. void Base64EscapeInternal(const unsigned char* src, int szsrc,
  1942. string* dest, bool do_padding,
  1943. const char* base64_chars) {
  1944. const int calc_escaped_size =
  1945. CalculateBase64EscapedLen(szsrc, do_padding);
  1946. dest->resize(calc_escaped_size);
  1947. const int escaped_len = Base64EscapeInternal(src, szsrc,
  1948. string_as_array(dest),
  1949. dest->size(),
  1950. base64_chars,
  1951. do_padding);
  1952. GOOGLE_DCHECK_EQ(calc_escaped_size, escaped_len);
  1953. dest->erase(escaped_len);
  1954. }
  1955. void Base64Escape(const unsigned char *src, int szsrc,
  1956. string* dest, bool do_padding) {
  1957. Base64EscapeInternal(src, szsrc, dest, do_padding, kBase64Chars);
  1958. }
  1959. void WebSafeBase64Escape(const unsigned char *src, int szsrc,
  1960. string *dest, bool do_padding) {
  1961. Base64EscapeInternal(src, szsrc, dest, do_padding, kWebSafeBase64Chars);
  1962. }
  1963. void Base64Escape(StringPiece src, string* dest) {
  1964. Base64Escape(reinterpret_cast<const unsigned char*>(src.data()),
  1965. src.size(), dest, true);
  1966. }
  1967. void WebSafeBase64Escape(StringPiece src, string* dest) {
  1968. WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
  1969. src.size(), dest, false);
  1970. }
  1971. void WebSafeBase64EscapeWithPadding(StringPiece src, string* dest) {
  1972. WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
  1973. src.size(), dest, true);
  1974. }
  1975. // Helper to append a Unicode code point to a string as UTF8, without bringing
  1976. // in any external dependencies.
  1977. int EncodeAsUTF8Char(uint32 code_point, char* output) {
  1978. uint32 tmp = 0;
  1979. int len = 0;
  1980. if (code_point <= 0x7f) {
  1981. tmp = code_point;
  1982. len = 1;
  1983. } else if (code_point <= 0x07ff) {
  1984. tmp = 0x0000c080 |
  1985. ((code_point & 0x07c0) << 2) |
  1986. (code_point & 0x003f);
  1987. len = 2;
  1988. } else if (code_point <= 0xffff) {
  1989. tmp = 0x00e08080 |
  1990. ((code_point & 0xf000) << 4) |
  1991. ((code_point & 0x0fc0) << 2) |
  1992. (code_point & 0x003f);
  1993. len = 3;
  1994. } else {
  1995. // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
  1996. // normally only defined up to there as well.
  1997. tmp = 0xf0808080 |
  1998. ((code_point & 0x1c0000) << 6) |
  1999. ((code_point & 0x03f000) << 4) |
  2000. ((code_point & 0x000fc0) << 2) |
  2001. (code_point & 0x003f);
  2002. len = 4;
  2003. }
  2004. tmp = ghtonl(tmp);
  2005. memcpy(output, reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
  2006. return len;
  2007. }
  2008. // Table of UTF-8 character lengths, based on first byte
  2009. static const unsigned char kUTF8LenTbl[256] = {
  2010. 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  2011. 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  2012. 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  2013. 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  2014. 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  2015. 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  2016. 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
  2017. 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
  2018. };
  2019. // Return length of a single UTF-8 source character
  2020. int UTF8FirstLetterNumBytes(const char* src, int len) {
  2021. if (len == 0) {
  2022. return 0;
  2023. }
  2024. return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
  2025. }
  2026. } // namespace protobuf
  2027. } // namespace google