xmltok.c 41 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681
  1. /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
  2. See the file COPYING for copying permission.
  3. */
  4. #include <stddef.h>
  5. #include <xml/details/expat/config.h>
  6. #include <xml/details/expat/expat_external.h>
  7. #include <xml/details/expat/internal.h>
  8. #include <xml/details/expat/xmltok.h>
  9. #include <xml/details/expat/nametab.h>
  10. #ifdef XML_DTD
  11. #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
  12. #else
  13. #define IGNORE_SECTION_TOK_VTABLE /* as nothing */
  14. #endif
  15. #define VTABLE1 \
  16. { PREFIX(prologTok), PREFIX(contentTok), \
  17. PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
  18. { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
  19. PREFIX(sameName), \
  20. PREFIX(nameMatchesAscii), \
  21. PREFIX(nameLength), \
  22. PREFIX(skipS), \
  23. PREFIX(getAtts), \
  24. PREFIX(charRefNumber), \
  25. PREFIX(predefinedEntityName), \
  26. PREFIX(updatePosition), \
  27. PREFIX(isPublicId)
  28. #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
  29. #define UCS2_GET_NAMING(pages, hi, lo) \
  30. (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
  31. /* A 2 byte UTF-8 representation splits the characters 11 bits between
  32. the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
  33. pages, 3 bits to add to that index and 5 bits to generate the mask.
  34. */
  35. #define UTF8_GET_NAMING2(pages, byte) \
  36. (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
  37. + ((((byte)[0]) & 3) << 1) \
  38. + ((((byte)[1]) >> 5) & 1)] \
  39. & (1 << (((byte)[1]) & 0x1F)))
  40. /* A 3 byte UTF-8 representation splits the characters 16 bits between
  41. the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
  42. into pages, 3 bits to add to that index and 5 bits to generate the
  43. mask.
  44. */
  45. #define UTF8_GET_NAMING3(pages, byte) \
  46. (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
  47. + ((((byte)[1]) >> 2) & 0xF)] \
  48. << 3) \
  49. + ((((byte)[1]) & 3) << 1) \
  50. + ((((byte)[2]) >> 5) & 1)] \
  51. & (1 << (((byte)[2]) & 0x1F)))
  52. #define UTF8_GET_NAMING(pages, p, n) \
  53. ((n) == 2 \
  54. ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
  55. : ((n) == 3 \
  56. ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
  57. : 0))
  58. /* Detection of invalid UTF-8 sequences is based on Table 3.1B
  59. of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
  60. with the additional restriction of not allowing the Unicode
  61. code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
  62. Implementation details:
  63. (A & 0x80) == 0 means A < 0x80
  64. and
  65. (A & 0xC0) == 0xC0 means A > 0xBF
  66. */
  67. #define UTF8_INVALID2(p) \
  68. ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
  69. #define UTF8_INVALID3(p) \
  70. (((p)[2] & 0x80) == 0 \
  71. || \
  72. ((*p) == 0xEF && (p)[1] == 0xBF \
  73. ? \
  74. (p)[2] > 0xBD \
  75. : \
  76. ((p)[2] & 0xC0) == 0xC0) \
  77. || \
  78. ((*p) == 0xE0 \
  79. ? \
  80. (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
  81. : \
  82. ((p)[1] & 0x80) == 0 \
  83. || \
  84. ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
  85. #define UTF8_INVALID4(p) \
  86. (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
  87. || \
  88. ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
  89. || \
  90. ((*p) == 0xF0 \
  91. ? \
  92. (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
  93. : \
  94. ((p)[1] & 0x80) == 0 \
  95. || \
  96. ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
  97. static int PTRFASTCALL
  98. isNever(const ENCODING *enc, const char *p)
  99. {
  100. UNUSED(enc);
  101. UNUSED(p);
  102. return 0;
  103. }
  104. static int PTRFASTCALL
  105. utf8_isName2(const ENCODING *enc, const char *p)
  106. {
  107. UNUSED(enc);
  108. return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
  109. }
  110. static int PTRFASTCALL
  111. utf8_isName3(const ENCODING *enc, const char *p)
  112. {
  113. UNUSED(enc);
  114. return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
  115. }
  116. #define utf8_isName4 isNever
  117. static int PTRFASTCALL
  118. utf8_isNmstrt2(const ENCODING *enc, const char *p)
  119. {
  120. UNUSED(enc);
  121. return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
  122. }
  123. static int PTRFASTCALL
  124. utf8_isNmstrt3(const ENCODING *enc, const char *p)
  125. {
  126. UNUSED(enc);
  127. return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
  128. }
  129. #define utf8_isNmstrt4 isNever
  130. static int PTRFASTCALL
  131. utf8_isInvalid2(const ENCODING *enc, const char *p)
  132. {
  133. UNUSED(enc);
  134. return UTF8_INVALID2((const unsigned char *)p);
  135. }
  136. static int PTRFASTCALL
  137. utf8_isInvalid3(const ENCODING *enc, const char *p)
  138. {
  139. UNUSED(enc);
  140. return UTF8_INVALID3((const unsigned char *)p);
  141. }
  142. static int PTRFASTCALL
  143. utf8_isInvalid4(const ENCODING *enc, const char *p)
  144. {
  145. UNUSED(enc);
  146. return UTF8_INVALID4((const unsigned char *)p);
  147. }
  148. struct normal_encoding {
  149. ENCODING enc;
  150. unsigned char type[256];
  151. #ifdef XML_MIN_SIZE
  152. int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
  153. int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
  154. int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
  155. int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
  156. int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
  157. #endif /* XML_MIN_SIZE */
  158. int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
  159. int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
  160. int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
  161. int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
  162. int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
  163. int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
  164. int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
  165. int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
  166. int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
  167. };
  168. #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc))
  169. #ifdef XML_MIN_SIZE
  170. #define STANDARD_VTABLE(E) \
  171. E ## byteType, \
  172. E ## isNameMin, \
  173. E ## isNmstrtMin, \
  174. E ## byteToAscii, \
  175. E ## charMatches,
  176. #define ZERO_VTABLE /* as nothing */
  177. #else
  178. #define STANDARD_VTABLE(E) /* as nothing */
  179. #define ZERO_VTABLE \
  180. 0, \
  181. 0, \
  182. 0, \
  183. 0, \
  184. 0, \
  185. 0, \
  186. 0, \
  187. 0, \
  188. 0
  189. #endif
  190. #define NORMAL_VTABLE(E) \
  191. E ## isName2, \
  192. E ## isName3, \
  193. E ## isName4, \
  194. E ## isNmstrt2, \
  195. E ## isNmstrt3, \
  196. E ## isNmstrt4, \
  197. E ## isInvalid2, \
  198. E ## isInvalid3, \
  199. E ## isInvalid4
  200. static int FASTCALL checkCharRefNumber(int);
  201. #include <xml/details/expat/xmltok_impl.h>
  202. #include <xml/details/expat/ascii.h>
  203. #ifdef XML_MIN_SIZE
  204. #define sb_isNameMin isNever
  205. #define sb_isNmstrtMin isNever
  206. #endif
  207. #ifdef XML_MIN_SIZE
  208. #define MINBPC(enc) ((enc)->minBytesPerChar)
  209. #else
  210. /* minimum bytes per character */
  211. #define MINBPC(enc) 1
  212. #endif
  213. #define SB_BYTE_TYPE(enc, p) \
  214. (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
  215. #ifdef XML_MIN_SIZE
  216. static int PTRFASTCALL
  217. sb_byteType(const ENCODING *enc, const char *p)
  218. {
  219. return SB_BYTE_TYPE(enc, p);
  220. }
  221. #define BYTE_TYPE(enc, p) \
  222. (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
  223. #else
  224. #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
  225. #endif
  226. #ifdef XML_MIN_SIZE
  227. #define BYTE_TO_ASCII(enc, p) \
  228. (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
  229. static int PTRFASTCALL
  230. sb_byteToAscii(const ENCODING *enc, const char *p)
  231. {
  232. return *p;
  233. }
  234. #else
  235. #define BYTE_TO_ASCII(enc, p) (*(p))
  236. #endif
  237. #define IS_NAME_CHAR(enc, p, n) \
  238. (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
  239. #define IS_NMSTRT_CHAR(enc, p, n) \
  240. (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
  241. #define IS_INVALID_CHAR(enc, p, n) \
  242. (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
  243. #ifdef XML_MIN_SIZE
  244. #define IS_NAME_CHAR_MINBPC(enc, p) \
  245. (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
  246. #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
  247. (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
  248. #else
  249. #define IS_NAME_CHAR_MINBPC(enc, p) (0)
  250. #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
  251. #endif
  252. #ifdef XML_MIN_SIZE
  253. #define CHAR_MATCHES(enc, p, c) \
  254. (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
  255. static int PTRCALL
  256. sb_charMatches(const ENCODING *enc, const char *p, int c)
  257. {
  258. return *p == c;
  259. }
  260. #else
  261. /* c is an ASCII character */
  262. #define CHAR_MATCHES(enc, p, c) (*(p) == c)
  263. #endif
  264. #define PREFIX(ident) normal_ ## ident
  265. #define XML_TOK_IMPL_C
  266. #include <xml/details/expat/xmltok_impl.c>
  267. #undef XML_TOK_IMPL_C
  268. #undef MINBPC
  269. #undef BYTE_TYPE
  270. #undef BYTE_TO_ASCII
  271. #undef CHAR_MATCHES
  272. #undef IS_NAME_CHAR
  273. #undef IS_NAME_CHAR_MINBPC
  274. #undef IS_NMSTRT_CHAR
  275. #undef IS_NMSTRT_CHAR_MINBPC
  276. #undef IS_INVALID_CHAR
  277. enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
  278. UTF8_cval1 = 0x00,
  279. UTF8_cval2 = 0xc0,
  280. UTF8_cval3 = 0xe0,
  281. UTF8_cval4 = 0xf0
  282. };
  283. static void PTRCALL
  284. utf8_toUtf8(const ENCODING *enc,
  285. const char **fromP, const char *fromLim,
  286. char **toP, const char *toLim)
  287. {
  288. char *to;
  289. const char *from;
  290. UNUSED(enc);
  291. if (fromLim - *fromP > toLim - *toP) {
  292. /* Avoid copying partial characters. */
  293. for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
  294. if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
  295. break;
  296. }
  297. for (to = *toP, from = *fromP; from != fromLim; from++, to++)
  298. *to = *from;
  299. *fromP = from;
  300. *toP = to;
  301. }
  302. static void PTRCALL
  303. utf8_toUtf16(const ENCODING *enc,
  304. const char **fromP, const char *fromLim,
  305. unsigned short **toP, const unsigned short *toLim)
  306. {
  307. unsigned short *to = *toP;
  308. const char *from = *fromP;
  309. while (from != fromLim && to != toLim) {
  310. switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
  311. case BT_LEAD2:
  312. *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
  313. from += 2;
  314. break;
  315. case BT_LEAD3:
  316. *to++ = (unsigned short)(((from[0] & 0xf) << 12)
  317. | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
  318. from += 3;
  319. break;
  320. case BT_LEAD4:
  321. {
  322. unsigned long n;
  323. if (to + 1 == toLim)
  324. goto after;
  325. n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
  326. | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
  327. n -= 0x10000;
  328. to[0] = (unsigned short)((n >> 10) | 0xD800);
  329. to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
  330. to += 2;
  331. from += 4;
  332. }
  333. break;
  334. default:
  335. *to++ = *from++;
  336. break;
  337. }
  338. }
  339. after:
  340. *fromP = from;
  341. *toP = to;
  342. }
  343. #ifdef XML_NS
  344. static const struct normal_encoding utf8_encoding_ns = {
  345. { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  346. {
  347. #include <xml/details/expat/asciitab.h>
  348. #include <xml/details/expat/utf8tab.h>
  349. },
  350. STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  351. };
  352. #endif
  353. static const struct normal_encoding utf8_encoding = {
  354. { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  355. {
  356. #define BT_COLON BT_NMSTRT
  357. #include <xml/details/expat/asciitab.h>
  358. #undef BT_COLON
  359. #include <xml/details/expat/utf8tab.h>
  360. },
  361. STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  362. };
  363. #ifdef XML_NS
  364. static const struct normal_encoding internal_utf8_encoding_ns = {
  365. { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  366. {
  367. #include <xml/details/expat/iasciitab.h>
  368. #include <xml/details/expat/utf8tab.h>
  369. },
  370. STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  371. };
  372. #endif
  373. static const struct normal_encoding internal_utf8_encoding = {
  374. { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  375. {
  376. #define BT_COLON BT_NMSTRT
  377. #include <xml/details/expat/iasciitab.h>
  378. #undef BT_COLON
  379. #include <xml/details/expat/utf8tab.h>
  380. },
  381. STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  382. };
  383. static void PTRCALL
  384. latin1_toUtf8(const ENCODING *enc,
  385. const char **fromP, const char *fromLim,
  386. char **toP, const char *toLim)
  387. {
  388. UNUSED(enc);
  389. for (;;) {
  390. unsigned char c;
  391. if (*fromP == fromLim)
  392. break;
  393. c = (unsigned char)**fromP;
  394. if (c & 0x80) {
  395. if (toLim - *toP < 2)
  396. break;
  397. *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
  398. *(*toP)++ = (char)((c & 0x3f) | 0x80);
  399. (*fromP)++;
  400. }
  401. else {
  402. if (*toP == toLim)
  403. break;
  404. *(*toP)++ = *(*fromP)++;
  405. }
  406. }
  407. }
  408. static void PTRCALL
  409. latin1_toUtf16(const ENCODING *enc,
  410. const char **fromP, const char *fromLim,
  411. unsigned short **toP, const unsigned short *toLim)
  412. {
  413. UNUSED(enc);
  414. while (*fromP != fromLim && *toP != toLim)
  415. *(*toP)++ = (unsigned char)*(*fromP)++;
  416. }
  417. #ifdef XML_NS
  418. static const struct normal_encoding latin1_encoding_ns = {
  419. { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
  420. {
  421. #include <xml/details/expat/asciitab.h>
  422. #include <xml/details/expat/latin1tab.h>
  423. },
  424. STANDARD_VTABLE(sb_) ZERO_VTABLE
  425. };
  426. #endif
  427. static const struct normal_encoding latin1_encoding = {
  428. { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
  429. {
  430. #define BT_COLON BT_NMSTRT
  431. #include <xml/details/expat/asciitab.h>
  432. #undef BT_COLON
  433. #include <xml/details/expat/latin1tab.h>
  434. },
  435. STANDARD_VTABLE(sb_) ZERO_VTABLE
  436. };
  437. static void PTRCALL
  438. ascii_toUtf8(const ENCODING *enc,
  439. const char **fromP, const char *fromLim,
  440. char **toP, const char *toLim)
  441. {
  442. UNUSED(enc);
  443. while (*fromP != fromLim && *toP != toLim)
  444. *(*toP)++ = *(*fromP)++;
  445. }
  446. #ifdef XML_NS
  447. static const struct normal_encoding ascii_encoding_ns = {
  448. { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
  449. {
  450. #include <xml/details/expat/asciitab.h>
  451. /* BT_NONXML == 0 */
  452. },
  453. STANDARD_VTABLE(sb_) ZERO_VTABLE
  454. };
  455. #endif
  456. static const struct normal_encoding ascii_encoding = {
  457. { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
  458. {
  459. #define BT_COLON BT_NMSTRT
  460. #include <xml/details/expat/asciitab.h>
  461. #undef BT_COLON
  462. /* BT_NONXML == 0 */
  463. },
  464. STANDARD_VTABLE(sb_) ZERO_VTABLE
  465. };
  466. static int PTRFASTCALL
  467. unicode_byte_type(char hi, char lo)
  468. {
  469. switch ((unsigned char)hi) {
  470. case 0xD8: case 0xD9: case 0xDA: case 0xDB:
  471. return BT_LEAD4;
  472. case 0xDC: case 0xDD: case 0xDE: case 0xDF:
  473. return BT_TRAIL;
  474. case 0xFF:
  475. switch ((unsigned char)lo) {
  476. case 0xFF:
  477. case 0xFE:
  478. return BT_NONXML;
  479. }
  480. break;
  481. }
  482. return BT_NONASCII;
  483. }
  484. #define DEFINE_UTF16_TO_UTF8(E) \
  485. static void PTRCALL \
  486. E ## toUtf8(const ENCODING *enc, \
  487. const char **fromP, const char *fromLim, \
  488. char **toP, const char *toLim) \
  489. { \
  490. const char *from; \
  491. UNUSED(enc); \
  492. for (from = *fromP; from != fromLim; from += 2) { \
  493. int plane; \
  494. unsigned char lo2; \
  495. unsigned char lo = GET_LO(from); \
  496. unsigned char hi = GET_HI(from); \
  497. switch (hi) { \
  498. case 0: \
  499. if (lo < 0x80) { \
  500. if (*toP == toLim) { \
  501. *fromP = from; \
  502. return; \
  503. } \
  504. *(*toP)++ = lo; \
  505. break; \
  506. } \
  507. /* fall through */ \
  508. case 0x1: case 0x2: case 0x3: \
  509. case 0x4: case 0x5: case 0x6: case 0x7: \
  510. if (toLim - *toP < 2) { \
  511. *fromP = from; \
  512. return; \
  513. } \
  514. *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
  515. *(*toP)++ = ((lo & 0x3f) | 0x80); \
  516. break; \
  517. default: \
  518. if (toLim - *toP < 3) { \
  519. *fromP = from; \
  520. return; \
  521. } \
  522. /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
  523. *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
  524. *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
  525. *(*toP)++ = ((lo & 0x3f) | 0x80); \
  526. break; \
  527. case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
  528. if (toLim - *toP < 4) { \
  529. *fromP = from; \
  530. return; \
  531. } \
  532. plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
  533. *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
  534. *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
  535. from += 2; \
  536. lo2 = GET_LO(from); \
  537. *(*toP)++ = (((lo & 0x3) << 4) \
  538. | ((GET_HI(from) & 0x3) << 2) \
  539. | (lo2 >> 6) \
  540. | 0x80); \
  541. *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
  542. break; \
  543. } \
  544. } \
  545. *fromP = from; \
  546. }
  547. #define DEFINE_UTF16_TO_UTF16(E) \
  548. static void PTRCALL \
  549. E ## toUtf16(const ENCODING *enc, \
  550. const char **fromP, const char *fromLim, \
  551. unsigned short **toP, const unsigned short *toLim) \
  552. { \
  553. UNUSED(enc); \
  554. /* Avoid copying first half only of surrogate */ \
  555. if (fromLim - *fromP > ((toLim - *toP) << 1) \
  556. && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
  557. fromLim -= 2; \
  558. for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
  559. *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
  560. }
  561. #define SET2(ptr, ch) \
  562. (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
  563. #define GET_LO(ptr) ((unsigned char)(ptr)[0])
  564. #define GET_HI(ptr) ((unsigned char)(ptr)[1])
  565. DEFINE_UTF16_TO_UTF8(little2_)
  566. DEFINE_UTF16_TO_UTF16(little2_)
  567. #undef SET2
  568. #undef GET_LO
  569. #undef GET_HI
  570. #define SET2(ptr, ch) \
  571. (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
  572. #define GET_LO(ptr) ((unsigned char)(ptr)[1])
  573. #define GET_HI(ptr) ((unsigned char)(ptr)[0])
  574. DEFINE_UTF16_TO_UTF8(big2_)
  575. DEFINE_UTF16_TO_UTF16(big2_)
  576. #undef SET2
  577. #undef GET_LO
  578. #undef GET_HI
  579. #define LITTLE2_BYTE_TYPE(enc, p) \
  580. ((p)[1] == 0 \
  581. ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
  582. : unicode_byte_type((p)[1], (p)[0]))
  583. #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
  584. #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
  585. #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
  586. UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
  587. #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
  588. UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
  589. #ifdef XML_MIN_SIZE
  590. static int PTRFASTCALL
  591. little2_byteType(const ENCODING *enc, const char *p)
  592. {
  593. return LITTLE2_BYTE_TYPE(enc, p);
  594. }
  595. static int PTRFASTCALL
  596. little2_byteToAscii(const ENCODING *enc, const char *p)
  597. {
  598. return LITTLE2_BYTE_TO_ASCII(enc, p);
  599. }
  600. static int PTRCALL
  601. little2_charMatches(const ENCODING *enc, const char *p, int c)
  602. {
  603. return LITTLE2_CHAR_MATCHES(enc, p, c);
  604. }
  605. static int PTRFASTCALL
  606. little2_isNameMin(const ENCODING *enc, const char *p)
  607. {
  608. return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
  609. }
  610. static int PTRFASTCALL
  611. little2_isNmstrtMin(const ENCODING *enc, const char *p)
  612. {
  613. return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
  614. }
  615. #undef VTABLE
  616. #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
  617. #else /* not XML_MIN_SIZE */
  618. #undef PREFIX
  619. #define PREFIX(ident) little2_ ## ident
  620. #define MINBPC(enc) 2
  621. /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
  622. #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
  623. #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
  624. #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
  625. #define IS_NAME_CHAR(enc, p, n) 0
  626. #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
  627. #define IS_NMSTRT_CHAR(enc, p, n) (0)
  628. #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
  629. #define XML_TOK_IMPL_C
  630. #include <xml/details/expat/xmltok_impl.c>
  631. #undef XML_TOK_IMPL_C
  632. #undef MINBPC
  633. #undef BYTE_TYPE
  634. #undef BYTE_TO_ASCII
  635. #undef CHAR_MATCHES
  636. #undef IS_NAME_CHAR
  637. #undef IS_NAME_CHAR_MINBPC
  638. #undef IS_NMSTRT_CHAR
  639. #undef IS_NMSTRT_CHAR_MINBPC
  640. #undef IS_INVALID_CHAR
  641. #endif /* not XML_MIN_SIZE */
  642. #ifdef XML_NS
  643. static const struct normal_encoding little2_encoding_ns = {
  644. { VTABLE, 2, 0,
  645. #if BYTEORDER == 1234
  646. 1
  647. #else
  648. 0
  649. #endif
  650. },
  651. {
  652. #include <xml/details/expat/asciitab.h>
  653. #include <xml/details/expat/latin1tab.h>
  654. },
  655. STANDARD_VTABLE(little2_) ZERO_VTABLE
  656. };
  657. #endif
  658. static const struct normal_encoding little2_encoding = {
  659. { VTABLE, 2, 0,
  660. #if BYTEORDER == 1234
  661. 1
  662. #else
  663. 0
  664. #endif
  665. },
  666. {
  667. #define BT_COLON BT_NMSTRT
  668. #include <xml/details/expat/asciitab.h>
  669. #undef BT_COLON
  670. #include <xml/details/expat/latin1tab.h>
  671. },
  672. STANDARD_VTABLE(little2_) ZERO_VTABLE
  673. };
  674. #if BYTEORDER != 4321
  675. #ifdef XML_NS
  676. static const struct normal_encoding internal_little2_encoding_ns = {
  677. { VTABLE, 2, 0, 1 },
  678. {
  679. #include <xml/details/expat/iasciitab.h>
  680. #include <xml/details/expat/latin1tab.h>
  681. },
  682. STANDARD_VTABLE(little2_) ZERO_VTABLE
  683. };
  684. #endif
  685. static const struct normal_encoding internal_little2_encoding = {
  686. { VTABLE, 2, 0, 1 },
  687. {
  688. #define BT_COLON BT_NMSTRT
  689. #include <xml/details/expat/iasciitab.h>
  690. #undef BT_COLON
  691. #include <xml/details/expat/latin1tab.h>
  692. },
  693. STANDARD_VTABLE(little2_) ZERO_VTABLE
  694. };
  695. #endif
  696. #define BIG2_BYTE_TYPE(enc, p) \
  697. ((p)[0] == 0 \
  698. ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
  699. : unicode_byte_type((p)[0], (p)[1]))
  700. #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
  701. #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
  702. #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
  703. UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
  704. #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
  705. UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
  706. #ifdef XML_MIN_SIZE
  707. static int PTRFASTCALL
  708. big2_byteType(const ENCODING *enc, const char *p)
  709. {
  710. return BIG2_BYTE_TYPE(enc, p);
  711. }
  712. static int PTRFASTCALL
  713. big2_byteToAscii(const ENCODING *enc, const char *p)
  714. {
  715. return BIG2_BYTE_TO_ASCII(enc, p);
  716. }
  717. static int PTRCALL
  718. big2_charMatches(const ENCODING *enc, const char *p, int c)
  719. {
  720. return BIG2_CHAR_MATCHES(enc, p, c);
  721. }
  722. static int PTRFASTCALL
  723. big2_isNameMin(const ENCODING *enc, const char *p)
  724. {
  725. return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
  726. }
  727. static int PTRFASTCALL
  728. big2_isNmstrtMin(const ENCODING *enc, const char *p)
  729. {
  730. return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
  731. }
  732. #undef VTABLE
  733. #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
  734. #else /* not XML_MIN_SIZE */
  735. #undef PREFIX
  736. #define PREFIX(ident) big2_ ## ident
  737. #define MINBPC(enc) 2
  738. /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
  739. #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
  740. #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
  741. #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
  742. #define IS_NAME_CHAR(enc, p, n) 0
  743. #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
  744. #define IS_NMSTRT_CHAR(enc, p, n) (0)
  745. #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
  746. #define XML_TOK_IMPL_C
  747. #include <xml/details/expat/xmltok_impl.c>
  748. #undef XML_TOK_IMPL_C
  749. #undef MINBPC
  750. #undef BYTE_TYPE
  751. #undef BYTE_TO_ASCII
  752. #undef CHAR_MATCHES
  753. #undef IS_NAME_CHAR
  754. #undef IS_NAME_CHAR_MINBPC
  755. #undef IS_NMSTRT_CHAR
  756. #undef IS_NMSTRT_CHAR_MINBPC
  757. #undef IS_INVALID_CHAR
  758. #endif /* not XML_MIN_SIZE */
  759. #ifdef XML_NS
  760. static const struct normal_encoding big2_encoding_ns = {
  761. { VTABLE, 2, 0,
  762. #if BYTEORDER == 4321
  763. 1
  764. #else
  765. 0
  766. #endif
  767. },
  768. {
  769. #include <xml/details/expat/asciitab.h>
  770. #include <xml/details/expat/latin1tab.h>
  771. },
  772. STANDARD_VTABLE(big2_) ZERO_VTABLE
  773. };
  774. #endif
  775. static const struct normal_encoding big2_encoding = {
  776. { VTABLE, 2, 0,
  777. #if BYTEORDER == 4321
  778. 1
  779. #else
  780. 0
  781. #endif
  782. },
  783. {
  784. #define BT_COLON BT_NMSTRT
  785. #include <xml/details/expat/asciitab.h>
  786. #undef BT_COLON
  787. #include <xml/details/expat/latin1tab.h>
  788. },
  789. STANDARD_VTABLE(big2_) ZERO_VTABLE
  790. };
  791. #if BYTEORDER != 1234
  792. #ifdef XML_NS
  793. static const struct normal_encoding internal_big2_encoding_ns = {
  794. { VTABLE, 2, 0, 1 },
  795. {
  796. #include <xml/details/expat/iasciitab.h>
  797. #include <xml/details/expat/latin1tab.h>
  798. },
  799. STANDARD_VTABLE(big2_) ZERO_VTABLE
  800. };
  801. #endif
  802. static const struct normal_encoding internal_big2_encoding = {
  803. { VTABLE, 2, 0, 1 },
  804. {
  805. #define BT_COLON BT_NMSTRT
  806. #include <xml/details/expat/iasciitab.h>
  807. #undef BT_COLON
  808. #include <xml/details/expat/latin1tab.h>
  809. },
  810. STANDARD_VTABLE(big2_) ZERO_VTABLE
  811. };
  812. #endif
  813. #undef PREFIX
  814. static int FASTCALL
  815. streqci(const char *s1, const char *s2)
  816. {
  817. for (;;) {
  818. char c1 = *s1++;
  819. char c2 = *s2++;
  820. if (ASCII_a <= c1 && c1 <= ASCII_z)
  821. c1 += ASCII_A - ASCII_a;
  822. if (ASCII_a <= c2 && c2 <= ASCII_z)
  823. c2 += ASCII_A - ASCII_a;
  824. if (c1 != c2)
  825. return 0;
  826. if (!c1)
  827. break;
  828. }
  829. return 1;
  830. }
  831. static void PTRCALL
  832. initUpdatePosition(const ENCODING *enc, const char *ptr,
  833. const char *end, POSITION *pos)
  834. {
  835. UNUSED(enc);
  836. normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
  837. }
  838. static int
  839. toAscii(const ENCODING *enc, const char *ptr, const char *end)
  840. {
  841. char buf[1];
  842. char *p = buf;
  843. XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
  844. if (p == buf)
  845. return -1;
  846. else
  847. return buf[0];
  848. }
  849. static int FASTCALL
  850. isSpace(int c)
  851. {
  852. switch (c) {
  853. case 0x20:
  854. case 0xD:
  855. case 0xA:
  856. case 0x9:
  857. return 1;
  858. }
  859. return 0;
  860. }
  861. /* Return 1 if there's just optional white space or there's an S
  862. followed by name=val.
  863. */
  864. static int
  865. parsePseudoAttribute(const ENCODING *enc,
  866. const char *ptr,
  867. const char *end,
  868. const char **namePtr,
  869. const char **nameEndPtr,
  870. const char **valPtr,
  871. const char **nextTokPtr)
  872. {
  873. int c;
  874. char open;
  875. if (ptr == end) {
  876. *namePtr = NULL;
  877. return 1;
  878. }
  879. if (!isSpace(toAscii(enc, ptr, end))) {
  880. *nextTokPtr = ptr;
  881. return 0;
  882. }
  883. do {
  884. ptr += enc->minBytesPerChar;
  885. } while (isSpace(toAscii(enc, ptr, end)));
  886. if (ptr == end) {
  887. *namePtr = NULL;
  888. return 1;
  889. }
  890. *namePtr = ptr;
  891. for (;;) {
  892. c = toAscii(enc, ptr, end);
  893. if (c == -1) {
  894. *nextTokPtr = ptr;
  895. return 0;
  896. }
  897. if (c == ASCII_EQUALS) {
  898. *nameEndPtr = ptr;
  899. break;
  900. }
  901. if (isSpace(c)) {
  902. *nameEndPtr = ptr;
  903. do {
  904. ptr += enc->minBytesPerChar;
  905. } while (isSpace(c = toAscii(enc, ptr, end)));
  906. if (c != ASCII_EQUALS) {
  907. *nextTokPtr = ptr;
  908. return 0;
  909. }
  910. break;
  911. }
  912. ptr += enc->minBytesPerChar;
  913. }
  914. if (ptr == *namePtr) {
  915. *nextTokPtr = ptr;
  916. return 0;
  917. }
  918. ptr += enc->minBytesPerChar;
  919. c = toAscii(enc, ptr, end);
  920. while (isSpace(c)) {
  921. ptr += enc->minBytesPerChar;
  922. c = toAscii(enc, ptr, end);
  923. }
  924. if (c != ASCII_QUOT && c != ASCII_APOS) {
  925. *nextTokPtr = ptr;
  926. return 0;
  927. }
  928. open = (char)c;
  929. ptr += enc->minBytesPerChar;
  930. *valPtr = ptr;
  931. for (;; ptr += enc->minBytesPerChar) {
  932. c = toAscii(enc, ptr, end);
  933. if (c == open)
  934. break;
  935. if (!(ASCII_a <= c && c <= ASCII_z)
  936. && !(ASCII_A <= c && c <= ASCII_Z)
  937. && !(ASCII_0 <= c && c <= ASCII_9)
  938. && c != ASCII_PERIOD
  939. && c != ASCII_MINUS
  940. && c != ASCII_UNDERSCORE) {
  941. *nextTokPtr = ptr;
  942. return 0;
  943. }
  944. }
  945. *nextTokPtr = ptr + enc->minBytesPerChar;
  946. return 1;
  947. }
  948. static const char KW_version[] = {
  949. ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
  950. };
  951. static const char KW_encoding[] = {
  952. ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
  953. };
  954. static const char KW_standalone[] = {
  955. ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
  956. ASCII_n, ASCII_e, '\0'
  957. };
  958. static const char KW_yes[] = {
  959. ASCII_y, ASCII_e, ASCII_s, '\0'
  960. };
  961. static const char KW_no[] = {
  962. ASCII_n, ASCII_o, '\0'
  963. };
  964. static int
  965. doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
  966. const char *,
  967. const char *),
  968. int isGeneralTextEntity,
  969. const ENCODING *enc,
  970. const char *ptr,
  971. const char *end,
  972. const char **badPtr,
  973. const char **versionPtr,
  974. const char **versionEndPtr,
  975. const char **encodingName,
  976. const ENCODING **encoding,
  977. int *standalone)
  978. {
  979. const char *val = NULL;
  980. const char *name = NULL;
  981. const char *nameEnd = NULL;
  982. ptr += 5 * enc->minBytesPerChar;
  983. end -= 2 * enc->minBytesPerChar;
  984. if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
  985. || !name) {
  986. *badPtr = ptr;
  987. return 0;
  988. }
  989. if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
  990. if (!isGeneralTextEntity) {
  991. *badPtr = name;
  992. return 0;
  993. }
  994. }
  995. else {
  996. if (versionPtr)
  997. *versionPtr = val;
  998. if (versionEndPtr)
  999. *versionEndPtr = ptr;
  1000. if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
  1001. *badPtr = ptr;
  1002. return 0;
  1003. }
  1004. if (!name) {
  1005. if (isGeneralTextEntity) {
  1006. /* a TextDecl must have an EncodingDecl */
  1007. *badPtr = ptr;
  1008. return 0;
  1009. }
  1010. return 1;
  1011. }
  1012. }
  1013. if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
  1014. int c = toAscii(enc, val, end);
  1015. if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
  1016. *badPtr = val;
  1017. return 0;
  1018. }
  1019. if (encodingName)
  1020. *encodingName = val;
  1021. if (encoding)
  1022. *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
  1023. if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
  1024. *badPtr = ptr;
  1025. return 0;
  1026. }
  1027. if (!name)
  1028. return 1;
  1029. }
  1030. if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
  1031. || isGeneralTextEntity) {
  1032. *badPtr = name;
  1033. return 0;
  1034. }
  1035. if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
  1036. if (standalone)
  1037. *standalone = 1;
  1038. }
  1039. else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
  1040. if (standalone)
  1041. *standalone = 0;
  1042. }
  1043. else {
  1044. *badPtr = val;
  1045. return 0;
  1046. }
  1047. while (isSpace(toAscii(enc, ptr, end)))
  1048. ptr += enc->minBytesPerChar;
  1049. if (ptr != end) {
  1050. *badPtr = ptr;
  1051. return 0;
  1052. }
  1053. return 1;
  1054. }
  1055. static int FASTCALL
  1056. checkCharRefNumber(int result)
  1057. {
  1058. switch (result >> 8) {
  1059. case 0xD8: case 0xD9: case 0xDA: case 0xDB:
  1060. case 0xDC: case 0xDD: case 0xDE: case 0xDF:
  1061. return -1;
  1062. case 0:
  1063. if (latin1_encoding.type[result] == BT_NONXML)
  1064. return -1;
  1065. break;
  1066. case 0xFF:
  1067. if (result == 0xFFFE || result == 0xFFFF)
  1068. return -1;
  1069. break;
  1070. }
  1071. return result;
  1072. }
  1073. int FASTCALL
  1074. XmlUtf8Encode(int c, char *buf)
  1075. {
  1076. enum {
  1077. /* minN is minimum legal resulting value for N byte sequence */
  1078. min2 = 0x80,
  1079. min3 = 0x800,
  1080. min4 = 0x10000
  1081. };
  1082. if (c < 0)
  1083. return 0;
  1084. if (c < min2) {
  1085. buf[0] = (char)(c | UTF8_cval1);
  1086. return 1;
  1087. }
  1088. if (c < min3) {
  1089. buf[0] = (char)((c >> 6) | UTF8_cval2);
  1090. buf[1] = (char)((c & 0x3f) | 0x80);
  1091. return 2;
  1092. }
  1093. if (c < min4) {
  1094. buf[0] = (char)((c >> 12) | UTF8_cval3);
  1095. buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
  1096. buf[2] = (char)((c & 0x3f) | 0x80);
  1097. return 3;
  1098. }
  1099. if (c < 0x110000) {
  1100. buf[0] = (char)((c >> 18) | UTF8_cval4);
  1101. buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
  1102. buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
  1103. buf[3] = (char)((c & 0x3f) | 0x80);
  1104. return 4;
  1105. }
  1106. return 0;
  1107. }
  1108. int FASTCALL
  1109. XmlUtf16Encode(int charNum, unsigned short *buf)
  1110. {
  1111. if (charNum < 0)
  1112. return 0;
  1113. if (charNum < 0x10000) {
  1114. buf[0] = (unsigned short)charNum;
  1115. return 1;
  1116. }
  1117. if (charNum < 0x110000) {
  1118. charNum -= 0x10000;
  1119. buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
  1120. buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
  1121. return 2;
  1122. }
  1123. return 0;
  1124. }
  1125. struct unknown_encoding {
  1126. struct normal_encoding normal;
  1127. CONVERTER convert;
  1128. void *userData;
  1129. unsigned short utf16[256];
  1130. char utf8[256][4];
  1131. };
  1132. #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc))
  1133. int
  1134. XmlSizeOfUnknownEncoding(void)
  1135. {
  1136. return sizeof(struct unknown_encoding);
  1137. }
  1138. static int PTRFASTCALL
  1139. unknown_isName(const ENCODING *enc, const char *p)
  1140. {
  1141. const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1142. int c = uenc->convert(uenc->userData, p);
  1143. if (c & ~0xFFFF)
  1144. return 0;
  1145. return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
  1146. }
  1147. static int PTRFASTCALL
  1148. unknown_isNmstrt(const ENCODING *enc, const char *p)
  1149. {
  1150. const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1151. int c = uenc->convert(uenc->userData, p);
  1152. if (c & ~0xFFFF)
  1153. return 0;
  1154. return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
  1155. }
  1156. static int PTRFASTCALL
  1157. unknown_isInvalid(const ENCODING *enc, const char *p)
  1158. {
  1159. const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1160. int c = uenc->convert(uenc->userData, p);
  1161. return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
  1162. }
  1163. static void PTRCALL
  1164. unknown_toUtf8(const ENCODING *enc,
  1165. const char **fromP, const char *fromLim,
  1166. char **toP, const char *toLim)
  1167. {
  1168. const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1169. char buf[XML_UTF8_ENCODE_MAX];
  1170. for (;;) {
  1171. const char *utf8;
  1172. int n;
  1173. if (*fromP == fromLim)
  1174. break;
  1175. utf8 = uenc->utf8[(unsigned char)**fromP];
  1176. n = *utf8++;
  1177. if (n == 0) {
  1178. int c = uenc->convert(uenc->userData, *fromP);
  1179. n = XmlUtf8Encode(c, buf);
  1180. if (n > toLim - *toP)
  1181. break;
  1182. utf8 = buf;
  1183. *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
  1184. - (BT_LEAD2 - 2));
  1185. }
  1186. else {
  1187. if (n > toLim - *toP)
  1188. break;
  1189. (*fromP)++;
  1190. }
  1191. do {
  1192. *(*toP)++ = *utf8++;
  1193. } while (--n != 0);
  1194. }
  1195. }
  1196. static void PTRCALL
  1197. unknown_toUtf16(const ENCODING *enc,
  1198. const char **fromP, const char *fromLim,
  1199. unsigned short **toP, const unsigned short *toLim)
  1200. {
  1201. const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1202. while (*fromP != fromLim && *toP != toLim) {
  1203. unsigned short c = uenc->utf16[(unsigned char)**fromP];
  1204. if (c == 0) {
  1205. c = (unsigned short)
  1206. uenc->convert(uenc->userData, *fromP);
  1207. *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
  1208. - (BT_LEAD2 - 2));
  1209. }
  1210. else
  1211. (*fromP)++;
  1212. *(*toP)++ = c;
  1213. }
  1214. }
  1215. ENCODING *
  1216. XmlInitUnknownEncoding(void *mem,
  1217. int *table,
  1218. CONVERTER convert,
  1219. void *userData)
  1220. {
  1221. int i;
  1222. struct unknown_encoding *e = (struct unknown_encoding *)mem;
  1223. for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
  1224. ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
  1225. for (i = 0; i < 128; i++)
  1226. if (latin1_encoding.type[i] != BT_OTHER
  1227. && latin1_encoding.type[i] != BT_NONXML
  1228. && table[i] != i)
  1229. return 0;
  1230. for (i = 0; i < 256; i++) {
  1231. int c = table[i];
  1232. if (c == -1) {
  1233. e->normal.type[i] = BT_MALFORM;
  1234. /* This shouldn't really get used. */
  1235. e->utf16[i] = 0xFFFF;
  1236. e->utf8[i][0] = 1;
  1237. e->utf8[i][1] = 0;
  1238. }
  1239. else if (c < 0) {
  1240. if (c < -4)
  1241. return 0;
  1242. e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
  1243. e->utf8[i][0] = 0;
  1244. e->utf16[i] = 0;
  1245. }
  1246. else if (c < 0x80) {
  1247. if (latin1_encoding.type[c] != BT_OTHER
  1248. && latin1_encoding.type[c] != BT_NONXML
  1249. && c != i)
  1250. return 0;
  1251. e->normal.type[i] = latin1_encoding.type[c];
  1252. e->utf8[i][0] = 1;
  1253. e->utf8[i][1] = (char)c;
  1254. e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
  1255. }
  1256. else if (checkCharRefNumber(c) < 0) {
  1257. e->normal.type[i] = BT_NONXML;
  1258. /* This shouldn't really get used. */
  1259. e->utf16[i] = 0xFFFF;
  1260. e->utf8[i][0] = 1;
  1261. e->utf8[i][1] = 0;
  1262. }
  1263. else {
  1264. if (c > 0xFFFF)
  1265. return 0;
  1266. if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
  1267. e->normal.type[i] = BT_NMSTRT;
  1268. else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
  1269. e->normal.type[i] = BT_NAME;
  1270. else
  1271. e->normal.type[i] = BT_OTHER;
  1272. e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
  1273. e->utf16[i] = (unsigned short)c;
  1274. }
  1275. }
  1276. e->userData = userData;
  1277. e->convert = convert;
  1278. if (convert) {
  1279. e->normal.isName2 = unknown_isName;
  1280. e->normal.isName3 = unknown_isName;
  1281. e->normal.isName4 = unknown_isName;
  1282. e->normal.isNmstrt2 = unknown_isNmstrt;
  1283. e->normal.isNmstrt3 = unknown_isNmstrt;
  1284. e->normal.isNmstrt4 = unknown_isNmstrt;
  1285. e->normal.isInvalid2 = unknown_isInvalid;
  1286. e->normal.isInvalid3 = unknown_isInvalid;
  1287. e->normal.isInvalid4 = unknown_isInvalid;
  1288. }
  1289. e->normal.enc.utf8Convert = unknown_toUtf8;
  1290. e->normal.enc.utf16Convert = unknown_toUtf16;
  1291. return &(e->normal.enc);
  1292. }
  1293. /* If this enumeration is changed, getEncodingIndex and encodings
  1294. must also be changed. */
  1295. enum {
  1296. UNKNOWN_ENC = -1,
  1297. ISO_8859_1_ENC = 0,
  1298. US_ASCII_ENC,
  1299. UTF_8_ENC,
  1300. UTF_16_ENC,
  1301. UTF_16BE_ENC,
  1302. UTF_16LE_ENC,
  1303. /* must match encodingNames up to here */
  1304. NO_ENC
  1305. };
  1306. static const char KW_ISO_8859_1[] = {
  1307. ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
  1308. ASCII_MINUS, ASCII_1, '\0'
  1309. };
  1310. static const char KW_US_ASCII[] = {
  1311. ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
  1312. '\0'
  1313. };
  1314. static const char KW_UTF_8[] = {
  1315. ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
  1316. };
  1317. static const char KW_UTF_16[] = {
  1318. ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
  1319. };
  1320. static const char KW_UTF_16BE[] = {
  1321. ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
  1322. '\0'
  1323. };
  1324. static const char KW_UTF_16LE[] = {
  1325. ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
  1326. '\0'
  1327. };
  1328. static int FASTCALL
  1329. getEncodingIndex(const char *name)
  1330. {
  1331. static const char * const encodingNames[] = {
  1332. KW_ISO_8859_1,
  1333. KW_US_ASCII,
  1334. KW_UTF_8,
  1335. KW_UTF_16,
  1336. KW_UTF_16BE,
  1337. KW_UTF_16LE,
  1338. };
  1339. int i;
  1340. if (name == NULL)
  1341. return NO_ENC;
  1342. for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
  1343. if (streqci(name, encodingNames[i]))
  1344. return i;
  1345. return UNKNOWN_ENC;
  1346. }
  1347. /* For binary compatibility, we store the index of the encoding
  1348. specified at initialization in the isUtf16 member.
  1349. */
  1350. #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
  1351. #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
  1352. /* This is what detects the encoding. encodingTable maps from
  1353. encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
  1354. the external (protocol) specified encoding; state is
  1355. XML_CONTENT_STATE if we're parsing an external text entity, and
  1356. XML_PROLOG_STATE otherwise.
  1357. */
  1358. static int
  1359. initScan(const ENCODING * const *encodingTable,
  1360. const INIT_ENCODING *enc,
  1361. int state,
  1362. const char *ptr,
  1363. const char *end,
  1364. const char **nextTokPtr)
  1365. {
  1366. const ENCODING **encPtr;
  1367. if (ptr == end)
  1368. return XML_TOK_NONE;
  1369. encPtr = enc->encPtr;
  1370. if (ptr + 1 == end) {
  1371. /* only a single byte available for auto-detection */
  1372. #ifndef XML_DTD /* FIXME */
  1373. /* a well-formed document entity must have more than one byte */
  1374. if (state != XML_CONTENT_STATE)
  1375. return XML_TOK_PARTIAL;
  1376. #endif
  1377. /* so we're parsing an external text entity... */
  1378. /* if UTF-16 was externally specified, then we need at least 2 bytes */
  1379. switch (INIT_ENC_INDEX(enc)) {
  1380. case UTF_16_ENC:
  1381. case UTF_16LE_ENC:
  1382. case UTF_16BE_ENC:
  1383. return XML_TOK_PARTIAL;
  1384. }
  1385. switch ((unsigned char)*ptr) {
  1386. case 0xFE:
  1387. case 0xFF:
  1388. case 0xEF: /* possibly first byte of UTF-8 BOM */
  1389. if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
  1390. && state == XML_CONTENT_STATE)
  1391. break;
  1392. /* fall through */
  1393. case 0x00:
  1394. case 0x3C:
  1395. return XML_TOK_PARTIAL;
  1396. }
  1397. }
  1398. else {
  1399. switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
  1400. case 0xFEFF:
  1401. if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
  1402. && state == XML_CONTENT_STATE)
  1403. break;
  1404. *nextTokPtr = ptr + 2;
  1405. *encPtr = encodingTable[UTF_16BE_ENC];
  1406. return XML_TOK_BOM;
  1407. /* 00 3C is handled in the default case */
  1408. case 0x3C00:
  1409. if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
  1410. || INIT_ENC_INDEX(enc) == UTF_16_ENC)
  1411. && state == XML_CONTENT_STATE)
  1412. break;
  1413. *encPtr = encodingTable[UTF_16LE_ENC];
  1414. return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1415. case 0xFFFE:
  1416. if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
  1417. && state == XML_CONTENT_STATE)
  1418. break;
  1419. *nextTokPtr = ptr + 2;
  1420. *encPtr = encodingTable[UTF_16LE_ENC];
  1421. return XML_TOK_BOM;
  1422. case 0xEFBB:
  1423. /* Maybe a UTF-8 BOM (EF BB BF) */
  1424. /* If there's an explicitly specified (external) encoding
  1425. of ISO-8859-1 or some flavour of UTF-16
  1426. and this is an external text entity,
  1427. don't look for the BOM,
  1428. because it might be a legal data.
  1429. */
  1430. if (state == XML_CONTENT_STATE) {
  1431. int e = INIT_ENC_INDEX(enc);
  1432. if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
  1433. || e == UTF_16LE_ENC || e == UTF_16_ENC)
  1434. break;
  1435. }
  1436. if (ptr + 2 == end)
  1437. return XML_TOK_PARTIAL;
  1438. if ((unsigned char)ptr[2] == 0xBF) {
  1439. *nextTokPtr = ptr + 3;
  1440. *encPtr = encodingTable[UTF_8_ENC];
  1441. return XML_TOK_BOM;
  1442. }
  1443. break;
  1444. default:
  1445. if (ptr[0] == '\0') {
  1446. /* 0 isn't a legal data character. Furthermore a document
  1447. entity can only start with ASCII characters. So the only
  1448. way this can fail to be big-endian UTF-16 if it it's an
  1449. external parsed general entity that's labelled as
  1450. UTF-16LE.
  1451. */
  1452. if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
  1453. break;
  1454. *encPtr = encodingTable[UTF_16BE_ENC];
  1455. return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1456. }
  1457. else if (ptr[1] == '\0') {
  1458. /* We could recover here in the case:
  1459. - parsing an external entity
  1460. - second byte is 0
  1461. - no externally specified encoding
  1462. - no encoding declaration
  1463. by assuming UTF-16LE. But we don't, because this would mean when
  1464. presented just with a single byte, we couldn't reliably determine
  1465. whether we needed further bytes.
  1466. */
  1467. if (state == XML_CONTENT_STATE)
  1468. break;
  1469. *encPtr = encodingTable[UTF_16LE_ENC];
  1470. return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1471. }
  1472. break;
  1473. }
  1474. }
  1475. *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
  1476. return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1477. }
  1478. #define NS(x) x
  1479. #define ns(x) x
  1480. #define XML_TOK_NS_C
  1481. #include <xml/details/expat/xmltok_ns.c>
  1482. #undef XML_TOK_NS_C
  1483. #undef NS
  1484. #undef ns
  1485. #ifdef XML_NS
  1486. #define NS(x) x ## NS
  1487. #define ns(x) x ## _ns
  1488. #define XML_TOK_NS_C
  1489. #include <xml/details/expat/xmltok_ns.c>
  1490. #undef XML_TOK_NS_C
  1491. #undef NS
  1492. #undef ns
  1493. ENCODING *
  1494. XmlInitUnknownEncodingNS(void *mem,
  1495. int *table,
  1496. CONVERTER convert,
  1497. void *userData)
  1498. {
  1499. ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
  1500. if (enc)
  1501. ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
  1502. return enc;
  1503. }
  1504. #endif /* XML_NS */