xmltok_impl.c 44 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796
  1. /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
  2. See the file COPYING for copying permission.
  3. */
  4. /* This file is included! */
  5. #ifdef XML_TOK_IMPL_C
  6. #ifndef IS_INVALID_CHAR
  7. #define IS_INVALID_CHAR(enc, ptr, n) (0)
  8. #endif
  9. #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
  10. case BT_LEAD ## n: \
  11. if (end - ptr < n) \
  12. return XML_TOK_PARTIAL_CHAR; \
  13. if (IS_INVALID_CHAR(enc, ptr, n)) { \
  14. *(nextTokPtr) = (ptr); \
  15. return XML_TOK_INVALID; \
  16. } \
  17. ptr += n; \
  18. break;
  19. #define INVALID_CASES(ptr, nextTokPtr) \
  20. INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
  21. INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
  22. INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
  23. case BT_NONXML: \
  24. case BT_MALFORM: \
  25. case BT_TRAIL: \
  26. *(nextTokPtr) = (ptr); \
  27. return XML_TOK_INVALID;
  28. #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
  29. case BT_LEAD ## n: \
  30. if (end - ptr < n) \
  31. return XML_TOK_PARTIAL_CHAR; \
  32. if (!IS_NAME_CHAR(enc, ptr, n)) { \
  33. *nextTokPtr = ptr; \
  34. return XML_TOK_INVALID; \
  35. } \
  36. ptr += n; \
  37. break;
  38. #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
  39. case BT_NONASCII: \
  40. if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
  41. *nextTokPtr = ptr; \
  42. return XML_TOK_INVALID; \
  43. } \
  44. case BT_NMSTRT: \
  45. case BT_HEX: \
  46. case BT_DIGIT: \
  47. case BT_NAME: \
  48. case BT_MINUS: \
  49. ptr += MINBPC(enc); \
  50. break; \
  51. CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
  52. CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
  53. CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
  54. #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
  55. case BT_LEAD ## n: \
  56. if (end - ptr < n) \
  57. return XML_TOK_PARTIAL_CHAR; \
  58. if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
  59. *nextTokPtr = ptr; \
  60. return XML_TOK_INVALID; \
  61. } \
  62. ptr += n; \
  63. break;
  64. #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
  65. case BT_NONASCII: \
  66. if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
  67. *nextTokPtr = ptr; \
  68. return XML_TOK_INVALID; \
  69. } \
  70. case BT_NMSTRT: \
  71. case BT_HEX: \
  72. ptr += MINBPC(enc); \
  73. break; \
  74. CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
  75. CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
  76. CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
  77. #ifndef PREFIX
  78. #define PREFIX(ident) ident
  79. #endif
  80. /* ptr points to character following "<!-" */
  81. static int PTRCALL
  82. PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
  83. const char *end, const char **nextTokPtr)
  84. {
  85. if (ptr != end) {
  86. if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
  87. *nextTokPtr = ptr;
  88. return XML_TOK_INVALID;
  89. }
  90. ptr += MINBPC(enc);
  91. while (ptr != end) {
  92. switch (BYTE_TYPE(enc, ptr)) {
  93. INVALID_CASES(ptr, nextTokPtr)
  94. case BT_MINUS:
  95. if ((ptr += MINBPC(enc)) == end)
  96. return XML_TOK_PARTIAL;
  97. if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
  98. if ((ptr += MINBPC(enc)) == end)
  99. return XML_TOK_PARTIAL;
  100. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  101. *nextTokPtr = ptr;
  102. return XML_TOK_INVALID;
  103. }
  104. *nextTokPtr = ptr + MINBPC(enc);
  105. return XML_TOK_COMMENT;
  106. }
  107. break;
  108. default:
  109. ptr += MINBPC(enc);
  110. break;
  111. }
  112. }
  113. }
  114. return XML_TOK_PARTIAL;
  115. }
  116. /* ptr points to character following "<!" */
  117. static int PTRCALL
  118. PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
  119. const char *end, const char **nextTokPtr)
  120. {
  121. if (ptr == end)
  122. return XML_TOK_PARTIAL;
  123. switch (BYTE_TYPE(enc, ptr)) {
  124. case BT_MINUS:
  125. return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  126. case BT_LSQB:
  127. *nextTokPtr = ptr + MINBPC(enc);
  128. return XML_TOK_COND_SECT_OPEN;
  129. case BT_NMSTRT:
  130. case BT_HEX:
  131. ptr += MINBPC(enc);
  132. break;
  133. default:
  134. *nextTokPtr = ptr;
  135. return XML_TOK_INVALID;
  136. }
  137. while (ptr != end) {
  138. switch (BYTE_TYPE(enc, ptr)) {
  139. case BT_PERCNT:
  140. if (ptr + MINBPC(enc) == end)
  141. return XML_TOK_PARTIAL;
  142. /* don't allow <!ENTITY% foo "whatever"> */
  143. switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
  144. case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
  145. *nextTokPtr = ptr;
  146. return XML_TOK_INVALID;
  147. }
  148. /* fall through */
  149. case BT_S: case BT_CR: case BT_LF:
  150. *nextTokPtr = ptr;
  151. return XML_TOK_DECL_OPEN;
  152. case BT_NMSTRT:
  153. case BT_HEX:
  154. ptr += MINBPC(enc);
  155. break;
  156. default:
  157. *nextTokPtr = ptr;
  158. return XML_TOK_INVALID;
  159. }
  160. }
  161. return XML_TOK_PARTIAL;
  162. }
  163. static int PTRCALL
  164. PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr,
  165. const char *end, int *tokPtr)
  166. {
  167. int upper = 0;
  168. UNUSED(enc);
  169. *tokPtr = XML_TOK_PI;
  170. if (end - ptr != MINBPC(enc)*3)
  171. return 1;
  172. switch (BYTE_TO_ASCII(enc, ptr)) {
  173. case ASCII_x:
  174. break;
  175. case ASCII_X:
  176. upper = 1;
  177. break;
  178. default:
  179. return 1;
  180. }
  181. ptr += MINBPC(enc);
  182. switch (BYTE_TO_ASCII(enc, ptr)) {
  183. case ASCII_m:
  184. break;
  185. case ASCII_M:
  186. upper = 1;
  187. break;
  188. default:
  189. return 1;
  190. }
  191. ptr += MINBPC(enc);
  192. switch (BYTE_TO_ASCII(enc, ptr)) {
  193. case ASCII_l:
  194. break;
  195. case ASCII_L:
  196. upper = 1;
  197. break;
  198. default:
  199. return 1;
  200. }
  201. if (upper)
  202. return 0;
  203. *tokPtr = XML_TOK_XML_DECL;
  204. return 1;
  205. }
  206. /* ptr points to character following "<?" */
  207. static int PTRCALL
  208. PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
  209. const char *end, const char **nextTokPtr)
  210. {
  211. int tok;
  212. const char *target = ptr;
  213. if (ptr == end)
  214. return XML_TOK_PARTIAL;
  215. switch (BYTE_TYPE(enc, ptr)) {
  216. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  217. default:
  218. *nextTokPtr = ptr;
  219. return XML_TOK_INVALID;
  220. }
  221. while (ptr != end) {
  222. switch (BYTE_TYPE(enc, ptr)) {
  223. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  224. case BT_S: case BT_CR: case BT_LF:
  225. if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
  226. *nextTokPtr = ptr;
  227. return XML_TOK_INVALID;
  228. }
  229. ptr += MINBPC(enc);
  230. while (ptr != end) {
  231. switch (BYTE_TYPE(enc, ptr)) {
  232. INVALID_CASES(ptr, nextTokPtr)
  233. case BT_QUEST:
  234. ptr += MINBPC(enc);
  235. if (ptr == end)
  236. return XML_TOK_PARTIAL;
  237. if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  238. *nextTokPtr = ptr + MINBPC(enc);
  239. return tok;
  240. }
  241. break;
  242. default:
  243. ptr += MINBPC(enc);
  244. break;
  245. }
  246. }
  247. return XML_TOK_PARTIAL;
  248. case BT_QUEST:
  249. if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
  250. *nextTokPtr = ptr;
  251. return XML_TOK_INVALID;
  252. }
  253. ptr += MINBPC(enc);
  254. if (ptr == end)
  255. return XML_TOK_PARTIAL;
  256. if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  257. *nextTokPtr = ptr + MINBPC(enc);
  258. return tok;
  259. }
  260. /* fall through */
  261. default:
  262. *nextTokPtr = ptr;
  263. return XML_TOK_INVALID;
  264. }
  265. }
  266. return XML_TOK_PARTIAL;
  267. }
  268. static int PTRCALL
  269. PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr,
  270. const char *end, const char **nextTokPtr)
  271. {
  272. static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
  273. ASCII_T, ASCII_A, ASCII_LSQB };
  274. int i;
  275. UNUSED(enc);
  276. /* CDATA[ */
  277. if (end - ptr < 6 * MINBPC(enc))
  278. return XML_TOK_PARTIAL;
  279. for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
  280. if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
  281. *nextTokPtr = ptr;
  282. return XML_TOK_INVALID;
  283. }
  284. }
  285. *nextTokPtr = ptr;
  286. return XML_TOK_CDATA_SECT_OPEN;
  287. }
  288. static int PTRCALL
  289. PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
  290. const char *end, const char **nextTokPtr)
  291. {
  292. if (ptr == end)
  293. return XML_TOK_NONE;
  294. if (MINBPC(enc) > 1) {
  295. size_t n = end - ptr;
  296. if (n & (MINBPC(enc) - 1)) {
  297. n &= ~(MINBPC(enc) - 1);
  298. if (n == 0)
  299. return XML_TOK_PARTIAL;
  300. end = ptr + n;
  301. }
  302. }
  303. switch (BYTE_TYPE(enc, ptr)) {
  304. case BT_RSQB:
  305. ptr += MINBPC(enc);
  306. if (ptr == end)
  307. return XML_TOK_PARTIAL;
  308. if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
  309. break;
  310. ptr += MINBPC(enc);
  311. if (ptr == end)
  312. return XML_TOK_PARTIAL;
  313. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  314. ptr -= MINBPC(enc);
  315. break;
  316. }
  317. *nextTokPtr = ptr + MINBPC(enc);
  318. return XML_TOK_CDATA_SECT_CLOSE;
  319. case BT_CR:
  320. ptr += MINBPC(enc);
  321. if (ptr == end)
  322. return XML_TOK_PARTIAL;
  323. if (BYTE_TYPE(enc, ptr) == BT_LF)
  324. ptr += MINBPC(enc);
  325. *nextTokPtr = ptr;
  326. return XML_TOK_DATA_NEWLINE;
  327. case BT_LF:
  328. *nextTokPtr = ptr + MINBPC(enc);
  329. return XML_TOK_DATA_NEWLINE;
  330. INVALID_CASES(ptr, nextTokPtr)
  331. default:
  332. ptr += MINBPC(enc);
  333. break;
  334. }
  335. while (ptr != end) {
  336. switch (BYTE_TYPE(enc, ptr)) {
  337. #define LEAD_CASE(n) \
  338. case BT_LEAD ## n: \
  339. if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
  340. *nextTokPtr = ptr; \
  341. return XML_TOK_DATA_CHARS; \
  342. } \
  343. ptr += n; \
  344. break;
  345. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  346. #undef LEAD_CASE
  347. case BT_NONXML:
  348. case BT_MALFORM:
  349. case BT_TRAIL:
  350. case BT_CR:
  351. case BT_LF:
  352. case BT_RSQB:
  353. *nextTokPtr = ptr;
  354. return XML_TOK_DATA_CHARS;
  355. default:
  356. ptr += MINBPC(enc);
  357. break;
  358. }
  359. }
  360. *nextTokPtr = ptr;
  361. return XML_TOK_DATA_CHARS;
  362. }
  363. /* ptr points to character following "</" */
  364. static int PTRCALL
  365. PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
  366. const char *end, const char **nextTokPtr)
  367. {
  368. if (ptr == end)
  369. return XML_TOK_PARTIAL;
  370. switch (BYTE_TYPE(enc, ptr)) {
  371. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  372. default:
  373. *nextTokPtr = ptr;
  374. return XML_TOK_INVALID;
  375. }
  376. while (ptr != end) {
  377. switch (BYTE_TYPE(enc, ptr)) {
  378. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  379. case BT_S: case BT_CR: case BT_LF:
  380. for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
  381. switch (BYTE_TYPE(enc, ptr)) {
  382. case BT_S: case BT_CR: case BT_LF:
  383. break;
  384. case BT_GT:
  385. *nextTokPtr = ptr + MINBPC(enc);
  386. return XML_TOK_END_TAG;
  387. default:
  388. *nextTokPtr = ptr;
  389. return XML_TOK_INVALID;
  390. }
  391. }
  392. return XML_TOK_PARTIAL;
  393. #ifdef XML_NS
  394. case BT_COLON:
  395. /* no need to check qname syntax here,
  396. since end-tag must match exactly */
  397. ptr += MINBPC(enc);
  398. break;
  399. #endif
  400. case BT_GT:
  401. *nextTokPtr = ptr + MINBPC(enc);
  402. return XML_TOK_END_TAG;
  403. default:
  404. *nextTokPtr = ptr;
  405. return XML_TOK_INVALID;
  406. }
  407. }
  408. return XML_TOK_PARTIAL;
  409. }
  410. /* ptr points to character following "&#X" */
  411. static int PTRCALL
  412. PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
  413. const char *end, const char **nextTokPtr)
  414. {
  415. if (ptr != end) {
  416. switch (BYTE_TYPE(enc, ptr)) {
  417. case BT_DIGIT:
  418. case BT_HEX:
  419. break;
  420. default:
  421. *nextTokPtr = ptr;
  422. return XML_TOK_INVALID;
  423. }
  424. for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
  425. switch (BYTE_TYPE(enc, ptr)) {
  426. case BT_DIGIT:
  427. case BT_HEX:
  428. break;
  429. case BT_SEMI:
  430. *nextTokPtr = ptr + MINBPC(enc);
  431. return XML_TOK_CHAR_REF;
  432. default:
  433. *nextTokPtr = ptr;
  434. return XML_TOK_INVALID;
  435. }
  436. }
  437. }
  438. return XML_TOK_PARTIAL;
  439. }
  440. /* ptr points to character following "&#" */
  441. static int PTRCALL
  442. PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
  443. const char *end, const char **nextTokPtr)
  444. {
  445. if (ptr != end) {
  446. if (CHAR_MATCHES(enc, ptr, ASCII_x))
  447. return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  448. switch (BYTE_TYPE(enc, ptr)) {
  449. case BT_DIGIT:
  450. break;
  451. default:
  452. *nextTokPtr = ptr;
  453. return XML_TOK_INVALID;
  454. }
  455. for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
  456. switch (BYTE_TYPE(enc, ptr)) {
  457. case BT_DIGIT:
  458. break;
  459. case BT_SEMI:
  460. *nextTokPtr = ptr + MINBPC(enc);
  461. return XML_TOK_CHAR_REF;
  462. default:
  463. *nextTokPtr = ptr;
  464. return XML_TOK_INVALID;
  465. }
  466. }
  467. }
  468. return XML_TOK_PARTIAL;
  469. }
  470. /* ptr points to character following "&" */
  471. static int PTRCALL
  472. PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
  473. const char **nextTokPtr)
  474. {
  475. if (ptr == end)
  476. return XML_TOK_PARTIAL;
  477. switch (BYTE_TYPE(enc, ptr)) {
  478. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  479. case BT_NUM:
  480. return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  481. default:
  482. *nextTokPtr = ptr;
  483. return XML_TOK_INVALID;
  484. }
  485. while (ptr != end) {
  486. switch (BYTE_TYPE(enc, ptr)) {
  487. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  488. case BT_SEMI:
  489. *nextTokPtr = ptr + MINBPC(enc);
  490. return XML_TOK_ENTITY_REF;
  491. default:
  492. *nextTokPtr = ptr;
  493. return XML_TOK_INVALID;
  494. }
  495. }
  496. return XML_TOK_PARTIAL;
  497. }
  498. /* ptr points to character following first character of attribute name */
  499. static int PTRCALL
  500. PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
  501. const char **nextTokPtr)
  502. {
  503. #ifdef XML_NS
  504. int hadColon = 0;
  505. #endif
  506. while (ptr != end) {
  507. switch (BYTE_TYPE(enc, ptr)) {
  508. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  509. #ifdef XML_NS
  510. case BT_COLON:
  511. if (hadColon) {
  512. *nextTokPtr = ptr;
  513. return XML_TOK_INVALID;
  514. }
  515. hadColon = 1;
  516. ptr += MINBPC(enc);
  517. if (ptr == end)
  518. return XML_TOK_PARTIAL;
  519. switch (BYTE_TYPE(enc, ptr)) {
  520. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  521. default:
  522. *nextTokPtr = ptr;
  523. return XML_TOK_INVALID;
  524. }
  525. break;
  526. #endif
  527. case BT_S: case BT_CR: case BT_LF:
  528. for (;;) {
  529. int t;
  530. ptr += MINBPC(enc);
  531. if (ptr == end)
  532. return XML_TOK_PARTIAL;
  533. t = BYTE_TYPE(enc, ptr);
  534. if (t == BT_EQUALS)
  535. break;
  536. switch (t) {
  537. case BT_S:
  538. case BT_LF:
  539. case BT_CR:
  540. break;
  541. default:
  542. *nextTokPtr = ptr;
  543. return XML_TOK_INVALID;
  544. }
  545. }
  546. /* fall through */
  547. case BT_EQUALS:
  548. {
  549. int open;
  550. #ifdef XML_NS
  551. hadColon = 0;
  552. #endif
  553. for (;;) {
  554. ptr += MINBPC(enc);
  555. if (ptr == end)
  556. return XML_TOK_PARTIAL;
  557. open = BYTE_TYPE(enc, ptr);
  558. if (open == BT_QUOT || open == BT_APOS)
  559. break;
  560. switch (open) {
  561. case BT_S:
  562. case BT_LF:
  563. case BT_CR:
  564. break;
  565. default:
  566. *nextTokPtr = ptr;
  567. return XML_TOK_INVALID;
  568. }
  569. }
  570. ptr += MINBPC(enc);
  571. /* in attribute value */
  572. for (;;) {
  573. int t;
  574. if (ptr == end)
  575. return XML_TOK_PARTIAL;
  576. t = BYTE_TYPE(enc, ptr);
  577. if (t == open)
  578. break;
  579. switch (t) {
  580. INVALID_CASES(ptr, nextTokPtr)
  581. case BT_AMP:
  582. {
  583. int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
  584. if (tok <= 0) {
  585. if (tok == XML_TOK_INVALID)
  586. *nextTokPtr = ptr;
  587. return tok;
  588. }
  589. break;
  590. }
  591. case BT_LT:
  592. *nextTokPtr = ptr;
  593. return XML_TOK_INVALID;
  594. default:
  595. ptr += MINBPC(enc);
  596. break;
  597. }
  598. }
  599. ptr += MINBPC(enc);
  600. if (ptr == end)
  601. return XML_TOK_PARTIAL;
  602. switch (BYTE_TYPE(enc, ptr)) {
  603. case BT_S:
  604. case BT_CR:
  605. case BT_LF:
  606. break;
  607. case BT_SOL:
  608. goto sol;
  609. case BT_GT:
  610. goto gt;
  611. default:
  612. *nextTokPtr = ptr;
  613. return XML_TOK_INVALID;
  614. }
  615. /* ptr points to closing quote */
  616. for (;;) {
  617. ptr += MINBPC(enc);
  618. if (ptr == end)
  619. return XML_TOK_PARTIAL;
  620. switch (BYTE_TYPE(enc, ptr)) {
  621. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  622. case BT_S: case BT_CR: case BT_LF:
  623. continue;
  624. case BT_GT:
  625. gt:
  626. *nextTokPtr = ptr + MINBPC(enc);
  627. return XML_TOK_START_TAG_WITH_ATTS;
  628. case BT_SOL:
  629. sol:
  630. ptr += MINBPC(enc);
  631. if (ptr == end)
  632. return XML_TOK_PARTIAL;
  633. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  634. *nextTokPtr = ptr;
  635. return XML_TOK_INVALID;
  636. }
  637. *nextTokPtr = ptr + MINBPC(enc);
  638. return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
  639. default:
  640. *nextTokPtr = ptr;
  641. return XML_TOK_INVALID;
  642. }
  643. break;
  644. }
  645. break;
  646. }
  647. default:
  648. *nextTokPtr = ptr;
  649. return XML_TOK_INVALID;
  650. }
  651. }
  652. return XML_TOK_PARTIAL;
  653. }
  654. /* ptr points to character following "<" */
  655. static int PTRCALL
  656. PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
  657. const char **nextTokPtr)
  658. {
  659. #ifdef XML_NS
  660. int hadColon;
  661. #endif
  662. if (ptr == end)
  663. return XML_TOK_PARTIAL;
  664. switch (BYTE_TYPE(enc, ptr)) {
  665. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  666. case BT_EXCL:
  667. if ((ptr += MINBPC(enc)) == end)
  668. return XML_TOK_PARTIAL;
  669. switch (BYTE_TYPE(enc, ptr)) {
  670. case BT_MINUS:
  671. return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  672. case BT_LSQB:
  673. return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
  674. end, nextTokPtr);
  675. }
  676. *nextTokPtr = ptr;
  677. return XML_TOK_INVALID;
  678. case BT_QUEST:
  679. return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  680. case BT_SOL:
  681. return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  682. default:
  683. *nextTokPtr = ptr;
  684. return XML_TOK_INVALID;
  685. }
  686. #ifdef XML_NS
  687. hadColon = 0;
  688. #endif
  689. /* we have a start-tag */
  690. while (ptr != end) {
  691. switch (BYTE_TYPE(enc, ptr)) {
  692. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  693. #ifdef XML_NS
  694. case BT_COLON:
  695. if (hadColon) {
  696. *nextTokPtr = ptr;
  697. return XML_TOK_INVALID;
  698. }
  699. hadColon = 1;
  700. ptr += MINBPC(enc);
  701. if (ptr == end)
  702. return XML_TOK_PARTIAL;
  703. switch (BYTE_TYPE(enc, ptr)) {
  704. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  705. default:
  706. *nextTokPtr = ptr;
  707. return XML_TOK_INVALID;
  708. }
  709. break;
  710. #endif
  711. case BT_S: case BT_CR: case BT_LF:
  712. {
  713. ptr += MINBPC(enc);
  714. while (ptr != end) {
  715. switch (BYTE_TYPE(enc, ptr)) {
  716. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  717. case BT_GT:
  718. goto gt;
  719. case BT_SOL:
  720. goto sol;
  721. case BT_S: case BT_CR: case BT_LF:
  722. ptr += MINBPC(enc);
  723. continue;
  724. default:
  725. *nextTokPtr = ptr;
  726. return XML_TOK_INVALID;
  727. }
  728. return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
  729. }
  730. return XML_TOK_PARTIAL;
  731. }
  732. case BT_GT:
  733. gt:
  734. *nextTokPtr = ptr + MINBPC(enc);
  735. return XML_TOK_START_TAG_NO_ATTS;
  736. case BT_SOL:
  737. sol:
  738. ptr += MINBPC(enc);
  739. if (ptr == end)
  740. return XML_TOK_PARTIAL;
  741. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  742. *nextTokPtr = ptr;
  743. return XML_TOK_INVALID;
  744. }
  745. *nextTokPtr = ptr + MINBPC(enc);
  746. return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
  747. default:
  748. *nextTokPtr = ptr;
  749. return XML_TOK_INVALID;
  750. }
  751. }
  752. return XML_TOK_PARTIAL;
  753. }
  754. static int PTRCALL
  755. PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
  756. const char **nextTokPtr)
  757. {
  758. if (ptr == end)
  759. return XML_TOK_NONE;
  760. if (MINBPC(enc) > 1) {
  761. size_t n = end - ptr;
  762. if (n & (MINBPC(enc) - 1)) {
  763. n &= ~(MINBPC(enc) - 1);
  764. if (n == 0)
  765. return XML_TOK_PARTIAL;
  766. end = ptr + n;
  767. }
  768. }
  769. switch (BYTE_TYPE(enc, ptr)) {
  770. case BT_LT:
  771. return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  772. case BT_AMP:
  773. return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  774. case BT_CR:
  775. ptr += MINBPC(enc);
  776. if (ptr == end)
  777. return XML_TOK_TRAILING_CR;
  778. if (BYTE_TYPE(enc, ptr) == BT_LF)
  779. ptr += MINBPC(enc);
  780. *nextTokPtr = ptr;
  781. return XML_TOK_DATA_NEWLINE;
  782. case BT_LF:
  783. *nextTokPtr = ptr + MINBPC(enc);
  784. return XML_TOK_DATA_NEWLINE;
  785. case BT_RSQB:
  786. ptr += MINBPC(enc);
  787. if (ptr == end)
  788. return XML_TOK_TRAILING_RSQB;
  789. if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
  790. break;
  791. ptr += MINBPC(enc);
  792. if (ptr == end)
  793. return XML_TOK_TRAILING_RSQB;
  794. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  795. ptr -= MINBPC(enc);
  796. break;
  797. }
  798. *nextTokPtr = ptr;
  799. return XML_TOK_INVALID;
  800. INVALID_CASES(ptr, nextTokPtr)
  801. default:
  802. ptr += MINBPC(enc);
  803. break;
  804. }
  805. while (ptr != end) {
  806. switch (BYTE_TYPE(enc, ptr)) {
  807. #define LEAD_CASE(n) \
  808. case BT_LEAD ## n: \
  809. if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
  810. *nextTokPtr = ptr; \
  811. return XML_TOK_DATA_CHARS; \
  812. } \
  813. ptr += n; \
  814. break;
  815. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  816. #undef LEAD_CASE
  817. case BT_RSQB:
  818. if (ptr + MINBPC(enc) != end) {
  819. if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
  820. ptr += MINBPC(enc);
  821. break;
  822. }
  823. if (ptr + 2*MINBPC(enc) != end) {
  824. if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
  825. ptr += MINBPC(enc);
  826. break;
  827. }
  828. *nextTokPtr = ptr + 2*MINBPC(enc);
  829. return XML_TOK_INVALID;
  830. }
  831. }
  832. /* fall through */
  833. case BT_AMP:
  834. case BT_LT:
  835. case BT_NONXML:
  836. case BT_MALFORM:
  837. case BT_TRAIL:
  838. case BT_CR:
  839. case BT_LF:
  840. *nextTokPtr = ptr;
  841. return XML_TOK_DATA_CHARS;
  842. default:
  843. ptr += MINBPC(enc);
  844. break;
  845. }
  846. }
  847. *nextTokPtr = ptr;
  848. return XML_TOK_DATA_CHARS;
  849. }
  850. /* ptr points to character following "%" */
  851. static int PTRCALL
  852. PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
  853. const char **nextTokPtr)
  854. {
  855. if (ptr == end)
  856. return XML_TOK_PARTIAL;
  857. switch (BYTE_TYPE(enc, ptr)) {
  858. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  859. case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
  860. *nextTokPtr = ptr;
  861. return XML_TOK_PERCENT;
  862. default:
  863. *nextTokPtr = ptr;
  864. return XML_TOK_INVALID;
  865. }
  866. while (ptr != end) {
  867. switch (BYTE_TYPE(enc, ptr)) {
  868. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  869. case BT_SEMI:
  870. *nextTokPtr = ptr + MINBPC(enc);
  871. return XML_TOK_PARAM_ENTITY_REF;
  872. default:
  873. *nextTokPtr = ptr;
  874. return XML_TOK_INVALID;
  875. }
  876. }
  877. return XML_TOK_PARTIAL;
  878. }
  879. static int PTRCALL
  880. PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
  881. const char **nextTokPtr)
  882. {
  883. if (ptr == end)
  884. return XML_TOK_PARTIAL;
  885. switch (BYTE_TYPE(enc, ptr)) {
  886. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  887. default:
  888. *nextTokPtr = ptr;
  889. return XML_TOK_INVALID;
  890. }
  891. while (ptr != end) {
  892. switch (BYTE_TYPE(enc, ptr)) {
  893. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  894. case BT_CR: case BT_LF: case BT_S:
  895. case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
  896. *nextTokPtr = ptr;
  897. return XML_TOK_POUND_NAME;
  898. default:
  899. *nextTokPtr = ptr;
  900. return XML_TOK_INVALID;
  901. }
  902. }
  903. return -XML_TOK_POUND_NAME;
  904. }
  905. static int PTRCALL
  906. PREFIX(scanLit)(int open, const ENCODING *enc,
  907. const char *ptr, const char *end,
  908. const char **nextTokPtr)
  909. {
  910. while (ptr != end) {
  911. int t = BYTE_TYPE(enc, ptr);
  912. switch (t) {
  913. INVALID_CASES(ptr, nextTokPtr)
  914. case BT_QUOT:
  915. case BT_APOS:
  916. ptr += MINBPC(enc);
  917. if (t != open)
  918. break;
  919. if (ptr == end)
  920. return -XML_TOK_LITERAL;
  921. *nextTokPtr = ptr;
  922. switch (BYTE_TYPE(enc, ptr)) {
  923. case BT_S: case BT_CR: case BT_LF:
  924. case BT_GT: case BT_PERCNT: case BT_LSQB:
  925. return XML_TOK_LITERAL;
  926. default:
  927. return XML_TOK_INVALID;
  928. }
  929. default:
  930. ptr += MINBPC(enc);
  931. break;
  932. }
  933. }
  934. return XML_TOK_PARTIAL;
  935. }
  936. static int PTRCALL
  937. PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
  938. const char **nextTokPtr)
  939. {
  940. int tok;
  941. if (ptr == end)
  942. return XML_TOK_NONE;
  943. if (MINBPC(enc) > 1) {
  944. size_t n = end - ptr;
  945. if (n & (MINBPC(enc) - 1)) {
  946. n &= ~(MINBPC(enc) - 1);
  947. if (n == 0)
  948. return XML_TOK_PARTIAL;
  949. end = ptr + n;
  950. }
  951. }
  952. switch (BYTE_TYPE(enc, ptr)) {
  953. case BT_QUOT:
  954. return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
  955. case BT_APOS:
  956. return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
  957. case BT_LT:
  958. {
  959. ptr += MINBPC(enc);
  960. if (ptr == end)
  961. return XML_TOK_PARTIAL;
  962. switch (BYTE_TYPE(enc, ptr)) {
  963. case BT_EXCL:
  964. return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  965. case BT_QUEST:
  966. return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  967. case BT_NMSTRT:
  968. case BT_HEX:
  969. case BT_NONASCII:
  970. case BT_LEAD2:
  971. case BT_LEAD3:
  972. case BT_LEAD4:
  973. *nextTokPtr = ptr - MINBPC(enc);
  974. return XML_TOK_INSTANCE_START;
  975. }
  976. *nextTokPtr = ptr;
  977. return XML_TOK_INVALID;
  978. }
  979. case BT_CR:
  980. if (ptr + MINBPC(enc) == end) {
  981. *nextTokPtr = end;
  982. /* indicate that this might be part of a CR/LF pair */
  983. return -XML_TOK_PROLOG_S;
  984. }
  985. /* fall through */
  986. case BT_S: case BT_LF:
  987. for (;;) {
  988. ptr += MINBPC(enc);
  989. if (ptr == end)
  990. break;
  991. switch (BYTE_TYPE(enc, ptr)) {
  992. case BT_S: case BT_LF:
  993. break;
  994. case BT_CR:
  995. /* don't split CR/LF pair */
  996. if (ptr + MINBPC(enc) != end)
  997. break;
  998. /* fall through */
  999. default:
  1000. *nextTokPtr = ptr;
  1001. return XML_TOK_PROLOG_S;
  1002. }
  1003. }
  1004. *nextTokPtr = ptr;
  1005. return XML_TOK_PROLOG_S;
  1006. case BT_PERCNT:
  1007. return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1008. case BT_COMMA:
  1009. *nextTokPtr = ptr + MINBPC(enc);
  1010. return XML_TOK_COMMA;
  1011. case BT_LSQB:
  1012. *nextTokPtr = ptr + MINBPC(enc);
  1013. return XML_TOK_OPEN_BRACKET;
  1014. case BT_RSQB:
  1015. ptr += MINBPC(enc);
  1016. if (ptr == end)
  1017. return -XML_TOK_CLOSE_BRACKET;
  1018. if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
  1019. if (ptr + MINBPC(enc) == end)
  1020. return XML_TOK_PARTIAL;
  1021. if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
  1022. *nextTokPtr = ptr + 2*MINBPC(enc);
  1023. return XML_TOK_COND_SECT_CLOSE;
  1024. }
  1025. }
  1026. *nextTokPtr = ptr;
  1027. return XML_TOK_CLOSE_BRACKET;
  1028. case BT_LPAR:
  1029. *nextTokPtr = ptr + MINBPC(enc);
  1030. return XML_TOK_OPEN_PAREN;
  1031. case BT_RPAR:
  1032. ptr += MINBPC(enc);
  1033. if (ptr == end)
  1034. return -XML_TOK_CLOSE_PAREN;
  1035. switch (BYTE_TYPE(enc, ptr)) {
  1036. case BT_AST:
  1037. *nextTokPtr = ptr + MINBPC(enc);
  1038. return XML_TOK_CLOSE_PAREN_ASTERISK;
  1039. case BT_QUEST:
  1040. *nextTokPtr = ptr + MINBPC(enc);
  1041. return XML_TOK_CLOSE_PAREN_QUESTION;
  1042. case BT_PLUS:
  1043. *nextTokPtr = ptr + MINBPC(enc);
  1044. return XML_TOK_CLOSE_PAREN_PLUS;
  1045. case BT_CR: case BT_LF: case BT_S:
  1046. case BT_GT: case BT_COMMA: case BT_VERBAR:
  1047. case BT_RPAR:
  1048. *nextTokPtr = ptr;
  1049. return XML_TOK_CLOSE_PAREN;
  1050. }
  1051. *nextTokPtr = ptr;
  1052. return XML_TOK_INVALID;
  1053. case BT_VERBAR:
  1054. *nextTokPtr = ptr + MINBPC(enc);
  1055. return XML_TOK_OR;
  1056. case BT_GT:
  1057. *nextTokPtr = ptr + MINBPC(enc);
  1058. return XML_TOK_DECL_CLOSE;
  1059. case BT_NUM:
  1060. return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1061. #define LEAD_CASE(n) \
  1062. case BT_LEAD ## n: \
  1063. if (end - ptr < n) \
  1064. return XML_TOK_PARTIAL_CHAR; \
  1065. if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
  1066. ptr += n; \
  1067. tok = XML_TOK_NAME; \
  1068. break; \
  1069. } \
  1070. if (IS_NAME_CHAR(enc, ptr, n)) { \
  1071. ptr += n; \
  1072. tok = XML_TOK_NMTOKEN; \
  1073. break; \
  1074. } \
  1075. *nextTokPtr = ptr; \
  1076. return XML_TOK_INVALID;
  1077. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1078. #undef LEAD_CASE
  1079. case BT_NMSTRT:
  1080. case BT_HEX:
  1081. tok = XML_TOK_NAME;
  1082. ptr += MINBPC(enc);
  1083. break;
  1084. case BT_DIGIT:
  1085. case BT_NAME:
  1086. case BT_MINUS:
  1087. #ifdef XML_NS
  1088. case BT_COLON:
  1089. #endif
  1090. tok = XML_TOK_NMTOKEN;
  1091. ptr += MINBPC(enc);
  1092. break;
  1093. case BT_NONASCII:
  1094. if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
  1095. ptr += MINBPC(enc);
  1096. tok = XML_TOK_NAME;
  1097. break;
  1098. }
  1099. if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
  1100. ptr += MINBPC(enc);
  1101. tok = XML_TOK_NMTOKEN;
  1102. break;
  1103. }
  1104. /* fall through */
  1105. default:
  1106. *nextTokPtr = ptr;
  1107. return XML_TOK_INVALID;
  1108. }
  1109. while (ptr != end) {
  1110. switch (BYTE_TYPE(enc, ptr)) {
  1111. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  1112. case BT_GT: case BT_RPAR: case BT_COMMA:
  1113. case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
  1114. case BT_S: case BT_CR: case BT_LF:
  1115. *nextTokPtr = ptr;
  1116. return tok;
  1117. #ifdef XML_NS
  1118. case BT_COLON:
  1119. ptr += MINBPC(enc);
  1120. switch (tok) {
  1121. case XML_TOK_NAME:
  1122. if (ptr == end)
  1123. return XML_TOK_PARTIAL;
  1124. tok = XML_TOK_PREFIXED_NAME;
  1125. switch (BYTE_TYPE(enc, ptr)) {
  1126. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  1127. default:
  1128. tok = XML_TOK_NMTOKEN;
  1129. break;
  1130. }
  1131. break;
  1132. case XML_TOK_PREFIXED_NAME:
  1133. tok = XML_TOK_NMTOKEN;
  1134. break;
  1135. }
  1136. break;
  1137. #endif
  1138. case BT_PLUS:
  1139. if (tok == XML_TOK_NMTOKEN) {
  1140. *nextTokPtr = ptr;
  1141. return XML_TOK_INVALID;
  1142. }
  1143. *nextTokPtr = ptr + MINBPC(enc);
  1144. return XML_TOK_NAME_PLUS;
  1145. case BT_AST:
  1146. if (tok == XML_TOK_NMTOKEN) {
  1147. *nextTokPtr = ptr;
  1148. return XML_TOK_INVALID;
  1149. }
  1150. *nextTokPtr = ptr + MINBPC(enc);
  1151. return XML_TOK_NAME_ASTERISK;
  1152. case BT_QUEST:
  1153. if (tok == XML_TOK_NMTOKEN) {
  1154. *nextTokPtr = ptr;
  1155. return XML_TOK_INVALID;
  1156. }
  1157. *nextTokPtr = ptr + MINBPC(enc);
  1158. return XML_TOK_NAME_QUESTION;
  1159. default:
  1160. *nextTokPtr = ptr;
  1161. return XML_TOK_INVALID;
  1162. }
  1163. }
  1164. return -tok;
  1165. }
  1166. static int PTRCALL
  1167. PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
  1168. const char *end, const char **nextTokPtr)
  1169. {
  1170. const char *start;
  1171. if (ptr == end)
  1172. return XML_TOK_NONE;
  1173. start = ptr;
  1174. while (ptr != end) {
  1175. switch (BYTE_TYPE(enc, ptr)) {
  1176. #define LEAD_CASE(n) \
  1177. case BT_LEAD ## n: ptr += n; break;
  1178. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1179. #undef LEAD_CASE
  1180. case BT_AMP:
  1181. if (ptr == start)
  1182. return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1183. *nextTokPtr = ptr;
  1184. return XML_TOK_DATA_CHARS;
  1185. case BT_LT:
  1186. /* this is for inside entity references */
  1187. *nextTokPtr = ptr;
  1188. return XML_TOK_INVALID;
  1189. case BT_LF:
  1190. if (ptr == start) {
  1191. *nextTokPtr = ptr + MINBPC(enc);
  1192. return XML_TOK_DATA_NEWLINE;
  1193. }
  1194. *nextTokPtr = ptr;
  1195. return XML_TOK_DATA_CHARS;
  1196. case BT_CR:
  1197. if (ptr == start) {
  1198. ptr += MINBPC(enc);
  1199. if (ptr == end)
  1200. return XML_TOK_TRAILING_CR;
  1201. if (BYTE_TYPE(enc, ptr) == BT_LF)
  1202. ptr += MINBPC(enc);
  1203. *nextTokPtr = ptr;
  1204. return XML_TOK_DATA_NEWLINE;
  1205. }
  1206. *nextTokPtr = ptr;
  1207. return XML_TOK_DATA_CHARS;
  1208. case BT_S:
  1209. if (ptr == start) {
  1210. *nextTokPtr = ptr + MINBPC(enc);
  1211. return XML_TOK_ATTRIBUTE_VALUE_S;
  1212. }
  1213. *nextTokPtr = ptr;
  1214. return XML_TOK_DATA_CHARS;
  1215. default:
  1216. ptr += MINBPC(enc);
  1217. break;
  1218. }
  1219. }
  1220. *nextTokPtr = ptr;
  1221. return XML_TOK_DATA_CHARS;
  1222. }
  1223. static int PTRCALL
  1224. PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
  1225. const char *end, const char **nextTokPtr)
  1226. {
  1227. const char *start;
  1228. if (ptr == end)
  1229. return XML_TOK_NONE;
  1230. start = ptr;
  1231. while (ptr != end) {
  1232. switch (BYTE_TYPE(enc, ptr)) {
  1233. #define LEAD_CASE(n) \
  1234. case BT_LEAD ## n: ptr += n; break;
  1235. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1236. #undef LEAD_CASE
  1237. case BT_AMP:
  1238. if (ptr == start)
  1239. return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1240. *nextTokPtr = ptr;
  1241. return XML_TOK_DATA_CHARS;
  1242. case BT_PERCNT:
  1243. if (ptr == start) {
  1244. int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
  1245. end, nextTokPtr);
  1246. return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
  1247. }
  1248. *nextTokPtr = ptr;
  1249. return XML_TOK_DATA_CHARS;
  1250. case BT_LF:
  1251. if (ptr == start) {
  1252. *nextTokPtr = ptr + MINBPC(enc);
  1253. return XML_TOK_DATA_NEWLINE;
  1254. }
  1255. *nextTokPtr = ptr;
  1256. return XML_TOK_DATA_CHARS;
  1257. case BT_CR:
  1258. if (ptr == start) {
  1259. ptr += MINBPC(enc);
  1260. if (ptr == end)
  1261. return XML_TOK_TRAILING_CR;
  1262. if (BYTE_TYPE(enc, ptr) == BT_LF)
  1263. ptr += MINBPC(enc);
  1264. *nextTokPtr = ptr;
  1265. return XML_TOK_DATA_NEWLINE;
  1266. }
  1267. *nextTokPtr = ptr;
  1268. return XML_TOK_DATA_CHARS;
  1269. default:
  1270. ptr += MINBPC(enc);
  1271. break;
  1272. }
  1273. }
  1274. *nextTokPtr = ptr;
  1275. return XML_TOK_DATA_CHARS;
  1276. }
  1277. #ifdef XML_DTD
  1278. static int PTRCALL
  1279. PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
  1280. const char *end, const char **nextTokPtr)
  1281. {
  1282. int level = 0;
  1283. if (MINBPC(enc) > 1) {
  1284. size_t n = end - ptr;
  1285. if (n & (MINBPC(enc) - 1)) {
  1286. n &= ~(MINBPC(enc) - 1);
  1287. end = ptr + n;
  1288. }
  1289. }
  1290. while (ptr != end) {
  1291. switch (BYTE_TYPE(enc, ptr)) {
  1292. INVALID_CASES(ptr, nextTokPtr)
  1293. case BT_LT:
  1294. if ((ptr += MINBPC(enc)) == end)
  1295. return XML_TOK_PARTIAL;
  1296. if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
  1297. if ((ptr += MINBPC(enc)) == end)
  1298. return XML_TOK_PARTIAL;
  1299. if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
  1300. ++level;
  1301. ptr += MINBPC(enc);
  1302. }
  1303. }
  1304. break;
  1305. case BT_RSQB:
  1306. if ((ptr += MINBPC(enc)) == end)
  1307. return XML_TOK_PARTIAL;
  1308. if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
  1309. if ((ptr += MINBPC(enc)) == end)
  1310. return XML_TOK_PARTIAL;
  1311. if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  1312. ptr += MINBPC(enc);
  1313. if (level == 0) {
  1314. *nextTokPtr = ptr;
  1315. return XML_TOK_IGNORE_SECT;
  1316. }
  1317. --level;
  1318. }
  1319. }
  1320. break;
  1321. default:
  1322. ptr += MINBPC(enc);
  1323. break;
  1324. }
  1325. }
  1326. return XML_TOK_PARTIAL;
  1327. }
  1328. #endif /* XML_DTD */
  1329. static int PTRCALL
  1330. PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
  1331. const char **badPtr)
  1332. {
  1333. ptr += MINBPC(enc);
  1334. end -= MINBPC(enc);
  1335. for (; ptr != end; ptr += MINBPC(enc)) {
  1336. switch (BYTE_TYPE(enc, ptr)) {
  1337. case BT_DIGIT:
  1338. case BT_HEX:
  1339. case BT_MINUS:
  1340. case BT_APOS:
  1341. case BT_LPAR:
  1342. case BT_RPAR:
  1343. case BT_PLUS:
  1344. case BT_COMMA:
  1345. case BT_SOL:
  1346. case BT_EQUALS:
  1347. case BT_QUEST:
  1348. case BT_CR:
  1349. case BT_LF:
  1350. case BT_SEMI:
  1351. case BT_EXCL:
  1352. case BT_AST:
  1353. case BT_PERCNT:
  1354. case BT_NUM:
  1355. #ifdef XML_NS
  1356. case BT_COLON:
  1357. #endif
  1358. break;
  1359. case BT_S:
  1360. if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
  1361. *badPtr = ptr;
  1362. return 0;
  1363. }
  1364. break;
  1365. case BT_NAME:
  1366. case BT_NMSTRT:
  1367. if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
  1368. break;
  1369. default:
  1370. switch (BYTE_TO_ASCII(enc, ptr)) {
  1371. case 0x24: /* $ */
  1372. case 0x40: /* @ */
  1373. break;
  1374. default:
  1375. *badPtr = ptr;
  1376. return 0;
  1377. }
  1378. break;
  1379. }
  1380. }
  1381. return 1;
  1382. }
  1383. /* This must only be called for a well-formed start-tag or empty
  1384. element tag. Returns the number of attributes. Pointers to the
  1385. first attsMax attributes are stored in atts.
  1386. */
  1387. static int PTRCALL
  1388. PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
  1389. int attsMax, ATTRIBUTE *atts)
  1390. {
  1391. enum { other, inName, inValue } state = inName;
  1392. int nAtts = 0;
  1393. int open = 0; /* defined when state == inValue;
  1394. initialization just to shut up compilers */
  1395. for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
  1396. switch (BYTE_TYPE(enc, ptr)) {
  1397. #define START_NAME \
  1398. if (state == other) { \
  1399. if (nAtts < attsMax) { \
  1400. atts[nAtts].name = ptr; \
  1401. atts[nAtts].normalized = 1; \
  1402. } \
  1403. state = inName; \
  1404. }
  1405. #define LEAD_CASE(n) \
  1406. case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
  1407. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1408. #undef LEAD_CASE
  1409. case BT_NONASCII:
  1410. case BT_NMSTRT:
  1411. case BT_HEX:
  1412. START_NAME
  1413. break;
  1414. #undef START_NAME
  1415. case BT_QUOT:
  1416. if (state != inValue) {
  1417. if (nAtts < attsMax)
  1418. atts[nAtts].valuePtr = ptr + MINBPC(enc);
  1419. state = inValue;
  1420. open = BT_QUOT;
  1421. }
  1422. else if (open == BT_QUOT) {
  1423. state = other;
  1424. if (nAtts < attsMax)
  1425. atts[nAtts].valueEnd = ptr;
  1426. nAtts++;
  1427. }
  1428. break;
  1429. case BT_APOS:
  1430. if (state != inValue) {
  1431. if (nAtts < attsMax)
  1432. atts[nAtts].valuePtr = ptr + MINBPC(enc);
  1433. state = inValue;
  1434. open = BT_APOS;
  1435. }
  1436. else if (open == BT_APOS) {
  1437. state = other;
  1438. if (nAtts < attsMax)
  1439. atts[nAtts].valueEnd = ptr;
  1440. nAtts++;
  1441. }
  1442. break;
  1443. case BT_AMP:
  1444. if (nAtts < attsMax)
  1445. atts[nAtts].normalized = 0;
  1446. break;
  1447. case BT_S:
  1448. if (state == inName)
  1449. state = other;
  1450. else if (state == inValue
  1451. && nAtts < attsMax
  1452. && atts[nAtts].normalized
  1453. && (ptr == atts[nAtts].valuePtr
  1454. || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
  1455. || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
  1456. || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
  1457. atts[nAtts].normalized = 0;
  1458. break;
  1459. case BT_CR: case BT_LF:
  1460. /* This case ensures that the first attribute name is counted
  1461. Apart from that we could just change state on the quote. */
  1462. if (state == inName)
  1463. state = other;
  1464. else if (state == inValue && nAtts < attsMax)
  1465. atts[nAtts].normalized = 0;
  1466. break;
  1467. case BT_GT:
  1468. case BT_SOL:
  1469. if (state != inValue)
  1470. return nAtts;
  1471. break;
  1472. default:
  1473. break;
  1474. }
  1475. }
  1476. /* not reached */
  1477. }
  1478. static int PTRFASTCALL
  1479. PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
  1480. {
  1481. int result = 0;
  1482. UNUSED(enc);
  1483. /* skip &# */
  1484. ptr += 2*MINBPC(enc);
  1485. if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
  1486. for (ptr += MINBPC(enc);
  1487. !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
  1488. ptr += MINBPC(enc)) {
  1489. int c = BYTE_TO_ASCII(enc, ptr);
  1490. switch (c) {
  1491. case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
  1492. case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
  1493. result <<= 4;
  1494. result |= (c - ASCII_0);
  1495. break;
  1496. case ASCII_A: case ASCII_B: case ASCII_C:
  1497. case ASCII_D: case ASCII_E: case ASCII_F:
  1498. result <<= 4;
  1499. result += 10 + (c - ASCII_A);
  1500. break;
  1501. case ASCII_a: case ASCII_b: case ASCII_c:
  1502. case ASCII_d: case ASCII_e: case ASCII_f:
  1503. result <<= 4;
  1504. result += 10 + (c - ASCII_a);
  1505. break;
  1506. }
  1507. if (result >= 0x110000)
  1508. return -1;
  1509. }
  1510. }
  1511. else {
  1512. for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
  1513. int c = BYTE_TO_ASCII(enc, ptr);
  1514. result *= 10;
  1515. result += (c - ASCII_0);
  1516. if (result >= 0x110000)
  1517. return -1;
  1518. }
  1519. }
  1520. return checkCharRefNumber(result);
  1521. }
  1522. static int PTRCALL
  1523. PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
  1524. const char *end)
  1525. {
  1526. UNUSED(enc);
  1527. switch ((end - ptr)/MINBPC(enc)) {
  1528. case 2:
  1529. if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
  1530. switch (BYTE_TO_ASCII(enc, ptr)) {
  1531. case ASCII_l:
  1532. return ASCII_LT;
  1533. case ASCII_g:
  1534. return ASCII_GT;
  1535. }
  1536. }
  1537. break;
  1538. case 3:
  1539. if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
  1540. ptr += MINBPC(enc);
  1541. if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
  1542. ptr += MINBPC(enc);
  1543. if (CHAR_MATCHES(enc, ptr, ASCII_p))
  1544. return ASCII_AMP;
  1545. }
  1546. }
  1547. break;
  1548. case 4:
  1549. switch (BYTE_TO_ASCII(enc, ptr)) {
  1550. case ASCII_q:
  1551. ptr += MINBPC(enc);
  1552. if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
  1553. ptr += MINBPC(enc);
  1554. if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
  1555. ptr += MINBPC(enc);
  1556. if (CHAR_MATCHES(enc, ptr, ASCII_t))
  1557. return ASCII_QUOT;
  1558. }
  1559. }
  1560. break;
  1561. case ASCII_a:
  1562. ptr += MINBPC(enc);
  1563. if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
  1564. ptr += MINBPC(enc);
  1565. if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
  1566. ptr += MINBPC(enc);
  1567. if (CHAR_MATCHES(enc, ptr, ASCII_s))
  1568. return ASCII_APOS;
  1569. }
  1570. }
  1571. break;
  1572. }
  1573. }
  1574. return 0;
  1575. }
  1576. static int PTRCALL
  1577. PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
  1578. {
  1579. for (;;) {
  1580. switch (BYTE_TYPE(enc, ptr1)) {
  1581. #define LEAD_CASE(n) \
  1582. case BT_LEAD ## n: \
  1583. if (*ptr1++ != *ptr2++) \
  1584. return 0;
  1585. LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
  1586. #undef LEAD_CASE
  1587. /* fall through */
  1588. if (*ptr1++ != *ptr2++)
  1589. return 0;
  1590. break;
  1591. case BT_NONASCII:
  1592. case BT_NMSTRT:
  1593. #ifdef XML_NS
  1594. case BT_COLON:
  1595. #endif
  1596. case BT_HEX:
  1597. case BT_DIGIT:
  1598. case BT_NAME:
  1599. case BT_MINUS:
  1600. if (*ptr2++ != *ptr1++)
  1601. return 0;
  1602. if (MINBPC(enc) > 1) {
  1603. if (*ptr2++ != *ptr1++)
  1604. return 0;
  1605. if (MINBPC(enc) > 2) {
  1606. if (*ptr2++ != *ptr1++)
  1607. return 0;
  1608. if (MINBPC(enc) > 3) {
  1609. if (*ptr2++ != *ptr1++)
  1610. return 0;
  1611. }
  1612. }
  1613. }
  1614. break;
  1615. default:
  1616. if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
  1617. return 1;
  1618. switch (BYTE_TYPE(enc, ptr2)) {
  1619. case BT_LEAD2:
  1620. case BT_LEAD3:
  1621. case BT_LEAD4:
  1622. case BT_NONASCII:
  1623. case BT_NMSTRT:
  1624. #ifdef XML_NS
  1625. case BT_COLON:
  1626. #endif
  1627. case BT_HEX:
  1628. case BT_DIGIT:
  1629. case BT_NAME:
  1630. case BT_MINUS:
  1631. return 0;
  1632. default:
  1633. return 1;
  1634. }
  1635. }
  1636. }
  1637. /* not reached */
  1638. }
  1639. static int PTRCALL
  1640. PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
  1641. const char *end1, const char *ptr2)
  1642. {
  1643. UNUSED(enc);
  1644. for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
  1645. if (ptr1 == end1)
  1646. return 0;
  1647. if (!CHAR_MATCHES(enc, ptr1, *ptr2))
  1648. return 0;
  1649. }
  1650. return ptr1 == end1;
  1651. }
  1652. static int PTRFASTCALL
  1653. PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
  1654. {
  1655. const char *start = ptr;
  1656. for (;;) {
  1657. switch (BYTE_TYPE(enc, ptr)) {
  1658. #define LEAD_CASE(n) \
  1659. case BT_LEAD ## n: ptr += n; break;
  1660. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1661. #undef LEAD_CASE
  1662. case BT_NONASCII:
  1663. case BT_NMSTRT:
  1664. #ifdef XML_NS
  1665. case BT_COLON:
  1666. #endif
  1667. case BT_HEX:
  1668. case BT_DIGIT:
  1669. case BT_NAME:
  1670. case BT_MINUS:
  1671. ptr += MINBPC(enc);
  1672. break;
  1673. default:
  1674. return (int)(ptr - start);
  1675. }
  1676. }
  1677. }
  1678. static const char * PTRFASTCALL
  1679. PREFIX(skipS)(const ENCODING *enc, const char *ptr)
  1680. {
  1681. for (;;) {
  1682. switch (BYTE_TYPE(enc, ptr)) {
  1683. case BT_LF:
  1684. case BT_CR:
  1685. case BT_S:
  1686. ptr += MINBPC(enc);
  1687. break;
  1688. default:
  1689. return ptr;
  1690. }
  1691. }
  1692. }
  1693. static void PTRCALL
  1694. PREFIX(updatePosition)(const ENCODING *enc,
  1695. const char *ptr,
  1696. const char *end,
  1697. POSITION *pos)
  1698. {
  1699. while (ptr < end) {
  1700. switch (BYTE_TYPE(enc, ptr)) {
  1701. #define LEAD_CASE(n) \
  1702. case BT_LEAD ## n: \
  1703. ptr += n; \
  1704. break;
  1705. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1706. #undef LEAD_CASE
  1707. case BT_LF:
  1708. pos->columnNumber = (XML_Size)-1;
  1709. pos->lineNumber++;
  1710. ptr += MINBPC(enc);
  1711. break;
  1712. case BT_CR:
  1713. pos->lineNumber++;
  1714. ptr += MINBPC(enc);
  1715. if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
  1716. ptr += MINBPC(enc);
  1717. pos->columnNumber = (XML_Size)-1;
  1718. break;
  1719. default:
  1720. ptr += MINBPC(enc);
  1721. break;
  1722. }
  1723. pos->columnNumber++;
  1724. }
  1725. }
  1726. #undef DO_LEAD_CASE
  1727. #undef MULTIBYTE_CASES
  1728. #undef INVALID_CASES
  1729. #undef CHECK_NAME_CASE
  1730. #undef CHECK_NAME_CASES
  1731. #undef CHECK_NMSTRT_CASE
  1732. #undef CHECK_NMSTRT_CASES
  1733. #endif /* XML_TOK_IMPL_C */