parser 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473
  1. // file : xml/parser -*- C++ -*-
  2. // copyright : Copyright (c) 2013-2014 Code Synthesis Tools CC
  3. // license : MIT; see accompanying LICENSE file
  4. #ifndef XML_PARSER
  5. #define XML_PARSER
  6. #include <xml/details/pre.hxx>
  7. #include <map>
  8. #include <vector>
  9. #include <string>
  10. #include <iosfwd>
  11. #include <cstddef> // std::size_t
  12. #include <xml/details/config.hxx> // LIBSTUDXML_EXTERNAL_EXPAT
  13. #ifndef LIBSTUDXML_EXTERNAL_EXPAT
  14. # include <xml/details/expat/expat.h>
  15. #else
  16. # include <expat.h>
  17. #endif
  18. // We only support UTF-8 Expat.
  19. //
  20. #ifdef XML_UNICODE
  21. # error UTF-16 expat (XML_UNICODE defined) is not supported
  22. #endif
  23. #include <xml/forward>
  24. #include <xml/qname>
  25. #include <xml/content>
  26. #include <xml/exception>
  27. #include <xml/details/export.hxx>
  28. namespace xml
  29. {
  30. class parser;
  31. struct LIBSTUDXML_EXPORT parsing: exception
  32. {
  33. virtual
  34. ~parsing () throw ();
  35. parsing (const std::string& name,
  36. unsigned long long line,
  37. unsigned long long column,
  38. const std::string& description);
  39. parsing (const parser&, const std::string& description);
  40. const std::string&
  41. name () const {return name_;}
  42. unsigned long long
  43. line () const {return line_;}
  44. unsigned long long
  45. column () const {return column_;}
  46. const std::string&
  47. description () const {return description_;}
  48. virtual const char*
  49. what () const throw ();
  50. private:
  51. void
  52. init ();
  53. private:
  54. std::string name_;
  55. unsigned long long line_;
  56. unsigned long long column_;
  57. std::string description_;
  58. std::string what_;
  59. };
  60. class LIBSTUDXML_EXPORT parser
  61. {
  62. public:
  63. typedef xml::qname qname_type;
  64. typedef xml::content content_type;
  65. typedef unsigned short feature_type;
  66. // If both receive_attributes_event and receive_attributes_map are
  67. // specified, then receive_attributes_event is assumed.
  68. //
  69. static const feature_type receive_elements = 0x0001;
  70. static const feature_type receive_characters = 0x0002;
  71. static const feature_type receive_attributes_map = 0x0004;
  72. static const feature_type receive_attributes_event = 0x0008;
  73. static const feature_type receive_namespace_decls = 0x0010;
  74. static const feature_type receive_default = receive_elements |
  75. receive_characters |
  76. receive_attributes_map;
  77. // Parse std::istream. Input name is used in diagnostics to identify
  78. // the document being parsed.
  79. //
  80. // If stream exceptions are enabled then std::ios_base::failure
  81. // exception is used to report io errors (badbit and failbit).
  82. // Otherwise, those are reported as the parsing exception.
  83. //
  84. parser (std::istream&,
  85. const std::string& input_name,
  86. feature_type = receive_default);
  87. // Parse memory buffer that contains the whole document. Input name
  88. // is used in diagnostics to identify the document being parsed.
  89. //
  90. parser (const void* data,
  91. std::size_t size,
  92. const std::string& input_name,
  93. feature_type = receive_default);
  94. const std::string&
  95. input_name () const {return iname_;}
  96. ~parser ();
  97. private:
  98. parser (const parser&);
  99. parser& operator= (const parser&);
  100. // Parsing events.
  101. //
  102. public:
  103. enum event_type
  104. {
  105. // If adding new events, also update the stream insertion operator.
  106. //
  107. start_element,
  108. end_element,
  109. start_attribute,
  110. end_attribute,
  111. characters,
  112. start_namespace_decl,
  113. end_namespace_decl,
  114. eof
  115. };
  116. event_type
  117. next ();
  118. // Get the next event and make sure that it's what's expected. If it
  119. // is not, then throw an appropriate parsing exception.
  120. //
  121. void
  122. next_expect (event_type);
  123. void
  124. next_expect (event_type, const std::string& name);
  125. void
  126. next_expect (event_type, const qname_type& qname);
  127. void
  128. next_expect (event_type, const std::string& ns, const std::string& name);
  129. event_type
  130. peek ();
  131. // Return the even that was last returned by the call to next() or
  132. // peek().
  133. //
  134. event_type
  135. event () {return event_;}
  136. // Event data.
  137. //
  138. public:
  139. const qname_type& qname () const {return *pqname_;}
  140. const std::string& namespace_ () const {return pqname_->namespace_ ();}
  141. const std::string& name () const {return pqname_->name ();}
  142. const std::string& prefix () const {return pqname_->prefix ();}
  143. std::string& value () {return *pvalue_;}
  144. const std::string& value () const {return *pvalue_;}
  145. template <typename T> T value () const;
  146. unsigned long long line () const {return line_;}
  147. unsigned long long column () const {return column_;}
  148. // Attribute map lookup. If attribute is not found, then the version
  149. // without the default value throws an appropriate parsing exception
  150. // while the version with the default value returns that value.
  151. //
  152. // Note also that there is no attribute(ns,name) version since it
  153. // would conflict with attribute(name,dv) (qualified attributes
  154. // are not very common).
  155. //
  156. // Attribute map is valid throughout at the "element level" until
  157. // end_element and not just during start_element. As a special case,
  158. // the map is still valid after peek() that returned end_element until
  159. // this end_element event is retrieved with next().
  160. //
  161. const std::string&
  162. attribute (const std::string& name) const;
  163. template <typename T>
  164. T
  165. attribute (const std::string& name) const;
  166. std::string
  167. attribute (const std::string& name,
  168. const std::string& default_value) const;
  169. template <typename T>
  170. T
  171. attribute (const std::string& name, const T& default_value) const;
  172. const std::string&
  173. attribute (const qname_type& qname) const;
  174. template <typename T>
  175. T
  176. attribute (const qname_type& qname) const;
  177. std::string
  178. attribute (const qname_type& qname,
  179. const std::string& default_value) const;
  180. template <typename T>
  181. T
  182. attribute (const qname_type& qname, const T& default_value) const;
  183. bool
  184. attribute_present (const std::string& name) const;
  185. bool
  186. attribute_present (const qname_type& qname) const;
  187. // Low-level attribute map access. Note that this API assumes
  188. // all attributes are handled.
  189. //
  190. struct attribute_value_type
  191. {
  192. std::string value;
  193. mutable bool handled;
  194. };
  195. typedef std::map<qname_type, attribute_value_type> attribute_map_type;
  196. const attribute_map_type&
  197. attribute_map () const;
  198. // Optional content processing.
  199. //
  200. public:
  201. // Note that you cannot get/set content while peeking.
  202. //
  203. void
  204. content (content_type);
  205. content_type
  206. content () const;
  207. // Versions that also set the content. Event type must be start_element.
  208. //
  209. void
  210. next_expect (event_type, const std::string& name, content_type);
  211. void
  212. next_expect (event_type, const qname_type& qname, content_type);
  213. void
  214. next_expect (event_type,
  215. const std::string& ns, const std::string& name,
  216. content_type);
  217. // Helpers for parsing elements with simple content. The first two
  218. // functions assume that start_element has already been parsed. The
  219. // rest parse the complete element, from start to end.
  220. //
  221. // Note also that as with attribute(), there is no (namespace,name)
  222. // overload since it would conflicts with (namespace,default_value).
  223. //
  224. public:
  225. std::string
  226. element ();
  227. template <typename T>
  228. T
  229. element ();
  230. std::string
  231. element (const std::string& name);
  232. std::string
  233. element (const qname_type& qname);
  234. template <typename T>
  235. T
  236. element (const std::string& name);
  237. template <typename T>
  238. T
  239. element (const qname_type& qname);
  240. std::string
  241. element (const std::string& name, const std::string& default_value);
  242. std::string
  243. element (const qname_type& qname, const std::string& default_value);
  244. template <typename T>
  245. T
  246. element (const std::string& name, const T& default_value);
  247. template <typename T>
  248. T
  249. element (const qname_type& qname, const T& default_value);
  250. // C++11 range-based for support. Generally, the iterator interface
  251. // doesn't make much sense for the parser so for now we have an
  252. // implementation that is just enough to the range-based for.
  253. //
  254. public:
  255. struct iterator
  256. {
  257. typedef event_type value_type;
  258. iterator (parser* p = 0, event_type e = eof): p_ (p), e_ (e) {}
  259. value_type operator* () const {return e_;}
  260. iterator& operator++ () {e_ = p_->next (); return *this;}
  261. // Comparison only makes sense when comparing to end (eof).
  262. //
  263. bool operator== (iterator y) const {return e_ == eof && y.e_ == eof;}
  264. bool operator!= (iterator y) const {return !(*this == y);}
  265. private:
  266. parser* p_;
  267. event_type e_;
  268. };
  269. iterator begin () {return iterator (this, next ());}
  270. iterator end () {return iterator (this, eof);}
  271. private:
  272. static void XMLCALL
  273. start_element_ (void*, const XML_Char*, const XML_Char**);
  274. static void XMLCALL
  275. end_element_ (void*, const XML_Char*);
  276. static void XMLCALL
  277. characters_ (void*, const XML_Char*, int);
  278. static void XMLCALL
  279. start_namespace_decl_ (void*, const XML_Char*, const XML_Char*);
  280. static void XMLCALL
  281. end_namespace_decl_ (void*, const XML_Char*);
  282. private:
  283. void
  284. init ();
  285. event_type
  286. next_ (bool peek);
  287. event_type
  288. next_body ();
  289. void
  290. handle_error ();
  291. private:
  292. // If size_ is 0, then data is std::istream. Otherwise, it is a buffer.
  293. //
  294. union
  295. {
  296. std::istream* is;
  297. const void* buf;
  298. } data_;
  299. std::size_t size_;
  300. const std::string iname_;
  301. feature_type feature_;
  302. XML_Parser p_;
  303. std::size_t depth_;
  304. bool accumulate_; // Whether we are accumulating character content.
  305. enum {state_next, state_peek} state_;
  306. event_type event_;
  307. event_type queue_;
  308. qname_type qname_;
  309. std::string value_;
  310. // These are used to avoid copying when we are handling attributes
  311. // and namespace decls.
  312. //
  313. const qname_type* pqname_;
  314. std::string* pvalue_;
  315. unsigned long long line_;
  316. unsigned long long column_;
  317. // Attributes as events.
  318. //
  319. struct attribute_type
  320. {
  321. qname_type qname;
  322. std::string value;
  323. };
  324. typedef std::vector<attribute_type> attributes;
  325. attributes attr_;
  326. attributes::size_type attr_i_; // Index of the current attribute.
  327. // Namespace declarations.
  328. //
  329. typedef std::vector<qname_type> namespace_decls;
  330. namespace_decls start_ns_;
  331. namespace_decls::size_type start_ns_i_; // Index of the current decl.
  332. namespace_decls end_ns_;
  333. namespace_decls::size_type end_ns_i_; // Index of the current decl.
  334. // Element state consisting of the content model and attribute map.
  335. //
  336. struct element_entry
  337. {
  338. element_entry (std::size_t d, content_type c = content_type::mixed)
  339. : depth (d), content (c), attr_unhandled_ (0) {}
  340. std::size_t depth;
  341. content_type content;
  342. attribute_map_type attr_map_;
  343. mutable attribute_map_type::size_type attr_unhandled_;
  344. };
  345. typedef std::vector<element_entry> element_state;
  346. std::vector<element_entry> element_state_;
  347. // Empty attribute map to return when an element has no attributes.
  348. //
  349. const attribute_map_type empty_attr_map_;
  350. // Return the element entry corresponding to the current depth, if
  351. // exists, and NULL otherwise.
  352. //
  353. const element_entry*
  354. get_element () const;
  355. const element_entry*
  356. get_element_ () const;
  357. void
  358. pop_element ();
  359. };
  360. LIBSTUDXML_EXPORT
  361. std::ostream&
  362. operator<< (std::ostream&, parser::event_type);
  363. }
  364. #include <xml/parser.ixx>
  365. #include <xml/parser.txx>
  366. #include <xml/details/post.hxx>
  367. #endif // XML_PARSER