intro.xhtml 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765
  1. <?xml version="1.0" encoding="iso-8859-1"?>
  2. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  3. <html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
  4. <head>
  5. <title>XML Parsing and Serialization in C++ with libstudxml</title>
  6. <meta name="copyright" content="&copy; 2013-2014 Code Synthesis Tools CC"/>
  7. <meta name="keywords" content="xml,c++,parsing,serialization,api,streaming,persistence"/>
  8. <meta name="description" content="XML Parsing and Serialization in C++ with libstudxml"/>
  9. <meta name="revision" content="1.0"/>
  10. <meta name="version" content="1.0.0"/>
  11. <link rel="stylesheet" type="text/css" href="default.css" />
  12. <style type="text/css">
  13. pre {
  14. padding : 0 0 0 0em;
  15. margin : 0em 0em 0em 0;
  16. font-size : 102%
  17. }
  18. body {
  19. min-width: 48em;
  20. }
  21. h1 {
  22. font-weight: bold;
  23. font-size: 200%;
  24. line-height: 1.2em;
  25. }
  26. h2 {
  27. font-weight : bold;
  28. font-size : 150%;
  29. padding-top : 0.8em;
  30. }
  31. h3 {
  32. font-size : 140%;
  33. padding-top : 0.8em;
  34. }
  35. /* Force page break for both PDF and HTML (when printing). */
  36. hr.page-break {
  37. height: 0;
  38. width: 0;
  39. border: 0;
  40. visibility: hidden;
  41. page-break-after: always;
  42. }
  43. /* Adjust indentation for three levels. */
  44. #container {
  45. max-width: 48em;
  46. }
  47. #content {
  48. padding: 0 0.1em 0 4em;
  49. /*background-color: red;*/
  50. }
  51. #content h1 {
  52. margin-left: -2.06em;
  53. }
  54. #content h2 {
  55. margin-left: -1.33em;
  56. }
  57. /* Title page */
  58. #titlepage {
  59. padding: 2em 0 1em 0;
  60. border-bottom: 1px solid black;
  61. }
  62. #titlepage .title {
  63. font-weight: bold;
  64. font-size: 200%;
  65. text-align: center;
  66. padding: 1em 0 2em 0;
  67. }
  68. #titlepage #first-title {
  69. padding: 1em 0 0.4em 0;
  70. }
  71. #titlepage #second-title {
  72. padding: 0.4em 0 2em 0;
  73. }
  74. #titlepage p {
  75. padding-bottom: 1em;
  76. }
  77. #titlepage #revision {
  78. padding-bottom: 0em;
  79. }
  80. /* Lists */
  81. ul.list li, ol.list li {
  82. padding-top : 0.3em;
  83. padding-bottom : 0.3em;
  84. }
  85. div.img {
  86. text-align: center;
  87. padding: 2em 0 2em 0;
  88. }
  89. /* */
  90. dl dt {
  91. padding : 0.8em 0 0 0;
  92. }
  93. /* TOC */
  94. table.toc {
  95. border-style : none;
  96. border-collapse : separate;
  97. border-spacing : 0;
  98. margin : 0.2em 0 0.2em 0;
  99. padding : 0 0 0 0;
  100. }
  101. table.toc tr {
  102. padding : 0 0 0 0;
  103. margin : 0 0 0 0;
  104. }
  105. table.toc * td, table.toc * th {
  106. border-style : none;
  107. margin : 0 0 0 0;
  108. vertical-align : top;
  109. }
  110. table.toc * th {
  111. font-weight : normal;
  112. padding : 0em 0.1em 0em 0;
  113. text-align : left;
  114. white-space : nowrap;
  115. }
  116. table.toc * table.toc th {
  117. padding-left : 1em;
  118. }
  119. table.toc * td {
  120. padding : 0em 0 0em 0.7em;
  121. text-align : left;
  122. }
  123. </style>
  124. </head>
  125. <body>
  126. <div id="container">
  127. <div id="content">
  128. <div class="noprint">
  129. <div id="titlepage">
  130. <div class="title" id="first-title">XML Parsing and Serialization in C++</div>
  131. <div class="title" id="second-title">With <code>libstudxml</code></div>
  132. <p>Copyright &copy; 2013-2014 Code Synthesis Tools CC. Permission is
  133. granted to copy, distribute and/or modify this document under the
  134. terms of the MIT license.</p>
  135. <!-- REMEMBER TO CHANGE VERSIONS IN THE META TAGS ABOVE! -->
  136. <p id="revision">Revision 1.0, May 2014</p>
  137. <p>This revision of the document describes <code>libstudxml</code> 1.0.0.</p>
  138. </div>
  139. <hr class="page-break"/>
  140. <h1>Table of Contents</h1>
  141. <table class="toc">
  142. <tr>
  143. <th></th><td><a href="#0">About This Document</a></td>
  144. </tr>
  145. <tr>
  146. <th>1</th><td><a href="#1">Terminology</a></td>
  147. </tr>
  148. <tr>
  149. <th>2</th><td><a href="#2">Low-Level API</a></td>
  150. </tr>
  151. <tr>
  152. <th>3</th><td><a href="#3">High-Level API</a></td>
  153. </tr>
  154. <tr>
  155. <th>4</th><td><a href="#4">Object Persistence</a></td>
  156. </tr>
  157. <tr>
  158. <th>5</th><td><a href="#5">Inheritance</a></td>
  159. </tr>
  160. <tr>
  161. <th>6</th><td><a href="#6">Implementation Notes</a></td>
  162. </tr>
  163. </table>
  164. </div>
  165. <hr class="page-break"/>
  166. <h1><a name="0">About This Document</a></h1>
  167. <p>This document is based on the presentation given by Boris Kolpackov at
  168. the C++Now 2014 conference where <code>libstudxml</code> was
  169. first made publicly available. Its goal is to introduce a new,
  170. modern C++ API for XML by showing how to handle the most common
  171. use cases. Compared to the talk, this introduction omits some of
  172. the discussion relevant to XML in general and its handling
  173. in C++. It also provides more complete code examples that would not
  174. fit onto slides during the presentation. If, however, you would
  175. like to get a more complete picture of the "state of XML in C++", then
  176. you may prefer to first
  177. <a href="http://youtu.be/AuamDUrG5ZU?list=UU5e__RG9K3cHrPotPABnrwg">watch
  178. the video</a> of the talk.</p>
  179. <p>While this document uses some C++11 features in the examples, the
  180. library itself can be used in C++98 applications as well.</p>
  181. <h1><a name="1">Terminology</a></h1>
  182. <p>Before we begin, let's define a few terms to make sure we are on
  183. the same page.</p>
  184. <p>When we say "XML format" that is a bit loose. XML is actually
  185. a meta-format that we specialize for our needs. That is, we decide
  186. what element and attribute names we will use, which elements will
  187. be valid where, what they will mean, and so on. This specialization
  188. of XML to a specific format is called an <em>XML Vocabulary</em>.</p>
  189. <p>Often, but not always, when we parse XML, we store extracted data
  190. in the application's memory. Usually, we would create classes
  191. specific to our XML vocabulary. For example, if we have an element
  192. called <code>person</code> then we may create a C++ class also
  193. called <code>person</code>. we will call such classes an
  194. <em>Object Model</em>.</p>
  195. <p>The content of an element in XML can be empty, text, nested
  196. elements, or a mixture of the two:</p>
  197. <pre class="xml">
  198. &lt;empty name="a" id="1"/>
  199. &lt;simple name="b" id="2">text&lt;simple/>
  200. &lt;complex name="c" id="3">
  201. &lt;nested>...&lt;/nested>
  202. &lt;nested>...&lt;/nested>
  203. &lt;complex/>
  204. &lt;mixed name="d" id="4">
  205. te&lt;nested>...&lt;/nested>
  206. x
  207. &lt;nested>...&lt;/nested>t
  208. &lt;mixed/>
  209. </pre>
  210. <p>These are called the <em>empty</em>, <em>simple</em>,
  211. <em>complex</em>, and <em>mixed</em> content models,
  212. respectively.</p>
  213. <h1><a name="2">Low-Level API</a></h1>
  214. <p><code>libstudxml</code> provides the streaming XML pull parser and
  215. streaming XML serializer. The parser is a conforming, non-validating
  216. XML 1.0 implementation (see <a href="#6">Implementation Notes</a>
  217. for details). The application character encoding (that is, the
  218. encoding used in the application's memory) for both parser and
  219. serializer is UTF-8. The output encoding of the serializer is
  220. UTF-8 as well. The parser supports UTF-8, UTF-16, ISO-8859-1,
  221. and US-ASCII input encodings.</p>
  222. <pre class="c++">
  223. #include &lt;xml/parser>
  224. namespace xml
  225. {
  226. class parser;
  227. }
  228. </pre>
  229. <pre class="c++">
  230. #include &lt;xml/serializer>
  231. namespace xml
  232. {
  233. class serializer;
  234. }
  235. </pre>
  236. <p>C++ is often used to implement XML converters and filters, especially
  237. where speed is a concern. Such applications require the lowest-level
  238. API with minimum overhead. So we will start there (see the
  239. <code>roundtrip</code> example in the <code>libstudxml</code>
  240. distribution).</p>
  241. <pre class="c++">
  242. class parser
  243. {
  244. typedef unsigned short feature_type;
  245. static const feature_type receive_elements;
  246. static const feature_type receive_characters;
  247. static const feature_type receive_attributes;
  248. static const feature_type receive_namespace_decls;
  249. static const feature_type receive_default =
  250. receive_elements |
  251. receive_characters |
  252. receive_attributes;
  253. parser (std::istream&amp;,
  254. const std::string&amp; input_name,
  255. feature_type = receive_default);
  256. ...
  257. };
  258. </pre>
  259. <p>The parser constructor takes three arguments: the stream to parse,
  260. input name that is used in diagnostics to identify the document
  261. being parsed, and the list of events we want the parser to report.</p>
  262. <p>As an example of an XML filter, let's write one that removes a
  263. specific attribute from the document, say <code>id</code>. The
  264. first step in our filter would then be to create the parser
  265. instance:</p>
  266. <pre class="c++">
  267. int main (int argc, char* argv[])
  268. {
  269. ...
  270. try
  271. {
  272. using namespace xml;
  273. ifstream ifs (argv[1]);
  274. parser p (ifs, argv[1]);
  275. ...
  276. }
  277. catch (const xml::parsing&amp; e)
  278. {
  279. cerr &lt;&lt; e.what () &lt;&lt; endl;
  280. return 1;
  281. }
  282. }
  283. </pre>
  284. <p>Here we also see how to handle parsing errors. So far so good.
  285. Let's see the next piece of the API.</p>
  286. <pre class="c++">
  287. class parser
  288. {
  289. enum event_type
  290. {
  291. start_element,
  292. end_element,
  293. start_attribute,
  294. end_attribute,
  295. characters,
  296. start_namespace_decl,
  297. end_namespace_decl,
  298. eof
  299. };
  300. event_type next ();
  301. };
  302. </pre>
  303. <p>We call the <code>next()</code> function when we are ready to handle
  304. the next piece of XML. And now we can implement our filter a bit
  305. further:</p>
  306. <pre class="c++">
  307. parser p (ifs, argv[1]);
  308. for (parser::event_type e (p.next ());
  309. e != parser::eof;
  310. e = p.next ())
  311. {
  312. switch (e)
  313. {
  314. case parser::start_element:
  315. ...
  316. case parser::end_element:
  317. ...
  318. case parser::start_attribute:
  319. ...
  320. case parser::end_attribute:
  321. ...
  322. case parser::characters:
  323. ...
  324. }
  325. }
  326. </pre>
  327. <p>In C++11 we can use the range-based <code>for</code> loop to tidy
  328. things up a bit:</p>
  329. <pre class="c++">
  330. parser p (ifs, argv[1]);
  331. for (parser::event_type e: p)
  332. {
  333. switch (e)
  334. {
  335. ...
  336. }
  337. }
  338. </pre>
  339. <p>The next piece of the API puzzle:</p>
  340. <pre class="c++">
  341. class parser
  342. {
  343. const std::string&amp; name () const;
  344. const std::string&amp; value () const;
  345. unsigned long long line () const;
  346. unsigned long long column () const;
  347. };
  348. </pre>
  349. <p>The <code>name()</code> accessor returns the name of the current element
  350. or attribute. The <code>value()</code> function returns the text of the
  351. characters event for an element or attribute. The <code>line()</code> and
  352. <code>column()</code> accessors return the current position in the document.
  353. Here is how we could print all the element positions for debugging:</p>
  354. <pre class="c++">
  355. switch (e)
  356. {
  357. case parser::start_element:
  358. cerr &lt;&lt; p.line () &lt;&lt; ':' &lt;&lt; p.column () &lt;&lt; ": start "
  359. &lt;&lt; p.name () &lt;&lt; endl;
  360. break;
  361. case parser::end_element:
  362. cerr &lt;&lt; p.line () &lt;&lt; ':' &lt;&lt; p.column () &lt;&lt; ": end "
  363. &lt;&lt; p.name () &lt;&lt; endl;
  364. break;
  365. }
  366. </pre>
  367. <p>We have now seen enough of the parsing side to complete our filter.
  368. What's missing is the serialization. So let's switch to that for a
  369. moment:</p>
  370. <pre class="c++">
  371. class serializer
  372. {
  373. serializer (std::ostream&amp;,
  374. const std::string&amp; output_name,
  375. unsigned short indentation = 2);
  376. ...
  377. };
  378. </pre>
  379. <p>The constructor is pretty similar to the <code>parser</code>'s. The
  380. <code>indentation</code> argument specifies the number of indentation
  381. spaces that should be used for pretty-printing. We can disable it by
  382. passing <code>0</code>.</p>
  383. <p>Now we can create the serializer instance for our filter:</p>
  384. <pre class="c++">
  385. int main (int argc, char* argv[])
  386. {
  387. ...
  388. try
  389. {
  390. using namespace xml;
  391. ifstream ifs (argv[1]);
  392. parser p (ifs, argv[1]);
  393. serializer s (cout, "output", 0);
  394. ...
  395. }
  396. catch (const xml::parsing&amp; e)
  397. {
  398. cerr &lt;&lt; e.what () &lt;&lt; endl;
  399. return 1;
  400. }
  401. catch (const xml::serialization&amp; e)
  402. {
  403. cerr &lt;&lt; e.what () &lt;&lt; endl;
  404. return 1;
  405. }
  406. }
  407. </pre>
  408. <p>Notice that we have also added an exception handler for the
  409. <code>serialization</code> exception. Instead of handling
  410. the <code>parsing</code> and <code>serialization</code>
  411. exceptions separately, we can catch just
  412. <code>xml::exception</code>, which is a common base for the
  413. other two:</p>
  414. <pre class="c++">
  415. int main (int argc, char* argv[])
  416. {
  417. try
  418. {
  419. ...
  420. }
  421. catch (const xml::exception&amp; e)
  422. {
  423. cerr &lt;&lt; e.what () &lt;&lt; endl;
  424. return 1;
  425. }
  426. }
  427. </pre>
  428. <p>The next chunk of the serializer API:</p>
  429. <pre class="c++">
  430. class serializer
  431. {
  432. void start_element (const std::string&amp; name);
  433. void end_element ();
  434. void start_attribute (const std::string&amp; name);
  435. void end_attribute ();
  436. void characters (const std::string&amp; value);
  437. };
  438. </pre>
  439. <p>Everything should be pretty self-explanatory here. And we have
  440. now seen enough to finish our filter:</p>
  441. <pre class="c++">
  442. parser p (ifs, argv[1]);
  443. serializer s (cout, "output", 0);
  444. bool skip (false);
  445. for (parser::event_type e: p)
  446. {
  447. switch (e)
  448. {
  449. case parser::start_element:
  450. {
  451. s.start_element (p.name ());
  452. break;
  453. }
  454. case parser::end_element:
  455. {
  456. s.end_element ();
  457. break;
  458. }
  459. case parser::start_attribute:
  460. {
  461. if (p.name () == "id")
  462. skip = true;
  463. else
  464. s.start_attribute (p.name ());
  465. break;
  466. }
  467. case parser::end_attribute:
  468. {
  469. if (skip)
  470. skip = false;
  471. else
  472. s.end_attribute ();
  473. break;
  474. }
  475. case parser::characters:
  476. {
  477. if (!skip)
  478. s.characters (p.value ());
  479. break;
  480. }
  481. }
  482. }
  483. </pre>
  484. <p>Do you see any problems with our filter? Well, one problem is
  485. that this implementation doesn't handle XML namespaces. Let's
  486. see how we can fix this. The first issue is with the element
  487. and attribute names. When namespaces are used, those may be
  488. qualified. <code>libstudxml</code> uses the <code>qname</code>
  489. class to represent such names:</p>
  490. <pre class="c++">
  491. #include &lt;xml/qname>
  492. namespace xml
  493. {
  494. class qname
  495. {
  496. public:
  497. qname ();
  498. qname (const std::string&amp; name);
  499. qname (const std::string&amp; namespace_,
  500. const std::string&amp; name);
  501. const std::string&amp; namespace_ () const;
  502. const std::string&amp; name () const;
  503. };
  504. }
  505. </pre>
  506. <p>The parser, in addition to the <code>name()</code> accessor also
  507. has <code>qname()</code> which returns the potentially qualified
  508. name. Similarly, the <code>start_element()</code> and
  509. <code>start_attribute()</code> functions in the serializer are
  510. overloaded to accept <code>qname</code>:</p>
  511. <pre class="c++">
  512. class parser
  513. {
  514. const qname&amp; qname () const;
  515. };
  516. class serializer
  517. {
  518. void start_element (const qname&amp;);
  519. void start_attribute (const qname&amp;);
  520. };
  521. </pre>
  522. <p>The first thing we need to do to make our filter namespace-aware
  523. is to use qualified names instead of the local ones. This one is
  524. easy:</p>
  525. <pre class="c++">
  526. switch (e)
  527. {
  528. case parser::start_element:
  529. {
  530. s.start_element (p.qname ());
  531. break;
  532. }
  533. case parser::start_attribute:
  534. {
  535. if (p.qname () == "id") // Unqualified name.
  536. skip = true;
  537. else
  538. s.start_attribute (p.qname ());
  539. break;
  540. }
  541. }
  542. </pre>
  543. <p>There is, however, another thing that we have to do. Right now our
  544. code does not propagate the namespace-prefix mappings from the input
  545. document to the output. At the moment, where the input XML might have
  546. meaningful prefixes assigned to namespaces, the output will have
  547. automatically generated ones like <code>g1</code>, <code>g2</code>,
  548. and so on.</p>
  549. <p>To fix this, first we need to tell the parser to report to us
  550. namespace-prefix mappings, called namespace declarations in XML:</p>
  551. <pre class="c++">
  552. parser p (ifs,
  553. argv[1]
  554. parser::receive_default |
  555. parser::receive_namespace_decls);
  556. </pre>
  557. <p>We then also need to propagate this information to the serializer by
  558. handling the <code>start_namespace_decl</code> event:</p>
  559. <pre class="c++">
  560. for (...)
  561. {
  562. switch (e)
  563. {
  564. ...
  565. case parser::start_namespace_decl:
  566. s.namespace_decl (p.namespace_ (), p.prefix ());
  567. break;
  568. ...
  569. }
  570. }
  571. </pre>
  572. <p>Well, that wasn't too bad.</p>
  573. <h1><a name="3">High-Level API</a></h1>
  574. <p>So that was pretty low level XML work where we didn't care about
  575. the semantics of the stored data, or, in fact the XML vocabulary that
  576. we dealt with.</p>
  577. <p>However, this API will quickly become tedious once we try to handle
  578. a specific XML vocabulary and do something useful with the stored
  579. data. Why is that? There are several areas where we could use some
  580. help:</p>
  581. <ul>
  582. <li>Validation and error handling</li>
  583. <li>Attribute access</li>
  584. <li>Data extraction</li>
  585. <li>Content model processing</li>
  586. <li>Control flow</li>
  587. </ul>
  588. <p>Let's examine each area using our object position vocabulary as a
  589. test case (see the <code>processing</code> example in the
  590. <code>libstudxml</code> distribution).</p>
  591. <pre class="xml">
  592. &lt;object id="123">
  593. &lt;name>Lion's Head&lt;/name>
  594. &lt;type>mountain&lt;/type>
  595. &lt;position lat="-33.8569" lon="18.5083"/>
  596. &lt;position lat="-33.8568" lon="18.5083"/>
  597. &lt;position lat="-33.8568" lon="18.5082"/>
  598. &lt;/object>
  599. </pre>
  600. <p>If you cannot assume the XML you are parsing is valid, and you
  601. generally shouldn't, then you will quickly realize that the biggest
  602. pain in dealing with XML is making sure that what we got is actually
  603. valid.</p>
  604. <p>This stuff is pervasive. What if the root element is spelled
  605. wrong? Maybe the <code>id</code> attribute is missing? Or there
  606. is some stray text before the <code>name</code> element? Things
  607. can be broken in an infinite number of ways.</p>
  608. <p>To illustrate this point, here is the parsing code of just the
  609. root element with proper error handling:</p>
  610. <pre class="c++">
  611. parser p (ifs, argv[1]);
  612. if (p.next () != parser::start_element ||
  613. p.qname () != "object")
  614. {
  615. // error
  616. }
  617. ...
  618. if (p.next () != parser::end_element) // object
  619. {
  620. // error
  621. }
  622. </pre>
  623. <p>Not very pretty. To help with this, the parser API provides the
  624. <code>next_expect()</code> function:</p>
  625. <pre class="c++">
  626. class parser
  627. {
  628. void next_expect (event_type);
  629. void next_expect (event_type, const std::string&amp; name);
  630. };
  631. </pre>
  632. <p>This function gets the next event and makes sure it is what's
  633. expected. If not, it throws an appropriate parsing exception.
  634. This simplifies our root element parsing quite a bit:</p>
  635. <pre class="c++">
  636. parser p (ifs, argv[1]);
  637. p.next_expect (parser::start_element, "object");
  638. ...
  639. p.next_expect (parser::end_element); // object
  640. </pre>
  641. <p>Let's now take the next step and try to handle the <code>id</code>
  642. attribute. According to what we have seen so far, it will look
  643. something along these lines:</p>
  644. <pre class="c++">
  645. p.next_expect (parser::start_element, "object");
  646. p.next_expect (parser::start_attribute, "id");
  647. p.next_expect (parser::characters);
  648. cout &lt;&lt; "id: " &lt;&lt; p.value () &lt;&lt; endl;
  649. p.next_expect (parser::end_attribute);
  650. ...
  651. p.next_expect (parser::end_element); // object
  652. </pre>
  653. <p>Not too bad but there is a bit of a problem. What if our <code>object</code>
  654. element had several attributes? The order of attributes in XML
  655. is arbitrary so we should be prepared to get them in any order.
  656. This fact complicates our attribute parsing code quite a bit:</p>
  657. <pre class="c++">
  658. while (p.next () == parser::start_attribute)
  659. {
  660. if (p.qname () == "id")
  661. {
  662. p.next_expect (parser::characters);
  663. cout &lt;&lt; "id: " &lt;&lt; p.value () &lt;&lt; endl;
  664. }
  665. else if (...)
  666. {
  667. }
  668. else
  669. {
  670. // error: unknown attribute
  671. }
  672. p.next_expect (parser::end_attribute);
  673. }
  674. </pre>
  675. <p>There is also a bug in this version. Can you see it? We now
  676. don't make sure that the <code>id</code> attribute was actually
  677. specified.</p>
  678. <p>If you think about it, at this level, it is actually not that
  679. convenient to receive attributes as events. In fact, a map of
  680. attributes would be much more usable.</p>
  681. <p>Remember we talked about the parser features that specify which
  682. events we want to see:</p>
  683. <pre class="c++">
  684. class parser
  685. {
  686. static const feature_type receive_elements;
  687. static const feature_type receive_characters;
  688. static const feature_type receive_attributes;
  689. ...
  690. };
  691. </pre>
  692. <p>Well, in reality, there is no <code>receive_attributes</code>. Rather,
  693. there are these two options:
  694. <pre class="c++">
  695. class parser
  696. {
  697. static const feature_type receive_attributes_map;
  698. static const feature_type receive_attributes_event;
  699. ...
  700. };
  701. </pre>
  702. <p>That is, we can ask the parser to send us attributes as events or
  703. as a map. And the default is to send them as a map.</p>
  704. <p>In case of a map, we have the following attribute access API to work
  705. with:</p>
  706. <pre class="c++">
  707. class parser
  708. {
  709. const std::string&amp; attribute (const std::string&amp; name) const;
  710. std::string attribute (const std::string&amp; name,
  711. const std::string&amp; default_value) const;
  712. bool attribute_present (const std::string&amp; name) const;
  713. };
  714. </pre>
  715. <p>If the attribute is not found, then the version without the default
  716. value throws an appropriate parsing exception while the version with
  717. the default value returns that value. There are also the
  718. <code>qname</code> versions of these functions.</p>
  719. <p>Let's see how this simplifies our code:</p>
  720. <pre class="c++">
  721. p.next_expect (parser::start_element, "object");
  722. cout &lt;&lt; "id: " &lt;&lt; p.attribute ("id") &lt;&lt; endl;
  723. ...
  724. p.next_expect (parser::end_element); // object
  725. </pre>
  726. <p>Much better.</p>
  727. <p>If the <code>id</code> attribute is not present, then we get an
  728. exception. But what happens if we have a stray attribute in our
  729. document? The attribute map is magical in this sense. After
  730. the <code>end_element</code> event for the <code>object</code>
  731. element the parser will examine the attribute map. If there is
  732. an attribute that hasn't been retrieved with one of the attribute
  733. access functions, then the parser will throw the unexpected
  734. attribute exception.</p>
  735. <p>Error handling out of the way, the next thing that will annoy us is data
  736. extractions. In XML everything is text. While our <code>id</code> value
  737. is an integer, XML stores it as text and the low-level API returns it to
  738. us as text. To help with this the parser provides the following data
  739. extraction functions:</p>
  740. <pre class="c++">
  741. class parser
  742. {
  743. template &lt;typename T>
  744. T value () const;
  745. template &lt;typename T>
  746. T attribute (const std::string&amp; name) const;
  747. template &lt;typename T>
  748. T attribute (const std::string&amp; name,
  749. const T&amp; default_value) const;
  750. };
  751. </pre>
  752. <p>Now we can get the <code>id</code> as an integer without much fuss:</p>
  753. <pre class="c++">
  754. p.next_expect (parser::start_element, "object");
  755. unsigned int id = p.attribute&lt;unsigned int> ("id");
  756. ...
  757. p.next_expect (parser::end_element); // object
  758. </pre>
  759. <p>Ok, let's try to parse our vocabulary a bit further:</p>
  760. <pre class="c++">
  761. p.next_expect (parser::start_element, "object");
  762. unsigned int id = p.attribute&lt;unsigned int> ("id");
  763. p.next_expect (parser::start_element, "name");
  764. ...
  765. p.next_expect (parser::end_element); // name
  766. p.next_expect (parser::end_element); // object
  767. </pre>
  768. <p>Here is the part of the document that we are parsing:</p>
  769. <pre class="xml">
  770. &lt;object id="123">
  771. &lt;name>Lion's Head&lt;/name>
  772. </pre>
  773. <p>What do you think, is everything alright with our code? When we
  774. try to parse our document, we will get an exception here:</p>
  775. <pre class="c++">
  776. p.next_expect (parser::start_element, "name");
  777. </pre>
  778. <p>Any idea why? Let's try to print the event that we get:</p>
  779. <pre class="c++">
  780. // p.next_expect (parser::start_element, "name");
  781. cerr &lt;&lt; p.next () &lt;&lt; endl;
  782. </pre>
  783. <p>We expect <code>start_element</code> but get <code>characters</code>!
  784. Wait a minute, but there are characters after <code>object</code> and
  785. before <code>name</code>. There is a newline and two spaces that are
  786. replaced with hashes for illustration here:</p>
  787. <pre class="xml">
  788. &lt;object id="123">#
  789. ##&lt;name>Lion's Head&lt;/name>
  790. </pre>
  791. <p>If you go to a forum or a mailing list for any XML parser, this will
  792. be the most common question. Why do I get text when I should clearly
  793. get an element!?</p>
  794. <p>The reason why we get this whitespace text is because the parser has no
  795. idea whether it is significant or not. The significance of whitespaces is
  796. determined by the XML content model that we talked about earlier. Here is
  797. the table:</p>
  798. <pre class="c++">
  799. #include &lt;xml/content>
  800. namespace xml
  801. {
  802. enum class content
  803. { // element characters whitespaces
  804. empty, // no no ignored
  805. simple, // no yes preserved
  806. complex, // yes no ignored
  807. mixed // yes yes preserved
  808. };
  809. }
  810. </pre>
  811. <p>In empty content neither nested elements nor characters are allowed with
  812. whitespaces ignored. Simple content allows no nested elements with
  813. whitespaces preserved. Complex content allows nested elements only with
  814. whitespaces which are ignored. Finally, the mixed content allows anything
  815. in any order with everything preserved.</p>
  816. <p>If we specify the content model for an element, then the parser
  817. will do automatic whitespace processing for us:</p>
  818. <pre class="c++">
  819. class parser
  820. {
  821. void content (content);
  822. };
  823. </pre>
  824. <p>That is, in empty and complex content, whitespaces will be silently
  825. ignored. By knowing the content model, the parser also has a chance to do
  826. more error handling for us. It will automatically throw appropriate
  827. exceptions if there are nested elements in empty or simple content or
  828. non-whitespace characters in complex content.</p>
  829. <p>Ok, let's now see how we can take advantage of this feature in
  830. our code:</p>
  831. <pre class="c++">
  832. p.next_expect (parser::start_element, "object");
  833. p.content (content::complex);
  834. unsigned int id = p.attribute&lt;unsigned int> ("id");
  835. p.next_expect (parser::start_element, "name"); // Ok.
  836. ...
  837. p.next_expect (parser::end_element); // name
  838. p.next_expect (parser::end_element); // object
  839. </pre>
  840. <p>Now whitespaces are ignored and everything works as we expected.
  841. Here is how we can parse the content of the <code>name</code>
  842. element:</p>
  843. <pre class="c++">
  844. p.next_expect (parser::start_element, "name");
  845. p.content (content::simple);
  846. p.next_expect (parser::characters);
  847. string name = p.value ();
  848. p.next_expect (parser::end_element); // name
  849. </pre>
  850. <p>As you can see, parsing a simple content element is quite a bit more
  851. involved compared to getting a value of an attribute. Element markup also
  852. has a higher overhead in the resulting XML. That's why in our case it would
  853. have been wiser to make <code>name</code> and <code>type</code>
  854. attributes.</p>
  855. <p>But if we are stuck with a lot of simple content elements, then
  856. the parser provides the following helper functions:</p>
  857. <pre class="c++">
  858. class parser
  859. {
  860. std::string element ();
  861. template &lt;typename T>
  862. T element ();
  863. std::string element (const std::string&amp; name);
  864. template &lt;typename T>
  865. T element (const std::string&amp; name);
  866. std::string element (const std::string&amp; name,
  867. const std::string&amp; default_value);
  868. template &lt;typename T>
  869. T element (const std::string&amp; name,
  870. const T&amp; default_value);
  871. };
  872. </pre>
  873. <p>The first two assume that you have already handled the
  874. <code>start_element</code> event. They should be used if the element also
  875. has attributes. The other four parse the complete element. Overloaded
  876. <code>qname</code> versions are also provided.</p>
  877. <p>Here is how we can simplify our parsing code thanks to these
  878. functions:</p>
  879. <pre class="c++">
  880. p.next_expect (parser::start_element, "object");
  881. p.content (content::complex);
  882. unsigned int id = p.attribute&lt;unsigned int> ("id");
  883. string name = p.element ("name");
  884. p.next_expect (parser::end_element); // object
  885. </pre>
  886. <p>For the <code>type</code> element we would like to use this <code>enum
  887. class</code>:</p>
  888. <pre class="c++">
  889. enum class object_type
  890. {
  891. building,
  892. mountain,
  893. ...
  894. };
  895. </pre>
  896. <p>The parsing code is similar to the <code>name</code> element. Now
  897. we use the data extracting version of the <code>element()</code>
  898. function:</p>
  899. <pre class="c++">
  900. object_type type = p.element&lt;object_type> ("type");
  901. </pre>
  902. <p>Except that this won't compile. The parser doesn't know how to
  903. convert the text representation to our <code>enum.</code> By
  904. default the parser will try to use the <code>iostream</code>
  905. extraction operator but we haven't provided any.</p>
  906. <p>We can provide conversion code specifically for XML by specializing
  907. the <code>value_traits</code> class template:</p>
  908. <pre class="c++">
  909. namespace xml
  910. {
  911. template &lt;>
  912. struct value_traits&lt;object_type>
  913. {
  914. static object_type
  915. parse (std::string, const parser&amp;)
  916. {
  917. ...
  918. }
  919. static std::string
  920. serialize (object_type, const serializer&amp;)
  921. {
  922. ...
  923. }
  924. };
  925. }
  926. </pre>
  927. <p>The last bit that we need to handle is the <code>position</code>
  928. elements. The interesting part here is how to stop without going
  929. too far since there can be several of them. To help with this task
  930. the parser allows us to peek into the next event:</p>
  931. <pre class="c++">
  932. p.next_expect (parser::start_element, "object");
  933. p.content (content::complex);
  934. ...
  935. do
  936. {
  937. p.next_expect (parser::start_element, "position");
  938. p.content (content::empty);
  939. float lat = p.attribute&lt;float> ("lat");
  940. float lon = p.attribute&lt;float> ("lon");
  941. p.next_expect (parser::end_element);
  942. } while (p.peek () == parser::start_element);
  943. p.next_expect (parser::end_element); // object
  944. </pre>
  945. <p>Do you see anything else that we can improve? Actually, there is
  946. one thing. Look at the <code>next_expect()</code> calls in the
  947. above code. They are both immediately followed by the setting
  948. of the content model. We can tidy this up a bit by passing the
  949. content model as a third argument to <code>next_expect()</code>.
  950. This even reads like prose: "Next we expect the start of an
  951. element called <code>position</code> that shall have empty
  952. content."</p>
  953. <p>Here is the complete, production-quality parsing code for our XML
  954. vocabulary. 13 lines. With validation and everything:</p>
  955. <pre class="c++">
  956. parser p (ifs, argv[1]);
  957. p.next_expect (parser::start_element, "object", content::complex);
  958. unsigned int id = p.attribute&lt;unsigned int> ("id");
  959. string name = p.element ("name");
  960. object_type type = p.element&lt;object_type> ("type");
  961. do
  962. {
  963. p.next_expect (parser::start_element, "position", content::empty);
  964. float lat = p.attribute&lt;float> ("lat");
  965. float lon = p.attribute&lt;float> ("lon");
  966. p.next_expect (parser::end_element); // position
  967. } while (p.peek () == parser::start_element)
  968. p.next_expect (parser::end_element); // object
  969. </pre>
  970. <p>So that was the high-level parsing API. Let's now catch up with the
  971. corresponding additions to the serializer.</p>
  972. <p>Similar to parsing, calling <code>start_attribute()</code>,
  973. <code>characters()</code>, and then <code>end_attribute()</code>
  974. might not be convenient. Instead we can add an attribute with
  975. a single call:</p>
  976. <pre class="c++">
  977. class serializer
  978. {
  979. void attribute (const std::string&amp; name,
  980. const std::string&amp; value);
  981. void element (const std::string&amp; value);
  982. void element (const std::string&amp; name,
  983. const std::string&amp; value);
  984. };
  985. </pre>
  986. <p>The same works for elements with simple content. The first version finishes
  987. the element that we have started, while the second writes the complete
  988. element. There are also the <code>qname</code> versions of these
  989. functions that are not shown.</p>
  990. <p>Instead of strings we can also serialize value types. This uses the
  991. same <code>value_traits</code> specialization mechanism that we have
  992. used for parsing:</p>
  993. <pre class="c++">
  994. class serializer
  995. {
  996. template &lt;typename T>
  997. void attribute (const std::string&amp; name,
  998. const T&amp; value);
  999. template &lt;typename T>
  1000. void element (const T&amp; value);
  1001. template &lt;typename T>
  1002. void element (const std::string&amp; name,
  1003. const T&amp; value);
  1004. template &lt;typename T>
  1005. void characters (const T&amp; value);
  1006. };
  1007. </pre>
  1008. <p>Let's now see now how we can serialize a complete sample document for
  1009. our object position vocabulary using this high-level API:</p>
  1010. <pre class="c++">
  1011. serializer s (cout, "output");
  1012. s.start_element ("object");
  1013. s.attribute ("id", 123);
  1014. s.element ("name", "Lion's Head");
  1015. s.element ("type", object_type::mountain);
  1016. for (...)
  1017. {
  1018. s.start_element ("position");
  1019. float lat (...), lon (...);
  1020. s.attribute ("lat", lat);
  1021. s.attribute ("lon", lon);
  1022. s.end_element (); // position
  1023. }
  1024. s.end_element (); // object
  1025. </pre>
  1026. <p>Pretty straightforward stuff.</p>
  1027. <h1><a name="4">Object Persistence</a></h1>
  1028. <p>So far we have used our API to first implement a filter that doesn't
  1029. really care about the data and then an application that processes the
  1030. data without creating any kind of object model. Let's now try to handle
  1031. the other end of the spectrum: objects that know how to persist
  1032. themselves into XML (see the <code>persistence</code> example in
  1033. the <code>libstudxml</code> distribution).</p>
  1034. <p>But before we continue, let's fix our XML to be slightly more idiomatic.
  1035. That is we make <code>name</code> and <code>type</code> to be attributes
  1036. rather than elements:</p>
  1037. <pre class="xml">
  1038. &lt;object name="Lion's Head" type="mountain" id="123">
  1039. &lt;position lat="-33.8569" lon="18.5083"/>
  1040. &lt;position lat="-33.8568" lon="18.5083"/>
  1041. &lt;position lat="-33.8568" lon="18.5082"/>
  1042. &lt;/object>
  1043. </pre>
  1044. <p>Generally, the API works best with idiomatic XML and will nudge you
  1045. gently in that direction with minor inconveniences.</p>
  1046. <p>For this vocabulary, the object model might look like this:</p>
  1047. <pre class="c++">
  1048. enum class object_type {...};
  1049. class position
  1050. {
  1051. ...
  1052. float lat_;
  1053. float lon_;
  1054. };
  1055. class object
  1056. {
  1057. ...
  1058. std::string name_;
  1059. object_type type_;
  1060. unsigned int id_;
  1061. std::vector&lt;position> positions_;
  1062. };
  1063. </pre>
  1064. <p>Here I omit sensible constructors, accessors and modifiers that our
  1065. classes would probably have.</p>
  1066. <p>Let me also mention that what I am going to show next is what I
  1067. believe is the sensible structure for XML persistence using this
  1068. API. But that doesn't mean it is the only way. For example, we
  1069. are going to do parsing in a constructor:</p>
  1070. <pre class="c++">
  1071. class position
  1072. {
  1073. position (xml::parser&amp;);
  1074. void
  1075. serialize (xml::serializer&amp;) const;
  1076. ...
  1077. };
  1078. class object
  1079. {
  1080. object (xml::parser&amp;);
  1081. void
  1082. serialize (xml::serializer&amp;) const;
  1083. ...
  1084. };
  1085. </pre>
  1086. <p>But you may prefer to first create an instance, say with the default
  1087. constructor, and then have a separate function do the parsing.
  1088. There is nothing wrong with this approach.</p>
  1089. <p>Let's start with the <code>position</code> constructor. Here, we are
  1090. immediately confronted with this choice: do we parse the start and end
  1091. element events in position or expect our caller to handle them.</p>
  1092. <p>I suggest that we let our caller do this. We may have different elements
  1093. in our vocabulary that use the same <code>position</code> type. If we
  1094. assume the element name in the constructor, then we won't be able to use
  1095. the same class for all these elements. We will see the second advantage
  1096. of this arrangement in a moment, when we deal with inheritance. But, if
  1097. you have a simple model with one-to-one mapping between types and
  1098. elements and no inheritance, then there is nothing wrong with going the
  1099. other route.</p>
  1100. <pre class="c++">
  1101. position::
  1102. position (parser&amp; p)
  1103. : lat_ (p.attribute&lt;float> ("lat")),
  1104. lon_ (p.attribute&lt;float> ("lon"))
  1105. {
  1106. p.content (content::empty);
  1107. }
  1108. </pre>
  1109. <p>Ok, nice and clean so far. Let's look at the <code>object</code>
  1110. constructor:</p>
  1111. <pre class="c++">
  1112. object::
  1113. object (parser&amp; p)
  1114. : name_ (p.attribute ("name")),
  1115. type_ (p.attribute&lt;object_type> ("type")),
  1116. id_ (p.attribute&lt;unsigned int> ("id"))
  1117. {
  1118. p.content (content::complex);
  1119. do
  1120. {
  1121. p.next_expect (parser::start_element, "position");
  1122. positions_.push_back (position (p));
  1123. p.next_expect (parser::end_element);
  1124. } while (p.peek () == parser::start_element);
  1125. }
  1126. </pre>
  1127. <p>The only mildly interesting line here is where we call the position
  1128. constructor to parse the content of the nested elements.</p>
  1129. <p>Before we look into serialization, let me also mention one other
  1130. thing. In our vocabulary all the attributes are required but it is
  1131. quite common to have optional attributes. The API functions with
  1132. default values make it really convenient to handle such attributes
  1133. in the initializer lists.</p>
  1134. <p>Let's say the <code>type</code> attribute is optional. Then we
  1135. could do this:</p>
  1136. <pre class="c++">
  1137. object::
  1138. object (parser&amp; p)
  1139. : ...
  1140. type_ (p.attribute ("type", object_type::other))
  1141. ...
  1142. </pre>
  1143. <p>We use the same arrangement for serialization, that is, the
  1144. containing object starts and ends the element allowing us to
  1145. reuse the same type for different elements:</p>
  1146. <pre class="c++">
  1147. void position::serialize (serializer&amp; s) const
  1148. {
  1149. s.attribute ("lat", lat_);
  1150. s.attribute ("lon", lon_);
  1151. }
  1152. void object::serialize (serializer&amp; s) const
  1153. {
  1154. s.attribute ("name", name_);
  1155. s.attribute ("type", type_);
  1156. s.attribute ("id", id_);
  1157. for (const auto&amp; p: positions_)
  1158. {
  1159. s.start_element ("position");
  1160. p.serialize (s);
  1161. s.end_element ();
  1162. }
  1163. }
  1164. </pre>
  1165. <p>Ok, also nice and tidy.</p>
  1166. There is one thing, however, that is not so nice: the start of
  1167. the parser or serializer. Here is the code:</p>
  1168. <pre class="c++">
  1169. parser p (ifs, argv[1]);
  1170. p.next_expect (parser::start_element, "object");
  1171. object o (p);
  1172. p.next_expect (parser::end_element);
  1173. serializer s (cout, "output");
  1174. s.start_element ("object");
  1175. o.serialize (s);
  1176. s.end_element ();
  1177. </pre>
  1178. <p>Remember, we made the caller responsible for handling the start and
  1179. end of the element. This works beautifully inside the object model but
  1180. not so much in the client code. What we would like to see instead
  1181. is this:</p>
  1182. <pre class="c++">
  1183. parser p (ifs, argv[1]);
  1184. object o (p);
  1185. serializer s (cout, "output");
  1186. o.serialize (s);
  1187. </pre>
  1188. <p>The main reason for choosing this structure was the ability to reuse the
  1189. same type for different elements. The other reason was inheritance which
  1190. we haven't gotten to yet. If we think about it, it is very unlikely for a
  1191. class corresponding to the root of our vocabulary to also be used inside
  1192. as a local element. I can't remember ever seeing a vocabulary like
  1193. this.</p>
  1194. <p>So what we can do here is make an exception: the root type of our
  1195. object model handles the top-level element. Here is the parser:</p>
  1196. <pre class="c++">
  1197. object::
  1198. object (parser&amp; p)
  1199. {
  1200. p.next_expect (
  1201. parser::start_element, "object", content::complex);
  1202. name_ = p.attribute ("name");
  1203. type_ = p.attribute&lt;object_type> ("type");
  1204. id_ = p.attribute&lt;unsigned int> ("id");
  1205. ...
  1206. p.next_expect (parser::end_element);
  1207. }
  1208. </pre>
  1209. <p>And here is the serializer:</p>
  1210. <pre class="c++">
  1211. void object::
  1212. serialize (serializer&amp; s) const
  1213. {
  1214. s.start_element ("object");
  1215. ...
  1216. s.end_element ();
  1217. }
  1218. </pre>
  1219. <p>The only minor drawback of going this route is that we can no longer
  1220. parse attributes in the initializer list for the root object.</p>
  1221. <h1><a name="5">Inheritance</a></h1>
  1222. <p>So far we have had a smooth sailing with the streaming approach but things get
  1223. a bit bumpy once we start dealing with inheritance. This is normally
  1224. where the in-memory approach has its day.</p>
  1225. <p>Say we have <code>elevated-object</code> which adds the
  1226. <code>units</code> attribute and the <code>elevation</code> elements.
  1227. Here is the XML:</p>
  1228. <pre class="xml">
  1229. &lt;elevated-object name="Lion's Head" type="mountain"
  1230. units="m" id="123">
  1231. &lt;position lat="-33.8569" lon="18.5083"/>
  1232. &lt;position lat="-33.8568" lon="18.5083"/>
  1233. &lt;position lat="-33.8568" lon="18.5082"/>
  1234. &lt;elevation val="668.9"/>
  1235. &lt;elevation val="669"/>
  1236. &lt;elevation val="669.1"/>
  1237. &lt;/elevated-object>
  1238. </pre>
  1239. <p>And here is the object model:</p>
  1240. <pre class="c++">
  1241. enum class units {...};
  1242. class elevation {...};
  1243. class elevated_object: public object
  1244. {
  1245. ...
  1246. units units_;
  1247. std::vector&lt;elevation> elevations_;
  1248. };
  1249. </pre>
  1250. <p>Streaming assumes linearity. We start an element, add some attributes,
  1251. add some nested elements, and end the element. In contrast, with an
  1252. in-memory approach we can add some attributes, then add some nested
  1253. elements, then go back and add more attributes. This kind of back and
  1254. forth is exactly what inheritance often requires. So this is a bit of
  1255. problem for us.</p>
  1256. <p>Consider the <code>elevated_object</code> constructor:</p>
  1257. <pre class="c++">
  1258. elevated_object::
  1259. elevated_object (parser&amp; p)
  1260. : object (p),
  1261. units_ (p.attribute&lt;units> ("units"))
  1262. {
  1263. do
  1264. {
  1265. p.next_expect (parser::start_element, "elevation");
  1266. elevations_.push_back (elevation (p));
  1267. p.next_expect (parser::end_element);
  1268. } while (p.peek () == parser::start_element &amp;&amp;
  1269. p.name () == "elevation")
  1270. }
  1271. </pre>
  1272. <p>Note that here I assume we went back to our original architecture
  1273. where the caller handles the start and end of the element (this is
  1274. the other advantage of this architecture: it allows us to reuse
  1275. base parsing and serialization code in derived classes).</p>
  1276. <p>So we would like to reuse the parsing code from <code>object</code>
  1277. so we call the base constructor first.</p>
  1278. <p>Then we parse the derived attribute and elements. Do you see
  1279. the problem? The <code>object</code> constructor will parse its
  1280. attributes and then move on to nested elements. When this constructor
  1281. returns, we need to go back to parsing attributes! This is not
  1282. something that a streaming approach would normally allow.</p>
  1283. <p>To resolve this, the lifetime of the attribute map was extended until
  1284. after the <code>end_element</code> event. That is, we can access
  1285. attributes any time we are at the element's level. As a result,
  1286. the above code just works.</p>
  1287. <p>We have the same problem in serialization. Let's say we write
  1288. the straightforward code like this:</p>
  1289. <pre class="c++">
  1290. void elevated_object::
  1291. serialize (serializer&amp; s) const
  1292. {
  1293. object::serialize (s);
  1294. s.attribute ("units", units_);
  1295. for (const auto&amp; e: elevations_)
  1296. {
  1297. s.start_element ("elevation");
  1298. e.serialize (s);
  1299. s.end_element ();
  1300. }
  1301. }
  1302. </pre>
  1303. <p>This is not going to work since we will try to add the <code>units</code>
  1304. attribute after the nested <code>position</code> elements have already
  1305. been written.</p>
  1306. <p>To handle inheritance in serialization we have to split the
  1307. <code>serialize()</code> function into two. One serializes
  1308. the attributes while the other &mdash; content:</p>
  1309. <pre class="c++">
  1310. void object::
  1311. serialize_attributes (serializer&amp; s) const
  1312. {
  1313. s.attribute ("name", name_);
  1314. s.attribute ("type", type_);
  1315. s.attribute ("id", id_);
  1316. }
  1317. void object::
  1318. serialize_content (serializer&amp; s) const
  1319. {
  1320. for (const auto&amp; p: positions_)
  1321. {
  1322. s.start_element ("position");
  1323. p.serialize (s);
  1324. s.end_element ();
  1325. }
  1326. }
  1327. </pre>
  1328. <p>The <code>serialize()</code> function then simply calls these two
  1329. in the correct order.</p>
  1330. <pre class="c++">
  1331. void object::
  1332. serialize (serializer&amp; s) const
  1333. {
  1334. serialize_attributes (s);
  1335. serialize_content (s);
  1336. }
  1337. </pre>
  1338. <p>I bet you can guess what the <code>elevated_object</code>'s
  1339. implementation looks like:</p>
  1340. <pre class="c++">
  1341. void elevated_object::
  1342. serialize_attributes (serializer&amp; s) const
  1343. {
  1344. object::serialize_attributes (s);
  1345. s.attribute ("units", units_);
  1346. }
  1347. void elevated_object::
  1348. serialize_content (serializer&amp; s) const
  1349. {
  1350. object::serialize_content (s);
  1351. for (const auto&amp; e: elevations_)
  1352. {
  1353. s.start_element ("elevation");
  1354. e.serialize (s);
  1355. s.end_element ();
  1356. }
  1357. }
  1358. </pre>
  1359. <p>The <code>serialize()</code> function for <code>elevated_object</code>
  1360. is exactly the same:</p>
  1361. <pre class="c++">
  1362. void elevated_object::
  1363. serialize (serializer&amp; s) const
  1364. {
  1365. serialize_attributes (s);
  1366. serialize_content (s);
  1367. }
  1368. </pre>
  1369. <h1><a name="6">Implementation Notes</a></h1>
  1370. <p><code>libstudxml</code>is an open source (MIT license), portable
  1371. (autotools and VC++ projects provided), and external dependency-free
  1372. implementation.</p>
  1373. <p>It provides a conforming, non-validating XML 1.0 parser by using
  1374. the mature and tested Expat XML parser. <code>libstudxml</code>
  1375. includes the Expat source code (also distributed under the MIT
  1376. license) as an implementation detail. However, you can link to
  1377. an external Expat library if you prefer.</p>
  1378. <p>If you are familiar with Expat, you are probably wondering how
  1379. the push interface provided by Expat was adapted to the pull
  1380. API shown earlier. Expat allows us to suspend and resume parsing
  1381. after every event and that's exactly what this implementation
  1382. does. The performance cost of this constant suspension and
  1383. resumption is about 35% of Expat's performance, which is not
  1384. negligible but not the end of the world either.</p>
  1385. <p>All in, with all the name splitting and string constructions,
  1386. parsing throughput on a 2010 Intel Core i7 laptop is about
  1387. 37 MByte/sec, which should be sufficient for most applications.</p>
  1388. <p>While it is much easier to implement a conforming serializer
  1389. from scratch, <code>libstudxml</code> reuses an existing and
  1390. tested implementation in this case as well. It includes source
  1391. code of a small C library for XML serialization called Genx
  1392. (also MIT licensed) that was initially created by Tim Bray
  1393. and significantly improved and extended over the past years
  1394. as part of the XSD/e project.</p>
  1395. </div>
  1396. </div>
  1397. </body>
  1398. </html>