xlsx_consumer.hpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. // Copyright (c) 2014-2021 Thomas Fussell
  2. // Copyright (c) 2010-2015 openpyxl
  3. //
  4. // Permission is hereby granted, free of charge, to any person obtaining a copy
  5. // of this software and associated documentation files (the "Software"), to deal
  6. // in the Software without restriction, including without limitation the rights
  7. // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8. // copies of the Software, and to permit persons to whom the Software is
  9. // furnished to do so, subject to the following conditions:
  10. //
  11. // The above copyright notice and this permission notice shall be included in
  12. // all copies or substantial portions of the Software.
  13. //
  14. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19. // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  20. // THE SOFTWARE
  21. //
  22. // @license: http://www.opensource.org/licenses/mit-license.php
  23. // @author: see AUTHORS file
  24. #pragma once
  25. #include <cstdint>
  26. #include <functional>
  27. #include <iostream>
  28. #include <memory>
  29. #include <string>
  30. #include <unordered_map>
  31. #include <vector>
  32. #include <detail/external/include_libstudxml.hpp>
  33. #include <detail/serialization/zstream.hpp>
  34. #include <xlnt/utils/numeric.hpp>
  35. #include <xml/parser>
  36. namespace xlnt {
  37. class cell;
  38. class color;
  39. class rich_text;
  40. class manifest;
  41. template<typename T>
  42. class optional;
  43. class path;
  44. class range_reference;
  45. class relationship;
  46. class streaming_workbook_reader;
  47. class variant;
  48. class workbook;
  49. class worksheet;
  50. namespace detail {
  51. class izstream;
  52. struct cell_impl;
  53. struct defined_name;
  54. struct worksheet_impl;
  55. /// <summary>
  56. /// Handles writing a workbook into an XLSX file.
  57. /// </summary>
  58. class xlsx_consumer
  59. {
  60. public:
  61. xlsx_consumer(workbook &destination);
  62. ~xlsx_consumer();
  63. void read(std::istream &source);
  64. void read(std::istream &source, const std::string &password);
  65. private:
  66. friend class xlnt::streaming_workbook_reader;
  67. void open(std::istream &source);
  68. bool has_cell();
  69. /// <summary>
  70. /// Reads the next cell in the current worksheet and optionally returns it if
  71. /// the last cell in the sheet has not yet been read. An exception will be thrown
  72. /// if this is not open as a streaming consumer.
  73. /// </summary>
  74. cell read_cell();
  75. /// <summary>
  76. /// Read all the files needed from the XLSX archive and initialize all of
  77. /// the data in the workbook to match.
  78. /// </summary>
  79. void populate_workbook(bool streaming);
  80. /// <summary>
  81. ///
  82. /// </summary>
  83. void read_content_types();
  84. // Metadata Property Readers
  85. /// <summary>
  86. /// Parse the core properties about the current package.
  87. /// </summary>
  88. void read_core_properties();
  89. /// <summary>
  90. /// Parse the core properties about the current package.
  91. /// </summary>
  92. void read_extended_properties();
  93. /// <summary>
  94. /// Parse the core properties about the current package.
  95. /// </summary>
  96. void read_custom_properties();
  97. // SpreadsheetML-Specific Package Part Readers
  98. /// <summary>
  99. /// Parse the main XML document about the workbook and then all child relationships
  100. /// of the workbook (e.g. worksheets).
  101. /// </summary>
  102. void read_office_document(const std::string &content_type);
  103. // Workbook Relationship Target Parts
  104. /// <summary>
  105. /// xl/calcChain.xml
  106. /// </summary>
  107. void read_calculation_chain();
  108. /// <summary>
  109. ///
  110. /// </summary>
  111. void read_connections();
  112. /// <summary>
  113. ///
  114. /// </summary>
  115. void read_custom_property();
  116. /// <summary>
  117. ///
  118. /// </summary>
  119. void read_custom_xml_mappings();
  120. /// <summary>
  121. ///
  122. /// </summary>
  123. void read_external_workbook_references();
  124. /// <summary>
  125. ///
  126. /// </summary>
  127. void read_pivot_table();
  128. /// <summary>
  129. /// xl/sharedStrings.xml
  130. /// </summary>
  131. void read_shared_string_table();
  132. /// <summary>
  133. ///
  134. /// </summary>
  135. void read_shared_workbook_revision_headers();
  136. /// <summary>
  137. ///
  138. /// </summary>
  139. void read_shared_workbook();
  140. /// <summary>
  141. ///
  142. /// </summary>
  143. void read_shared_workbook_user_data();
  144. /// <summary>
  145. /// xl/styles.xml
  146. /// </summary>
  147. void read_stylesheet();
  148. /// <summary>
  149. /// xl/theme/theme1.xml
  150. /// </summary>
  151. void read_theme();
  152. /// <summary>
  153. ///
  154. /// </summary>
  155. void read_volatile_dependencies();
  156. /// <summary>
  157. /// xl/sheets/*.xml
  158. /// </summary>
  159. void read_chartsheet(const std::string &rel_id);
  160. /// <summary>
  161. /// xl/sheets/*.xml
  162. /// </summary>
  163. void read_dialogsheet(const std::string &rel_id);
  164. /// <summary>
  165. /// xl/sheets/*.xml
  166. /// </summary>
  167. void read_worksheet(const std::string &rel_id);
  168. /// <summary>
  169. /// xl/sheets/*.xml
  170. /// </summary>
  171. std::string read_worksheet_begin(const std::string &rel_id);
  172. /// <summary>
  173. /// xl/sheets/*.xml
  174. /// </summary>
  175. void read_worksheet_sheetdata();
  176. /// <summary>
  177. /// xl/sheets/*.xml
  178. /// </summary>
  179. worksheet read_worksheet_end(const std::string &rel_id);
  180. // Sheet Relationship Target Parts
  181. /// <summary>
  182. ///
  183. /// </summary>
  184. void read_comments(worksheet ws);
  185. /// <summary>
  186. ///
  187. /// </summary>
  188. void read_vml_drawings(worksheet ws);
  189. /// <summary>
  190. ///
  191. /// </summary>
  192. void read_drawings(worksheet ws, const path &part);
  193. // Unknown Parts
  194. /// <summary>
  195. ///
  196. /// </summary>
  197. void read_unknown_parts();
  198. /// <summary>
  199. ///
  200. /// </summary>
  201. void read_unknown_relationships();
  202. /// <summary>
  203. ///
  204. /// </summary>
  205. void read_image(const path &part);
  206. /// <summary>
  207. ///
  208. /// </summary>
  209. void read_binary(const path &part);
  210. // Common Section Readers
  211. /// <summary>
  212. /// Read part from the archive and return a vector of relationships
  213. /// based on the content of that part.
  214. /// </summary>
  215. std::vector<relationship> read_relationships(const path &part);
  216. /// <summary>
  217. /// Read a CT_Color from the document currently being parsed.
  218. /// </summary>
  219. color read_color();
  220. /// <summary>
  221. /// Read a rich text CT_RElt from the document currently being parsed.
  222. /// </summary>
  223. rich_text read_rich_text(const xml::qname &parent);
  224. /// <summary>
  225. /// Returns true if the givent document type represents an XLSX file.
  226. /// </summary>
  227. bool document_type_is_xlsx(const std::string &document_content_type);
  228. // SAX Parsing Helpers
  229. /// <summary>
  230. /// In mixed content XML elements, whitespace before and after is not ignored.
  231. /// Additionally, if PCDATA spans the boundary of the XML read buffer, it will
  232. /// be parsed as two separate strings instead of on longer string. This method
  233. /// will read character data until non-character data is peek()ed from the parser
  234. /// and returns the combined strings. This should be used when parsing mixed
  235. /// content to ignore whitespace and whenever character data is expected between
  236. /// tags.
  237. /// </summary>
  238. std::string read_text();
  239. variant read_variant();
  240. /// <summary>
  241. /// Read the part from the archive and parse it as XML. After this is called,
  242. /// xlsx_consumer::parser() will return a reference to the parser that reads
  243. /// this part.
  244. /// </summary>
  245. void read_part(const std::vector<relationship> &rel_chain);
  246. /// <summary>
  247. /// libstudxml will throw an exception if all attributes on an element are not
  248. /// read with xml::parser::attribute(const std::string &). This should therefore
  249. /// be called if every remaining attribute should be ignored on an element.
  250. /// </summary>
  251. void skip_attributes();
  252. /// <summary>
  253. /// Skip attribute name if it exists on the currently parsed element in the XML
  254. /// parser.
  255. /// </summary>
  256. void skip_attribute(const std::string &name);
  257. /// <summary>
  258. /// Skip attribute name if it exists on the currently parsed element in the XML
  259. /// parser.
  260. /// </summary>
  261. void skip_attribute(const xml::qname &name);
  262. /// <summary>
  263. /// Call skip_attribute on every name in names.
  264. /// </summary>
  265. void skip_attributes(const std::vector<xml::qname> &names);
  266. /// <summary>
  267. /// Call skip_attribute on every name in names.
  268. /// </summary>
  269. void skip_attributes(const std::vector<std::string> &names);
  270. /// <summary>
  271. /// Read all content in name until the closing tag is reached.
  272. /// The closing tag will not be handled after this is called.
  273. /// </summary>
  274. void skip_remaining_content(const xml::qname &name);
  275. /// <summary>
  276. /// Handles the next event in the XML parser and throws an exception
  277. /// if it is not the start of an element. Additionally sets the content
  278. /// type of the element to content.
  279. /// </summary>
  280. xml::qname expect_start_element(xml::content content);
  281. /// <summary>
  282. /// Handles the next event in the XML parser and throws an exception
  283. /// if the next element is not named name. Sets the content type of
  284. /// the element to content.
  285. /// </summary>
  286. void expect_start_element(const xml::qname &name, xml::content content);
  287. /// <summary>
  288. /// Throws an exception if the next event in the XML parser is not
  289. /// the end of element called name.
  290. /// </summary>
  291. void expect_end_element(const xml::qname &name);
  292. /// <summary>
  293. /// Returns true if the top of the parsing stack is called name and
  294. /// the end of that element hasn't been reached in the XML document.
  295. /// </summary>
  296. bool in_element(const xml::qname &name);
  297. /// <summary>
  298. /// Throws an exception or skips remaining elements depending on
  299. /// the value of THROW_ON_INVALID_XML.
  300. /// </summary>
  301. void unexpected_element(const xml::qname &name);
  302. // Properties
  303. /// <summary>
  304. /// Convenience method to dereference the pointer to the current parser to avoid
  305. /// having to use "parser_->" constantly.
  306. /// </summary>
  307. xml::parser &parser();
  308. /// <summary>
  309. /// Convenience method to access the target workbook's manifest.
  310. /// </summary>
  311. class manifest &manifest();
  312. /// <summary>
  313. /// The ZIP file containing the files that make up the OOXML package.
  314. /// </summary>
  315. std::unique_ptr<izstream> archive_;
  316. /// <summary>
  317. /// Map of sheet titles to relationship IDs.
  318. /// </summary>
  319. std::unordered_map<std::string, std::size_t> sheet_title_id_map_;
  320. /// <summary>
  321. /// Map of sheet titles to indices. Used to ensure sheets are maintained
  322. /// in the correct order.
  323. /// </summary>
  324. std::unordered_map<std::string, std::size_t> sheet_title_index_map_;
  325. /// <summary>
  326. /// A reference to the workbook which is being read.
  327. /// </summary>
  328. workbook &target_;
  329. /// <summary>
  330. /// This pointer is generally set by instantiating an xml::parser in a function
  331. /// scope and then calling a read_*() method which uses xlsx_consumer::parser()
  332. /// to access the object.
  333. /// </summary>
  334. xml::parser *parser_;
  335. std::vector<xml::qname> stack_;
  336. bool preserve_space_ = false;
  337. bool streaming_ = false;
  338. std::unique_ptr<detail::cell_impl> streaming_cell_;
  339. std::unordered_map<int, std::string> shared_formulae_;
  340. std::unordered_map<std::string, std::string> array_formulae_;
  341. detail::worksheet_impl *current_worksheet_;
  342. number_serialiser converter_;
  343. std::vector<defined_name> defined_names_;
  344. };
  345. } // namespace detail
  346. } // namespace xlnt