xml_helper.hpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. #pragma once
  2. #include <sstream>
  3. #include <unordered_set>
  4. #include <xlnt/packaging/manifest.hpp>
  5. #include <xlnt/workbook/workbook.hpp>
  6. #include <detail/external/include_libstudxml.hpp>
  7. #include <detail/serialization/vector_streambuf.hpp>
  8. #include <detail/serialization/zstream.hpp>
  9. class xml_helper
  10. {
  11. public:
  12. static bool compare_files(const std::string &left,
  13. const std::string &right, const std::string &content_type)
  14. {
  15. // content types are stored in unordered maps, too complicated to compare
  16. if (content_type == "[Content_Types].xml")
  17. {
  18. return true;
  19. }
  20. // calcChain is optional
  21. if (content_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.calcChain+xml")
  22. {
  23. return true;
  24. }
  25. // compared already
  26. if (content_type == "application/vnd.openxmlformats-package.relationships+xml")
  27. {
  28. return true;
  29. }
  30. auto is_xml = (content_type.substr(0, 12) == "application/"
  31. && content_type.substr(content_type.size() - 4) == "+xml")
  32. || content_type == "application/xml"
  33. || content_type == "[Content_Types].xml"
  34. || content_type == "application/vnd.openxmlformats-officedocument.vmlDrawing";
  35. if (is_xml)
  36. {
  37. return compare_xml_exact(left, right);
  38. }
  39. auto is_thumbnail = content_type == "image/jpeg";
  40. if (is_thumbnail)
  41. {
  42. return true;
  43. }
  44. return left == right;
  45. }
  46. static bool compare_xml_exact(const std::string &left,
  47. const std::string &right, bool suppress_debug_info = false)
  48. {
  49. xml::parser left_parser(left.data(), left.size(), "left");
  50. xml::parser right_parser(right.data(), right.size(), "right");
  51. bool difference = false;
  52. auto right_iter = right_parser.begin();
  53. auto is_whitespace = [](const std::string &v) {
  54. return v.find_first_not_of("\n\r\t ") == std::string::npos;
  55. };
  56. // Iterate through each node in the left document
  57. for (auto left_event : left_parser)
  58. {
  59. // Ignore entirely whitespace text
  60. if (left_event == xml::parser::event_type::characters
  61. && is_whitespace(left_parser.value())) continue;
  62. // There's a difference if the end of the right document is reached
  63. if (right_iter == right_parser.end())
  64. {
  65. difference = true;
  66. break;
  67. }
  68. auto right_event = *right_iter;
  69. // Iterate through right document until the first non-whitespace node is reached
  70. while (right_iter != right_parser.end()
  71. && right_event == xml::parser::event_type::characters
  72. && is_whitespace(right_parser.value()))
  73. {
  74. ++right_iter;
  75. right_event = *right_iter;
  76. }
  77. // There's a difference if the left node type differs from the right node type
  78. if (left_event != right_event)
  79. {
  80. difference = true;
  81. break;
  82. }
  83. if (left_event == xml::parser::event_type::start_element)
  84. {
  85. // Store a map of all attributes from left and right elements in locals
  86. auto left_attr_map = left_parser.attribute_map();
  87. auto right_attr_map = right_parser.attribute_map();
  88. // Iterate through all attributes in the left element
  89. for (auto attr : left_attr_map)
  90. {
  91. // There's a difference if the rigght element doesn't have the attribute from the left element
  92. if (right_attr_map.find(attr.first) == right_attr_map.end())
  93. {
  94. difference = true;
  95. break;
  96. }
  97. // There's a difference if the value of the right attribute doesn't match the value of the left
  98. if (attr.second.value != right_attr_map.at(attr.first).value)
  99. {
  100. // Unless this exception holds
  101. if (left_parser.qname() == xml::qname("urn:schemas-microsoft-com:vml", "shape")
  102. && attr.first == std::string("style"))
  103. {
  104. // for now this doesn't matter, so do nothing
  105. // TODO: think of a better way to do this or prevent the difference in the first place
  106. }
  107. else
  108. {
  109. difference = true;
  110. break;
  111. }
  112. }
  113. }
  114. // Iterate through all attributes in the right element
  115. for (auto attr : right_attr_map)
  116. {
  117. // There's a difference if the left element doesn't have the attribute from the right element
  118. if (left_attr_map.find(attr.first) == left_attr_map.end())
  119. {
  120. difference = true;
  121. break;
  122. }
  123. // There's a difference if the value of the left attribute doesn't match the value of the right
  124. if (attr.second.value != left_attr_map.at(attr.first).value)
  125. {
  126. // Unless this exception holds
  127. if (left_parser.qname() == xml::qname("urn:schemas-microsoft-com:vml", "shape")
  128. && attr.first == std::string("style"))
  129. {
  130. // for now this doesn't matter, so do nothing
  131. // TODO: think of a better way to do this or prevent the difference in the first place
  132. }
  133. else
  134. {
  135. difference = true;
  136. break;
  137. }
  138. }
  139. }
  140. // break out of outer for loop too if a difference was found in attribute for loops
  141. if (difference)
  142. {
  143. break;
  144. }
  145. // Finally, there's a difference if the names of the left and right elements don't match
  146. if (left_parser.qname() != right_parser.qname())
  147. {
  148. difference = true;
  149. break;
  150. }
  151. }
  152. else if (left_event == xml::parser::event_type::characters)
  153. {
  154. // There's a difference if the left text doesn't match the right text
  155. if (left_parser.value() != right_parser.value())
  156. {
  157. // Unless this exception holds
  158. if (left_parser.qname() == xml::qname("urn:schemas-microsoft-com:office:excel", "Anchor"))
  159. {
  160. // for now this doesn't matter, so do nothing
  161. // TODO: think of a better way to do this or prevent the difference in the first place
  162. }
  163. else
  164. {
  165. difference = true;
  166. break;
  167. }
  168. }
  169. }
  170. // Move to the next node in the right document, left node is incremented by for loop
  171. ++right_iter;
  172. }
  173. if (difference && !suppress_debug_info)
  174. {
  175. std::cout << "documents don't match" << std::endl;
  176. std::cout << "left:" << std::endl;
  177. for (auto c : left)
  178. {
  179. std::cout << c << std::flush;
  180. }
  181. std::cout << std::endl;
  182. std::cout << "right:" << std::endl;
  183. for (auto c : right)
  184. {
  185. std::cout << c << std::flush;
  186. }
  187. std::cout << std::endl;
  188. }
  189. return !difference;
  190. }
  191. static bool compare_relationships(const xlnt::manifest &left,
  192. const xlnt::manifest &right)
  193. {
  194. std::unordered_set<std::string> parts;
  195. for (const auto &part : left.parts())
  196. {
  197. parts.insert(part.string());
  198. auto left_rels = left.relationships(part);
  199. auto right_rels = right.relationships(part);
  200. if (left_rels.size() != right_rels.size())
  201. {
  202. return false;
  203. }
  204. std::unordered_map<std::string, xlnt::relationship> left_rels_map;
  205. for (const auto &rel : left_rels)
  206. {
  207. left_rels_map[rel.id()] = rel;
  208. }
  209. for (const auto &right_rel : right_rels)
  210. {
  211. if (left_rels_map.count(right_rel.id()) != 1)
  212. {
  213. return false;
  214. }
  215. const auto &left_rel = left_rels_map.at(right_rel.id());
  216. if (left_rel != right_rel)
  217. {
  218. return false;
  219. }
  220. }
  221. }
  222. for (const auto &part : right.parts())
  223. {
  224. if (parts.count(part.string()) != 1)
  225. {
  226. return false;
  227. }
  228. }
  229. return true;
  230. }
  231. static bool xlsx_archives_match(const std::vector<std::uint8_t> &left,
  232. const std::vector<std::uint8_t> &right)
  233. {
  234. xlnt::detail::vector_istreambuf left_buffer(left);
  235. std::istream left_stream(&left_buffer);
  236. xlnt::detail::izstream left_archive(left_stream);
  237. const auto left_info = left_archive.files();
  238. xlnt::detail::vector_istreambuf right_buffer(right);
  239. std::istream right_stream(&right_buffer);
  240. xlnt::detail::izstream right_archive(right_stream);
  241. const auto right_info = right_archive.files();
  242. auto difference_is_missing_calc_chain = false;
  243. if (std::abs(int(left_info.size()) - int(right_info.size())) == 1)
  244. {
  245. auto is_calc_chain = [](const xlnt::path &p) {
  246. return p.filename() == "calcChain.xml";
  247. };
  248. auto left_has_calc_chain = std::find_if(left_info.begin(), left_info.end(), is_calc_chain)
  249. != left_info.end();
  250. auto right_has_calc_chain = std::find_if(right_info.begin(), right_info.end(), is_calc_chain)
  251. != right_info.end();
  252. if (left_has_calc_chain != right_has_calc_chain)
  253. {
  254. difference_is_missing_calc_chain = true;
  255. }
  256. }
  257. if (left_info.size() != right_info.size() && !difference_is_missing_calc_chain)
  258. {
  259. std::cout << "left has a different number of files than right" << std::endl;
  260. std::cout << "left has: ";
  261. for (auto &info : left_info)
  262. {
  263. std::cout << info.string() << ", ";
  264. }
  265. std::cout << std::endl;
  266. std::cout << "right has: ";
  267. for (auto &info : right_info)
  268. {
  269. std::cout << info.string() << ", ";
  270. }
  271. std::cout << std::endl;
  272. }
  273. bool match = true;
  274. xlnt::workbook left_workbook;
  275. left_workbook.load(left);
  276. xlnt::workbook right_workbook;
  277. right_workbook.load(right);
  278. auto &left_manifest = left_workbook.manifest();
  279. auto &right_manifest = right_workbook.manifest();
  280. if (!compare_relationships(left_manifest, right_manifest))
  281. {
  282. std::cout << "relationship mismatch\n"
  283. << "Left:\n";
  284. for (const auto &part : left_manifest.parts())
  285. {
  286. std::cout << "-part: " << part.string() << '\n';
  287. auto rels = left_manifest.relationships(part);
  288. for (auto &rel : rels)
  289. {
  290. std::cout << rel.id() << ':'
  291. << static_cast<int>(rel.type())
  292. << ':' << static_cast<int>(rel.target_mode())
  293. << ':' << rel.source().path().string()
  294. << ':' << rel.target().path().string() << '\n';
  295. }
  296. }
  297. std::cout << "\nRight:\n";
  298. for (const auto &part : right_manifest.parts())
  299. {
  300. std::cout << "-part: " << part.string() << '\n';
  301. auto rels = right_manifest.relationships(part);
  302. for (auto &rel : rels)
  303. {
  304. std::cout << rel.id()
  305. << ':' << static_cast<int>(rel.type())
  306. << ':' << static_cast<int>(rel.target_mode())
  307. << ':' << rel.source().path().string()
  308. << ':' << rel.target().path().string() << '\n';
  309. }
  310. }
  311. return false;
  312. }
  313. for (auto left_member : left_info)
  314. {
  315. if (!right_archive.has_file(left_member))
  316. {
  317. if (difference_is_missing_calc_chain)
  318. {
  319. continue;
  320. }
  321. match = false;
  322. std::cout << "right is missing file: " << left_member.string() << std::endl;
  323. break;
  324. }
  325. auto left_content_type = left_member.string() == "[Content_Types].xml"
  326. ? "[Content_Types].xml"
  327. : left_manifest.content_type(left_member);
  328. auto right_content_type = left_member.string() == "[Content_Types].xml"
  329. ? "[Content_Types].xml"
  330. : right_manifest.content_type(left_member);
  331. if (left_content_type != right_content_type)
  332. {
  333. std::cout << "content types differ: "
  334. << left_member.string()
  335. << " "
  336. << left_content_type
  337. << " "
  338. << right_content_type
  339. << std::endl;
  340. match = false;
  341. break;
  342. }
  343. if (!compare_files(left_archive.read(left_member),
  344. right_archive.read(left_member), left_content_type))
  345. {
  346. std::cout << left_member.string() << std::endl;
  347. match = false;
  348. break;
  349. }
  350. }
  351. return match;
  352. }
  353. };