123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417 |
- #pragma once
- #include <sstream>
- #include <unordered_set>
- #include <xlnt/packaging/manifest.hpp>
- #include <xlnt/workbook/workbook.hpp>
- #include <detail/external/include_libstudxml.hpp>
- #include <detail/serialization/vector_streambuf.hpp>
- #include <detail/serialization/zstream.hpp>
- class xml_helper
- {
- public:
- static bool compare_files(const std::string &left,
- const std::string &right, const std::string &content_type)
- {
- // content types are stored in unordered maps, too complicated to compare
- if (content_type == "[Content_Types].xml")
- {
- return true;
- }
- // calcChain is optional
- if (content_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.calcChain+xml")
- {
- return true;
- }
- // compared already
- if (content_type == "application/vnd.openxmlformats-package.relationships+xml")
- {
- return true;
- }
- auto is_xml = (content_type.substr(0, 12) == "application/"
- && content_type.substr(content_type.size() - 4) == "+xml")
- || content_type == "application/xml"
- || content_type == "[Content_Types].xml"
- || content_type == "application/vnd.openxmlformats-officedocument.vmlDrawing";
- if (is_xml)
- {
- return compare_xml_exact(left, right);
- }
- auto is_thumbnail = content_type == "image/jpeg";
- if (is_thumbnail)
- {
- return true;
- }
- return left == right;
- }
- static bool compare_xml_exact(const std::string &left,
- const std::string &right, bool suppress_debug_info = false)
- {
- xml::parser left_parser(left.data(), left.size(), "left");
- xml::parser right_parser(right.data(), right.size(), "right");
- bool difference = false;
- auto right_iter = right_parser.begin();
- auto is_whitespace = [](const std::string &v) {
- return v.find_first_not_of("\n\r\t ") == std::string::npos;
- };
- // Iterate through each node in the left document
- for (auto left_event : left_parser)
- {
- // Ignore entirely whitespace text
- if (left_event == xml::parser::event_type::characters
- && is_whitespace(left_parser.value())) continue;
- // There's a difference if the end of the right document is reached
- if (right_iter == right_parser.end())
- {
- difference = true;
- break;
- }
- auto right_event = *right_iter;
- // Iterate through right document until the first non-whitespace node is reached
- while (right_iter != right_parser.end()
- && right_event == xml::parser::event_type::characters
- && is_whitespace(right_parser.value()))
- {
- ++right_iter;
- right_event = *right_iter;
- }
- // There's a difference if the left node type differs from the right node type
- if (left_event != right_event)
- {
- difference = true;
- break;
- }
- if (left_event == xml::parser::event_type::start_element)
- {
- // Store a map of all attributes from left and right elements in locals
- auto left_attr_map = left_parser.attribute_map();
- auto right_attr_map = right_parser.attribute_map();
- // Iterate through all attributes in the left element
- for (auto attr : left_attr_map)
- {
- // There's a difference if the rigght element doesn't have the attribute from the left element
- if (right_attr_map.find(attr.first) == right_attr_map.end())
- {
- difference = true;
- break;
- }
- // There's a difference if the value of the right attribute doesn't match the value of the left
- if (attr.second.value != right_attr_map.at(attr.first).value)
- {
- // Unless this exception holds
- if (left_parser.qname() == xml::qname("urn:schemas-microsoft-com:vml", "shape")
- && attr.first == std::string("style"))
- {
- // for now this doesn't matter, so do nothing
- // TODO: think of a better way to do this or prevent the difference in the first place
- }
- else
- {
- difference = true;
- break;
- }
- }
- }
- // Iterate through all attributes in the right element
- for (auto attr : right_attr_map)
- {
- // There's a difference if the left element doesn't have the attribute from the right element
- if (left_attr_map.find(attr.first) == left_attr_map.end())
- {
- difference = true;
- break;
- }
- // There's a difference if the value of the left attribute doesn't match the value of the right
- if (attr.second.value != left_attr_map.at(attr.first).value)
- {
- // Unless this exception holds
- if (left_parser.qname() == xml::qname("urn:schemas-microsoft-com:vml", "shape")
- && attr.first == std::string("style"))
- {
- // for now this doesn't matter, so do nothing
- // TODO: think of a better way to do this or prevent the difference in the first place
- }
- else
- {
- difference = true;
- break;
- }
- }
- }
- // break out of outer for loop too if a difference was found in attribute for loops
- if (difference)
- {
- break;
- }
- // Finally, there's a difference if the names of the left and right elements don't match
- if (left_parser.qname() != right_parser.qname())
- {
- difference = true;
- break;
- }
- }
- else if (left_event == xml::parser::event_type::characters)
- {
- // There's a difference if the left text doesn't match the right text
- if (left_parser.value() != right_parser.value())
- {
- // Unless this exception holds
- if (left_parser.qname() == xml::qname("urn:schemas-microsoft-com:office:excel", "Anchor"))
- {
- // for now this doesn't matter, so do nothing
- // TODO: think of a better way to do this or prevent the difference in the first place
- }
- else
- {
- difference = true;
- break;
- }
- }
- }
- // Move to the next node in the right document, left node is incremented by for loop
- ++right_iter;
- }
- if (difference && !suppress_debug_info)
- {
- std::cout << "documents don't match" << std::endl;
- std::cout << "left:" << std::endl;
- for (auto c : left)
- {
- std::cout << c << std::flush;
- }
- std::cout << std::endl;
- std::cout << "right:" << std::endl;
- for (auto c : right)
- {
- std::cout << c << std::flush;
- }
- std::cout << std::endl;
- }
- return !difference;
- }
- static bool compare_relationships(const xlnt::manifest &left,
- const xlnt::manifest &right)
- {
- std::unordered_set<std::string> parts;
- for (const auto &part : left.parts())
- {
- parts.insert(part.string());
- auto left_rels = left.relationships(part);
- auto right_rels = right.relationships(part);
- if (left_rels.size() != right_rels.size())
- {
- return false;
- }
- std::unordered_map<std::string, xlnt::relationship> left_rels_map;
- for (const auto &rel : left_rels)
- {
- left_rels_map[rel.id()] = rel;
- }
- for (const auto &right_rel : right_rels)
- {
- if (left_rels_map.count(right_rel.id()) != 1)
- {
- return false;
- }
- const auto &left_rel = left_rels_map.at(right_rel.id());
- if (left_rel != right_rel)
- {
- return false;
- }
- }
- }
- for (const auto &part : right.parts())
- {
- if (parts.count(part.string()) != 1)
- {
- return false;
- }
- }
- return true;
- }
- static bool xlsx_archives_match(const std::vector<std::uint8_t> &left,
- const std::vector<std::uint8_t> &right)
- {
- xlnt::detail::vector_istreambuf left_buffer(left);
- std::istream left_stream(&left_buffer);
- xlnt::detail::izstream left_archive(left_stream);
- const auto left_info = left_archive.files();
- xlnt::detail::vector_istreambuf right_buffer(right);
- std::istream right_stream(&right_buffer);
- xlnt::detail::izstream right_archive(right_stream);
- const auto right_info = right_archive.files();
- auto difference_is_missing_calc_chain = false;
- if (std::abs(int(left_info.size()) - int(right_info.size())) == 1)
- {
- auto is_calc_chain = [](const xlnt::path &p) {
- return p.filename() == "calcChain.xml";
- };
- auto left_has_calc_chain = std::find_if(left_info.begin(), left_info.end(), is_calc_chain)
- != left_info.end();
- auto right_has_calc_chain = std::find_if(right_info.begin(), right_info.end(), is_calc_chain)
- != right_info.end();
- if (left_has_calc_chain != right_has_calc_chain)
- {
- difference_is_missing_calc_chain = true;
- }
- }
- if (left_info.size() != right_info.size() && !difference_is_missing_calc_chain)
- {
- std::cout << "left has a different number of files than right" << std::endl;
- std::cout << "left has: ";
- for (auto &info : left_info)
- {
- std::cout << info.string() << ", ";
- }
- std::cout << std::endl;
- std::cout << "right has: ";
- for (auto &info : right_info)
- {
- std::cout << info.string() << ", ";
- }
- std::cout << std::endl;
- }
- bool match = true;
- xlnt::workbook left_workbook;
- left_workbook.load(left);
- xlnt::workbook right_workbook;
- right_workbook.load(right);
- auto &left_manifest = left_workbook.manifest();
- auto &right_manifest = right_workbook.manifest();
- if (!compare_relationships(left_manifest, right_manifest))
- {
- std::cout << "relationship mismatch\n"
- << "Left:\n";
- for (const auto &part : left_manifest.parts())
- {
- std::cout << "-part: " << part.string() << '\n';
- auto rels = left_manifest.relationships(part);
- for (auto &rel : rels)
- {
- std::cout << rel.id() << ':'
- << static_cast<int>(rel.type())
- << ':' << static_cast<int>(rel.target_mode())
- << ':' << rel.source().path().string()
- << ':' << rel.target().path().string() << '\n';
- }
- }
- std::cout << "\nRight:\n";
- for (const auto &part : right_manifest.parts())
- {
- std::cout << "-part: " << part.string() << '\n';
- auto rels = right_manifest.relationships(part);
- for (auto &rel : rels)
- {
- std::cout << rel.id()
- << ':' << static_cast<int>(rel.type())
- << ':' << static_cast<int>(rel.target_mode())
- << ':' << rel.source().path().string()
- << ':' << rel.target().path().string() << '\n';
- }
- }
- return false;
- }
- for (auto left_member : left_info)
- {
- if (!right_archive.has_file(left_member))
- {
- if (difference_is_missing_calc_chain)
- {
- continue;
- }
- match = false;
- std::cout << "right is missing file: " << left_member.string() << std::endl;
- break;
- }
- auto left_content_type = left_member.string() == "[Content_Types].xml"
- ? "[Content_Types].xml"
- : left_manifest.content_type(left_member);
- auto right_content_type = left_member.string() == "[Content_Types].xml"
- ? "[Content_Types].xml"
- : right_manifest.content_type(left_member);
- if (left_content_type != right_content_type)
- {
- std::cout << "content types differ: "
- << left_member.string()
- << " "
- << left_content_type
- << " "
- << right_content_type
- << std::endl;
- match = false;
- break;
- }
- if (!compare_files(left_archive.read(left_member),
- right_archive.read(left_member), left_content_type))
- {
- std::cout << left_member.string() << std::endl;
- match = false;
- break;
- }
- }
- return match;
- }
- };
|