xlntpyarrow.lib.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452
  1. // Copyright (c) 2017-2018 Thomas Fussell
  2. //
  3. // Permission is hereby granted, free of charge, to any person obtaining a copy
  4. // of this software and associated documentation files (the "Software"), to deal
  5. // in the Software without restriction, including without limitation the rights
  6. // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. // copies of the Software, and to permit persons to whom the Software is
  8. // furnished to do so, subject to the following conditions:
  9. //
  10. // The above copyright notice and this permission notice shall be included in
  11. // all copies or substantial portions of the Software.
  12. //
  13. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19. // THE SOFTWARE
  20. //
  21. // @license: http://www.opensource.org/licenses/mit-license.php
  22. // @author: see AUTHORS file
  23. #include <exception>
  24. #include <arrow/api.h>
  25. #include <arrow/python/pyarrow.h>
  26. #include <pybind11/pybind11.h>
  27. #include <pybind11/stl.h>
  28. #include <xlnt/xlnt.hpp>
  29. #include <xlnt/workbook/streaming_workbook_reader.hpp>
  30. #include <python_streambuf.hpp>
  31. void import_pyarrow()
  32. {
  33. static auto imported = false;
  34. if (!imported)
  35. {
  36. if (arrow::py::import_pyarrow() != 0)
  37. {
  38. throw xlnt::exception("Import of pyarrow failed.");
  39. }
  40. imported = true;
  41. }
  42. }
  43. arrow::ArrayBuilder *make_array_builder(arrow::Type::type type)
  44. {
  45. auto pool = arrow::default_memory_pool();
  46. auto builder = static_cast<arrow::ArrayBuilder *>(nullptr);
  47. switch(type)
  48. {
  49. case arrow::Type::NA:
  50. break;
  51. case arrow::Type::UINT8:
  52. builder = new arrow::TypeTraits<arrow::UInt8Type>::BuilderType(pool);
  53. break;
  54. case arrow::Type::INT8:
  55. builder = new arrow::TypeTraits<arrow::Int8Type>::BuilderType(pool);
  56. break;
  57. case arrow::Type::UINT16:
  58. builder = new arrow::TypeTraits<arrow::UInt16Type>::BuilderType(pool);
  59. break;
  60. case arrow::Type::INT16:
  61. builder = new arrow::TypeTraits<arrow::Int16Type>::BuilderType(pool);
  62. break;
  63. case arrow::Type::UINT32:
  64. builder = new arrow::TypeTraits<arrow::UInt32Type>::BuilderType(pool);
  65. break;
  66. case arrow::Type::INT32:
  67. builder = new arrow::TypeTraits<arrow::Int32Type>::BuilderType(pool);
  68. break;
  69. case arrow::Type::UINT64:
  70. builder = new arrow::TypeTraits<arrow::UInt64Type>::BuilderType(pool);
  71. break;
  72. case arrow::Type::INT64:
  73. builder = new arrow::TypeTraits<arrow::Int64Type>::BuilderType(pool);
  74. break;
  75. case arrow::Type::DATE64:
  76. builder = new arrow::TypeTraits<arrow::Date64Type>::BuilderType(pool);
  77. break;
  78. case arrow::Type::DATE32:
  79. builder = new arrow::TypeTraits<arrow::Date32Type>::BuilderType(pool);
  80. break;
  81. /*
  82. case arrow::Type::TIMESTAMP:
  83. builder = new arrow::TypeTraits<arrow::TimestampType>::BuilderType(pool);
  84. break;
  85. case arrow::Type::TIME32:
  86. builder = new arrow::TypeTraits<arrow::Time32Type>::BuilderType(pool);
  87. break;
  88. case arrow::Type::TIME64:
  89. builder = new arrow::TypeTraits<arrow::Time64Type>::BuilderType(pool);
  90. break;
  91. */
  92. case arrow::Type::HALF_FLOAT:
  93. builder = new arrow::TypeTraits<arrow::HalfFloatType>::BuilderType(pool);
  94. break;
  95. case arrow::Type::FLOAT:
  96. builder = new arrow::TypeTraits<arrow::FloatType>::BuilderType(pool);
  97. break;
  98. case arrow::Type::DOUBLE:
  99. builder = new arrow::TypeTraits<arrow::DoubleType>::BuilderType(pool);
  100. break;
  101. /*
  102. case arrow::Type::DECIMAL:
  103. builder = new arrow::TypeTraits<arrow::DecimalType>::BuilderType(pool, type);
  104. break;
  105. */
  106. case arrow::Type::BOOL:
  107. builder = new arrow::TypeTraits<arrow::BooleanType>::BuilderType(pool);
  108. break;
  109. case arrow::Type::STRING:
  110. builder = new arrow::TypeTraits<arrow::StringType>::BuilderType(pool);
  111. break;
  112. case arrow::Type::BINARY:
  113. builder = new arrow::TypeTraits<arrow::BinaryType>::BuilderType(pool);
  114. break;
  115. /*
  116. case arrow::Type::FIXED_SIZE_BINARY:
  117. builder = new arrow::TypeTraits<arrow::FixedSizeBinaryType>::BuilderType(pool);
  118. break;
  119. case arrow::Type::LIST:
  120. builder = new arrow::TypeTraits<arrow::ListType>::BuilderType(pool);
  121. break;
  122. case arrow::Type::STRUCT:
  123. builder = new arrow::TypeTraits<arrow::StructType>::BuilderType(pool);
  124. break;
  125. case arrow::Type::UNION:
  126. builder = new arrow::TypeTraits<arrow::UnionType>::BuilderType(pool);
  127. break;
  128. case arrow::Type::DICTIONARY:
  129. builder = new arrow::TypeTraits<arrow::DictionaryType>::BuilderType(pool);
  130. break;
  131. */
  132. default:
  133. throw xlnt::exception("not implemented");
  134. }
  135. return builder;
  136. }
  137. void open_file(xlnt::streaming_workbook_reader &reader, pybind11::object file)
  138. {
  139. reader.open(std::unique_ptr<std::streambuf>(new xlnt::python_streambuf(file)));
  140. }
  141. template<typename T>
  142. T cell_value(xlnt::cell cell)
  143. {
  144. return static_cast<T>(cell.value<double>());
  145. }
  146. // from https://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
  147. std::uint16_t float_to_half(float f)
  148. {
  149. auto x = static_cast<std::uint32_t>(f);
  150. auto half = ((x >> 16) & 0x8000)
  151. | ((((x & 0x7f800000) - 0x38000000) >> 13) & 0x7c00)
  152. | ((x >> 13) & 0x03ff);
  153. return half;
  154. }
  155. void append_cell_value(arrow::ArrayBuilder *builder, arrow::Type::type type, xlnt::cell cell)
  156. {
  157. const status = arrow::Status::OK();
  158. switch (type)
  159. {
  160. case arrow::Type::NA:
  161. break;
  162. case arrow::Type::BOOL:
  163. status = static_cast<arrow::BooleanBuilder *>(builder)
  164. ->Append(cell.value<bool>());
  165. break;
  166. case arrow::Type::UINT8:
  167. status = static_cast<arrow::UInt8Builder *>(builder)
  168. ->Append(cell_value<std::uint8_t>(cell));
  169. break;
  170. case arrow::Type::INT8:
  171. status = static_cast<arrow::Int8Builder *>(builder)
  172. ->Append(cell_value<std::uint8_t>(cell));
  173. break;
  174. case arrow::Type::UINT16:
  175. status = static_cast<arrow::UInt16Builder *>(builder)
  176. ->Append(cell_value<std::uint16_t>(cell));
  177. break;
  178. case arrow::Type::INT16:
  179. status = static_cast<arrow::Int16Builder *>(builder)
  180. ->Append(cell_value<std::int16_t>(cell));
  181. break;
  182. case arrow::Type::UINT32:
  183. status = static_cast<arrow::UInt32Builder *>(builder)
  184. ->Append(cell_value<std::uint32_t>(cell));
  185. break;
  186. case arrow::Type::INT32:
  187. status = static_cast<arrow::Int32Builder *>(builder)
  188. ->Append(cell_value<std::int32_t>(cell));
  189. break;
  190. case arrow::Type::UINT64:
  191. status = static_cast<arrow::UInt64Builder *>(builder)
  192. ->Append(cell_value<std::uint64_t>(cell));
  193. break;
  194. case arrow::Type::INT64:
  195. status = static_cast<arrow::Int64Builder *>(builder)
  196. ->Append(cell_value<std::int64_t>(cell));
  197. break;
  198. case arrow::Type::HALF_FLOAT:
  199. status = static_cast<arrow::HalfFloatBuilder *>(builder)
  200. ->Append(float_to_half(cell_value<float>(cell)));
  201. break;
  202. case arrow::Type::FLOAT:
  203. status = static_cast<arrow::FloatBuilder *>(builder)
  204. ->Append(cell_value<float>(cell));
  205. break;
  206. case arrow::Type::DOUBLE:
  207. status = static_cast<arrow::DoubleBuilder *>(builder)
  208. ->Append(cell_value<double>(cell));
  209. break;
  210. case arrow::Type::STRING:
  211. status = static_cast<arrow::StringBuilder *>(builder)
  212. ->Append(cell.value<std::string>());
  213. break;
  214. case arrow::Type::BINARY:
  215. status = static_cast<arrow::BinaryBuilder *>(builder)
  216. ->Append(cell.value<std::string>());
  217. break;
  218. case arrow::Type::FIXED_SIZE_BINARY:
  219. status = static_cast<arrow::FixedSizeBinaryBuilder *>(builder)
  220. ->Append(cell.value<std::string>());
  221. break;
  222. case arrow::Type::DATE32:
  223. status = static_cast<arrow::Date32Builder *>(builder)
  224. ->Append(cell_value<arrow::Date32Type::c_type>(cell));
  225. break;
  226. case arrow::Type::DATE64:
  227. status = static_cast<arrow::Date64Builder *>(builder)
  228. ->Append(cell_value<arrow::Date64Type::c_type>(cell));
  229. break;
  230. case arrow::Type::TIMESTAMP:
  231. status = static_cast<arrow::TimestampBuilder *>(builder)
  232. ->Append(cell_value<arrow::TimestampType::c_type>(cell));
  233. break;
  234. case arrow::Type::TIME32:
  235. status = static_cast<arrow::Time32Builder *>(builder)
  236. ->Append(cell_value<arrow::Time32Type::c_type>(cell));
  237. break;
  238. case arrow::Type::TIME64:
  239. status = static_cast<arrow::Time64Builder *>(builder)
  240. ->Append(cell_value<arrow::Time64Type::c_type>(cell));
  241. break;
  242. /*
  243. case arrow::Type::INTERVAL:
  244. status = static_cast<arrow::IntervalBuilder *>(builder)
  245. ->Append(cell_value<std::int64_t>(cell));
  246. break;
  247. case arrow::Type::DECIMAL:
  248. status = static_cast<arrow::DecimalBuilder *>(builder)
  249. ->Append(cell.value<std::string>());
  250. break;
  251. case arrow::Type::LIST:
  252. status = static_cast<arrow::ListBuilder *>(builder)
  253. ->Append(cell.value<std::string>());
  254. break;
  255. case arrow::Type::STRUCT:
  256. status = static_cast<arrow::StructBuilder *>(builder)
  257. ->Append(cell.value<std::string>());
  258. break;
  259. case arrow::Type::UNION:
  260. status = static_cast<arrow::UnionBuilder *>(builder)
  261. ->Append(cell.value<std::string>());
  262. break;
  263. case arrow::Type::DICTIONARY:
  264. status = static_cast<arrow::DictionaryBuilder *>(builder)
  265. ->Append(cell.value<std::string>());
  266. break;
  267. */
  268. default:
  269. throw xlnt::exception("not implemented");
  270. }
  271. if (status != arrow::Status::OK())
  272. {
  273. throw xlnt::exception("Append failed");
  274. }
  275. }
  276. pybind11::handle read_batch(xlnt::streaming_workbook_reader &reader,
  277. pybind11::object pyschema, int max_rows)
  278. {
  279. import_pyarrow();
  280. std::shared_ptr<arrow::Schema> schema;
  281. arrow::py::unwrap_schema(pyschema.ptr(), &schema);
  282. std::vector<arrow::Type::type> column_types;
  283. for (auto i = 0; i < schema->num_fields(); ++i)
  284. {
  285. column_types.push_back(schema->field(i)->type()->id());
  286. }
  287. auto builders = std::vector<std::unique_ptr<arrow::ArrayBuilder>>();
  288. for (auto type : column_types)
  289. {
  290. builders.emplace_back(make_array_builder(type));
  291. }
  292. auto row = std::int64_t(0);
  293. while (row < max_rows)
  294. {
  295. if (!reader.has_cell()) break;
  296. for (auto column = 0; column < schema->num_fields(); ++column)
  297. {
  298. if (!reader.has_cell()) break;
  299. auto cell = reader.read_cell();
  300. auto zero_indexed_column = cell.column().index - 1;
  301. auto column_type = column_types.at(zero_indexed_column);
  302. auto builder = builders.at(zero_indexed_column).get();
  303. append_cell_value(builder, column_type, cell);
  304. }
  305. ++row;
  306. }
  307. auto columns = std::vector<std::shared_ptr<arrow::Array>>();
  308. for (auto &builder : builders)
  309. {
  310. std::shared_ptr<arrow::Array> column;
  311. builder->Finish(&column);
  312. columns.emplace_back(column);
  313. }
  314. auto batch_pointer = std::make_shared<arrow::RecordBatch>(schema, row, columns);
  315. auto batch_object = arrow::py::wrap_record_batch(batch_pointer);
  316. auto batch_handle = pybind11::handle(batch_object); // don't need to incr. reference count, right?
  317. return batch_handle;
  318. }
  319. PYBIND11_MODULE(lib, m)
  320. {
  321. m.doc() = "streaming read/write interface for C++ XLSX library xlnt";
  322. pybind11::class_<xlnt::streaming_workbook_reader>(m, "StreamingWorkbookReader")
  323. .def(pybind11::init<>())
  324. .def("has_cell", &xlnt::streaming_workbook_reader::has_cell)
  325. .def("read_cell", &xlnt::streaming_workbook_reader::read_cell)
  326. .def("has_worksheet", &xlnt::streaming_workbook_reader::has_worksheet)
  327. .def("begin_worksheet", &xlnt::streaming_workbook_reader::begin_worksheet)
  328. .def("end_worksheet", &xlnt::streaming_workbook_reader::end_worksheet)
  329. .def("sheet_titles", &xlnt::streaming_workbook_reader::sheet_titles)
  330. .def("open", &open_file)
  331. .def("read_batch", &read_batch);
  332. pybind11::class_<xlnt::worksheet>(m, "Worksheet");
  333. pybind11::class_<xlnt::cell> cell(m, "Cell");
  334. cell.def("value_string", [](xlnt::cell &cell)
  335. {
  336. return cell.value<std::string>();
  337. })
  338. .def("value_bool", [](xlnt::cell &cell)
  339. {
  340. return cell.value<bool>();
  341. })
  342. .def("value_unsigned_int", [](xlnt::cell &cell)
  343. {
  344. return cell.value<unsigned int>();
  345. })
  346. .def("value_double", [](xlnt::cell &cell)
  347. {
  348. return cell.value<double>();
  349. })
  350. .def("data_type", [](xlnt::cell &cell)
  351. {
  352. return cell.data_type();
  353. })
  354. .def("row", &xlnt::cell::row)
  355. .def("column", [](xlnt::cell &cell)
  356. {
  357. return cell.column().index;
  358. })
  359. .def("format_is_date", [](xlnt::cell &cell)
  360. {
  361. return cell.has_format() && cell.number_format().is_date_format();
  362. });
  363. pybind11::enum_<xlnt::cell::type>(cell, "Type")
  364. .value("Empty", xlnt::cell::type::empty)
  365. .value("Boolean", xlnt::cell::type::boolean)
  366. .value("Date", xlnt::cell::type::date)
  367. .value("Error", xlnt::cell::type::error)
  368. .value("InlineString", xlnt::cell::type::inline_string)
  369. .value("Number", xlnt::cell::type::number)
  370. .value("SharedString", xlnt::cell::type::shared_string)
  371. .value("FormulaString", xlnt::cell::type::formula_string);
  372. }