__init__.py 3.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. import pyarrow as pa
  2. import xlntpyarrow.lib as xpa
  3. COLUMN_TYPE_FIELD = {
  4. xpa.Cell.Type.Number: pa.float64,
  5. xpa.Cell.Type.SharedString: pa.string,
  6. xpa.Cell.Type.InlineString: pa.string,
  7. xpa.Cell.Type.FormulaString: pa.string,
  8. xpa.Cell.Type.Error: pa.string,
  9. xpa.Cell.Type.Boolean: pa.bool_,
  10. xpa.Cell.Type.Date: pa.date32,
  11. xpa.Cell.Type.Empty: pa.string,
  12. }
  13. def cell_to_pyarrow_array(cell, type):
  14. if cell.data_type() == xpa.Cell.Type.Number:
  15. return pa.array([cell.value_double()], type)
  16. elif cell.data_type() == xpa.Cell.Type.SharedString:
  17. return pa.array([cell.value_string()], type)
  18. elif cell.data_type() == xpa.Cell.Type.InlineString:
  19. return pa.array([cell.value_string()], type)
  20. elif cell.data_type() == xpa.Cell.Type.FormulaString:
  21. return pa.array([cell.value_string()], type)
  22. elif cell.data_type() == xpa.Cell.Type.Error:
  23. return pa.array([cell.value_string()], type)
  24. elif cell.data_type() == xpa.Cell.Type.Boolean:
  25. return pa.array([cell.value_bool()], type)
  26. elif cell.data_type() == xpa.Cell.Type.Date:
  27. return pa.array([cell.value_unsigned_int()], type)
  28. elif cell.data_type() == xpa.Cell.Type.Empty:
  29. return pa.array([cell.value_string()], type)
  30. def xlsx2arrow(io, sheetname):
  31. reader = xpa.StreamingWorkbookReader()
  32. reader.open(io)
  33. sheet_titles = reader.sheet_titles()
  34. sheet_title = sheet_titles[0]
  35. if sheetname is not None:
  36. if isinstance(sheetname, int):
  37. sheet_title = sheet_titles[sheetname]
  38. elif isinstance(sheetname, str):
  39. sheet_title = sheetname
  40. reader.begin_worksheet(sheet_title)
  41. column_names = []
  42. fields = []
  43. batches = []
  44. schema = None
  45. first_batch = []
  46. max_column = 0
  47. while reader.has_cell():
  48. if schema is None:
  49. cell = reader.read_cell()
  50. type = cell.data_type()
  51. if cell.row() == 1:
  52. column_names.append(cell.value_string())
  53. max_column = max(max_column, cell.column())
  54. continue
  55. elif cell.row() == 2:
  56. column_name = column_names[cell.column() - 1]
  57. if type == xpa.Cell.Type.Number and cell.format_is_date():
  58. fields.append(pa.field(column_name, pa.date32))
  59. else:
  60. fields.append(pa.field(column_name, COLUMN_TYPE_FIELD[type]()))
  61. first_batch.append(cell_to_pyarrow_array(cell, fields[-1].type))
  62. if cell.column() == max_column:
  63. schema = pa.schema(fields)
  64. print(schema)
  65. batches.append(pa.RecordBatch.from_arrays(first_batch, column_names))
  66. continue
  67. batches.append(reader.read_batch(schema, 10000))
  68. reader.end_worksheet()
  69. return pa.Table.from_batches(batches)
  70. if __name__ == '__main__':
  71. file = open('tmp.xlsx', 'rb')
  72. table = xlsx2arrow(file, 'Sheet1')
  73. print(table.to_pandas())