from typing import List
from pandas.compat._optional import import_optional_dependency
import pandas as pd
from pandas._typing import FilePathOrBuffer, Scalar
from pandas.io.excel._base import _BaseExcelReader
class _ODFReader(_BaseExcelReader):
"""Read tables out of OpenDocument formatted files
Parameters
----------
filepath_or_buffer: string, path to be parsed or
an open readable stream.
"""
def __init__(self, filepath_or_buffer: FilePathOrBuffer):
import_optional_dependency("odf")
super().__init__(filepath_or_buffer)
@property
def _workbook_class(self):
from odf.opendocument import OpenDocument
return OpenDocument
def load_workbook(self, filepath_or_buffer: FilePathOrBuffer):
from odf.opendocument import load
return load(filepath_or_buffer)
@property
def empty_value(self) -> str:
"""Property for compat with other readers."""
return ""
@property
def sheet_names(self) -> List[str]:
"""Return a list of sheet names present in the document"""
from odf.table import Table
tables = self.book.getElementsByType(Table)
return [t.getAttribute("name") for t in tables]
def get_sheet_by_index(self, index: int):
from odf.table import Table
tables = self.book.getElementsByType(Table)
return tables[index]
def get_sheet_by_name(self, name: str):
from odf.table import Table
tables = self.book.getElementsByType(Table)
for table in tables:
if table.getAttribute("name") == name:
return table
raise ValueError("sheet {name} not found".format(name))
def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
"""Parse an ODF Table into a list of lists
"""
from odf.table import CoveredTableCell, TableCell, TableRow
covered_cell_name = CoveredTableCell().qname
table_cell_name = TableCell().qname
cell_names = {covered_cell_name, table_cell_name}
sheet_rows = sheet.getElementsByType(TableRow)
empty_rows = 0
max_row_len = 0
table = [] # type: List[List[Scalar]]
for i, sheet_row in enumerate(sheet_rows):
sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names]
empty_cells = 0
table_row = [] # type: List[Scalar]
for j, sheet_cell in enumerate(sheet_cells):
if sheet_cell.qname == table_cell_name:
value = self._get_cell_value(sheet_cell, convert_float)
else:
value = self.empty_value
column_repeat = self._get_column_repeat(sheet_cell)
# Queue up empty values, writing only if content succeeds them
if value == self.empty_value:
empty_cells += column_repeat
else:
table_row.extend([self.empty_value] * empty_cells)
empty_cells = 0
table_row.extend([value] * column_repeat)
if max_row_len < len(table_row):
max_row_len = len(table_row)
row_repeat = self._get_row_repeat(sheet_row)
if self._is_empty_row(sheet_row):
empty_rows += row_repeat
else:
# add blank rows to our table
table.extend([[self.empty_value]] * empty_rows)
empty_rows = 0
for _ in range(row_repeat):
table.append(table_row)
# Make our table square
for row in table:
if len(row) < max_row_len:
row.extend([self.empty_value] * (max_row_len - len(row)))
return table
def _get_row_repeat(self, row) -> int:
"""Return number of times this row was repeated
Repeating an empty row appeared to be a common way
of representing sparse rows in the table.
"""
from odf.namespaces import TABLENS
return int(row.attributes.get((TABLENS, "number-rows-repeated"), 1))
def _get_column_repeat(self, cell) -> int:
from odf.namespaces import TABLENS
return int(cell.attributes.get((TABLENS, "number-columns-repeated"), 1))
def _is_empty_row(self, row) -> bool:
"""Helper function to find empty rows
"""
for column in row.childNodes:
if len(column.childNodes) > 0:
return False
return True
def _get_cell_value(self, cell, convert_float: bool) -> Scalar:
from odf.namespaces import OFFICENS
cell_type = cell.attributes.get((OFFICENS, "value-type"))
if cell_type == "boolean":
if str(cell) == "TRUE":
return True
return False
if cell_type is None:
return self.empty_value
elif cell_type == "float":
# GH5394
cell_value = float(cell.attributes.get((OFFICENS, "value")))
if cell_value == 0.0 and str(cell) != cell_value: # NA handling
return str(cell)
if convert_float:
val = int(cell_value)
if val == cell_value:
return val
return cell_value
elif cell_type == "percentage":
cell_value = cell.attributes.get((OFFICENS, "value"))
return float(cell_value)
elif cell_type == "string":
return str(cell)
elif cell_type == "currency":
cell_value = cell.attributes.get((OFFICENS, "value"))
return float(cell_value)
elif cell_type == "date":
cell_value = cell.attributes.get((OFFICENS, "date-value"))
return pd.to_datetime(cell_value)
elif cell_type == "time":
return pd.to_datetime(str(cell)).time()
else:
raise ValueError("Unrecognized type {}".format(cell_type))