Updated script that can be controled by Nodejs web app
This commit is contained in:
253
lib/python3.13/site-packages/pandas/io/excel/_odfreader.py
Normal file
253
lib/python3.13/site-packages/pandas/io/excel/_odfreader.py
Normal file
@ -0,0 +1,253 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
cast,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import (
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
Scalar,
|
||||
StorageOptions,
|
||||
)
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.util._decorators import doc
|
||||
|
||||
import pandas as pd
|
||||
from pandas.core.shared_docs import _shared_docs
|
||||
|
||||
from pandas.io.excel._base import BaseExcelReader
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from odf.opendocument import OpenDocument
|
||||
|
||||
from pandas._libs.tslibs.nattype import NaTType
|
||||
|
||||
|
||||
@doc(storage_options=_shared_docs["storage_options"])
|
||||
class ODFReader(BaseExcelReader["OpenDocument"]):
|
||||
def __init__(
|
||||
self,
|
||||
filepath_or_buffer: FilePath | ReadBuffer[bytes],
|
||||
storage_options: StorageOptions | None = None,
|
||||
engine_kwargs: dict | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Read tables out of OpenDocument formatted files.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : str, path to be parsed or
|
||||
an open readable stream.
|
||||
{storage_options}
|
||||
engine_kwargs : dict, optional
|
||||
Arbitrary keyword arguments passed to excel engine.
|
||||
"""
|
||||
import_optional_dependency("odf")
|
||||
super().__init__(
|
||||
filepath_or_buffer,
|
||||
storage_options=storage_options,
|
||||
engine_kwargs=engine_kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def _workbook_class(self) -> type[OpenDocument]:
|
||||
from odf.opendocument import OpenDocument
|
||||
|
||||
return OpenDocument
|
||||
|
||||
def load_workbook(
|
||||
self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs
|
||||
) -> OpenDocument:
|
||||
from odf.opendocument import load
|
||||
|
||||
return load(filepath_or_buffer, **engine_kwargs)
|
||||
|
||||
@property
|
||||
def empty_value(self) -> str:
|
||||
"""Property for compat with other readers."""
|
||||
return ""
|
||||
|
||||
@property
|
||||
def sheet_names(self) -> list[str]:
|
||||
"""Return a list of sheet names present in the document"""
|
||||
from odf.table import Table
|
||||
|
||||
tables = self.book.getElementsByType(Table)
|
||||
return [t.getAttribute("name") for t in tables]
|
||||
|
||||
def get_sheet_by_index(self, index: int):
|
||||
from odf.table import Table
|
||||
|
||||
self.raise_if_bad_sheet_by_index(index)
|
||||
tables = self.book.getElementsByType(Table)
|
||||
return tables[index]
|
||||
|
||||
def get_sheet_by_name(self, name: str):
|
||||
from odf.table import Table
|
||||
|
||||
self.raise_if_bad_sheet_by_name(name)
|
||||
tables = self.book.getElementsByType(Table)
|
||||
|
||||
for table in tables:
|
||||
if table.getAttribute("name") == name:
|
||||
return table
|
||||
|
||||
self.close()
|
||||
raise ValueError(f"sheet {name} not found")
|
||||
|
||||
def get_sheet_data(
|
||||
self, sheet, file_rows_needed: int | None = None
|
||||
) -> list[list[Scalar | NaTType]]:
|
||||
"""
|
||||
Parse an ODF Table into a list of lists
|
||||
"""
|
||||
from odf.table import (
|
||||
CoveredTableCell,
|
||||
TableCell,
|
||||
TableRow,
|
||||
)
|
||||
|
||||
covered_cell_name = CoveredTableCell().qname
|
||||
table_cell_name = TableCell().qname
|
||||
cell_names = {covered_cell_name, table_cell_name}
|
||||
|
||||
sheet_rows = sheet.getElementsByType(TableRow)
|
||||
empty_rows = 0
|
||||
max_row_len = 0
|
||||
|
||||
table: list[list[Scalar | NaTType]] = []
|
||||
|
||||
for sheet_row in sheet_rows:
|
||||
sheet_cells = [
|
||||
x
|
||||
for x in sheet_row.childNodes
|
||||
if hasattr(x, "qname") and x.qname in cell_names
|
||||
]
|
||||
empty_cells = 0
|
||||
table_row: list[Scalar | NaTType] = []
|
||||
|
||||
for sheet_cell in sheet_cells:
|
||||
if sheet_cell.qname == table_cell_name:
|
||||
value = self._get_cell_value(sheet_cell)
|
||||
else:
|
||||
value = self.empty_value
|
||||
|
||||
column_repeat = self._get_column_repeat(sheet_cell)
|
||||
|
||||
# Queue up empty values, writing only if content succeeds them
|
||||
if value == self.empty_value:
|
||||
empty_cells += column_repeat
|
||||
else:
|
||||
table_row.extend([self.empty_value] * empty_cells)
|
||||
empty_cells = 0
|
||||
table_row.extend([value] * column_repeat)
|
||||
|
||||
if max_row_len < len(table_row):
|
||||
max_row_len = len(table_row)
|
||||
|
||||
row_repeat = self._get_row_repeat(sheet_row)
|
||||
if len(table_row) == 0:
|
||||
empty_rows += row_repeat
|
||||
else:
|
||||
# add blank rows to our table
|
||||
table.extend([[self.empty_value]] * empty_rows)
|
||||
empty_rows = 0
|
||||
table.extend(table_row for _ in range(row_repeat))
|
||||
if file_rows_needed is not None and len(table) >= file_rows_needed:
|
||||
break
|
||||
|
||||
# Make our table square
|
||||
for row in table:
|
||||
if len(row) < max_row_len:
|
||||
row.extend([self.empty_value] * (max_row_len - len(row)))
|
||||
|
||||
return table
|
||||
|
||||
def _get_row_repeat(self, row) -> int:
|
||||
"""
|
||||
Return number of times this row was repeated
|
||||
Repeating an empty row appeared to be a common way
|
||||
of representing sparse rows in the table.
|
||||
"""
|
||||
from odf.namespaces import TABLENS
|
||||
|
||||
return int(row.attributes.get((TABLENS, "number-rows-repeated"), 1))
|
||||
|
||||
def _get_column_repeat(self, cell) -> int:
|
||||
from odf.namespaces import TABLENS
|
||||
|
||||
return int(cell.attributes.get((TABLENS, "number-columns-repeated"), 1))
|
||||
|
||||
def _get_cell_value(self, cell) -> Scalar | NaTType:
|
||||
from odf.namespaces import OFFICENS
|
||||
|
||||
if str(cell) == "#N/A":
|
||||
return np.nan
|
||||
|
||||
cell_type = cell.attributes.get((OFFICENS, "value-type"))
|
||||
if cell_type == "boolean":
|
||||
if str(cell) == "TRUE":
|
||||
return True
|
||||
return False
|
||||
if cell_type is None:
|
||||
return self.empty_value
|
||||
elif cell_type == "float":
|
||||
# GH5394
|
||||
cell_value = float(cell.attributes.get((OFFICENS, "value")))
|
||||
val = int(cell_value)
|
||||
if val == cell_value:
|
||||
return val
|
||||
return cell_value
|
||||
elif cell_type == "percentage":
|
||||
cell_value = cell.attributes.get((OFFICENS, "value"))
|
||||
return float(cell_value)
|
||||
elif cell_type == "string":
|
||||
return self._get_cell_string_value(cell)
|
||||
elif cell_type == "currency":
|
||||
cell_value = cell.attributes.get((OFFICENS, "value"))
|
||||
return float(cell_value)
|
||||
elif cell_type == "date":
|
||||
cell_value = cell.attributes.get((OFFICENS, "date-value"))
|
||||
return pd.Timestamp(cell_value)
|
||||
elif cell_type == "time":
|
||||
stamp = pd.Timestamp(str(cell))
|
||||
# cast needed here because Scalar doesn't include datetime.time
|
||||
return cast(Scalar, stamp.time())
|
||||
else:
|
||||
self.close()
|
||||
raise ValueError(f"Unrecognized type {cell_type}")
|
||||
|
||||
def _get_cell_string_value(self, cell) -> str:
|
||||
"""
|
||||
Find and decode OpenDocument text:s tags that represent
|
||||
a run length encoded sequence of space characters.
|
||||
"""
|
||||
from odf.element import Element
|
||||
from odf.namespaces import TEXTNS
|
||||
from odf.office import Annotation
|
||||
from odf.text import S
|
||||
|
||||
office_annotation = Annotation().qname
|
||||
text_s = S().qname
|
||||
|
||||
value = []
|
||||
|
||||
for fragment in cell.childNodes:
|
||||
if isinstance(fragment, Element):
|
||||
if fragment.qname == text_s:
|
||||
spaces = int(fragment.attributes.get((TEXTNS, "c"), 1))
|
||||
value.append(" " * spaces)
|
||||
elif fragment.qname == office_annotation:
|
||||
continue
|
||||
else:
|
||||
# recursive impl needed in case of nested fragments
|
||||
# with multiple spaces
|
||||
# https://github.com/pandas-dev/pandas/pull/36175#discussion_r484639704
|
||||
value.append(self._get_cell_string_value(fragment))
|
||||
else:
|
||||
value.append(str(fragment).strip("\n"))
|
||||
return "".join(value)
|
Reference in New Issue
Block a user