Updated script that can be controled by Nodejs web app

2024-11-25 12:24:18 +07:00
parent c440eda1f4
commit 8b0ab2bd3a
8662 changed files with 1803808 additions and 34 deletions
--- a/lib/python3.13/site-packages/pandas/io/formats/csvs.py
+++ b/lib/python3.13/site-packages/pandas/io/formats/csvs.py
@@ -0,0 +1,330 @@
+"""
+Module for formatting output data into CSV files.
+"""
+
+from __future__ import annotations
+
+from collections.abc import (
+    Hashable,
+    Iterable,
+    Iterator,
+    Sequence,
+)
+import csv as csvlib
+import os
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    cast,
+)
+
+import numpy as np
+
+from pandas._libs import writers as libwriters
+from pandas._typing import SequenceNotStr
+from pandas.util._decorators import cache_readonly
+
+from pandas.core.dtypes.generic import (
+    ABCDatetimeIndex,
+    ABCIndex,
+    ABCMultiIndex,
+    ABCPeriodIndex,
+)
+from pandas.core.dtypes.missing import notna
+
+from pandas.core.indexes.api import Index
+
+from pandas.io.common import get_handle
+
+if TYPE_CHECKING:
+    from pandas._typing import (
+        CompressionOptions,
+        FilePath,
+        FloatFormatType,
+        IndexLabel,
+        StorageOptions,
+        WriteBuffer,
+        npt,
+    )
+
+    from pandas.io.formats.format import DataFrameFormatter
+
+
+_DEFAULT_CHUNKSIZE_CELLS = 100_000
+
+
+class CSVFormatter:
+    cols: npt.NDArray[np.object_]
+
+    def __init__(
+        self,
+        formatter: DataFrameFormatter,
+        path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] = "",
+        sep: str = ",",
+        cols: Sequence[Hashable] | None = None,
+        index_label: IndexLabel | None = None,
+        mode: str = "w",
+        encoding: str | None = None,
+        errors: str = "strict",
+        compression: CompressionOptions = "infer",
+        quoting: int | None = None,
+        lineterminator: str | None = "\n",
+        chunksize: int | None = None,
+        quotechar: str | None = '"',
+        date_format: str | None = None,
+        doublequote: bool = True,
+        escapechar: str | None = None,
+        storage_options: StorageOptions | None = None,
+    ) -> None:
+        self.fmt = formatter
+
+        self.obj = self.fmt.frame
+
+        self.filepath_or_buffer = path_or_buf
+        self.encoding = encoding
+        self.compression: CompressionOptions = compression
+        self.mode = mode
+        self.storage_options = storage_options
+
+        self.sep = sep
+        self.index_label = self._initialize_index_label(index_label)
+        self.errors = errors
+        self.quoting = quoting or csvlib.QUOTE_MINIMAL
+        self.quotechar = self._initialize_quotechar(quotechar)
+        self.doublequote = doublequote
+        self.escapechar = escapechar
+        self.lineterminator = lineterminator or os.linesep
+        self.date_format = date_format
+        self.cols = self._initialize_columns(cols)
+        self.chunksize = self._initialize_chunksize(chunksize)
+
+    @property
+    def na_rep(self) -> str:
+        return self.fmt.na_rep
+
+    @property
+    def float_format(self) -> FloatFormatType | None:
+        return self.fmt.float_format
+
+    @property
+    def decimal(self) -> str:
+        return self.fmt.decimal
+
+    @property
+    def header(self) -> bool | SequenceNotStr[str]:
+        return self.fmt.header
+
+    @property
+    def index(self) -> bool:
+        return self.fmt.index
+
+    def _initialize_index_label(self, index_label: IndexLabel | None) -> IndexLabel:
+        if index_label is not False:
+            if index_label is None:
+                return self._get_index_label_from_obj()
+            elif not isinstance(index_label, (list, tuple, np.ndarray, ABCIndex)):
+                # given a string for a DF with Index
+                return [index_label]
+        return index_label
+
+    def _get_index_label_from_obj(self) -> Sequence[Hashable]:
+        if isinstance(self.obj.index, ABCMultiIndex):
+            return self._get_index_label_multiindex()
+        else:
+            return self._get_index_label_flat()
+
+    def _get_index_label_multiindex(self) -> Sequence[Hashable]:
+        return [name or "" for name in self.obj.index.names]
+
+    def _get_index_label_flat(self) -> Sequence[Hashable]:
+        index_label = self.obj.index.name
+        return [""] if index_label is None else [index_label]
+
+    def _initialize_quotechar(self, quotechar: str | None) -> str | None:
+        if self.quoting != csvlib.QUOTE_NONE:
+            # prevents crash in _csv
+            return quotechar
+        return None
+
+    @property
+    def has_mi_columns(self) -> bool:
+        return bool(isinstance(self.obj.columns, ABCMultiIndex))
+
+    def _initialize_columns(
+        self, cols: Iterable[Hashable] | None
+    ) -> npt.NDArray[np.object_]:
+        # validate mi options
+        if self.has_mi_columns:
+            if cols is not None:
+                msg = "cannot specify cols with a MultiIndex on the columns"
+                raise TypeError(msg)
+
+        if cols is not None:
+            if isinstance(cols, ABCIndex):
+                cols = cols._get_values_for_csv(**self._number_format)
+            else:
+                cols = list(cols)
+            self.obj = self.obj.loc[:, cols]
+
+        # update columns to include possible multiplicity of dupes
+        # and make sure cols is just a list of labels
+        new_cols = self.obj.columns
+        return new_cols._get_values_for_csv(**self._number_format)
+
+    def _initialize_chunksize(self, chunksize: int | None) -> int:
+        if chunksize is None:
+            return (_DEFAULT_CHUNKSIZE_CELLS // (len(self.cols) or 1)) or 1
+        return int(chunksize)
+
+    @property
+    def _number_format(self) -> dict[str, Any]:
+        """Dictionary used for storing number formatting settings."""
+        return {
+            "na_rep": self.na_rep,
+            "float_format": self.float_format,
+            "date_format": self.date_format,
+            "quoting": self.quoting,
+            "decimal": self.decimal,
+        }
+
+    @cache_readonly
+    def data_index(self) -> Index:
+        data_index = self.obj.index
+        if (
+            isinstance(data_index, (ABCDatetimeIndex, ABCPeriodIndex))
+            and self.date_format is not None
+        ):
+            data_index = Index(
+                [x.strftime(self.date_format) if notna(x) else "" for x in data_index]
+            )
+        elif isinstance(data_index, ABCMultiIndex):
+            data_index = data_index.remove_unused_levels()
+        return data_index
+
+    @property
+    def nlevels(self) -> int:
+        if self.index:
+            return getattr(self.data_index, "nlevels", 1)
+        else:
+            return 0
+
+    @property
+    def _has_aliases(self) -> bool:
+        return isinstance(self.header, (tuple, list, np.ndarray, ABCIndex))
+
+    @property
+    def _need_to_save_header(self) -> bool:
+        return bool(self._has_aliases or self.header)
+
+    @property
+    def write_cols(self) -> SequenceNotStr[Hashable]:
+        if self._has_aliases:
+            assert not isinstance(self.header, bool)
+            if len(self.header) != len(self.cols):
+                raise ValueError(
+                    f"Writing {len(self.cols)} cols but got {len(self.header)} aliases"
+                )
+            return self.header
+        else:
+            # self.cols is an ndarray derived from Index._get_values_for_csv,
+            #  so its entries are strings, i.e. hashable
+            return cast(SequenceNotStr[Hashable], self.cols)
+
+    @property
+    def encoded_labels(self) -> list[Hashable]:
+        encoded_labels: list[Hashable] = []
+
+        if self.index and self.index_label:
+            assert isinstance(self.index_label, Sequence)
+            encoded_labels = list(self.index_label)
+
+        if not self.has_mi_columns or self._has_aliases:
+            encoded_labels += list(self.write_cols)
+
+        return encoded_labels
+
+    def save(self) -> None:
+        """
+        Create the writer & save.
+        """
+        # apply compression and byte/text conversion
+        with get_handle(
+            self.filepath_or_buffer,
+            self.mode,
+            encoding=self.encoding,
+            errors=self.errors,
+            compression=self.compression,
+            storage_options=self.storage_options,
+        ) as handles:
+            # Note: self.encoding is irrelevant here
+            self.writer = csvlib.writer(
+                handles.handle,
+                lineterminator=self.lineterminator,
+                delimiter=self.sep,
+                quoting=self.quoting,
+                doublequote=self.doublequote,
+                escapechar=self.escapechar,
+                quotechar=self.quotechar,
+            )
+
+            self._save()
+
+    def _save(self) -> None:
+        if self._need_to_save_header:
+            self._save_header()
+        self._save_body()
+
+    def _save_header(self) -> None:
+        if not self.has_mi_columns or self._has_aliases:
+            self.writer.writerow(self.encoded_labels)
+        else:
+            for row in self._generate_multiindex_header_rows():
+                self.writer.writerow(row)
+
+    def _generate_multiindex_header_rows(self) -> Iterator[list[Hashable]]:
+        columns = self.obj.columns
+        for i in range(columns.nlevels):
+            # we need at least 1 index column to write our col names
+            col_line = []
+            if self.index:
+                # name is the first column
+                col_line.append(columns.names[i])
+
+                if isinstance(self.index_label, list) and len(self.index_label) > 1:
+                    col_line.extend([""] * (len(self.index_label) - 1))
+
+            col_line.extend(columns._get_level_values(i))
+            yield col_line
+
+        # Write out the index line if it's not empty.
+        # Otherwise, we will print out an extraneous
+        # blank line between the mi and the data rows.
+        if self.encoded_labels and set(self.encoded_labels) != {""}:
+            yield self.encoded_labels + [""] * len(columns)
+
+    def _save_body(self) -> None:
+        nrows = len(self.data_index)
+        chunks = (nrows // self.chunksize) + 1
+        for i in range(chunks):
+            start_i = i * self.chunksize
+            end_i = min(start_i + self.chunksize, nrows)
+            if start_i >= end_i:
+                break
+            self._save_chunk(start_i, end_i)
+
+    def _save_chunk(self, start_i: int, end_i: int) -> None:
+        # create the data for a chunk
+        slicer = slice(start_i, end_i)
+        df = self.obj.iloc[slicer]
+
+        res = df._get_values_for_csv(**self._number_format)
+        data = list(res._iter_column_arrays())
+
+        ix = self.data_index[slicer]._get_values_for_csv(**self._number_format)
+        libwriters.write_csv_rows(
+            data,
+            ix,
+            self.nlevels,
+            self.cols,
+            self.writer,
+        )