Updated script that can be controled by Nodejs web app

2024-11-25 12:24:18 +07:00
parent c440eda1f4
commit 8b0ab2bd3a
8662 changed files with 1803808 additions and 34 deletions
--- a/lib/python3.13/site-packages/pandas/core/interchange/utils.py
+++ b/lib/python3.13/site-packages/pandas/core/interchange/utils.py
@@ -0,0 +1,178 @@
+"""
+Utility functions and objects for implementing the interchange API.
+"""
+
+from __future__ import annotations
+
+import typing
+
+import numpy as np
+
+from pandas._libs import lib
+
+from pandas.core.dtypes.dtypes import (
+    ArrowDtype,
+    CategoricalDtype,
+    DatetimeTZDtype,
+)
+
+import pandas as pd
+
+if typing.TYPE_CHECKING:
+    from pandas._typing import DtypeObj
+
+
+# Maps str(pyarrow.DataType) = C type format string
+# Currently, no pyarrow API for this
+PYARROW_CTYPES = {
+    "null": "n",
+    "bool": "b",
+    "uint8": "C",
+    "uint16": "S",
+    "uint32": "I",
+    "uint64": "L",
+    "int8": "c",
+    "int16": "S",
+    "int32": "i",
+    "int64": "l",
+    "halffloat": "e",  # float16
+    "float": "f",  # float32
+    "double": "g",  # float64
+    "string": "u",
+    "large_string": "U",
+    "binary": "z",
+    "time32[s]": "tts",
+    "time32[ms]": "ttm",
+    "time64[us]": "ttu",
+    "time64[ns]": "ttn",
+    "date32[day]": "tdD",
+    "date64[ms]": "tdm",
+    "timestamp[s]": "tss:",
+    "timestamp[ms]": "tsm:",
+    "timestamp[us]": "tsu:",
+    "timestamp[ns]": "tsn:",
+    "duration[s]": "tDs",
+    "duration[ms]": "tDm",
+    "duration[us]": "tDu",
+    "duration[ns]": "tDn",
+}
+
+
+class ArrowCTypes:
+    """
+    Enum for Apache Arrow C type format strings.
+
+    The Arrow C data interface:
+    https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings
+    """
+
+    NULL = "n"
+    BOOL = "b"
+    INT8 = "c"
+    UINT8 = "C"
+    INT16 = "s"
+    UINT16 = "S"
+    INT32 = "i"
+    UINT32 = "I"
+    INT64 = "l"
+    UINT64 = "L"
+    FLOAT16 = "e"
+    FLOAT32 = "f"
+    FLOAT64 = "g"
+    STRING = "u"  # utf-8
+    LARGE_STRING = "U"  # utf-8
+    DATE32 = "tdD"
+    DATE64 = "tdm"
+    # Resoulution:
+    #   - seconds -> 's'
+    #   - milliseconds -> 'm'
+    #   - microseconds -> 'u'
+    #   - nanoseconds -> 'n'
+    TIMESTAMP = "ts{resolution}:{tz}"
+    TIME = "tt{resolution}"
+
+
+class Endianness:
+    """Enum indicating the byte-order of a data-type."""
+
+    LITTLE = "<"
+    BIG = ">"
+    NATIVE = "="
+    NA = "|"
+
+
+def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str:
+    """
+    Represent pandas `dtype` as a format string in Apache Arrow C notation.
+
+    Parameters
+    ----------
+    dtype : np.dtype
+        Datatype of pandas DataFrame to represent.
+
+    Returns
+    -------
+    str
+        Format string in Apache Arrow C notation of the given `dtype`.
+    """
+    if isinstance(dtype, CategoricalDtype):
+        return ArrowCTypes.INT64
+    elif dtype == np.dtype("O"):
+        return ArrowCTypes.STRING
+    elif isinstance(dtype, ArrowDtype):
+        import pyarrow as pa
+
+        pa_type = dtype.pyarrow_dtype
+        if pa.types.is_decimal(pa_type):
+            return f"d:{pa_type.precision},{pa_type.scale}"
+        elif pa.types.is_timestamp(pa_type) and pa_type.tz is not None:
+            return f"ts{pa_type.unit[0]}:{pa_type.tz}"
+        format_str = PYARROW_CTYPES.get(str(pa_type), None)
+        if format_str is not None:
+            return format_str
+
+    format_str = getattr(ArrowCTypes, dtype.name.upper(), None)
+    if format_str is not None:
+        return format_str
+
+    if lib.is_np_dtype(dtype, "M"):
+        # Selecting the first char of resolution string:
+        # dtype.str -> '<M8[ns]' -> 'n'
+        resolution = np.datetime_data(dtype)[0][0]
+        return ArrowCTypes.TIMESTAMP.format(resolution=resolution, tz="")
+
+    elif isinstance(dtype, DatetimeTZDtype):
+        return ArrowCTypes.TIMESTAMP.format(resolution=dtype.unit[0], tz=dtype.tz)
+
+    elif isinstance(dtype, pd.BooleanDtype):
+        return ArrowCTypes.BOOL
+
+    raise NotImplementedError(
+        f"Conversion of {dtype} to Arrow C format string is not implemented."
+    )
+
+
+def maybe_rechunk(series: pd.Series, *, allow_copy: bool) -> pd.Series | None:
+    """
+    Rechunk a multi-chunk pyarrow array into a single-chunk array, if necessary.
+
+    - Returns `None` if the input series is not backed by a multi-chunk pyarrow array
+      (and so doesn't need rechunking)
+    - Returns a single-chunk-backed-Series if the input is backed by a multi-chunk
+      pyarrow array and `allow_copy` is `True`.
+    - Raises a `RuntimeError` if `allow_copy` is `False` and input is a
+      based by a multi-chunk pyarrow array.
+    """
+    if not isinstance(series.dtype, pd.ArrowDtype):
+        return None
+    chunked_array = series.array._pa_array  # type: ignore[attr-defined]
+    if len(chunked_array.chunks) == 1:
+        return None
+    if not allow_copy:
+        raise RuntimeError(
+            "Found multi-chunk pyarrow array, but `allow_copy` is False. "
+            "Please rechunk the array before calling this function, or set "
+            "`allow_copy=True`."
+        )
+    arr = chunked_array.combine_chunks()
+    return pd.Series(arr, dtype=series.dtype, name=series.name, index=series.index)