Updated script that can be controled by Nodejs web app

2024-11-25 12:24:18 +07:00
parent c440eda1f4
commit 8b0ab2bd3a
8662 changed files with 1803808 additions and 34 deletions
--- a/lib/python3.13/site-packages/pandas/core/arrays/arrow/init.py
+++ b/lib/python3.13/site-packages/pandas/core/arrays/arrow/init.py
@@ -0,0 +1,7 @@
+from pandas.core.arrays.arrow.accessors import (
+    ListAccessor,
+    StructAccessor,
+)
+from pandas.core.arrays.arrow.array import ArrowExtensionArray
+
+__all__ = ["ArrowExtensionArray", "StructAccessor", "ListAccessor"]
--- a/lib/python3.13/site-packages/pandas/core/arrays/arrow/pycache/init.cpython-313.pyc
+++ b/lib/python3.13/site-packages/pandas/core/arrays/arrow/pycache/init.cpython-313.pyc
--- a/lib/python3.13/site-packages/pandas/core/arrays/arrow/pycache/accessors.cpython-313.pyc
+++ b/lib/python3.13/site-packages/pandas/core/arrays/arrow/pycache/accessors.cpython-313.pyc
--- a/lib/python3.13/site-packages/pandas/core/arrays/arrow/pycache/array.cpython-313.pyc
+++ b/lib/python3.13/site-packages/pandas/core/arrays/arrow/pycache/array.cpython-313.pyc
--- a/lib/python3.13/site-packages/pandas/core/arrays/arrow/_arrow_utils.py
+++ b/lib/python3.13/site-packages/pandas/core/arrays/arrow/_arrow_utils.py
@@ -0,0 +1,66 @@
+from __future__ import annotations
+
+import warnings
+
+import numpy as np
+import pyarrow
+
+from pandas.errors import PerformanceWarning
+from pandas.util._exceptions import find_stack_level
+
+
+def fallback_performancewarning(version: str | None = None) -> None:
+    """
+    Raise a PerformanceWarning for falling back to ExtensionArray's
+    non-pyarrow method
+    """
+    msg = "Falling back on a non-pyarrow code path which may decrease performance."
+    if version is not None:
+        msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning."
+    warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level())
+
+
+def pyarrow_array_to_numpy_and_mask(
+    arr, dtype: np.dtype
+) -> tuple[np.ndarray, np.ndarray]:
+    """
+    Convert a primitive pyarrow.Array to a numpy array and boolean mask based
+    on the buffers of the Array.
+
+    At the moment pyarrow.BooleanArray is not supported.
+
+    Parameters
+    ----------
+    arr : pyarrow.Array
+    dtype : numpy.dtype
+
+    Returns
+    -------
+    (data, mask)
+        Tuple of two numpy arrays with the raw data (with specified dtype) and
+        a boolean mask (validity mask, so False means missing)
+    """
+    dtype = np.dtype(dtype)
+
+    if pyarrow.types.is_null(arr.type):
+        # No initialization of data is needed since everything is null
+        data = np.empty(len(arr), dtype=dtype)
+        mask = np.zeros(len(arr), dtype=bool)
+        return data, mask
+    buflist = arr.buffers()
+    # Since Arrow buffers might contain padding and the data might be offset,
+    # the buffer gets sliced here before handing it to numpy.
+    # See also https://github.com/pandas-dev/pandas/issues/40896
+    offset = arr.offset * dtype.itemsize
+    length = len(arr) * dtype.itemsize
+    data_buf = buflist[1][offset : offset + length]
+    data = np.frombuffer(data_buf, dtype=dtype)
+    bitmask = buflist[0]
+    if bitmask is not None:
+        mask = pyarrow.BooleanArray.from_buffers(
+            pyarrow.bool_(), len(arr), [None, bitmask], offset=arr.offset
+        )
+        mask = np.asarray(mask)
+    else:
+        mask = np.ones(len(arr), dtype=bool)
+    return data, mask
--- a/lib/python3.13/site-packages/pandas/core/arrays/arrow/accessors.py
+++ b/lib/python3.13/site-packages/pandas/core/arrays/arrow/accessors.py
@@ -0,0 +1,473 @@
+"""Accessors for arrow-backed data."""
+
+from __future__ import annotations
+
+from abc import (
+    ABCMeta,
+    abstractmethod,
+)
+from typing import (
+    TYPE_CHECKING,
+    cast,
+)
+
+from pandas.compat import (
+    pa_version_under10p1,
+    pa_version_under11p0,
+)
+
+from pandas.core.dtypes.common import is_list_like
+
+if not pa_version_under10p1:
+    import pyarrow as pa
+    import pyarrow.compute as pc
+
+    from pandas.core.dtypes.dtypes import ArrowDtype
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
+    from pandas import (
+        DataFrame,
+        Series,
+    )
+
+
+class ArrowAccessor(metaclass=ABCMeta):
+    @abstractmethod
+    def __init__(self, data, validation_msg: str) -> None:
+        self._data = data
+        self._validation_msg = validation_msg
+        self._validate(data)
+
+    @abstractmethod
+    def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
+        pass
+
+    def _validate(self, data):
+        dtype = data.dtype
+        if not isinstance(dtype, ArrowDtype):
+            # Raise AttributeError so that inspect can handle non-struct Series.
+            raise AttributeError(self._validation_msg.format(dtype=dtype))
+
+        if not self._is_valid_pyarrow_dtype(dtype.pyarrow_dtype):
+            # Raise AttributeError so that inspect can handle invalid Series.
+            raise AttributeError(self._validation_msg.format(dtype=dtype))
+
+    @property
+    def _pa_array(self):
+        return self._data.array._pa_array
+
+
+class ListAccessor(ArrowAccessor):
+    """
+    Accessor object for list data properties of the Series values.
+
+    Parameters
+    ----------
+    data : Series
+        Series containing Arrow list data.
+    """
+
+    def __init__(self, data=None) -> None:
+        super().__init__(
+            data,
+            validation_msg="Can only use the '.list' accessor with "
+            "'list[pyarrow]' dtype, not {dtype}.",
+        )
+
+    def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
+        return (
+            pa.types.is_list(pyarrow_dtype)
+            or pa.types.is_fixed_size_list(pyarrow_dtype)
+            or pa.types.is_large_list(pyarrow_dtype)
+        )
+
+    def len(self) -> Series:
+        """
+        Return the length of each list in the Series.
+
+        Returns
+        -------
+        pandas.Series
+            The length of each list.
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> s = pd.Series(
+        ...     [
+        ...         [1, 2, 3],
+        ...         [3],
+        ...     ],
+        ...     dtype=pd.ArrowDtype(pa.list_(
+        ...         pa.int64()
+        ...     ))
+        ... )
+        >>> s.list.len()
+        0    3
+        1    1
+        dtype: int32[pyarrow]
+        """
+        from pandas import Series
+
+        value_lengths = pc.list_value_length(self._pa_array)
+        return Series(value_lengths, dtype=ArrowDtype(value_lengths.type))
+
+    def __getitem__(self, key: int | slice) -> Series:
+        """
+        Index or slice lists in the Series.
+
+        Parameters
+        ----------
+        key : int | slice
+            Index or slice of indices to access from each list.
+
+        Returns
+        -------
+        pandas.Series
+            The list at requested index.
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> s = pd.Series(
+        ...     [
+        ...         [1, 2, 3],
+        ...         [3],
+        ...     ],
+        ...     dtype=pd.ArrowDtype(pa.list_(
+        ...         pa.int64()
+        ...     ))
+        ... )
+        >>> s.list[0]
+        0    1
+        1    3
+        dtype: int64[pyarrow]
+        """
+        from pandas import Series
+
+        if isinstance(key, int):
+            # TODO: Support negative key but pyarrow does not allow
+            # element index to be an array.
+            # if key < 0:
+            #     key = pc.add(key, pc.list_value_length(self._pa_array))
+            element = pc.list_element(self._pa_array, key)
+            return Series(element, dtype=ArrowDtype(element.type))
+        elif isinstance(key, slice):
+            if pa_version_under11p0:
+                raise NotImplementedError(
+                    f"List slice not supported by pyarrow {pa.__version__}."
+                )
+
+            # TODO: Support negative start/stop/step, ideally this would be added
+            # upstream in pyarrow.
+            start, stop, step = key.start, key.stop, key.step
+            if start is None:
+                # TODO: When adding negative step support
+                #  this should be setto last element of array
+                # when step is negative.
+                start = 0
+            if step is None:
+                step = 1
+            sliced = pc.list_slice(self._pa_array, start, stop, step)
+            return Series(sliced, dtype=ArrowDtype(sliced.type))
+        else:
+            raise ValueError(f"key must be an int or slice, got {type(key).__name__}")
+
+    def __iter__(self) -> Iterator:
+        raise TypeError(f"'{type(self).__name__}' object is not iterable")
+
+    def flatten(self) -> Series:
+        """
+        Flatten list values.
+
+        Returns
+        -------
+        pandas.Series
+            The data from all lists in the series flattened.
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> s = pd.Series(
+        ...     [
+        ...         [1, 2, 3],
+        ...         [3],
+        ...     ],
+        ...     dtype=pd.ArrowDtype(pa.list_(
+        ...         pa.int64()
+        ...     ))
+        ... )
+        >>> s.list.flatten()
+        0    1
+        1    2
+        2    3
+        3    3
+        dtype: int64[pyarrow]
+        """
+        from pandas import Series
+
+        flattened = pc.list_flatten(self._pa_array)
+        return Series(flattened, dtype=ArrowDtype(flattened.type))
+
+
+class StructAccessor(ArrowAccessor):
+    """
+    Accessor object for structured data properties of the Series values.
+
+    Parameters
+    ----------
+    data : Series
+        Series containing Arrow struct data.
+    """
+
+    def __init__(self, data=None) -> None:
+        super().__init__(
+            data,
+            validation_msg=(
+                "Can only use the '.struct' accessor with 'struct[pyarrow]' "
+                "dtype, not {dtype}."
+            ),
+        )
+
+    def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
+        return pa.types.is_struct(pyarrow_dtype)
+
+    @property
+    def dtypes(self) -> Series:
+        """
+        Return the dtype object of each child field of the struct.
+
+        Returns
+        -------
+        pandas.Series
+            The data type of each child field.
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> s = pd.Series(
+        ...     [
+        ...         {"version": 1, "project": "pandas"},
+        ...         {"version": 2, "project": "pandas"},
+        ...         {"version": 1, "project": "numpy"},
+        ...     ],
+        ...     dtype=pd.ArrowDtype(pa.struct(
+        ...         [("version", pa.int64()), ("project", pa.string())]
+        ...     ))
+        ... )
+        >>> s.struct.dtypes
+        version     int64[pyarrow]
+        project    string[pyarrow]
+        dtype: object
+        """
+        from pandas import (
+            Index,
+            Series,
+        )
+
+        pa_type = self._data.dtype.pyarrow_dtype
+        types = [ArrowDtype(struct.type) for struct in pa_type]
+        names = [struct.name for struct in pa_type]
+        return Series(types, index=Index(names))
+
+    def field(
+        self,
+        name_or_index: list[str]
+        | list[bytes]
+        | list[int]
+        | pc.Expression
+        | bytes
+        | str
+        | int,
+    ) -> Series:
+        """
+        Extract a child field of a struct as a Series.
+
+        Parameters
+        ----------
+        name_or_index : str | bytes | int | expression | list
+            Name or index of the child field to extract.
+
+            For list-like inputs, this will index into a nested
+            struct.
+
+        Returns
+        -------
+        pandas.Series
+            The data corresponding to the selected child field.
+
+        See Also
+        --------
+        Series.struct.explode : Return all child fields as a DataFrame.
+
+        Notes
+        -----
+        The name of the resulting Series will be set using the following
+        rules:
+
+        - For string, bytes, or integer `name_or_index` (or a list of these, for
+          a nested selection), the Series name is set to the selected
+          field's name.
+        - For a :class:`pyarrow.compute.Expression`, this is set to
+          the string form of the expression.
+        - For list-like `name_or_index`, the name will be set to the
+          name of the final field selected.
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> s = pd.Series(
+        ...     [
+        ...         {"version": 1, "project": "pandas"},
+        ...         {"version": 2, "project": "pandas"},
+        ...         {"version": 1, "project": "numpy"},
+        ...     ],
+        ...     dtype=pd.ArrowDtype(pa.struct(
+        ...         [("version", pa.int64()), ("project", pa.string())]
+        ...     ))
+        ... )
+
+        Extract by field name.
+
+        >>> s.struct.field("project")
+        0    pandas
+        1    pandas
+        2     numpy
+        Name: project, dtype: string[pyarrow]
+
+        Extract by field index.
+
+        >>> s.struct.field(0)
+        0    1
+        1    2
+        2    1
+        Name: version, dtype: int64[pyarrow]
+
+        Or an expression
+
+        >>> import pyarrow.compute as pc
+        >>> s.struct.field(pc.field("project"))
+        0    pandas
+        1    pandas
+        2     numpy
+        Name: project, dtype: string[pyarrow]
+
+        For nested struct types, you can pass a list of values to index
+        multiple levels:
+
+        >>> version_type = pa.struct([
+        ...     ("major", pa.int64()),
+        ...     ("minor", pa.int64()),
+        ... ])
+        >>> s = pd.Series(
+        ...     [
+        ...         {"version": {"major": 1, "minor": 5}, "project": "pandas"},
+        ...         {"version": {"major": 2, "minor": 1}, "project": "pandas"},
+        ...         {"version": {"major": 1, "minor": 26}, "project": "numpy"},
+        ...     ],
+        ...     dtype=pd.ArrowDtype(pa.struct(
+        ...         [("version", version_type), ("project", pa.string())]
+        ...     ))
+        ... )
+        >>> s.struct.field(["version", "minor"])
+        0     5
+        1     1
+        2    26
+        Name: minor, dtype: int64[pyarrow]
+        >>> s.struct.field([0, 0])
+        0    1
+        1    2
+        2    1
+        Name: major, dtype: int64[pyarrow]
+        """
+        from pandas import Series
+
+        def get_name(
+            level_name_or_index: list[str]
+            | list[bytes]
+            | list[int]
+            | pc.Expression
+            | bytes
+            | str
+            | int,
+            data: pa.ChunkedArray,
+        ):
+            if isinstance(level_name_or_index, int):
+                name = data.type.field(level_name_or_index).name
+            elif isinstance(level_name_or_index, (str, bytes)):
+                name = level_name_or_index
+            elif isinstance(level_name_or_index, pc.Expression):
+                name = str(level_name_or_index)
+            elif is_list_like(level_name_or_index):
+                # For nested input like [2, 1, 2]
+                # iteratively get the struct and field name. The last
+                # one is used for the name of the index.
+                level_name_or_index = list(reversed(level_name_or_index))
+                selected = data
+                while level_name_or_index:
+                    # we need the cast, otherwise mypy complains about
+                    # getting ints, bytes, or str here, which isn't possible.
+                    level_name_or_index = cast(list, level_name_or_index)
+                    name_or_index = level_name_or_index.pop()
+                    name = get_name(name_or_index, selected)
+                    selected = selected.type.field(selected.type.get_field_index(name))
+                    name = selected.name
+            else:
+                raise ValueError(
+                    "name_or_index must be an int, str, bytes, "
+                    "pyarrow.compute.Expression, or list of those"
+                )
+            return name
+
+        pa_arr = self._data.array._pa_array
+        name = get_name(name_or_index, pa_arr)
+        field_arr = pc.struct_field(pa_arr, name_or_index)
+
+        return Series(
+            field_arr,
+            dtype=ArrowDtype(field_arr.type),
+            index=self._data.index,
+            name=name,
+        )
+
+    def explode(self) -> DataFrame:
+        """
+        Extract all child fields of a struct as a DataFrame.
+
+        Returns
+        -------
+        pandas.DataFrame
+            The data corresponding to all child fields.
+
+        See Also
+        --------
+        Series.struct.field : Return a single child field as a Series.
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> s = pd.Series(
+        ...     [
+        ...         {"version": 1, "project": "pandas"},
+        ...         {"version": 2, "project": "pandas"},
+        ...         {"version": 1, "project": "numpy"},
+        ...     ],
+        ...     dtype=pd.ArrowDtype(pa.struct(
+        ...         [("version", pa.int64()), ("project", pa.string())]
+        ...     ))
+        ... )
+
+        >>> s.struct.explode()
+           version project
+        0        1  pandas
+        1        2  pandas
+        2        1   numpy
+        """
+        from pandas import concat
+
+        pa_type = self._pa_array.type
+        return concat(
+            [self.field(i) for i in range(pa_type.num_fields)], axis="columns"
+        )
--- a/lib/python3.13/site-packages/pandas/core/arrays/arrow/array.py
+++ b/lib/python3.13/site-packages/pandas/core/arrays/arrow/array.py
--- a/lib/python3.13/site-packages/pandas/core/arrays/arrow/extension_types.py
+++ b/lib/python3.13/site-packages/pandas/core/arrays/arrow/extension_types.py
@@ -0,0 +1,174 @@
+from __future__ import annotations
+
+import json
+from typing import TYPE_CHECKING
+
+import pyarrow
+
+from pandas.compat import pa_version_under14p1
+
+from pandas.core.dtypes.dtypes import (
+    IntervalDtype,
+    PeriodDtype,
+)
+
+from pandas.core.arrays.interval import VALID_CLOSED
+
+if TYPE_CHECKING:
+    from pandas._typing import IntervalClosedType
+
+
+class ArrowPeriodType(pyarrow.ExtensionType):
+    def __init__(self, freq) -> None:
+        # attributes need to be set first before calling
+        # super init (as that calls serialize)
+        self._freq = freq
+        pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period")
+
+    @property
+    def freq(self):
+        return self._freq
+
+    def __arrow_ext_serialize__(self) -> bytes:
+        metadata = {"freq": self.freq}
+        return json.dumps(metadata).encode()
+
+    @classmethod
+    def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowPeriodType:
+        metadata = json.loads(serialized.decode())
+        return ArrowPeriodType(metadata["freq"])
+
+    def __eq__(self, other):
+        if isinstance(other, pyarrow.BaseExtensionType):
+            return type(self) == type(other) and self.freq == other.freq
+        else:
+            return NotImplemented
+
+    def __ne__(self, other) -> bool:
+        return not self == other
+
+    def __hash__(self) -> int:
+        return hash((str(self), self.freq))
+
+    def to_pandas_dtype(self) -> PeriodDtype:
+        return PeriodDtype(freq=self.freq)
+
+
+# register the type with a dummy instance
+_period_type = ArrowPeriodType("D")
+pyarrow.register_extension_type(_period_type)
+
+
+class ArrowIntervalType(pyarrow.ExtensionType):
+    def __init__(self, subtype, closed: IntervalClosedType) -> None:
+        # attributes need to be set first before calling
+        # super init (as that calls serialize)
+        assert closed in VALID_CLOSED
+        self._closed: IntervalClosedType = closed
+        if not isinstance(subtype, pyarrow.DataType):
+            subtype = pyarrow.type_for_alias(str(subtype))
+        self._subtype = subtype
+
+        storage_type = pyarrow.struct([("left", subtype), ("right", subtype)])
+        pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval")
+
+    @property
+    def subtype(self):
+        return self._subtype
+
+    @property
+    def closed(self) -> IntervalClosedType:
+        return self._closed
+
+    def __arrow_ext_serialize__(self) -> bytes:
+        metadata = {"subtype": str(self.subtype), "closed": self.closed}
+        return json.dumps(metadata).encode()
+
+    @classmethod
+    def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowIntervalType:
+        metadata = json.loads(serialized.decode())
+        subtype = pyarrow.type_for_alias(metadata["subtype"])
+        closed = metadata["closed"]
+        return ArrowIntervalType(subtype, closed)
+
+    def __eq__(self, other):
+        if isinstance(other, pyarrow.BaseExtensionType):
+            return (
+                type(self) == type(other)
+                and self.subtype == other.subtype
+                and self.closed == other.closed
+            )
+        else:
+            return NotImplemented
+
+    def __ne__(self, other) -> bool:
+        return not self == other
+
+    def __hash__(self) -> int:
+        return hash((str(self), str(self.subtype), self.closed))
+
+    def to_pandas_dtype(self) -> IntervalDtype:
+        return IntervalDtype(self.subtype.to_pandas_dtype(), self.closed)
+
+
+# register the type with a dummy instance
+_interval_type = ArrowIntervalType(pyarrow.int64(), "left")
+pyarrow.register_extension_type(_interval_type)
+
+
+_ERROR_MSG = """\
+Disallowed deserialization of 'arrow.py_extension_type':
+storage_type = {storage_type}
+serialized = {serialized}
+pickle disassembly:\n{pickle_disassembly}
+
+Reading of untrusted Parquet or Feather files with a PyExtensionType column
+allows arbitrary code execution.
+If you trust this file, you can enable reading the extension type by one of:
+
+- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)`
+- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running
+  `import pyarrow_hotfix; pyarrow_hotfix.uninstall()`
+
+We strongly recommend updating your Parquet/Feather files to use extension types
+derived from `pyarrow.ExtensionType` instead, and register this type explicitly.
+"""
+
+
+def patch_pyarrow():
+    # starting from pyarrow 14.0.1, it has its own mechanism
+    if not pa_version_under14p1:
+        return
+
+    # if https://github.com/pitrou/pyarrow-hotfix was installed and enabled
+    if getattr(pyarrow, "_hotfix_installed", False):
+        return
+
+    class ForbiddenExtensionType(pyarrow.ExtensionType):
+        def __arrow_ext_serialize__(self):
+            return b""
+
+        @classmethod
+        def __arrow_ext_deserialize__(cls, storage_type, serialized):
+            import io
+            import pickletools
+
+            out = io.StringIO()
+            pickletools.dis(serialized, out)
+            raise RuntimeError(
+                _ERROR_MSG.format(
+                    storage_type=storage_type,
+                    serialized=serialized,
+                    pickle_disassembly=out.getvalue(),
+                )
+            )
+
+    pyarrow.unregister_extension_type("arrow.py_extension_type")
+    pyarrow.register_extension_type(
+        ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type")
+    )
+
+    pyarrow._hotfix_installed = True
+
+
+patch_pyarrow()