Updated script that can be controled by Nodejs web app

2024-11-25 12:24:18 +07:00
parent c440eda1f4
commit 8b0ab2bd3a
8662 changed files with 1803808 additions and 34 deletions
--- a/lib/python3.13/site-packages/pandas/core/util/init.py
+++ b/lib/python3.13/site-packages/pandas/core/util/init.py
--- a/lib/python3.13/site-packages/pandas/core/util/pycache/init.cpython-313.pyc
+++ b/lib/python3.13/site-packages/pandas/core/util/pycache/init.cpython-313.pyc
--- a/lib/python3.13/site-packages/pandas/core/util/pycache/hashing.cpython-313.pyc
+++ b/lib/python3.13/site-packages/pandas/core/util/pycache/hashing.cpython-313.pyc
--- a/lib/python3.13/site-packages/pandas/core/util/pycache/numba_.cpython-313.pyc
+++ b/lib/python3.13/site-packages/pandas/core/util/pycache/numba_.cpython-313.pyc
--- a/lib/python3.13/site-packages/pandas/core/util/hashing.py
+++ b/lib/python3.13/site-packages/pandas/core/util/hashing.py
@ -0,0 +1,339 @@
+"""
+data hash pandas / numpy objects
+"""
+from __future__ import annotations
+
+import itertools
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+from pandas._libs.hashing import hash_object_array
+
+from pandas.core.dtypes.common import is_list_like
+from pandas.core.dtypes.dtypes import CategoricalDtype
+from pandas.core.dtypes.generic import (
+    ABCDataFrame,
+    ABCExtensionArray,
+    ABCIndex,
+    ABCMultiIndex,
+    ABCSeries,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Hashable,
+        Iterable,
+        Iterator,
+    )
+
+    from pandas._typing import (
+        ArrayLike,
+        npt,
+    )
+
+    from pandas import (
+        DataFrame,
+        Index,
+        MultiIndex,
+        Series,
+    )
+
+
+# 16 byte long hashing key
+_default_hash_key = "0123456789123456"
+
+
+def combine_hash_arrays(
+    arrays: Iterator[np.ndarray], num_items: int
+) -> npt.NDArray[np.uint64]:
+    """
+    Parameters
+    ----------
+    arrays : Iterator[np.ndarray]
+    num_items : int
+
+    Returns
+    -------
+    np.ndarray[uint64]
+
+    Should be the same as CPython's tupleobject.c
+    """
+    try:
+        first = next(arrays)
+    except StopIteration:
+        return np.array([], dtype=np.uint64)
+
+    arrays = itertools.chain([first], arrays)
+
+    mult = np.uint64(1000003)
+    out = np.zeros_like(first) + np.uint64(0x345678)
+    last_i = 0
+    for i, a in enumerate(arrays):
+        inverse_i = num_items - i
+        out ^= a
+        out *= mult
+        mult += np.uint64(82520 + inverse_i + inverse_i)
+        last_i = i
+    assert last_i + 1 == num_items, "Fed in wrong num_items"
+    out += np.uint64(97531)
+    return out
+
+
+def hash_pandas_object(
+    obj: Index | DataFrame | Series,
+    index: bool = True,
+    encoding: str = "utf8",
+    hash_key: str | None = _default_hash_key,
+    categorize: bool = True,
+) -> Series:
+    """
+    Return a data hash of the Index/Series/DataFrame.
+
+    Parameters
+    ----------
+    obj : Index, Series, or DataFrame
+    index : bool, default True
+        Include the index in the hash (if Series/DataFrame).
+    encoding : str, default 'utf8'
+        Encoding for data & key when strings.
+    hash_key : str, default _default_hash_key
+        Hash_key for string key to encode.
+    categorize : bool, default True
+        Whether to first categorize object arrays before hashing. This is more
+        efficient when the array contains duplicate values.
+
+    Returns
+    -------
+    Series of uint64, same length as the object
+
+    Examples
+    --------
+    >>> pd.util.hash_pandas_object(pd.Series([1, 2, 3]))
+    0    14639053686158035780
+    1     3869563279212530728
+    2      393322362522515241
+    dtype: uint64
+    """
+    from pandas import Series
+
+    if hash_key is None:
+        hash_key = _default_hash_key
+
+    if isinstance(obj, ABCMultiIndex):
+        return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False)
+
+    elif isinstance(obj, ABCIndex):
+        h = hash_array(obj._values, encoding, hash_key, categorize).astype(
+            "uint64", copy=False
+        )
+        ser = Series(h, index=obj, dtype="uint64", copy=False)
+
+    elif isinstance(obj, ABCSeries):
+        h = hash_array(obj._values, encoding, hash_key, categorize).astype(
+            "uint64", copy=False
+        )
+        if index:
+            index_iter = (
+                hash_pandas_object(
+                    obj.index,
+                    index=False,
+                    encoding=encoding,
+                    hash_key=hash_key,
+                    categorize=categorize,
+                )._values
+                for _ in [None]
+            )
+            arrays = itertools.chain([h], index_iter)
+            h = combine_hash_arrays(arrays, 2)
+
+        ser = Series(h, index=obj.index, dtype="uint64", copy=False)
+
+    elif isinstance(obj, ABCDataFrame):
+        hashes = (
+            hash_array(series._values, encoding, hash_key, categorize)
+            for _, series in obj.items()
+        )
+        num_items = len(obj.columns)
+        if index:
+            index_hash_generator = (
+                hash_pandas_object(
+                    obj.index,
+                    index=False,
+                    encoding=encoding,
+                    hash_key=hash_key,
+                    categorize=categorize,
+                )._values
+                for _ in [None]
+            )
+            num_items += 1
+
+            # keep `hashes` specifically a generator to keep mypy happy
+            _hashes = itertools.chain(hashes, index_hash_generator)
+            hashes = (x for x in _hashes)
+        h = combine_hash_arrays(hashes, num_items)
+
+        ser = Series(h, index=obj.index, dtype="uint64", copy=False)
+    else:
+        raise TypeError(f"Unexpected type for hashing {type(obj)}")
+
+    return ser
+
+
+def hash_tuples(
+    vals: MultiIndex | Iterable[tuple[Hashable, ...]],
+    encoding: str = "utf8",
+    hash_key: str = _default_hash_key,
+) -> npt.NDArray[np.uint64]:
+    """
+    Hash an MultiIndex / listlike-of-tuples efficiently.
+
+    Parameters
+    ----------
+    vals : MultiIndex or listlike-of-tuples
+    encoding : str, default 'utf8'
+    hash_key : str, default _default_hash_key
+
+    Returns
+    -------
+    ndarray[np.uint64] of hashed values
+    """
+    if not is_list_like(vals):
+        raise TypeError("must be convertible to a list-of-tuples")
+
+    from pandas import (
+        Categorical,
+        MultiIndex,
+    )
+
+    if not isinstance(vals, ABCMultiIndex):
+        mi = MultiIndex.from_tuples(vals)
+    else:
+        mi = vals
+
+    # create a list-of-Categoricals
+    cat_vals = [
+        Categorical._simple_new(
+            mi.codes[level],
+            CategoricalDtype(categories=mi.levels[level], ordered=False),
+        )
+        for level in range(mi.nlevels)
+    ]
+
+    # hash the list-of-ndarrays
+    hashes = (
+        cat._hash_pandas_object(encoding=encoding, hash_key=hash_key, categorize=False)
+        for cat in cat_vals
+    )
+    h = combine_hash_arrays(hashes, len(cat_vals))
+
+    return h
+
+
+def hash_array(
+    vals: ArrayLike,
+    encoding: str = "utf8",
+    hash_key: str = _default_hash_key,
+    categorize: bool = True,
+) -> npt.NDArray[np.uint64]:
+    """
+    Given a 1d array, return an array of deterministic integers.
+
+    Parameters
+    ----------
+    vals : ndarray or ExtensionArray
+    encoding : str, default 'utf8'
+        Encoding for data & key when strings.
+    hash_key : str, default _default_hash_key
+        Hash_key for string key to encode.
+    categorize : bool, default True
+        Whether to first categorize object arrays before hashing. This is more
+        efficient when the array contains duplicate values.
+
+    Returns
+    -------
+    ndarray[np.uint64, ndim=1]
+        Hashed values, same length as the vals.
+
+    Examples
+    --------
+    >>> pd.util.hash_array(np.array([1, 2, 3]))
+    array([ 6238072747940578789, 15839785061582574730,  2185194620014831856],
+      dtype=uint64)
+    """
+    if not hasattr(vals, "dtype"):
+        raise TypeError("must pass a ndarray-like")
+
+    if isinstance(vals, ABCExtensionArray):
+        return vals._hash_pandas_object(
+            encoding=encoding, hash_key=hash_key, categorize=categorize
+        )
+
+    if not isinstance(vals, np.ndarray):
+        # GH#42003
+        raise TypeError(
+            "hash_array requires np.ndarray or ExtensionArray, not "
+            f"{type(vals).__name__}. Use hash_pandas_object instead."
+        )
+
+    return _hash_ndarray(vals, encoding, hash_key, categorize)
+
+
+def _hash_ndarray(
+    vals: np.ndarray,
+    encoding: str = "utf8",
+    hash_key: str = _default_hash_key,
+    categorize: bool = True,
+) -> npt.NDArray[np.uint64]:
+    """
+    See hash_array.__doc__.
+    """
+    dtype = vals.dtype
+
+    # _hash_ndarray only takes 64-bit values, so handle 128-bit by parts
+    if np.issubdtype(dtype, np.complex128):
+        hash_real = _hash_ndarray(vals.real, encoding, hash_key, categorize)
+        hash_imag = _hash_ndarray(vals.imag, encoding, hash_key, categorize)
+        return hash_real + 23 * hash_imag
+
+    # First, turn whatever array this is into unsigned 64-bit ints, if we can
+    # manage it.
+    if dtype == bool:
+        vals = vals.astype("u8")
+    elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
+        vals = vals.view("i8").astype("u8", copy=False)
+    elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
+        vals = vals.view(f"u{vals.dtype.itemsize}").astype("u8")
+    else:
+        # With repeated values, its MUCH faster to categorize object dtypes,
+        # then hash and rename categories. We allow skipping the categorization
+        # when the values are known/likely to be unique.
+        if categorize:
+            from pandas import (
+                Categorical,
+                Index,
+                factorize,
+            )
+
+            codes, categories = factorize(vals, sort=False)
+            dtype = CategoricalDtype(categories=Index(categories), ordered=False)
+            cat = Categorical._simple_new(codes, dtype)
+            return cat._hash_pandas_object(
+                encoding=encoding, hash_key=hash_key, categorize=False
+            )
+
+        try:
+            vals = hash_object_array(vals, hash_key, encoding)
+        except TypeError:
+            # we have mixed types
+            vals = hash_object_array(
+                vals.astype(str).astype(object), hash_key, encoding
+            )
+
+    # Then, redistribute these 64-bit ints within the space of 64-bit ints
+    vals ^= vals >> 30
+    vals *= np.uint64(0xBF58476D1CE4E5B9)
+    vals ^= vals >> 27
+    vals *= np.uint64(0x94D049BB133111EB)
+    vals ^= vals >> 31
+    return vals
--- a/lib/python3.13/site-packages/pandas/core/util/numba_.py
+++ b/lib/python3.13/site-packages/pandas/core/util/numba_.py
@ -0,0 +1,98 @@
+"""Common utilities for Numba operations"""
+from __future__ import annotations
+
+import types
+from typing import (
+    TYPE_CHECKING,
+    Callable,
+)
+
+import numpy as np
+
+from pandas.compat._optional import import_optional_dependency
+from pandas.errors import NumbaUtilError
+
+GLOBAL_USE_NUMBA: bool = False
+
+
+def maybe_use_numba(engine: str | None) -> bool:
+    """Signal whether to use numba routines."""
+    return engine == "numba" or (engine is None and GLOBAL_USE_NUMBA)
+
+
+def set_use_numba(enable: bool = False) -> None:
+    global GLOBAL_USE_NUMBA
+    if enable:
+        import_optional_dependency("numba")
+    GLOBAL_USE_NUMBA = enable
+
+
+def get_jit_arguments(
+    engine_kwargs: dict[str, bool] | None = None, kwargs: dict | None = None
+) -> dict[str, bool]:
+    """
+    Return arguments to pass to numba.JIT, falling back on pandas default JIT settings.
+
+    Parameters
+    ----------
+    engine_kwargs : dict, default None
+        user passed keyword arguments for numba.JIT
+    kwargs : dict, default None
+        user passed keyword arguments to pass into the JITed function
+
+    Returns
+    -------
+    dict[str, bool]
+        nopython, nogil, parallel
+
+    Raises
+    ------
+    NumbaUtilError
+    """
+    if engine_kwargs is None:
+        engine_kwargs = {}
+
+    nopython = engine_kwargs.get("nopython", True)
+    if kwargs and nopython:
+        raise NumbaUtilError(
+            "numba does not support kwargs with nopython=True: "
+            "https://github.com/numba/numba/issues/2916"
+        )
+    nogil = engine_kwargs.get("nogil", False)
+    parallel = engine_kwargs.get("parallel", False)
+    return {"nopython": nopython, "nogil": nogil, "parallel": parallel}
+
+
+def jit_user_function(func: Callable) -> Callable:
+    """
+    If user function is not jitted already, mark the user's function
+    as jitable.
+
+    Parameters
+    ----------
+    func : function
+        user defined function
+
+    Returns
+    -------
+    function
+        Numba JITed function, or function marked as JITable by numba
+    """
+    if TYPE_CHECKING:
+        import numba
+    else:
+        numba = import_optional_dependency("numba")
+
+    if numba.extending.is_jitted(func):
+        # Don't jit a user passed jitted function
+        numba_func = func
+    elif getattr(np, func.__name__, False) is func or isinstance(
+        func, types.BuiltinFunctionType
+    ):
+        # Not necessary to jit builtins or np functions
+        # This will mess up register_jitable
+        numba_func = func
+    else:
+        numba_func = numba.extending.register_jitable(func)
+
+    return numba_func