Updated script that can be controled by Nodejs web app
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
339
lib/python3.13/site-packages/pandas/core/util/hashing.py
Normal file
339
lib/python3.13/site-packages/pandas/core/util/hashing.py
Normal file
@ -0,0 +1,339 @@
|
||||
"""
|
||||
data hash pandas / numpy objects
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import itertools
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs.hashing import hash_object_array
|
||||
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
from pandas.core.dtypes.dtypes import CategoricalDtype
|
||||
from pandas.core.dtypes.generic import (
|
||||
ABCDataFrame,
|
||||
ABCExtensionArray,
|
||||
ABCIndex,
|
||||
ABCMultiIndex,
|
||||
ABCSeries,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import (
|
||||
Hashable,
|
||||
Iterable,
|
||||
Iterator,
|
||||
)
|
||||
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
npt,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
)
|
||||
|
||||
|
||||
# 16 byte long hashing key
|
||||
_default_hash_key = "0123456789123456"
|
||||
|
||||
|
||||
def combine_hash_arrays(
|
||||
arrays: Iterator[np.ndarray], num_items: int
|
||||
) -> npt.NDArray[np.uint64]:
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
arrays : Iterator[np.ndarray]
|
||||
num_items : int
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.ndarray[uint64]
|
||||
|
||||
Should be the same as CPython's tupleobject.c
|
||||
"""
|
||||
try:
|
||||
first = next(arrays)
|
||||
except StopIteration:
|
||||
return np.array([], dtype=np.uint64)
|
||||
|
||||
arrays = itertools.chain([first], arrays)
|
||||
|
||||
mult = np.uint64(1000003)
|
||||
out = np.zeros_like(first) + np.uint64(0x345678)
|
||||
last_i = 0
|
||||
for i, a in enumerate(arrays):
|
||||
inverse_i = num_items - i
|
||||
out ^= a
|
||||
out *= mult
|
||||
mult += np.uint64(82520 + inverse_i + inverse_i)
|
||||
last_i = i
|
||||
assert last_i + 1 == num_items, "Fed in wrong num_items"
|
||||
out += np.uint64(97531)
|
||||
return out
|
||||
|
||||
|
||||
def hash_pandas_object(
|
||||
obj: Index | DataFrame | Series,
|
||||
index: bool = True,
|
||||
encoding: str = "utf8",
|
||||
hash_key: str | None = _default_hash_key,
|
||||
categorize: bool = True,
|
||||
) -> Series:
|
||||
"""
|
||||
Return a data hash of the Index/Series/DataFrame.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : Index, Series, or DataFrame
|
||||
index : bool, default True
|
||||
Include the index in the hash (if Series/DataFrame).
|
||||
encoding : str, default 'utf8'
|
||||
Encoding for data & key when strings.
|
||||
hash_key : str, default _default_hash_key
|
||||
Hash_key for string key to encode.
|
||||
categorize : bool, default True
|
||||
Whether to first categorize object arrays before hashing. This is more
|
||||
efficient when the array contains duplicate values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series of uint64, same length as the object
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> pd.util.hash_pandas_object(pd.Series([1, 2, 3]))
|
||||
0 14639053686158035780
|
||||
1 3869563279212530728
|
||||
2 393322362522515241
|
||||
dtype: uint64
|
||||
"""
|
||||
from pandas import Series
|
||||
|
||||
if hash_key is None:
|
||||
hash_key = _default_hash_key
|
||||
|
||||
if isinstance(obj, ABCMultiIndex):
|
||||
return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False)
|
||||
|
||||
elif isinstance(obj, ABCIndex):
|
||||
h = hash_array(obj._values, encoding, hash_key, categorize).astype(
|
||||
"uint64", copy=False
|
||||
)
|
||||
ser = Series(h, index=obj, dtype="uint64", copy=False)
|
||||
|
||||
elif isinstance(obj, ABCSeries):
|
||||
h = hash_array(obj._values, encoding, hash_key, categorize).astype(
|
||||
"uint64", copy=False
|
||||
)
|
||||
if index:
|
||||
index_iter = (
|
||||
hash_pandas_object(
|
||||
obj.index,
|
||||
index=False,
|
||||
encoding=encoding,
|
||||
hash_key=hash_key,
|
||||
categorize=categorize,
|
||||
)._values
|
||||
for _ in [None]
|
||||
)
|
||||
arrays = itertools.chain([h], index_iter)
|
||||
h = combine_hash_arrays(arrays, 2)
|
||||
|
||||
ser = Series(h, index=obj.index, dtype="uint64", copy=False)
|
||||
|
||||
elif isinstance(obj, ABCDataFrame):
|
||||
hashes = (
|
||||
hash_array(series._values, encoding, hash_key, categorize)
|
||||
for _, series in obj.items()
|
||||
)
|
||||
num_items = len(obj.columns)
|
||||
if index:
|
||||
index_hash_generator = (
|
||||
hash_pandas_object(
|
||||
obj.index,
|
||||
index=False,
|
||||
encoding=encoding,
|
||||
hash_key=hash_key,
|
||||
categorize=categorize,
|
||||
)._values
|
||||
for _ in [None]
|
||||
)
|
||||
num_items += 1
|
||||
|
||||
# keep `hashes` specifically a generator to keep mypy happy
|
||||
_hashes = itertools.chain(hashes, index_hash_generator)
|
||||
hashes = (x for x in _hashes)
|
||||
h = combine_hash_arrays(hashes, num_items)
|
||||
|
||||
ser = Series(h, index=obj.index, dtype="uint64", copy=False)
|
||||
else:
|
||||
raise TypeError(f"Unexpected type for hashing {type(obj)}")
|
||||
|
||||
return ser
|
||||
|
||||
|
||||
def hash_tuples(
|
||||
vals: MultiIndex | Iterable[tuple[Hashable, ...]],
|
||||
encoding: str = "utf8",
|
||||
hash_key: str = _default_hash_key,
|
||||
) -> npt.NDArray[np.uint64]:
|
||||
"""
|
||||
Hash an MultiIndex / listlike-of-tuples efficiently.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
vals : MultiIndex or listlike-of-tuples
|
||||
encoding : str, default 'utf8'
|
||||
hash_key : str, default _default_hash_key
|
||||
|
||||
Returns
|
||||
-------
|
||||
ndarray[np.uint64] of hashed values
|
||||
"""
|
||||
if not is_list_like(vals):
|
||||
raise TypeError("must be convertible to a list-of-tuples")
|
||||
|
||||
from pandas import (
|
||||
Categorical,
|
||||
MultiIndex,
|
||||
)
|
||||
|
||||
if not isinstance(vals, ABCMultiIndex):
|
||||
mi = MultiIndex.from_tuples(vals)
|
||||
else:
|
||||
mi = vals
|
||||
|
||||
# create a list-of-Categoricals
|
||||
cat_vals = [
|
||||
Categorical._simple_new(
|
||||
mi.codes[level],
|
||||
CategoricalDtype(categories=mi.levels[level], ordered=False),
|
||||
)
|
||||
for level in range(mi.nlevels)
|
||||
]
|
||||
|
||||
# hash the list-of-ndarrays
|
||||
hashes = (
|
||||
cat._hash_pandas_object(encoding=encoding, hash_key=hash_key, categorize=False)
|
||||
for cat in cat_vals
|
||||
)
|
||||
h = combine_hash_arrays(hashes, len(cat_vals))
|
||||
|
||||
return h
|
||||
|
||||
|
||||
def hash_array(
|
||||
vals: ArrayLike,
|
||||
encoding: str = "utf8",
|
||||
hash_key: str = _default_hash_key,
|
||||
categorize: bool = True,
|
||||
) -> npt.NDArray[np.uint64]:
|
||||
"""
|
||||
Given a 1d array, return an array of deterministic integers.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
vals : ndarray or ExtensionArray
|
||||
encoding : str, default 'utf8'
|
||||
Encoding for data & key when strings.
|
||||
hash_key : str, default _default_hash_key
|
||||
Hash_key for string key to encode.
|
||||
categorize : bool, default True
|
||||
Whether to first categorize object arrays before hashing. This is more
|
||||
efficient when the array contains duplicate values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ndarray[np.uint64, ndim=1]
|
||||
Hashed values, same length as the vals.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> pd.util.hash_array(np.array([1, 2, 3]))
|
||||
array([ 6238072747940578789, 15839785061582574730, 2185194620014831856],
|
||||
dtype=uint64)
|
||||
"""
|
||||
if not hasattr(vals, "dtype"):
|
||||
raise TypeError("must pass a ndarray-like")
|
||||
|
||||
if isinstance(vals, ABCExtensionArray):
|
||||
return vals._hash_pandas_object(
|
||||
encoding=encoding, hash_key=hash_key, categorize=categorize
|
||||
)
|
||||
|
||||
if not isinstance(vals, np.ndarray):
|
||||
# GH#42003
|
||||
raise TypeError(
|
||||
"hash_array requires np.ndarray or ExtensionArray, not "
|
||||
f"{type(vals).__name__}. Use hash_pandas_object instead."
|
||||
)
|
||||
|
||||
return _hash_ndarray(vals, encoding, hash_key, categorize)
|
||||
|
||||
|
||||
def _hash_ndarray(
|
||||
vals: np.ndarray,
|
||||
encoding: str = "utf8",
|
||||
hash_key: str = _default_hash_key,
|
||||
categorize: bool = True,
|
||||
) -> npt.NDArray[np.uint64]:
|
||||
"""
|
||||
See hash_array.__doc__.
|
||||
"""
|
||||
dtype = vals.dtype
|
||||
|
||||
# _hash_ndarray only takes 64-bit values, so handle 128-bit by parts
|
||||
if np.issubdtype(dtype, np.complex128):
|
||||
hash_real = _hash_ndarray(vals.real, encoding, hash_key, categorize)
|
||||
hash_imag = _hash_ndarray(vals.imag, encoding, hash_key, categorize)
|
||||
return hash_real + 23 * hash_imag
|
||||
|
||||
# First, turn whatever array this is into unsigned 64-bit ints, if we can
|
||||
# manage it.
|
||||
if dtype == bool:
|
||||
vals = vals.astype("u8")
|
||||
elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
|
||||
vals = vals.view("i8").astype("u8", copy=False)
|
||||
elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
|
||||
vals = vals.view(f"u{vals.dtype.itemsize}").astype("u8")
|
||||
else:
|
||||
# With repeated values, its MUCH faster to categorize object dtypes,
|
||||
# then hash and rename categories. We allow skipping the categorization
|
||||
# when the values are known/likely to be unique.
|
||||
if categorize:
|
||||
from pandas import (
|
||||
Categorical,
|
||||
Index,
|
||||
factorize,
|
||||
)
|
||||
|
||||
codes, categories = factorize(vals, sort=False)
|
||||
dtype = CategoricalDtype(categories=Index(categories), ordered=False)
|
||||
cat = Categorical._simple_new(codes, dtype)
|
||||
return cat._hash_pandas_object(
|
||||
encoding=encoding, hash_key=hash_key, categorize=False
|
||||
)
|
||||
|
||||
try:
|
||||
vals = hash_object_array(vals, hash_key, encoding)
|
||||
except TypeError:
|
||||
# we have mixed types
|
||||
vals = hash_object_array(
|
||||
vals.astype(str).astype(object), hash_key, encoding
|
||||
)
|
||||
|
||||
# Then, redistribute these 64-bit ints within the space of 64-bit ints
|
||||
vals ^= vals >> 30
|
||||
vals *= np.uint64(0xBF58476D1CE4E5B9)
|
||||
vals ^= vals >> 27
|
||||
vals *= np.uint64(0x94D049BB133111EB)
|
||||
vals ^= vals >> 31
|
||||
return vals
|
98
lib/python3.13/site-packages/pandas/core/util/numba_.py
Normal file
98
lib/python3.13/site-packages/pandas/core/util/numba_.py
Normal file
@ -0,0 +1,98 @@
|
||||
"""Common utilities for Numba operations"""
|
||||
from __future__ import annotations
|
||||
|
||||
import types
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Callable,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.errors import NumbaUtilError
|
||||
|
||||
GLOBAL_USE_NUMBA: bool = False
|
||||
|
||||
|
||||
def maybe_use_numba(engine: str | None) -> bool:
|
||||
"""Signal whether to use numba routines."""
|
||||
return engine == "numba" or (engine is None and GLOBAL_USE_NUMBA)
|
||||
|
||||
|
||||
def set_use_numba(enable: bool = False) -> None:
|
||||
global GLOBAL_USE_NUMBA
|
||||
if enable:
|
||||
import_optional_dependency("numba")
|
||||
GLOBAL_USE_NUMBA = enable
|
||||
|
||||
|
||||
def get_jit_arguments(
|
||||
engine_kwargs: dict[str, bool] | None = None, kwargs: dict | None = None
|
||||
) -> dict[str, bool]:
|
||||
"""
|
||||
Return arguments to pass to numba.JIT, falling back on pandas default JIT settings.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
engine_kwargs : dict, default None
|
||||
user passed keyword arguments for numba.JIT
|
||||
kwargs : dict, default None
|
||||
user passed keyword arguments to pass into the JITed function
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict[str, bool]
|
||||
nopython, nogil, parallel
|
||||
|
||||
Raises
|
||||
------
|
||||
NumbaUtilError
|
||||
"""
|
||||
if engine_kwargs is None:
|
||||
engine_kwargs = {}
|
||||
|
||||
nopython = engine_kwargs.get("nopython", True)
|
||||
if kwargs and nopython:
|
||||
raise NumbaUtilError(
|
||||
"numba does not support kwargs with nopython=True: "
|
||||
"https://github.com/numba/numba/issues/2916"
|
||||
)
|
||||
nogil = engine_kwargs.get("nogil", False)
|
||||
parallel = engine_kwargs.get("parallel", False)
|
||||
return {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
|
||||
|
||||
def jit_user_function(func: Callable) -> Callable:
|
||||
"""
|
||||
If user function is not jitted already, mark the user's function
|
||||
as jitable.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : function
|
||||
user defined function
|
||||
|
||||
Returns
|
||||
-------
|
||||
function
|
||||
Numba JITed function, or function marked as JITable by numba
|
||||
"""
|
||||
if TYPE_CHECKING:
|
||||
import numba
|
||||
else:
|
||||
numba = import_optional_dependency("numba")
|
||||
|
||||
if numba.extending.is_jitted(func):
|
||||
# Don't jit a user passed jitted function
|
||||
numba_func = func
|
||||
elif getattr(np, func.__name__, False) is func or isinstance(
|
||||
func, types.BuiltinFunctionType
|
||||
):
|
||||
# Not necessary to jit builtins or np functions
|
||||
# This will mess up register_jitable
|
||||
numba_func = func
|
||||
else:
|
||||
numba_func = numba.extending.register_jitable(func)
|
||||
|
||||
return numba_func
|
Reference in New Issue
Block a user