Updated script that can be controled by Nodejs web app

This commit is contained in:
mac OS
2024-11-25 12:24:18 +07:00
parent c440eda1f4
commit 8b0ab2bd3a
8662 changed files with 1803808 additions and 34 deletions

View File

@ -0,0 +1,15 @@
from pandas.core.groupby.generic import (
DataFrameGroupBy,
NamedAgg,
SeriesGroupBy,
)
from pandas.core.groupby.groupby import GroupBy
from pandas.core.groupby.grouper import Grouper
__all__ = [
"DataFrameGroupBy",
"NamedAgg",
"SeriesGroupBy",
"GroupBy",
"Grouper",
]

View File

@ -0,0 +1,121 @@
"""
Provide basic components for groupby.
"""
from __future__ import annotations
import dataclasses
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from collections.abc import Hashable
@dataclasses.dataclass(order=True, frozen=True)
class OutputKey:
label: Hashable
position: int
# special case to prevent duplicate plots when catching exceptions when
# forwarding methods from NDFrames
plotting_methods = frozenset(["plot", "hist"])
# cythonized transformations or canned "agg+broadcast", which do not
# require postprocessing of the result by transform.
cythonized_kernels = frozenset(["cumprod", "cumsum", "shift", "cummin", "cummax"])
# List of aggregation/reduction functions.
# These map each group to a single numeric value
reduction_kernels = frozenset(
[
"all",
"any",
"corrwith",
"count",
"first",
"idxmax",
"idxmin",
"last",
"max",
"mean",
"median",
"min",
"nunique",
"prod",
# as long as `quantile`'s signature accepts only
# a single quantile value, it's a reduction.
# GH#27526 might change that.
"quantile",
"sem",
"size",
"skew",
"std",
"sum",
"var",
]
)
# List of transformation functions.
# a transformation is a function that, for each group,
# produces a result that has the same shape as the group.
transformation_kernels = frozenset(
[
"bfill",
"cumcount",
"cummax",
"cummin",
"cumprod",
"cumsum",
"diff",
"ffill",
"fillna",
"ngroup",
"pct_change",
"rank",
"shift",
]
)
# these are all the public methods on Grouper which don't belong
# in either of the above lists
groupby_other_methods = frozenset(
[
"agg",
"aggregate",
"apply",
"boxplot",
# corr and cov return ngroups*ncolumns rows, so they
# are neither a transformation nor a reduction
"corr",
"cov",
"describe",
"dtypes",
"expanding",
"ewm",
"filter",
"get_group",
"groups",
"head",
"hist",
"indices",
"ndim",
"ngroups",
"nth",
"ohlc",
"pipe",
"plot",
"resample",
"rolling",
"tail",
"take",
"transform",
"sample",
"value_counts",
]
)
# Valid values of `name` for `groupby.transform(name)`
# NOTE: do NOT edit this directly. New additions should be inserted
# into the appropriate list above.
transform_kernel_allowlist = reduction_kernels | transformation_kernels

View File

@ -0,0 +1,87 @@
from __future__ import annotations
import numpy as np
from pandas.core.algorithms import unique1d
from pandas.core.arrays.categorical import (
Categorical,
CategoricalDtype,
recode_for_categories,
)
def recode_for_groupby(
c: Categorical, sort: bool, observed: bool
) -> tuple[Categorical, Categorical | None]:
"""
Code the categories to ensure we can groupby for categoricals.
If observed=True, we return a new Categorical with the observed
categories only.
If sort=False, return a copy of self, coded with categories as
returned by .unique(), followed by any categories not appearing in
the data. If sort=True, return self.
This method is needed solely to ensure the categorical index of the
GroupBy result has categories in the order of appearance in the data
(GH-8868).
Parameters
----------
c : Categorical
sort : bool
The value of the sort parameter groupby was called with.
observed : bool
Account only for the observed values
Returns
-------
Categorical
If sort=False, the new categories are set to the order of
appearance in codes (unless ordered=True, in which case the
original order is preserved), followed by any unrepresented
categories in the original order.
Categorical or None
If we are observed, return the original categorical, otherwise None
"""
# we only care about observed values
if observed:
# In cases with c.ordered, this is equivalent to
# return c.remove_unused_categories(), c
unique_codes = unique1d(c.codes)
take_codes = unique_codes[unique_codes != -1]
if sort:
take_codes = np.sort(take_codes)
# we recode according to the uniques
categories = c.categories.take(take_codes)
codes = recode_for_categories(c.codes, c.categories, categories)
# return a new categorical that maps our new codes
# and categories
dtype = CategoricalDtype(categories, ordered=c.ordered)
return Categorical._simple_new(codes, dtype=dtype), c
# Already sorted according to c.categories; all is fine
if sort:
return c, None
# sort=False should order groups in as-encountered order (GH-8868)
# xref GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories
all_codes = np.arange(c.categories.nunique())
# GH 38140: exclude nan from indexer for categories
unique_notnan_codes = unique1d(c.codes[c.codes != -1])
if sort:
unique_notnan_codes = np.sort(unique_notnan_codes)
if len(all_codes) > len(unique_notnan_codes):
# GH 13179: All categories need to be present, even if missing from the data
missing_codes = np.setdiff1d(all_codes, unique_notnan_codes, assume_unique=True)
take_codes = np.concatenate((unique_notnan_codes, missing_codes))
else:
take_codes = unique_notnan_codes
return Categorical(c, c.unique().categories.take(take_codes)), None

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,304 @@
from __future__ import annotations
from collections.abc import Iterable
from typing import (
TYPE_CHECKING,
Literal,
cast,
)
import numpy as np
from pandas.util._decorators import (
cache_readonly,
doc,
)
from pandas.core.dtypes.common import (
is_integer,
is_list_like,
)
if TYPE_CHECKING:
from pandas._typing import PositionalIndexer
from pandas import (
DataFrame,
Series,
)
from pandas.core.groupby import groupby
class GroupByIndexingMixin:
"""
Mixin for adding ._positional_selector to GroupBy.
"""
@cache_readonly
def _positional_selector(self) -> GroupByPositionalSelector:
"""
Return positional selection for each group.
``groupby._positional_selector[i:j]`` is similar to
``groupby.apply(lambda x: x.iloc[i:j])``
but much faster and preserves the original index and order.
``_positional_selector[]`` is compatible with and extends :meth:`~GroupBy.head`
and :meth:`~GroupBy.tail`. For example:
- ``head(5)``
- ``_positional_selector[5:-5]``
- ``tail(5)``
together return all the rows.
Allowed inputs for the index are:
- An integer valued iterable, e.g. ``range(2, 4)``.
- A comma separated list of integers and slices, e.g. ``5``, ``2, 4``, ``2:4``.
The output format is the same as :meth:`~GroupBy.head` and
:meth:`~GroupBy.tail`, namely
a subset of the ``DataFrame`` or ``Series`` with the index and order preserved.
Returns
-------
Series
The filtered subset of the original Series.
DataFrame
The filtered subset of the original DataFrame.
See Also
--------
DataFrame.iloc : Purely integer-location based indexing for selection by
position.
GroupBy.head : Return first n rows of each group.
GroupBy.tail : Return last n rows of each group.
GroupBy.nth : Take the nth row from each group if n is an int, or a
subset of rows, if n is a list of ints.
Notes
-----
- The slice step cannot be negative.
- If the index specification results in overlaps, the item is not duplicated.
- If the index specification changes the order of items, then
they are returned in their original order.
By contrast, ``DataFrame.iloc`` can change the row order.
- ``groupby()`` parameters such as as_index and dropna are ignored.
The differences between ``_positional_selector[]`` and :meth:`~GroupBy.nth`
with ``as_index=False`` are:
- Input to ``_positional_selector`` can include
one or more slices whereas ``nth``
just handles an integer or a list of integers.
- ``_positional_selector`` can accept a slice relative to the
last row of each group.
- ``_positional_selector`` does not have an equivalent to the
``nth()`` ``dropna`` parameter.
Examples
--------
>>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]],
... columns=["A", "B"])
>>> df.groupby("A")._positional_selector[1:2]
A B
1 a 2
4 b 5
>>> df.groupby("A")._positional_selector[1, -1]
A B
1 a 2
2 a 3
4 b 5
"""
if TYPE_CHECKING:
# pylint: disable-next=used-before-assignment
groupby_self = cast(groupby.GroupBy, self)
else:
groupby_self = self
return GroupByPositionalSelector(groupby_self)
def _make_mask_from_positional_indexer(
self,
arg: PositionalIndexer | tuple,
) -> np.ndarray:
if is_list_like(arg):
if all(is_integer(i) for i in cast(Iterable, arg)):
mask = self._make_mask_from_list(cast(Iterable[int], arg))
else:
mask = self._make_mask_from_tuple(cast(tuple, arg))
elif isinstance(arg, slice):
mask = self._make_mask_from_slice(arg)
elif is_integer(arg):
mask = self._make_mask_from_int(cast(int, arg))
else:
raise TypeError(
f"Invalid index {type(arg)}. "
"Must be integer, list-like, slice or a tuple of "
"integers and slices"
)
if isinstance(mask, bool):
if mask:
mask = self._ascending_count >= 0
else:
mask = self._ascending_count < 0
return cast(np.ndarray, mask)
def _make_mask_from_int(self, arg: int) -> np.ndarray:
if arg >= 0:
return self._ascending_count == arg
else:
return self._descending_count == (-arg - 1)
def _make_mask_from_list(self, args: Iterable[int]) -> bool | np.ndarray:
positive = [arg for arg in args if arg >= 0]
negative = [-arg - 1 for arg in args if arg < 0]
mask: bool | np.ndarray = False
if positive:
mask |= np.isin(self._ascending_count, positive)
if negative:
mask |= np.isin(self._descending_count, negative)
return mask
def _make_mask_from_tuple(self, args: tuple) -> bool | np.ndarray:
mask: bool | np.ndarray = False
for arg in args:
if is_integer(arg):
mask |= self._make_mask_from_int(cast(int, arg))
elif isinstance(arg, slice):
mask |= self._make_mask_from_slice(arg)
else:
raise ValueError(
f"Invalid argument {type(arg)}. Should be int or slice."
)
return mask
def _make_mask_from_slice(self, arg: slice) -> bool | np.ndarray:
start = arg.start
stop = arg.stop
step = arg.step
if step is not None and step < 0:
raise ValueError(f"Invalid step {step}. Must be non-negative")
mask: bool | np.ndarray = True
if step is None:
step = 1
if start is None:
if step > 1:
mask &= self._ascending_count % step == 0
elif start >= 0:
mask &= self._ascending_count >= start
if step > 1:
mask &= (self._ascending_count - start) % step == 0
else:
mask &= self._descending_count < -start
offset_array = self._descending_count + start + 1
limit_array = (
self._ascending_count + self._descending_count + (start + 1)
) < 0
offset_array = np.where(limit_array, self._ascending_count, offset_array)
mask &= offset_array % step == 0
if stop is not None:
if stop >= 0:
mask &= self._ascending_count < stop
else:
mask &= self._descending_count >= -stop
return mask
@cache_readonly
def _ascending_count(self) -> np.ndarray:
if TYPE_CHECKING:
groupby_self = cast(groupby.GroupBy, self)
else:
groupby_self = self
return groupby_self._cumcount_array()
@cache_readonly
def _descending_count(self) -> np.ndarray:
if TYPE_CHECKING:
groupby_self = cast(groupby.GroupBy, self)
else:
groupby_self = self
return groupby_self._cumcount_array(ascending=False)
@doc(GroupByIndexingMixin._positional_selector)
class GroupByPositionalSelector:
def __init__(self, groupby_object: groupby.GroupBy) -> None:
self.groupby_object = groupby_object
def __getitem__(self, arg: PositionalIndexer | tuple) -> DataFrame | Series:
"""
Select by positional index per group.
Implements GroupBy._positional_selector
Parameters
----------
arg : PositionalIndexer | tuple
Allowed values are:
- int
- int valued iterable such as list or range
- slice with step either None or positive
- tuple of integers and slices
Returns
-------
Series
The filtered subset of the original groupby Series.
DataFrame
The filtered subset of the original groupby DataFrame.
See Also
--------
DataFrame.iloc : Integer-location based indexing for selection by position.
GroupBy.head : Return first n rows of each group.
GroupBy.tail : Return last n rows of each group.
GroupBy._positional_selector : Return positional selection for each group.
GroupBy.nth : Take the nth row from each group if n is an int, or a
subset of rows, if n is a list of ints.
"""
mask = self.groupby_object._make_mask_from_positional_indexer(arg)
return self.groupby_object._mask_selected_obj(mask)
class GroupByNthSelector:
"""
Dynamically substituted for GroupBy.nth to enable both call and index
"""
def __init__(self, groupby_object: groupby.GroupBy) -> None:
self.groupby_object = groupby_object
def __call__(
self,
n: PositionalIndexer | tuple,
dropna: Literal["any", "all", None] = None,
) -> DataFrame | Series:
return self.groupby_object._nth(n, dropna)
def __getitem__(self, n: PositionalIndexer | tuple) -> DataFrame | Series:
return self.groupby_object._nth(n)

View File

@ -0,0 +1,181 @@
"""Common utilities for Numba operations with groupby ops"""
from __future__ import annotations
import functools
import inspect
from typing import (
TYPE_CHECKING,
Any,
Callable,
)
import numpy as np
from pandas.compat._optional import import_optional_dependency
from pandas.core.util.numba_ import (
NumbaUtilError,
jit_user_function,
)
if TYPE_CHECKING:
from pandas._typing import Scalar
def validate_udf(func: Callable) -> None:
"""
Validate user defined function for ops when using Numba with groupby ops.
The first signature arguments should include:
def f(values, index, ...):
...
Parameters
----------
func : function, default False
user defined function
Returns
-------
None
Raises
------
NumbaUtilError
"""
if not callable(func):
raise NotImplementedError(
"Numba engine can only be used with a single function."
)
udf_signature = list(inspect.signature(func).parameters.keys())
expected_args = ["values", "index"]
min_number_args = len(expected_args)
if (
len(udf_signature) < min_number_args
or udf_signature[:min_number_args] != expected_args
):
raise NumbaUtilError(
f"The first {min_number_args} arguments to {func.__name__} must be "
f"{expected_args}"
)
@functools.cache
def generate_numba_agg_func(
func: Callable[..., Scalar],
nopython: bool,
nogil: bool,
parallel: bool,
) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, Any], np.ndarray]:
"""
Generate a numba jitted agg function specified by values from engine_kwargs.
1. jit the user's function
2. Return a groupby agg function with the jitted function inline
Configurations specified in engine_kwargs apply to both the user's
function _AND_ the groupby evaluation loop.
Parameters
----------
func : function
function to be applied to each group and will be JITed
nopython : bool
nopython to be passed into numba.jit
nogil : bool
nogil to be passed into numba.jit
parallel : bool
parallel to be passed into numba.jit
Returns
-------
Numba function
"""
numba_func = jit_user_function(func)
if TYPE_CHECKING:
import numba
else:
numba = import_optional_dependency("numba")
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def group_agg(
values: np.ndarray,
index: np.ndarray,
begin: np.ndarray,
end: np.ndarray,
num_columns: int,
*args: Any,
) -> np.ndarray:
assert len(begin) == len(end)
num_groups = len(begin)
result = np.empty((num_groups, num_columns))
for i in numba.prange(num_groups):
group_index = index[begin[i] : end[i]]
for j in numba.prange(num_columns):
group = values[begin[i] : end[i], j]
result[i, j] = numba_func(group, group_index, *args)
return result
return group_agg
@functools.cache
def generate_numba_transform_func(
func: Callable[..., np.ndarray],
nopython: bool,
nogil: bool,
parallel: bool,
) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, Any], np.ndarray]:
"""
Generate a numba jitted transform function specified by values from engine_kwargs.
1. jit the user's function
2. Return a groupby transform function with the jitted function inline
Configurations specified in engine_kwargs apply to both the user's
function _AND_ the groupby evaluation loop.
Parameters
----------
func : function
function to be applied to each window and will be JITed
nopython : bool
nopython to be passed into numba.jit
nogil : bool
nogil to be passed into numba.jit
parallel : bool
parallel to be passed into numba.jit
Returns
-------
Numba function
"""
numba_func = jit_user_function(func)
if TYPE_CHECKING:
import numba
else:
numba = import_optional_dependency("numba")
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def group_transform(
values: np.ndarray,
index: np.ndarray,
begin: np.ndarray,
end: np.ndarray,
num_columns: int,
*args: Any,
) -> np.ndarray:
assert len(begin) == len(end)
num_groups = len(begin)
result = np.empty((len(values), num_columns))
for i in numba.prange(num_groups):
group_index = index[begin[i] : end[i]]
for j in numba.prange(num_columns):
group = values[begin[i] : end[i], j]
result[begin[i] : end[i], j] = numba_func(group, group_index, *args)
return result
return group_transform

File diff suppressed because it is too large Load Diff