Updated script that can be controled by Nodejs web app

This commit is contained in:
mac OS
2024-11-25 12:24:18 +07:00
parent c440eda1f4
commit 8b0ab2bd3a
8662 changed files with 1803808 additions and 34 deletions

View File

@ -0,0 +1,23 @@
from pandas.core.window.ewm import (
ExponentialMovingWindow,
ExponentialMovingWindowGroupby,
)
from pandas.core.window.expanding import (
Expanding,
ExpandingGroupby,
)
from pandas.core.window.rolling import (
Rolling,
RollingGroupby,
Window,
)
__all__ = [
"Expanding",
"ExpandingGroupby",
"ExponentialMovingWindow",
"ExponentialMovingWindowGroupby",
"Rolling",
"RollingGroupby",
"Window",
]

View File

@ -0,0 +1,169 @@
"""Common utility functions for rolling operations"""
from __future__ import annotations
from collections import defaultdict
from typing import cast
import numpy as np
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCSeries,
)
from pandas.core.indexes.api import MultiIndex
def flex_binary_moment(arg1, arg2, f, pairwise: bool = False):
if isinstance(arg1, ABCSeries) and isinstance(arg2, ABCSeries):
X, Y = prep_binary(arg1, arg2)
return f(X, Y)
elif isinstance(arg1, ABCDataFrame):
from pandas import DataFrame
def dataframe_from_int_dict(data, frame_template) -> DataFrame:
result = DataFrame(data, index=frame_template.index)
if len(result.columns) > 0:
result.columns = frame_template.columns[result.columns]
else:
result.columns = frame_template.columns.copy()
return result
results = {}
if isinstance(arg2, ABCDataFrame):
if pairwise is False:
if arg1 is arg2:
# special case in order to handle duplicate column names
for i in range(len(arg1.columns)):
results[i] = f(arg1.iloc[:, i], arg2.iloc[:, i])
return dataframe_from_int_dict(results, arg1)
else:
if not arg1.columns.is_unique:
raise ValueError("'arg1' columns are not unique")
if not arg2.columns.is_unique:
raise ValueError("'arg2' columns are not unique")
X, Y = arg1.align(arg2, join="outer")
X, Y = prep_binary(X, Y)
res_columns = arg1.columns.union(arg2.columns)
for col in res_columns:
if col in X and col in Y:
results[col] = f(X[col], Y[col])
return DataFrame(results, index=X.index, columns=res_columns)
elif pairwise is True:
results = defaultdict(dict)
for i in range(len(arg1.columns)):
for j in range(len(arg2.columns)):
if j < i and arg2 is arg1:
# Symmetric case
results[i][j] = results[j][i]
else:
results[i][j] = f(
*prep_binary(arg1.iloc[:, i], arg2.iloc[:, j])
)
from pandas import concat
result_index = arg1.index.union(arg2.index)
if len(result_index):
# construct result frame
result = concat(
[
concat(
[results[i][j] for j in range(len(arg2.columns))],
ignore_index=True,
)
for i in range(len(arg1.columns))
],
ignore_index=True,
axis=1,
)
result.columns = arg1.columns
# set the index and reorder
if arg2.columns.nlevels > 1:
# mypy needs to know columns is a MultiIndex, Index doesn't
# have levels attribute
arg2.columns = cast(MultiIndex, arg2.columns)
# GH 21157: Equivalent to MultiIndex.from_product(
# [result_index], <unique combinations of arg2.columns.levels>,
# )
# A normal MultiIndex.from_product will produce too many
# combinations.
result_level = np.tile(
result_index, len(result) // len(result_index)
)
arg2_levels = (
np.repeat(
arg2.columns.get_level_values(i),
len(result) // len(arg2.columns),
)
for i in range(arg2.columns.nlevels)
)
result_names = list(arg2.columns.names) + [result_index.name]
result.index = MultiIndex.from_arrays(
[*arg2_levels, result_level], names=result_names
)
# GH 34440
num_levels = len(result.index.levels)
new_order = [num_levels - 1] + list(range(num_levels - 1))
result = result.reorder_levels(new_order).sort_index()
else:
result.index = MultiIndex.from_product(
[range(len(arg2.columns)), range(len(result_index))]
)
result = result.swaplevel(1, 0).sort_index()
result.index = MultiIndex.from_product(
[result_index] + [arg2.columns]
)
else:
# empty result
result = DataFrame(
index=MultiIndex(
levels=[arg1.index, arg2.columns], codes=[[], []]
),
columns=arg2.columns,
dtype="float64",
)
# reset our index names to arg1 names
# reset our column names to arg2 names
# careful not to mutate the original names
result.columns = result.columns.set_names(arg1.columns.names)
result.index = result.index.set_names(
result_index.names + arg2.columns.names
)
return result
else:
results = {
i: f(*prep_binary(arg1.iloc[:, i], arg2))
for i in range(len(arg1.columns))
}
return dataframe_from_int_dict(results, arg1)
else:
return flex_binary_moment(arg2, arg1, f)
def zsqrt(x):
with np.errstate(all="ignore"):
result = np.sqrt(x)
mask = x < 0
if isinstance(x, ABCDataFrame):
if mask._values.any():
result[mask] = 0
else:
if mask.any():
result[mask] = 0
return result
def prep_binary(arg1, arg2):
# mask out values, this also makes a common index...
X = arg1 + 0 * arg2
Y = arg2 + 0 * arg1
return X, Y

View File

@ -0,0 +1,116 @@
"""Any shareable docstring components for rolling/expanding/ewm"""
from __future__ import annotations
from textwrap import dedent
from pandas.core.shared_docs import _shared_docs
_shared_docs = dict(**_shared_docs)
def create_section_header(header: str) -> str:
"""Create numpydoc section header"""
return f"{header}\n{'-' * len(header)}\n"
template_header = "\nCalculate the {window_method} {aggregation_description}.\n\n"
template_returns = dedent(
"""
Series or DataFrame
Return type is the same as the original object with ``np.float64`` dtype.\n
"""
).replace("\n", "", 1)
template_see_also = dedent(
"""
pandas.Series.{window_method} : Calling {window_method} with Series data.
pandas.DataFrame.{window_method} : Calling {window_method} with DataFrames.
pandas.Series.{agg_method} : Aggregating {agg_method} for Series.
pandas.DataFrame.{agg_method} : Aggregating {agg_method} for DataFrame.\n
"""
).replace("\n", "", 1)
kwargs_numeric_only = dedent(
"""
numeric_only : bool, default False
Include only float, int, boolean columns.
.. versionadded:: 1.5.0\n
"""
).replace("\n", "", 1)
kwargs_scipy = dedent(
"""
**kwargs
Keyword arguments to configure the ``SciPy`` weighted window type.\n
"""
).replace("\n", "", 1)
window_apply_parameters = dedent(
"""
func : function
Must produce a single value from an ndarray input if ``raw=True``
or a single value from a Series if ``raw=False``. Can also accept a
Numba JIT function with ``engine='numba'`` specified.
raw : bool, default False
* ``False`` : passes each row or column as a Series to the
function.
* ``True`` : the passed function will receive ndarray
objects instead.
If you are just applying a NumPy reduction function this will
achieve much better performance.
engine : str, default None
* ``'cython'`` : Runs rolling apply through C-extensions from cython.
* ``'numba'`` : Runs rolling apply through JIT compiled code from numba.
Only available when ``raw`` is set to ``True``.
* ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
engine_kwargs : dict, default None
* For ``'cython'`` engine, there are no accepted ``engine_kwargs``
* For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
and ``parallel`` dictionary keys. The values must either be ``True`` or
``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be
applied to both the ``func`` and the ``apply`` rolling aggregation.
args : tuple, default None
Positional arguments to be passed into func.
kwargs : dict, default None
Keyword arguments to be passed into func.\n
"""
).replace("\n", "", 1)
numba_notes = (
"See :ref:`window.numba_engine` and :ref:`enhancingperf.numba` for "
"extended documentation and performance considerations for the Numba engine.\n\n"
)
def window_agg_numba_parameters(version: str = "1.3") -> str:
return (
dedent(
"""
engine : str, default None
* ``'cython'`` : Runs the operation through C-extensions from cython.
* ``'numba'`` : Runs the operation through JIT compiled code from numba.
* ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
.. versionadded:: {version}.0
engine_kwargs : dict, default None
* For ``'cython'`` engine, there are no accepted ``engine_kwargs``
* For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
and ``parallel`` dictionary keys. The values must either be ``True`` or
``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
``{{'nopython': True, 'nogil': False, 'parallel': False}}``
.. versionadded:: {version}.0\n
"""
)
.replace("\n", "", 1)
.replace("{version}", version)
)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,964 @@
from __future__ import annotations
from textwrap import dedent
from typing import (
TYPE_CHECKING,
Any,
Callable,
Literal,
)
from pandas.util._decorators import (
deprecate_kwarg,
doc,
)
from pandas.core.indexers.objects import (
BaseIndexer,
ExpandingIndexer,
GroupbyIndexer,
)
from pandas.core.window.doc import (
_shared_docs,
create_section_header,
kwargs_numeric_only,
numba_notes,
template_header,
template_returns,
template_see_also,
window_agg_numba_parameters,
window_apply_parameters,
)
from pandas.core.window.rolling import (
BaseWindowGroupby,
RollingAndExpandingMixin,
)
if TYPE_CHECKING:
from pandas._typing import (
Axis,
QuantileInterpolation,
WindowingRankType,
)
from pandas import (
DataFrame,
Series,
)
from pandas.core.generic import NDFrame
class Expanding(RollingAndExpandingMixin):
"""
Provide expanding window calculations.
Parameters
----------
min_periods : int, default 1
Minimum number of observations in window required to have a value;
otherwise, result is ``np.nan``.
axis : int or str, default 0
If ``0`` or ``'index'``, roll across the rows.
If ``1`` or ``'columns'``, roll across the columns.
For `Series` this parameter is unused and defaults to 0.
method : str {'single', 'table'}, default 'single'
Execute the rolling operation per single column or row (``'single'``)
or over the entire object (``'table'``).
This argument is only implemented when specifying ``engine='numba'``
in the method call.
.. versionadded:: 1.3.0
Returns
-------
pandas.api.typing.Expanding
See Also
--------
rolling : Provides rolling window calculations.
ewm : Provides exponential weighted functions.
Notes
-----
See :ref:`Windowing Operations <window.expanding>` for further usage details
and examples.
Examples
--------
>>> df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]})
>>> df
B
0 0.0
1 1.0
2 2.0
3 NaN
4 4.0
**min_periods**
Expanding sum with 1 vs 3 observations needed to calculate a value.
>>> df.expanding(1).sum()
B
0 0.0
1 1.0
2 3.0
3 3.0
4 7.0
>>> df.expanding(3).sum()
B
0 NaN
1 NaN
2 3.0
3 3.0
4 7.0
"""
_attributes: list[str] = ["min_periods", "axis", "method"]
def __init__(
self,
obj: NDFrame,
min_periods: int = 1,
axis: Axis = 0,
method: str = "single",
selection=None,
) -> None:
super().__init__(
obj=obj,
min_periods=min_periods,
axis=axis,
method=method,
selection=selection,
)
def _get_window_indexer(self) -> BaseIndexer:
"""
Return an indexer class that will compute the window start and end bounds
"""
return ExpandingIndexer()
@doc(
_shared_docs["aggregate"],
see_also=dedent(
"""
See Also
--------
pandas.DataFrame.aggregate : Similar DataFrame method.
pandas.Series.aggregate : Similar Series method.
"""
),
examples=dedent(
"""
Examples
--------
>>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
>>> df
A B C
0 1 4 7
1 2 5 8
2 3 6 9
>>> df.ewm(alpha=0.5).mean()
A B C
0 1.000000 4.000000 7.000000
1 1.666667 4.666667 7.666667
2 2.428571 5.428571 8.428571
"""
),
klass="Series/Dataframe",
axis="",
)
def aggregate(self, func, *args, **kwargs):
return super().aggregate(func, *args, **kwargs)
agg = aggregate
@doc(
template_header,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also,
create_section_header("Examples"),
dedent(
"""\
>>> ser = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
>>> ser.expanding().count()
a 1.0
b 2.0
c 3.0
d 4.0
dtype: float64
"""
),
window_method="expanding",
aggregation_description="count of non NaN observations",
agg_method="count",
)
def count(self, numeric_only: bool = False):
return super().count(numeric_only=numeric_only)
@doc(
template_header,
create_section_header("Parameters"),
window_apply_parameters,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also,
create_section_header("Examples"),
dedent(
"""\
>>> ser = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
>>> ser.expanding().apply(lambda s: s.max() - 2 * s.min())
a -1.0
b 0.0
c 1.0
d 2.0
dtype: float64
"""
),
window_method="expanding",
aggregation_description="custom aggregation function",
agg_method="apply",
)
def apply(
self,
func: Callable[..., Any],
raw: bool = False,
engine: Literal["cython", "numba"] | None = None,
engine_kwargs: dict[str, bool] | None = None,
args: tuple[Any, ...] | None = None,
kwargs: dict[str, Any] | None = None,
):
return super().apply(
func,
raw=raw,
engine=engine,
engine_kwargs=engine_kwargs,
args=args,
kwargs=kwargs,
)
@doc(
template_header,
create_section_header("Parameters"),
kwargs_numeric_only,
window_agg_numba_parameters(),
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also,
create_section_header("Notes"),
numba_notes,
create_section_header("Examples"),
dedent(
"""\
>>> ser = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
>>> ser.expanding().sum()
a 1.0
b 3.0
c 6.0
d 10.0
dtype: float64
"""
),
window_method="expanding",
aggregation_description="sum",
agg_method="sum",
)
def sum(
self,
numeric_only: bool = False,
engine: Literal["cython", "numba"] | None = None,
engine_kwargs: dict[str, bool] | None = None,
):
return super().sum(
numeric_only=numeric_only,
engine=engine,
engine_kwargs=engine_kwargs,
)
@doc(
template_header,
create_section_header("Parameters"),
kwargs_numeric_only,
window_agg_numba_parameters(),
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also,
create_section_header("Notes"),
numba_notes,
create_section_header("Examples"),
dedent(
"""\
>>> ser = pd.Series([3, 2, 1, 4], index=['a', 'b', 'c', 'd'])
>>> ser.expanding().max()
a 3.0
b 3.0
c 3.0
d 4.0
dtype: float64
"""
),
window_method="expanding",
aggregation_description="maximum",
agg_method="max",
)
def max(
self,
numeric_only: bool = False,
engine: Literal["cython", "numba"] | None = None,
engine_kwargs: dict[str, bool] | None = None,
):
return super().max(
numeric_only=numeric_only,
engine=engine,
engine_kwargs=engine_kwargs,
)
@doc(
template_header,
create_section_header("Parameters"),
kwargs_numeric_only,
window_agg_numba_parameters(),
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also,
create_section_header("Notes"),
numba_notes,
create_section_header("Examples"),
dedent(
"""\
>>> ser = pd.Series([2, 3, 4, 1], index=['a', 'b', 'c', 'd'])
>>> ser.expanding().min()
a 2.0
b 2.0
c 2.0
d 1.0
dtype: float64
"""
),
window_method="expanding",
aggregation_description="minimum",
agg_method="min",
)
def min(
self,
numeric_only: bool = False,
engine: Literal["cython", "numba"] | None = None,
engine_kwargs: dict[str, bool] | None = None,
):
return super().min(
numeric_only=numeric_only,
engine=engine,
engine_kwargs=engine_kwargs,
)
@doc(
template_header,
create_section_header("Parameters"),
kwargs_numeric_only,
window_agg_numba_parameters(),
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also,
create_section_header("Notes"),
numba_notes,
create_section_header("Examples"),
dedent(
"""\
>>> ser = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
>>> ser.expanding().mean()
a 1.0
b 1.5
c 2.0
d 2.5
dtype: float64
"""
),
window_method="expanding",
aggregation_description="mean",
agg_method="mean",
)
def mean(
self,
numeric_only: bool = False,
engine: Literal["cython", "numba"] | None = None,
engine_kwargs: dict[str, bool] | None = None,
):
return super().mean(
numeric_only=numeric_only,
engine=engine,
engine_kwargs=engine_kwargs,
)
@doc(
template_header,
create_section_header("Parameters"),
kwargs_numeric_only,
window_agg_numba_parameters(),
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also,
create_section_header("Notes"),
numba_notes,
create_section_header("Examples"),
dedent(
"""\
>>> ser = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
>>> ser.expanding().median()
a 1.0
b 1.5
c 2.0
d 2.5
dtype: float64
"""
),
window_method="expanding",
aggregation_description="median",
agg_method="median",
)
def median(
self,
numeric_only: bool = False,
engine: Literal["cython", "numba"] | None = None,
engine_kwargs: dict[str, bool] | None = None,
):
return super().median(
numeric_only=numeric_only,
engine=engine,
engine_kwargs=engine_kwargs,
)
@doc(
template_header,
create_section_header("Parameters"),
dedent(
"""
ddof : int, default 1
Delta Degrees of Freedom. The divisor used in calculations
is ``N - ddof``, where ``N`` represents the number of elements.\n
"""
).replace("\n", "", 1),
kwargs_numeric_only,
window_agg_numba_parameters("1.4"),
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
"numpy.std : Equivalent method for NumPy array.\n",
template_see_also,
create_section_header("Notes"),
dedent(
"""
The default ``ddof`` of 1 used in :meth:`Series.std` is different
than the default ``ddof`` of 0 in :func:`numpy.std`.
A minimum of one period is required for the rolling calculation.\n
"""
).replace("\n", "", 1),
create_section_header("Examples"),
dedent(
"""
>>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
>>> s.expanding(3).std()
0 NaN
1 NaN
2 0.577350
3 0.957427
4 0.894427
5 0.836660
6 0.786796
dtype: float64
"""
).replace("\n", "", 1),
window_method="expanding",
aggregation_description="standard deviation",
agg_method="std",
)
def std(
self,
ddof: int = 1,
numeric_only: bool = False,
engine: Literal["cython", "numba"] | None = None,
engine_kwargs: dict[str, bool] | None = None,
):
return super().std(
ddof=ddof,
numeric_only=numeric_only,
engine=engine,
engine_kwargs=engine_kwargs,
)
@doc(
template_header,
create_section_header("Parameters"),
dedent(
"""
ddof : int, default 1
Delta Degrees of Freedom. The divisor used in calculations
is ``N - ddof``, where ``N`` represents the number of elements.\n
"""
).replace("\n", "", 1),
kwargs_numeric_only,
window_agg_numba_parameters("1.4"),
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
"numpy.var : Equivalent method for NumPy array.\n",
template_see_also,
create_section_header("Notes"),
dedent(
"""
The default ``ddof`` of 1 used in :meth:`Series.var` is different
than the default ``ddof`` of 0 in :func:`numpy.var`.
A minimum of one period is required for the rolling calculation.\n
"""
).replace("\n", "", 1),
create_section_header("Examples"),
dedent(
"""
>>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
>>> s.expanding(3).var()
0 NaN
1 NaN
2 0.333333
3 0.916667
4 0.800000
5 0.700000
6 0.619048
dtype: float64
"""
).replace("\n", "", 1),
window_method="expanding",
aggregation_description="variance",
agg_method="var",
)
def var(
self,
ddof: int = 1,
numeric_only: bool = False,
engine: Literal["cython", "numba"] | None = None,
engine_kwargs: dict[str, bool] | None = None,
):
return super().var(
ddof=ddof,
numeric_only=numeric_only,
engine=engine,
engine_kwargs=engine_kwargs,
)
@doc(
template_header,
create_section_header("Parameters"),
dedent(
"""
ddof : int, default 1
Delta Degrees of Freedom. The divisor used in calculations
is ``N - ddof``, where ``N`` represents the number of elements.\n
"""
).replace("\n", "", 1),
kwargs_numeric_only,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also,
create_section_header("Notes"),
"A minimum of one period is required for the calculation.\n\n",
create_section_header("Examples"),
dedent(
"""
>>> s = pd.Series([0, 1, 2, 3])
>>> s.expanding().sem()
0 NaN
1 0.707107
2 0.707107
3 0.745356
dtype: float64
"""
).replace("\n", "", 1),
window_method="expanding",
aggregation_description="standard error of mean",
agg_method="sem",
)
def sem(self, ddof: int = 1, numeric_only: bool = False):
return super().sem(ddof=ddof, numeric_only=numeric_only)
@doc(
template_header,
create_section_header("Parameters"),
kwargs_numeric_only,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
"scipy.stats.skew : Third moment of a probability density.\n",
template_see_also,
create_section_header("Notes"),
"A minimum of three periods is required for the rolling calculation.\n\n",
create_section_header("Examples"),
dedent(
"""\
>>> ser = pd.Series([-1, 0, 2, -1, 2], index=['a', 'b', 'c', 'd', 'e'])
>>> ser.expanding().skew()
a NaN
b NaN
c 0.935220
d 1.414214
e 0.315356
dtype: float64
"""
),
window_method="expanding",
aggregation_description="unbiased skewness",
agg_method="skew",
)
def skew(self, numeric_only: bool = False):
return super().skew(numeric_only=numeric_only)
@doc(
template_header,
create_section_header("Parameters"),
kwargs_numeric_only,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
"scipy.stats.kurtosis : Reference SciPy method.\n",
template_see_also,
create_section_header("Notes"),
"A minimum of four periods is required for the calculation.\n\n",
create_section_header("Examples"),
dedent(
"""
The example below will show a rolling calculation with a window size of
four matching the equivalent function call using `scipy.stats`.
>>> arr = [1, 2, 3, 4, 999]
>>> import scipy.stats
>>> print(f"{{scipy.stats.kurtosis(arr[:-1], bias=False):.6f}}")
-1.200000
>>> print(f"{{scipy.stats.kurtosis(arr, bias=False):.6f}}")
4.999874
>>> s = pd.Series(arr)
>>> s.expanding(4).kurt()
0 NaN
1 NaN
2 NaN
3 -1.200000
4 4.999874
dtype: float64
"""
).replace("\n", "", 1),
window_method="expanding",
aggregation_description="Fisher's definition of kurtosis without bias",
agg_method="kurt",
)
def kurt(self, numeric_only: bool = False):
return super().kurt(numeric_only=numeric_only)
@doc(
template_header,
create_section_header("Parameters"),
dedent(
"""
quantile : float
Quantile to compute. 0 <= quantile <= 1.
.. deprecated:: 2.1.0
This will be renamed to 'q' in a future version.
interpolation : {{'linear', 'lower', 'higher', 'midpoint', 'nearest'}}
This optional parameter specifies the interpolation method to use,
when the desired quantile lies between two data points `i` and `j`:
* linear: `i + (j - i) * fraction`, where `fraction` is the
fractional part of the index surrounded by `i` and `j`.
* lower: `i`.
* higher: `j`.
* nearest: `i` or `j` whichever is nearest.
* midpoint: (`i` + `j`) / 2.
"""
).replace("\n", "", 1),
kwargs_numeric_only,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also,
create_section_header("Examples"),
dedent(
"""\
>>> ser = pd.Series([1, 2, 3, 4, 5, 6], index=['a', 'b', 'c', 'd', 'e', 'f'])
>>> ser.expanding(min_periods=4).quantile(.25)
a NaN
b NaN
c NaN
d 1.75
e 2.00
f 2.25
dtype: float64
"""
),
window_method="expanding",
aggregation_description="quantile",
agg_method="quantile",
)
@deprecate_kwarg(old_arg_name="quantile", new_arg_name="q")
def quantile(
self,
q: float,
interpolation: QuantileInterpolation = "linear",
numeric_only: bool = False,
):
return super().quantile(
q=q,
interpolation=interpolation,
numeric_only=numeric_only,
)
@doc(
template_header,
".. versionadded:: 1.4.0 \n\n",
create_section_header("Parameters"),
dedent(
"""
method : {{'average', 'min', 'max'}}, default 'average'
How to rank the group of records that have the same value (i.e. ties):
* average: average rank of the group
* min: lowest rank in the group
* max: highest rank in the group
ascending : bool, default True
Whether or not the elements should be ranked in ascending order.
pct : bool, default False
Whether or not to display the returned rankings in percentile
form.
"""
).replace("\n", "", 1),
kwargs_numeric_only,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also,
create_section_header("Examples"),
dedent(
"""
>>> s = pd.Series([1, 4, 2, 3, 5, 3])
>>> s.expanding().rank()
0 1.0
1 2.0
2 2.0
3 3.0
4 5.0
5 3.5
dtype: float64
>>> s.expanding().rank(method="max")
0 1.0
1 2.0
2 2.0
3 3.0
4 5.0
5 4.0
dtype: float64
>>> s.expanding().rank(method="min")
0 1.0
1 2.0
2 2.0
3 3.0
4 5.0
5 3.0
dtype: float64
"""
).replace("\n", "", 1),
window_method="expanding",
aggregation_description="rank",
agg_method="rank",
)
def rank(
self,
method: WindowingRankType = "average",
ascending: bool = True,
pct: bool = False,
numeric_only: bool = False,
):
return super().rank(
method=method,
ascending=ascending,
pct=pct,
numeric_only=numeric_only,
)
@doc(
template_header,
create_section_header("Parameters"),
dedent(
"""
other : Series or DataFrame, optional
If not supplied then will default to self and produce pairwise
output.
pairwise : bool, default None
If False then only matching columns between self and other will be
used and the output will be a DataFrame.
If True then all pairwise combinations will be calculated and the
output will be a MultiIndexed DataFrame in the case of DataFrame
inputs. In the case of missing elements, only complete pairwise
observations will be used.
ddof : int, default 1
Delta Degrees of Freedom. The divisor used in calculations
is ``N - ddof``, where ``N`` represents the number of elements.
"""
).replace("\n", "", 1),
kwargs_numeric_only,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also,
create_section_header("Examples"),
dedent(
"""\
>>> ser1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
>>> ser2 = pd.Series([10, 11, 13, 16], index=['a', 'b', 'c', 'd'])
>>> ser1.expanding().cov(ser2)
a NaN
b 0.500000
c 1.500000
d 3.333333
dtype: float64
"""
),
window_method="expanding",
aggregation_description="sample covariance",
agg_method="cov",
)
def cov(
self,
other: DataFrame | Series | None = None,
pairwise: bool | None = None,
ddof: int = 1,
numeric_only: bool = False,
):
return super().cov(
other=other,
pairwise=pairwise,
ddof=ddof,
numeric_only=numeric_only,
)
@doc(
template_header,
create_section_header("Parameters"),
dedent(
"""
other : Series or DataFrame, optional
If not supplied then will default to self and produce pairwise
output.
pairwise : bool, default None
If False then only matching columns between self and other will be
used and the output will be a DataFrame.
If True then all pairwise combinations will be calculated and the
output will be a MultiIndexed DataFrame in the case of DataFrame
inputs. In the case of missing elements, only complete pairwise
observations will be used.
"""
).replace("\n", "", 1),
kwargs_numeric_only,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
dedent(
"""
cov : Similar method to calculate covariance.
numpy.corrcoef : NumPy Pearson's correlation calculation.
"""
).replace("\n", "", 1),
template_see_also,
create_section_header("Notes"),
dedent(
"""
This function uses Pearson's definition of correlation
(https://en.wikipedia.org/wiki/Pearson_correlation_coefficient).
When `other` is not specified, the output will be self correlation (e.g.
all 1's), except for :class:`~pandas.DataFrame` inputs with `pairwise`
set to `True`.
Function will return ``NaN`` for correlations of equal valued sequences;
this is the result of a 0/0 division error.
When `pairwise` is set to `False`, only matching columns between `self` and
`other` will be used.
When `pairwise` is set to `True`, the output will be a MultiIndex DataFrame
with the original index on the first level, and the `other` DataFrame
columns on the second level.
In the case of missing elements, only complete pairwise observations
will be used.\n
"""
),
create_section_header("Examples"),
dedent(
"""\
>>> ser1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
>>> ser2 = pd.Series([10, 11, 13, 16], index=['a', 'b', 'c', 'd'])
>>> ser1.expanding().corr(ser2)
a NaN
b 1.000000
c 0.981981
d 0.975900
dtype: float64
"""
),
window_method="expanding",
aggregation_description="correlation",
agg_method="corr",
)
def corr(
self,
other: DataFrame | Series | None = None,
pairwise: bool | None = None,
ddof: int = 1,
numeric_only: bool = False,
):
return super().corr(
other=other,
pairwise=pairwise,
ddof=ddof,
numeric_only=numeric_only,
)
class ExpandingGroupby(BaseWindowGroupby, Expanding):
"""
Provide a expanding groupby implementation.
"""
_attributes = Expanding._attributes + BaseWindowGroupby._attributes
def _get_window_indexer(self) -> GroupbyIndexer:
"""
Return an indexer class that will compute the window start and end bounds
Returns
-------
GroupbyIndexer
"""
window_indexer = GroupbyIndexer(
groupby_indices=self._grouper.indices,
window_indexer=ExpandingIndexer,
)
return window_indexer

View File

@ -0,0 +1,351 @@
from __future__ import annotations
import functools
from typing import (
TYPE_CHECKING,
Any,
Callable,
)
import numpy as np
from pandas.compat._optional import import_optional_dependency
from pandas.core.util.numba_ import jit_user_function
if TYPE_CHECKING:
from pandas._typing import Scalar
@functools.cache
def generate_numba_apply_func(
func: Callable[..., Scalar],
nopython: bool,
nogil: bool,
parallel: bool,
):
"""
Generate a numba jitted apply function specified by values from engine_kwargs.
1. jit the user's function
2. Return a rolling apply function with the jitted function inline
Configurations specified in engine_kwargs apply to both the user's
function _AND_ the rolling apply function.
Parameters
----------
func : function
function to be applied to each window and will be JITed
nopython : bool
nopython to be passed into numba.jit
nogil : bool
nogil to be passed into numba.jit
parallel : bool
parallel to be passed into numba.jit
Returns
-------
Numba function
"""
numba_func = jit_user_function(func)
if TYPE_CHECKING:
import numba
else:
numba = import_optional_dependency("numba")
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def roll_apply(
values: np.ndarray,
begin: np.ndarray,
end: np.ndarray,
minimum_periods: int,
*args: Any,
) -> np.ndarray:
result = np.empty(len(begin))
for i in numba.prange(len(result)):
start = begin[i]
stop = end[i]
window = values[start:stop]
count_nan = np.sum(np.isnan(window))
if len(window) - count_nan >= minimum_periods:
result[i] = numba_func(window, *args)
else:
result[i] = np.nan
return result
return roll_apply
@functools.cache
def generate_numba_ewm_func(
nopython: bool,
nogil: bool,
parallel: bool,
com: float,
adjust: bool,
ignore_na: bool,
deltas: tuple,
normalize: bool,
):
"""
Generate a numba jitted ewm mean or sum function specified by values
from engine_kwargs.
Parameters
----------
nopython : bool
nopython to be passed into numba.jit
nogil : bool
nogil to be passed into numba.jit
parallel : bool
parallel to be passed into numba.jit
com : float
adjust : bool
ignore_na : bool
deltas : tuple
normalize : bool
Returns
-------
Numba function
"""
if TYPE_CHECKING:
import numba
else:
numba = import_optional_dependency("numba")
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def ewm(
values: np.ndarray,
begin: np.ndarray,
end: np.ndarray,
minimum_periods: int,
) -> np.ndarray:
result = np.empty(len(values))
alpha = 1.0 / (1.0 + com)
old_wt_factor = 1.0 - alpha
new_wt = 1.0 if adjust else alpha
for i in numba.prange(len(begin)):
start = begin[i]
stop = end[i]
window = values[start:stop]
sub_result = np.empty(len(window))
weighted = window[0]
nobs = int(not np.isnan(weighted))
sub_result[0] = weighted if nobs >= minimum_periods else np.nan
old_wt = 1.0
for j in range(1, len(window)):
cur = window[j]
is_observation = not np.isnan(cur)
nobs += is_observation
if not np.isnan(weighted):
if is_observation or not ignore_na:
if normalize:
# note that len(deltas) = len(vals) - 1 and deltas[i]
# is to be used in conjunction with vals[i+1]
old_wt *= old_wt_factor ** deltas[start + j - 1]
else:
weighted = old_wt_factor * weighted
if is_observation:
if normalize:
# avoid numerical errors on constant series
if weighted != cur:
weighted = old_wt * weighted + new_wt * cur
if normalize:
weighted = weighted / (old_wt + new_wt)
if adjust:
old_wt += new_wt
else:
old_wt = 1.0
else:
weighted += cur
elif is_observation:
weighted = cur
sub_result[j] = weighted if nobs >= minimum_periods else np.nan
result[start:stop] = sub_result
return result
return ewm
@functools.cache
def generate_numba_table_func(
func: Callable[..., np.ndarray],
nopython: bool,
nogil: bool,
parallel: bool,
):
"""
Generate a numba jitted function to apply window calculations table-wise.
Func will be passed a M window size x N number of columns array, and
must return a 1 x N number of columns array. Func is intended to operate
row-wise, but the result will be transposed for axis=1.
1. jit the user's function
2. Return a rolling apply function with the jitted function inline
Parameters
----------
func : function
function to be applied to each window and will be JITed
nopython : bool
nopython to be passed into numba.jit
nogil : bool
nogil to be passed into numba.jit
parallel : bool
parallel to be passed into numba.jit
Returns
-------
Numba function
"""
numba_func = jit_user_function(func)
if TYPE_CHECKING:
import numba
else:
numba = import_optional_dependency("numba")
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def roll_table(
values: np.ndarray,
begin: np.ndarray,
end: np.ndarray,
minimum_periods: int,
*args: Any,
):
result = np.empty((len(begin), values.shape[1]))
min_periods_mask = np.empty(result.shape)
for i in numba.prange(len(result)):
start = begin[i]
stop = end[i]
window = values[start:stop]
count_nan = np.sum(np.isnan(window), axis=0)
sub_result = numba_func(window, *args)
nan_mask = len(window) - count_nan >= minimum_periods
min_periods_mask[i, :] = nan_mask
result[i, :] = sub_result
result = np.where(min_periods_mask, result, np.nan)
return result
return roll_table
# This function will no longer be needed once numba supports
# axis for all np.nan* agg functions
# https://github.com/numba/numba/issues/1269
@functools.cache
def generate_manual_numpy_nan_agg_with_axis(nan_func):
if TYPE_CHECKING:
import numba
else:
numba = import_optional_dependency("numba")
@numba.jit(nopython=True, nogil=True, parallel=True)
def nan_agg_with_axis(table):
result = np.empty(table.shape[1])
for i in numba.prange(table.shape[1]):
partition = table[:, i]
result[i] = nan_func(partition)
return result
return nan_agg_with_axis
@functools.cache
def generate_numba_ewm_table_func(
nopython: bool,
nogil: bool,
parallel: bool,
com: float,
adjust: bool,
ignore_na: bool,
deltas: tuple,
normalize: bool,
):
"""
Generate a numba jitted ewm mean or sum function applied table wise specified
by values from engine_kwargs.
Parameters
----------
nopython : bool
nopython to be passed into numba.jit
nogil : bool
nogil to be passed into numba.jit
parallel : bool
parallel to be passed into numba.jit
com : float
adjust : bool
ignore_na : bool
deltas : tuple
normalize: bool
Returns
-------
Numba function
"""
if TYPE_CHECKING:
import numba
else:
numba = import_optional_dependency("numba")
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def ewm_table(
values: np.ndarray,
begin: np.ndarray,
end: np.ndarray,
minimum_periods: int,
) -> np.ndarray:
alpha = 1.0 / (1.0 + com)
old_wt_factor = 1.0 - alpha
new_wt = 1.0 if adjust else alpha
old_wt = np.ones(values.shape[1])
result = np.empty(values.shape)
weighted = values[0].copy()
nobs = (~np.isnan(weighted)).astype(np.int64)
result[0] = np.where(nobs >= minimum_periods, weighted, np.nan)
for i in range(1, len(values)):
cur = values[i]
is_observations = ~np.isnan(cur)
nobs += is_observations.astype(np.int64)
for j in numba.prange(len(cur)):
if not np.isnan(weighted[j]):
if is_observations[j] or not ignore_na:
if normalize:
# note that len(deltas) = len(vals) - 1 and deltas[i]
# is to be used in conjunction with vals[i+1]
old_wt[j] *= old_wt_factor ** deltas[i - 1]
else:
weighted[j] = old_wt_factor * weighted[j]
if is_observations[j]:
if normalize:
# avoid numerical errors on constant series
if weighted[j] != cur[j]:
weighted[j] = (
old_wt[j] * weighted[j] + new_wt * cur[j]
)
if normalize:
weighted[j] = weighted[j] / (old_wt[j] + new_wt)
if adjust:
old_wt[j] += new_wt
else:
old_wt[j] = 1.0
else:
weighted[j] += cur[j]
elif is_observations[j]:
weighted[j] = cur[j]
result[i] = np.where(nobs >= minimum_periods, weighted, np.nan)
return result
return ewm_table

View File

@ -0,0 +1,118 @@
from __future__ import annotations
from typing import TYPE_CHECKING
import numpy as np
from pandas.compat._optional import import_optional_dependency
def generate_online_numba_ewma_func(
nopython: bool,
nogil: bool,
parallel: bool,
):
"""
Generate a numba jitted groupby ewma function specified by values
from engine_kwargs.
Parameters
----------
nopython : bool
nopython to be passed into numba.jit
nogil : bool
nogil to be passed into numba.jit
parallel : bool
parallel to be passed into numba.jit
Returns
-------
Numba function
"""
if TYPE_CHECKING:
import numba
else:
numba = import_optional_dependency("numba")
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def online_ewma(
values: np.ndarray,
deltas: np.ndarray,
minimum_periods: int,
old_wt_factor: float,
new_wt: float,
old_wt: np.ndarray,
adjust: bool,
ignore_na: bool,
):
"""
Compute online exponentially weighted mean per column over 2D values.
Takes the first observation as is, then computes the subsequent
exponentially weighted mean accounting minimum periods.
"""
result = np.empty(values.shape)
weighted_avg = values[0].copy()
nobs = (~np.isnan(weighted_avg)).astype(np.int64)
result[0] = np.where(nobs >= minimum_periods, weighted_avg, np.nan)
for i in range(1, len(values)):
cur = values[i]
is_observations = ~np.isnan(cur)
nobs += is_observations.astype(np.int64)
for j in numba.prange(len(cur)):
if not np.isnan(weighted_avg[j]):
if is_observations[j] or not ignore_na:
# note that len(deltas) = len(vals) - 1 and deltas[i] is to be
# used in conjunction with vals[i+1]
old_wt[j] *= old_wt_factor ** deltas[j - 1]
if is_observations[j]:
# avoid numerical errors on constant series
if weighted_avg[j] != cur[j]:
weighted_avg[j] = (
(old_wt[j] * weighted_avg[j]) + (new_wt * cur[j])
) / (old_wt[j] + new_wt)
if adjust:
old_wt[j] += new_wt
else:
old_wt[j] = 1.0
elif is_observations[j]:
weighted_avg[j] = cur[j]
result[i] = np.where(nobs >= minimum_periods, weighted_avg, np.nan)
return result, old_wt
return online_ewma
class EWMMeanState:
def __init__(self, com, adjust, ignore_na, axis, shape) -> None:
alpha = 1.0 / (1.0 + com)
self.axis = axis
self.shape = shape
self.adjust = adjust
self.ignore_na = ignore_na
self.new_wt = 1.0 if adjust else alpha
self.old_wt_factor = 1.0 - alpha
self.old_wt = np.ones(self.shape[self.axis - 1])
self.last_ewm = None
def run_ewm(self, weighted_avg, deltas, min_periods, ewm_func):
result, old_wt = ewm_func(
weighted_avg,
deltas,
min_periods,
self.old_wt_factor,
self.new_wt,
self.old_wt,
self.adjust,
self.ignore_na,
)
self.old_wt = old_wt
self.last_ewm = result[-1]
return result
def reset(self) -> None:
self.old_wt = np.ones(self.shape[self.axis - 1])
self.last_ewm = None

File diff suppressed because it is too large Load Diff