Updated script that can be controled by Nodejs web app

This commit is contained in:
mac OS
2024-11-25 12:24:18 +07:00
parent c440eda1f4
commit 8b0ab2bd3a
8662 changed files with 1803808 additions and 34 deletions

View File

@ -0,0 +1,239 @@
from __future__ import annotations
import functools
from typing import (
TYPE_CHECKING,
Any,
Callable,
)
if TYPE_CHECKING:
from pandas._typing import Scalar
import numpy as np
from pandas.compat._optional import import_optional_dependency
@functools.cache
def generate_apply_looper(func, nopython=True, nogil=True, parallel=False):
if TYPE_CHECKING:
import numba
else:
numba = import_optional_dependency("numba")
nb_compat_func = numba.extending.register_jitable(func)
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def nb_looper(values, axis):
# Operate on the first row/col in order to get
# the output shape
if axis == 0:
first_elem = values[:, 0]
dim0 = values.shape[1]
else:
first_elem = values[0]
dim0 = values.shape[0]
res0 = nb_compat_func(first_elem)
# Use np.asarray to get shape for
# https://github.com/numba/numba/issues/4202#issuecomment-1185981507
buf_shape = (dim0,) + np.atleast_1d(np.asarray(res0)).shape
if axis == 0:
buf_shape = buf_shape[::-1]
buff = np.empty(buf_shape)
if axis == 1:
buff[0] = res0
for i in numba.prange(1, values.shape[0]):
buff[i] = nb_compat_func(values[i])
else:
buff[:, 0] = res0
for j in numba.prange(1, values.shape[1]):
buff[:, j] = nb_compat_func(values[:, j])
return buff
return nb_looper
@functools.cache
def make_looper(func, result_dtype, is_grouped_kernel, nopython, nogil, parallel):
if TYPE_CHECKING:
import numba
else:
numba = import_optional_dependency("numba")
if is_grouped_kernel:
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def column_looper(
values: np.ndarray,
labels: np.ndarray,
ngroups: int,
min_periods: int,
*args,
):
result = np.empty((values.shape[0], ngroups), dtype=result_dtype)
na_positions = {}
for i in numba.prange(values.shape[0]):
output, na_pos = func(
values[i], result_dtype, labels, ngroups, min_periods, *args
)
result[i] = output
if len(na_pos) > 0:
na_positions[i] = np.array(na_pos)
return result, na_positions
else:
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def column_looper(
values: np.ndarray,
start: np.ndarray,
end: np.ndarray,
min_periods: int,
*args,
):
result = np.empty((values.shape[0], len(start)), dtype=result_dtype)
na_positions = {}
for i in numba.prange(values.shape[0]):
output, na_pos = func(
values[i], result_dtype, start, end, min_periods, *args
)
result[i] = output
if len(na_pos) > 0:
na_positions[i] = np.array(na_pos)
return result, na_positions
return column_looper
default_dtype_mapping: dict[np.dtype, Any] = {
np.dtype("int8"): np.int64,
np.dtype("int16"): np.int64,
np.dtype("int32"): np.int64,
np.dtype("int64"): np.int64,
np.dtype("uint8"): np.uint64,
np.dtype("uint16"): np.uint64,
np.dtype("uint32"): np.uint64,
np.dtype("uint64"): np.uint64,
np.dtype("float32"): np.float64,
np.dtype("float64"): np.float64,
np.dtype("complex64"): np.complex128,
np.dtype("complex128"): np.complex128,
}
# TODO: Preserve complex dtypes
float_dtype_mapping: dict[np.dtype, Any] = {
np.dtype("int8"): np.float64,
np.dtype("int16"): np.float64,
np.dtype("int32"): np.float64,
np.dtype("int64"): np.float64,
np.dtype("uint8"): np.float64,
np.dtype("uint16"): np.float64,
np.dtype("uint32"): np.float64,
np.dtype("uint64"): np.float64,
np.dtype("float32"): np.float64,
np.dtype("float64"): np.float64,
np.dtype("complex64"): np.float64,
np.dtype("complex128"): np.float64,
}
identity_dtype_mapping: dict[np.dtype, Any] = {
np.dtype("int8"): np.int8,
np.dtype("int16"): np.int16,
np.dtype("int32"): np.int32,
np.dtype("int64"): np.int64,
np.dtype("uint8"): np.uint8,
np.dtype("uint16"): np.uint16,
np.dtype("uint32"): np.uint32,
np.dtype("uint64"): np.uint64,
np.dtype("float32"): np.float32,
np.dtype("float64"): np.float64,
np.dtype("complex64"): np.complex64,
np.dtype("complex128"): np.complex128,
}
def generate_shared_aggregator(
func: Callable[..., Scalar],
dtype_mapping: dict[np.dtype, np.dtype],
is_grouped_kernel: bool,
nopython: bool,
nogil: bool,
parallel: bool,
):
"""
Generate a Numba function that loops over the columns 2D object and applies
a 1D numba kernel over each column.
Parameters
----------
func : function
aggregation function to be applied to each column
dtype_mapping: dict or None
If not None, maps a dtype to a result dtype.
Otherwise, will fall back to default mapping.
is_grouped_kernel: bool, default False
Whether func operates using the group labels (True)
or using starts/ends arrays
If true, you also need to pass the number of groups to this function
nopython : bool
nopython to be passed into numba.jit
nogil : bool
nogil to be passed into numba.jit
parallel : bool
parallel to be passed into numba.jit
Returns
-------
Numba function
"""
# A wrapper around the looper function,
# to dispatch based on dtype since numba is unable to do that in nopython mode
# It also post-processes the values by inserting nans where number of observations
# is less than min_periods
# Cannot do this in numba nopython mode
# (you'll run into type-unification error when you cast int -> float)
def looper_wrapper(
values,
start=None,
end=None,
labels=None,
ngroups=None,
min_periods: int = 0,
**kwargs,
):
result_dtype = dtype_mapping[values.dtype]
column_looper = make_looper(
func, result_dtype, is_grouped_kernel, nopython, nogil, parallel
)
# Need to unpack kwargs since numba only supports *args
if is_grouped_kernel:
result, na_positions = column_looper(
values, labels, ngroups, min_periods, *kwargs.values()
)
else:
result, na_positions = column_looper(
values, start, end, min_periods, *kwargs.values()
)
if result.dtype.kind == "i":
# Look if na_positions is not empty
# If so, convert the whole block
# This is OK since int dtype cannot hold nan,
# so if min_periods not satisfied for 1 col, it is not satisfied for
# all columns at that index
for na_pos in na_positions.values():
if len(na_pos) > 0:
result = result.astype("float64")
break
# TODO: Optimize this
for i, na_pos in na_positions.items():
if len(na_pos) > 0:
result[i, na_pos] = np.nan
return result
return looper_wrapper

View File

@ -0,0 +1,584 @@
# Disable type checking for this module since numba's internals
# are not typed, and we use numba's internals via its extension API
# mypy: ignore-errors
"""
Utility classes/functions to let numba recognize
pandas Index/Series/DataFrame
Mostly vendored from https://github.com/numba/numba/blob/main/numba/tests/pdlike_usecase.py
"""
from __future__ import annotations
from contextlib import contextmanager
import operator
import numba
from numba import types
from numba.core import cgutils
from numba.core.datamodel import models
from numba.core.extending import (
NativeValue,
box,
lower_builtin,
make_attribute_wrapper,
overload,
overload_attribute,
overload_method,
register_model,
type_callable,
typeof_impl,
unbox,
)
from numba.core.imputils import impl_ret_borrowed
import numpy as np
from pandas._libs import lib
from pandas.core.indexes.base import Index
from pandas.core.indexing import _iLocIndexer
from pandas.core.internals import SingleBlockManager
from pandas.core.series import Series
# Helper function to hack around fact that Index casts numpy string dtype to object
#
# Idea is to set an attribute on a Index called _numba_data
# that is the original data, or the object data casted to numpy string dtype,
# with a context manager that is unset afterwards
@contextmanager
def set_numba_data(index: Index):
numba_data = index._data
if numba_data.dtype == object:
if not lib.is_string_array(numba_data):
raise ValueError(
"The numba engine only supports using string or numeric column names"
)
numba_data = numba_data.astype("U")
try:
index._numba_data = numba_data
yield index
finally:
del index._numba_data
# TODO: Range index support
# (this currently lowers OK, but does not round-trip)
class IndexType(types.Type):
"""
The type class for Index objects.
"""
def __init__(self, dtype, layout, pyclass: any) -> None:
self.pyclass = pyclass
name = f"index({dtype}, {layout})"
self.dtype = dtype
self.layout = layout
super().__init__(name)
@property
def key(self):
return self.pyclass, self.dtype, self.layout
@property
def as_array(self):
return types.Array(self.dtype, 1, self.layout)
def copy(self, dtype=None, ndim: int = 1, layout=None):
assert ndim == 1
if dtype is None:
dtype = self.dtype
layout = layout or self.layout
return type(self)(dtype, layout, self.pyclass)
class SeriesType(types.Type):
"""
The type class for Series objects.
"""
def __init__(self, dtype, index, namety) -> None:
assert isinstance(index, IndexType)
self.dtype = dtype
self.index = index
self.values = types.Array(self.dtype, 1, "C")
self.namety = namety
name = f"series({dtype}, {index}, {namety})"
super().__init__(name)
@property
def key(self):
return self.dtype, self.index, self.namety
@property
def as_array(self):
return self.values
def copy(self, dtype=None, ndim: int = 1, layout: str = "C"):
assert ndim == 1
assert layout == "C"
if dtype is None:
dtype = self.dtype
return type(self)(dtype, self.index, self.namety)
@typeof_impl.register(Index)
def typeof_index(val, c):
"""
This will assume that only strings are in object dtype
index.
(you should check this before this gets lowered down to numba)
"""
# arrty = typeof_impl(val._data, c)
arrty = typeof_impl(val._numba_data, c)
assert arrty.ndim == 1
return IndexType(arrty.dtype, arrty.layout, type(val))
@typeof_impl.register(Series)
def typeof_series(val, c):
index = typeof_impl(val.index, c)
arrty = typeof_impl(val.values, c)
namety = typeof_impl(val.name, c)
assert arrty.ndim == 1
assert arrty.layout == "C"
return SeriesType(arrty.dtype, index, namety)
@type_callable(Series)
def type_series_constructor(context):
def typer(data, index, name=None):
if isinstance(index, IndexType) and isinstance(data, types.Array):
assert data.ndim == 1
if name is None:
name = types.intp
return SeriesType(data.dtype, index, name)
return typer
@type_callable(Index)
def type_index_constructor(context):
def typer(data, hashmap=None):
if isinstance(data, types.Array):
assert data.layout == "C"
assert data.ndim == 1
assert hashmap is None or isinstance(hashmap, types.DictType)
return IndexType(data.dtype, layout=data.layout, pyclass=Index)
return typer
# Backend extensions for Index and Series and Frame
@register_model(IndexType)
class IndexModel(models.StructModel):
def __init__(self, dmm, fe_type) -> None:
# We don't want the numpy string scalar type in our hashmap
members = [
("data", fe_type.as_array),
# This is an attempt to emulate our hashtable code with a numba
# typed dict
# It maps from values in the index to their integer positions in the array
("hashmap", types.DictType(fe_type.dtype, types.intp)),
# Pointer to the Index object this was created from, or that it
# boxes to
# https://numba.discourse.group/t/qst-how-to-cache-the-boxing-of-an-object/2128/2?u=lithomas1
("parent", types.pyobject),
]
models.StructModel.__init__(self, dmm, fe_type, members)
@register_model(SeriesType)
class SeriesModel(models.StructModel):
def __init__(self, dmm, fe_type) -> None:
members = [
("index", fe_type.index),
("values", fe_type.as_array),
("name", fe_type.namety),
]
models.StructModel.__init__(self, dmm, fe_type, members)
make_attribute_wrapper(IndexType, "data", "_data")
make_attribute_wrapper(IndexType, "hashmap", "hashmap")
make_attribute_wrapper(SeriesType, "index", "index")
make_attribute_wrapper(SeriesType, "values", "values")
make_attribute_wrapper(SeriesType, "name", "name")
@lower_builtin(Series, types.Array, IndexType)
def pdseries_constructor(context, builder, sig, args):
data, index = args
series = cgutils.create_struct_proxy(sig.return_type)(context, builder)
series.index = index
series.values = data
series.name = context.get_constant(types.intp, 0)
return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue())
@lower_builtin(Series, types.Array, IndexType, types.intp)
@lower_builtin(Series, types.Array, IndexType, types.float64)
@lower_builtin(Series, types.Array, IndexType, types.unicode_type)
def pdseries_constructor_with_name(context, builder, sig, args):
data, index, name = args
series = cgutils.create_struct_proxy(sig.return_type)(context, builder)
series.index = index
series.values = data
series.name = name
return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue())
@lower_builtin(Index, types.Array, types.DictType, types.pyobject)
def index_constructor_2arg(context, builder, sig, args):
(data, hashmap, parent) = args
index = cgutils.create_struct_proxy(sig.return_type)(context, builder)
index.data = data
index.hashmap = hashmap
index.parent = parent
return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue())
@lower_builtin(Index, types.Array, types.DictType)
def index_constructor_2arg_parent(context, builder, sig, args):
# Basically same as index_constructor_1arg, but also lets you specify the
# parent object
(data, hashmap) = args
index = cgutils.create_struct_proxy(sig.return_type)(context, builder)
index.data = data
index.hashmap = hashmap
return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue())
@lower_builtin(Index, types.Array)
def index_constructor_1arg(context, builder, sig, args):
from numba.typed import Dict
key_type = sig.return_type.dtype
value_type = types.intp
def index_impl(data):
return Index(data, Dict.empty(key_type, value_type))
return context.compile_internal(builder, index_impl, sig, args)
# Helper to convert the unicodecharseq (numpy string scalar) into a unicode_type
# (regular string)
def maybe_cast_str(x):
# Dummy function that numba can overload
pass
@overload(maybe_cast_str)
def maybe_cast_str_impl(x):
"""Converts numba UnicodeCharSeq (numpy string scalar) -> unicode type (string).
Is a no-op for other types."""
if isinstance(x, types.UnicodeCharSeq):
return lambda x: str(x)
else:
return lambda x: x
@unbox(IndexType)
def unbox_index(typ, obj, c):
"""
Convert a Index object to a native structure.
Note: Object dtype is not allowed here
"""
data_obj = c.pyapi.object_getattr_string(obj, "_numba_data")
index = cgutils.create_struct_proxy(typ)(c.context, c.builder)
# If we see an object array, assume its been validated as only containing strings
# We still need to do the conversion though
index.data = c.unbox(typ.as_array, data_obj).value
typed_dict_obj = c.pyapi.unserialize(c.pyapi.serialize_object(numba.typed.Dict))
# Create an empty typed dict in numba for the hashmap for indexing
# equiv of numba.typed.Dict.empty(typ.dtype, types.intp)
arr_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.dtype))
intp_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(types.intp))
hashmap_obj = c.pyapi.call_method(
typed_dict_obj, "empty", (arr_type_obj, intp_type_obj)
)
index.hashmap = c.unbox(types.DictType(typ.dtype, types.intp), hashmap_obj).value
# Set the parent for speedy boxing.
index.parent = obj
# Decrefs
c.pyapi.decref(data_obj)
c.pyapi.decref(arr_type_obj)
c.pyapi.decref(intp_type_obj)
c.pyapi.decref(typed_dict_obj)
return NativeValue(index._getvalue())
@unbox(SeriesType)
def unbox_series(typ, obj, c):
"""
Convert a Series object to a native structure.
"""
index_obj = c.pyapi.object_getattr_string(obj, "index")
values_obj = c.pyapi.object_getattr_string(obj, "values")
name_obj = c.pyapi.object_getattr_string(obj, "name")
series = cgutils.create_struct_proxy(typ)(c.context, c.builder)
series.index = c.unbox(typ.index, index_obj).value
series.values = c.unbox(typ.values, values_obj).value
series.name = c.unbox(typ.namety, name_obj).value
# Decrefs
c.pyapi.decref(index_obj)
c.pyapi.decref(values_obj)
c.pyapi.decref(name_obj)
return NativeValue(series._getvalue())
@box(IndexType)
def box_index(typ, val, c):
"""
Convert a native index structure to a Index object.
If our native index is of a numpy string dtype, we'll cast it to
object.
"""
# First build a Numpy array object, then wrap it in a Index
index = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val)
res = cgutils.alloca_once_value(c.builder, index.parent)
# Does parent exist?
# (it means already boxed once, or Index same as original df.index or df.columns)
# xref https://github.com/numba/numba/blob/596e8a55334cc46854e3192766e643767bd7c934/numba/core/boxing.py#L593C17-L593C17
with c.builder.if_else(cgutils.is_not_null(c.builder, index.parent)) as (
has_parent,
otherwise,
):
with has_parent:
c.pyapi.incref(index.parent)
with otherwise:
# TODO: preserve the original class for the index
# Also need preserve the name of the Index
# class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.pyclass))
class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Index))
array_obj = c.box(typ.as_array, index.data)
if isinstance(typ.dtype, types.UnicodeCharSeq):
# We converted to numpy string dtype, convert back
# to object since _simple_new won't do that for uss
object_str_obj = c.pyapi.unserialize(c.pyapi.serialize_object("object"))
array_obj = c.pyapi.call_method(array_obj, "astype", (object_str_obj,))
c.pyapi.decref(object_str_obj)
# this is basically Index._simple_new(array_obj, name_obj) in python
index_obj = c.pyapi.call_method(class_obj, "_simple_new", (array_obj,))
index.parent = index_obj
c.builder.store(index_obj, res)
# Decrefs
c.pyapi.decref(class_obj)
c.pyapi.decref(array_obj)
return c.builder.load(res)
@box(SeriesType)
def box_series(typ, val, c):
"""
Convert a native series structure to a Series object.
"""
series = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val)
series_const_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Series._from_mgr))
mgr_const_obj = c.pyapi.unserialize(
c.pyapi.serialize_object(SingleBlockManager.from_array)
)
index_obj = c.box(typ.index, series.index)
array_obj = c.box(typ.as_array, series.values)
name_obj = c.box(typ.namety, series.name)
# This is basically equivalent of
# pd.Series(data=array_obj, index=index_obj)
# To improve perf, we will construct the Series from a manager
# object to avoid checks.
# We'll also set the name attribute manually to avoid validation
mgr_obj = c.pyapi.call_function_objargs(
mgr_const_obj,
(
array_obj,
index_obj,
),
)
mgr_axes_obj = c.pyapi.object_getattr_string(mgr_obj, "axes")
# Series._constructor_from_mgr(mgr, axes)
series_obj = c.pyapi.call_function_objargs(
series_const_obj, (mgr_obj, mgr_axes_obj)
)
c.pyapi.object_setattr_string(series_obj, "_name", name_obj)
# Decrefs
c.pyapi.decref(series_const_obj)
c.pyapi.decref(mgr_axes_obj)
c.pyapi.decref(mgr_obj)
c.pyapi.decref(mgr_const_obj)
c.pyapi.decref(index_obj)
c.pyapi.decref(array_obj)
c.pyapi.decref(name_obj)
return series_obj
# Add common series reductions (e.g. mean, sum),
# and also add common binops (e.g. add, sub, mul, div)
def generate_series_reduction(ser_reduction, ser_method):
@overload_method(SeriesType, ser_reduction)
def series_reduction(series):
def series_reduction_impl(series):
return ser_method(series.values)
return series_reduction_impl
return series_reduction
def generate_series_binop(binop):
@overload(binop)
def series_binop(series1, value):
if isinstance(series1, SeriesType):
if isinstance(value, SeriesType):
def series_binop_impl(series1, series2):
# TODO: Check index matching?
return Series(
binop(series1.values, series2.values),
series1.index,
series1.name,
)
return series_binop_impl
else:
def series_binop_impl(series1, value):
return Series(
binop(series1.values, value), series1.index, series1.name
)
return series_binop_impl
return series_binop
series_reductions = [
("sum", np.sum),
("mean", np.mean),
# Disabled due to discrepancies between numba std. dev
# and pandas std. dev (no way to specify dof)
# ("std", np.std),
# ("var", np.var),
("min", np.min),
("max", np.max),
]
for reduction, reduction_method in series_reductions:
generate_series_reduction(reduction, reduction_method)
series_binops = [operator.add, operator.sub, operator.mul, operator.truediv]
for ser_binop in series_binops:
generate_series_binop(ser_binop)
# get_loc on Index
@overload_method(IndexType, "get_loc")
def index_get_loc(index, item):
def index_get_loc_impl(index, item):
# Initialize the hash table if not initialized
if len(index.hashmap) == 0:
for i, val in enumerate(index._data):
index.hashmap[val] = i
return index.hashmap[item]
return index_get_loc_impl
# Indexing for Series/Index
@overload(operator.getitem)
def series_indexing(series, item):
if isinstance(series, SeriesType):
def series_getitem(series, item):
loc = series.index.get_loc(item)
return series.iloc[loc]
return series_getitem
@overload(operator.getitem)
def index_indexing(index, idx):
if isinstance(index, IndexType):
def index_getitem(index, idx):
return index._data[idx]
return index_getitem
class IlocType(types.Type):
def __init__(self, obj_type) -> None:
self.obj_type = obj_type
name = f"iLocIndexer({obj_type})"
super().__init__(name=name)
@property
def key(self):
return self.obj_type
@typeof_impl.register(_iLocIndexer)
def typeof_iloc(val, c):
objtype = typeof_impl(val.obj, c)
return IlocType(objtype)
@type_callable(_iLocIndexer)
def type_iloc_constructor(context):
def typer(obj):
if isinstance(obj, SeriesType):
return IlocType(obj)
return typer
@lower_builtin(_iLocIndexer, SeriesType)
def iloc_constructor(context, builder, sig, args):
(obj,) = args
iloc_indexer = cgutils.create_struct_proxy(sig.return_type)(context, builder)
iloc_indexer.obj = obj
return impl_ret_borrowed(
context, builder, sig.return_type, iloc_indexer._getvalue()
)
@register_model(IlocType)
class ILocModel(models.StructModel):
def __init__(self, dmm, fe_type) -> None:
members = [("obj", fe_type.obj_type)]
models.StructModel.__init__(self, dmm, fe_type, members)
make_attribute_wrapper(IlocType, "obj", "obj")
@overload_attribute(SeriesType, "iloc")
def series_iloc(series):
def get(series):
return _iLocIndexer(series)
return get
@overload(operator.getitem)
def iloc_getitem(iloc_indexer, i):
if isinstance(iloc_indexer, IlocType):
def getitem_impl(iloc_indexer, i):
return iloc_indexer.obj.values[i]
return getitem_impl

View File

@ -0,0 +1,27 @@
from pandas.core._numba.kernels.mean_ import (
grouped_mean,
sliding_mean,
)
from pandas.core._numba.kernels.min_max_ import (
grouped_min_max,
sliding_min_max,
)
from pandas.core._numba.kernels.sum_ import (
grouped_sum,
sliding_sum,
)
from pandas.core._numba.kernels.var_ import (
grouped_var,
sliding_var,
)
__all__ = [
"sliding_mean",
"grouped_mean",
"sliding_sum",
"grouped_sum",
"sliding_var",
"grouped_var",
"sliding_min_max",
"grouped_min_max",
]

View File

@ -0,0 +1,196 @@
"""
Numba 1D mean kernels that can be shared by
* Dataframe / Series
* groupby
* rolling / expanding
Mirrors pandas/_libs/window/aggregation.pyx
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import numba
import numpy as np
from pandas.core._numba.kernels.shared import is_monotonic_increasing
from pandas.core._numba.kernels.sum_ import grouped_kahan_sum
if TYPE_CHECKING:
from pandas._typing import npt
@numba.jit(nopython=True, nogil=True, parallel=False)
def add_mean(
val: float,
nobs: int,
sum_x: float,
neg_ct: int,
compensation: float,
num_consecutive_same_value: int,
prev_value: float,
) -> tuple[int, float, int, float, int, float]:
if not np.isnan(val):
nobs += 1
y = val - compensation
t = sum_x + y
compensation = t - sum_x - y
sum_x = t
if val < 0:
neg_ct += 1
if val == prev_value:
num_consecutive_same_value += 1
else:
num_consecutive_same_value = 1
prev_value = val
return nobs, sum_x, neg_ct, compensation, num_consecutive_same_value, prev_value
@numba.jit(nopython=True, nogil=True, parallel=False)
def remove_mean(
val: float, nobs: int, sum_x: float, neg_ct: int, compensation: float
) -> tuple[int, float, int, float]:
if not np.isnan(val):
nobs -= 1
y = -val - compensation
t = sum_x + y
compensation = t - sum_x - y
sum_x = t
if val < 0:
neg_ct -= 1
return nobs, sum_x, neg_ct, compensation
@numba.jit(nopython=True, nogil=True, parallel=False)
def sliding_mean(
values: np.ndarray,
result_dtype: np.dtype,
start: np.ndarray,
end: np.ndarray,
min_periods: int,
) -> tuple[np.ndarray, list[int]]:
N = len(start)
nobs = 0
sum_x = 0.0
neg_ct = 0
compensation_add = 0.0
compensation_remove = 0.0
is_monotonic_increasing_bounds = is_monotonic_increasing(
start
) and is_monotonic_increasing(end)
output = np.empty(N, dtype=result_dtype)
for i in range(N):
s = start[i]
e = end[i]
if i == 0 or not is_monotonic_increasing_bounds:
prev_value = values[s]
num_consecutive_same_value = 0
for j in range(s, e):
val = values[j]
(
nobs,
sum_x,
neg_ct,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_mean(
val,
nobs,
sum_x,
neg_ct,
compensation_add,
num_consecutive_same_value,
prev_value, # pyright: ignore[reportGeneralTypeIssues]
)
else:
for j in range(start[i - 1], s):
val = values[j]
nobs, sum_x, neg_ct, compensation_remove = remove_mean(
val, nobs, sum_x, neg_ct, compensation_remove
)
for j in range(end[i - 1], e):
val = values[j]
(
nobs,
sum_x,
neg_ct,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_mean(
val,
nobs,
sum_x,
neg_ct,
compensation_add,
num_consecutive_same_value,
prev_value, # pyright: ignore[reportGeneralTypeIssues]
)
if nobs >= min_periods and nobs > 0:
result = sum_x / nobs
if num_consecutive_same_value >= nobs:
result = prev_value
elif neg_ct == 0 and result < 0:
result = 0
elif neg_ct == nobs and result > 0:
result = 0
else:
result = np.nan
output[i] = result
if not is_monotonic_increasing_bounds:
nobs = 0
sum_x = 0.0
neg_ct = 0
compensation_remove = 0.0
# na_position is empty list since float64 can already hold nans
# Do list comprehension, since numba cannot figure out that na_pos is
# empty list of ints on its own
na_pos = [0 for i in range(0)]
return output, na_pos
@numba.jit(nopython=True, nogil=True, parallel=False)
def grouped_mean(
values: np.ndarray,
result_dtype: np.dtype,
labels: npt.NDArray[np.intp],
ngroups: int,
min_periods: int,
) -> tuple[np.ndarray, list[int]]:
output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum(
values, result_dtype, labels, ngroups
)
# Post-processing, replace sums that don't satisfy min_periods
for lab in range(ngroups):
nobs = nobs_arr[lab]
num_consecutive_same_value = consecutive_counts[lab]
prev_value = prev_vals[lab]
sum_x = output[lab]
if nobs >= min_periods:
if num_consecutive_same_value >= nobs:
result = prev_value * nobs
else:
result = sum_x
else:
result = np.nan
result /= nobs
output[lab] = result
# na_position is empty list since float64 can already hold nans
# Do list comprehension, since numba cannot figure out that na_pos is
# empty list of ints on its own
na_pos = [0 for i in range(0)]
return output, na_pos

View File

@ -0,0 +1,125 @@
"""
Numba 1D min/max kernels that can be shared by
* Dataframe / Series
* groupby
* rolling / expanding
Mirrors pandas/_libs/window/aggregation.pyx
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import numba
import numpy as np
if TYPE_CHECKING:
from pandas._typing import npt
@numba.jit(nopython=True, nogil=True, parallel=False)
def sliding_min_max(
values: np.ndarray,
result_dtype: np.dtype,
start: np.ndarray,
end: np.ndarray,
min_periods: int,
is_max: bool,
) -> tuple[np.ndarray, list[int]]:
N = len(start)
nobs = 0
output = np.empty(N, dtype=result_dtype)
na_pos = []
# Use deque once numba supports it
# https://github.com/numba/numba/issues/7417
Q: list = []
W: list = []
for i in range(N):
curr_win_size = end[i] - start[i]
if i == 0:
st = start[i]
else:
st = end[i - 1]
for k in range(st, end[i]):
ai = values[k]
if not np.isnan(ai):
nobs += 1
elif is_max:
ai = -np.inf
else:
ai = np.inf
# Discard previous entries if we find new min or max
if is_max:
while Q and ((ai >= values[Q[-1]]) or values[Q[-1]] != values[Q[-1]]):
Q.pop()
else:
while Q and ((ai <= values[Q[-1]]) or values[Q[-1]] != values[Q[-1]]):
Q.pop()
Q.append(k)
W.append(k)
# Discard entries outside and left of current window
while Q and Q[0] <= start[i] - 1:
Q.pop(0)
while W and W[0] <= start[i] - 1:
if not np.isnan(values[W[0]]):
nobs -= 1
W.pop(0)
# Save output based on index in input value array
if Q and curr_win_size > 0 and nobs >= min_periods:
output[i] = values[Q[0]]
else:
if values.dtype.kind != "i":
output[i] = np.nan
else:
na_pos.append(i)
return output, na_pos
@numba.jit(nopython=True, nogil=True, parallel=False)
def grouped_min_max(
values: np.ndarray,
result_dtype: np.dtype,
labels: npt.NDArray[np.intp],
ngroups: int,
min_periods: int,
is_max: bool,
) -> tuple[np.ndarray, list[int]]:
N = len(labels)
nobs = np.zeros(ngroups, dtype=np.int64)
na_pos = []
output = np.empty(ngroups, dtype=result_dtype)
for i in range(N):
lab = labels[i]
val = values[i]
if lab < 0:
continue
if values.dtype.kind == "i" or not np.isnan(val):
nobs[lab] += 1
else:
# NaN value cannot be a min/max value
continue
if nobs[lab] == 1:
# First element in group, set output equal to this
output[lab] = val
continue
if is_max:
if val > output[lab]:
output[lab] = val
else:
if val < output[lab]:
output[lab] = val
# Set labels that don't satisfy min_periods as np.nan
for lab, count in enumerate(nobs):
if count < min_periods:
na_pos.append(lab)
return output, na_pos

View File

@ -0,0 +1,29 @@
from __future__ import annotations
from typing import TYPE_CHECKING
import numba
if TYPE_CHECKING:
import numpy as np
@numba.jit(
# error: Any? not callable
numba.boolean(numba.int64[:]), # type: ignore[misc]
nopython=True,
nogil=True,
parallel=False,
)
def is_monotonic_increasing(bounds: np.ndarray) -> bool:
"""Check if int64 values are monotonically increasing."""
n = len(bounds)
if n < 2:
return True
prev = bounds[0]
for i in range(1, n):
cur = bounds[i]
if cur < prev:
return False
prev = cur
return True

View File

@ -0,0 +1,244 @@
"""
Numba 1D sum kernels that can be shared by
* Dataframe / Series
* groupby
* rolling / expanding
Mirrors pandas/_libs/window/aggregation.pyx
"""
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
)
import numba
from numba.extending import register_jitable
import numpy as np
if TYPE_CHECKING:
from pandas._typing import npt
from pandas.core._numba.kernels.shared import is_monotonic_increasing
@numba.jit(nopython=True, nogil=True, parallel=False)
def add_sum(
val: Any,
nobs: int,
sum_x: Any,
compensation: Any,
num_consecutive_same_value: int,
prev_value: Any,
) -> tuple[int, Any, Any, int, Any]:
if not np.isnan(val):
nobs += 1
y = val - compensation
t = sum_x + y
compensation = t - sum_x - y
sum_x = t
if val == prev_value:
num_consecutive_same_value += 1
else:
num_consecutive_same_value = 1
prev_value = val
return nobs, sum_x, compensation, num_consecutive_same_value, prev_value
@numba.jit(nopython=True, nogil=True, parallel=False)
def remove_sum(
val: Any, nobs: int, sum_x: Any, compensation: Any
) -> tuple[int, Any, Any]:
if not np.isnan(val):
nobs -= 1
y = -val - compensation
t = sum_x + y
compensation = t - sum_x - y
sum_x = t
return nobs, sum_x, compensation
@numba.jit(nopython=True, nogil=True, parallel=False)
def sliding_sum(
values: np.ndarray,
result_dtype: np.dtype,
start: np.ndarray,
end: np.ndarray,
min_periods: int,
) -> tuple[np.ndarray, list[int]]:
dtype = values.dtype
na_val: object = np.nan
if dtype.kind == "i":
na_val = 0
N = len(start)
nobs = 0
sum_x = 0
compensation_add = 0
compensation_remove = 0
na_pos = []
is_monotonic_increasing_bounds = is_monotonic_increasing(
start
) and is_monotonic_increasing(end)
output = np.empty(N, dtype=result_dtype)
for i in range(N):
s = start[i]
e = end[i]
if i == 0 or not is_monotonic_increasing_bounds:
prev_value = values[s]
num_consecutive_same_value = 0
for j in range(s, e):
val = values[j]
(
nobs,
sum_x,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_sum(
val,
nobs,
sum_x,
compensation_add,
num_consecutive_same_value,
prev_value,
)
else:
for j in range(start[i - 1], s):
val = values[j]
nobs, sum_x, compensation_remove = remove_sum(
val, nobs, sum_x, compensation_remove
)
for j in range(end[i - 1], e):
val = values[j]
(
nobs,
sum_x,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_sum(
val,
nobs,
sum_x,
compensation_add,
num_consecutive_same_value,
prev_value,
)
if nobs == 0 == min_periods:
result: object = 0
elif nobs >= min_periods:
if num_consecutive_same_value >= nobs:
result = prev_value * nobs
else:
result = sum_x
else:
result = na_val
if dtype.kind == "i":
na_pos.append(i)
output[i] = result
if not is_monotonic_increasing_bounds:
nobs = 0
sum_x = 0
compensation_remove = 0
return output, na_pos
# Mypy/pyright don't like the fact that the decorator is untyped
@register_jitable # type: ignore[misc]
def grouped_kahan_sum(
values: np.ndarray,
result_dtype: np.dtype,
labels: npt.NDArray[np.intp],
ngroups: int,
) -> tuple[
np.ndarray, npt.NDArray[np.int64], np.ndarray, npt.NDArray[np.int64], np.ndarray
]:
N = len(labels)
nobs_arr = np.zeros(ngroups, dtype=np.int64)
comp_arr = np.zeros(ngroups, dtype=values.dtype)
consecutive_counts = np.zeros(ngroups, dtype=np.int64)
prev_vals = np.zeros(ngroups, dtype=values.dtype)
output = np.zeros(ngroups, dtype=result_dtype)
for i in range(N):
lab = labels[i]
val = values[i]
if lab < 0:
continue
sum_x = output[lab]
nobs = nobs_arr[lab]
compensation_add = comp_arr[lab]
num_consecutive_same_value = consecutive_counts[lab]
prev_value = prev_vals[lab]
(
nobs,
sum_x,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_sum(
val,
nobs,
sum_x,
compensation_add,
num_consecutive_same_value,
prev_value,
)
output[lab] = sum_x
consecutive_counts[lab] = num_consecutive_same_value
prev_vals[lab] = prev_value
comp_arr[lab] = compensation_add
nobs_arr[lab] = nobs
return output, nobs_arr, comp_arr, consecutive_counts, prev_vals
@numba.jit(nopython=True, nogil=True, parallel=False)
def grouped_sum(
values: np.ndarray,
result_dtype: np.dtype,
labels: npt.NDArray[np.intp],
ngroups: int,
min_periods: int,
) -> tuple[np.ndarray, list[int]]:
na_pos = []
output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum(
values, result_dtype, labels, ngroups
)
# Post-processing, replace sums that don't satisfy min_periods
for lab in range(ngroups):
nobs = nobs_arr[lab]
num_consecutive_same_value = consecutive_counts[lab]
prev_value = prev_vals[lab]
sum_x = output[lab]
if nobs >= min_periods:
if num_consecutive_same_value >= nobs:
result = prev_value * nobs
else:
result = sum_x
else:
result = sum_x # Don't change val, will be replaced by nan later
na_pos.append(lab)
output[lab] = result
return output, na_pos

View File

@ -0,0 +1,245 @@
"""
Numba 1D var kernels that can be shared by
* Dataframe / Series
* groupby
* rolling / expanding
Mirrors pandas/_libs/window/aggregation.pyx
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import numba
import numpy as np
if TYPE_CHECKING:
from pandas._typing import npt
from pandas.core._numba.kernels.shared import is_monotonic_increasing
@numba.jit(nopython=True, nogil=True, parallel=False)
def add_var(
val: float,
nobs: int,
mean_x: float,
ssqdm_x: float,
compensation: float,
num_consecutive_same_value: int,
prev_value: float,
) -> tuple[int, float, float, float, int, float]:
if not np.isnan(val):
if val == prev_value:
num_consecutive_same_value += 1
else:
num_consecutive_same_value = 1
prev_value = val
nobs += 1
prev_mean = mean_x - compensation
y = val - compensation
t = y - mean_x
compensation = t + mean_x - y
delta = t
if nobs:
mean_x += delta / nobs
else:
mean_x = 0
ssqdm_x += (val - prev_mean) * (val - mean_x)
return nobs, mean_x, ssqdm_x, compensation, num_consecutive_same_value, prev_value
@numba.jit(nopython=True, nogil=True, parallel=False)
def remove_var(
val: float, nobs: int, mean_x: float, ssqdm_x: float, compensation: float
) -> tuple[int, float, float, float]:
if not np.isnan(val):
nobs -= 1
if nobs:
prev_mean = mean_x - compensation
y = val - compensation
t = y - mean_x
compensation = t + mean_x - y
delta = t
mean_x -= delta / nobs
ssqdm_x -= (val - prev_mean) * (val - mean_x)
else:
mean_x = 0
ssqdm_x = 0
return nobs, mean_x, ssqdm_x, compensation
@numba.jit(nopython=True, nogil=True, parallel=False)
def sliding_var(
values: np.ndarray,
result_dtype: np.dtype,
start: np.ndarray,
end: np.ndarray,
min_periods: int,
ddof: int = 1,
) -> tuple[np.ndarray, list[int]]:
N = len(start)
nobs = 0
mean_x = 0.0
ssqdm_x = 0.0
compensation_add = 0.0
compensation_remove = 0.0
min_periods = max(min_periods, 1)
is_monotonic_increasing_bounds = is_monotonic_increasing(
start
) and is_monotonic_increasing(end)
output = np.empty(N, dtype=result_dtype)
for i in range(N):
s = start[i]
e = end[i]
if i == 0 or not is_monotonic_increasing_bounds:
prev_value = values[s]
num_consecutive_same_value = 0
for j in range(s, e):
val = values[j]
(
nobs,
mean_x,
ssqdm_x,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_var(
val,
nobs,
mean_x,
ssqdm_x,
compensation_add,
num_consecutive_same_value,
prev_value,
)
else:
for j in range(start[i - 1], s):
val = values[j]
nobs, mean_x, ssqdm_x, compensation_remove = remove_var(
val, nobs, mean_x, ssqdm_x, compensation_remove
)
for j in range(end[i - 1], e):
val = values[j]
(
nobs,
mean_x,
ssqdm_x,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_var(
val,
nobs,
mean_x,
ssqdm_x,
compensation_add,
num_consecutive_same_value,
prev_value,
)
if nobs >= min_periods and nobs > ddof:
if nobs == 1 or num_consecutive_same_value >= nobs:
result = 0.0
else:
result = ssqdm_x / (nobs - ddof)
else:
result = np.nan
output[i] = result
if not is_monotonic_increasing_bounds:
nobs = 0
mean_x = 0.0
ssqdm_x = 0.0
compensation_remove = 0.0
# na_position is empty list since float64 can already hold nans
# Do list comprehension, since numba cannot figure out that na_pos is
# empty list of ints on its own
na_pos = [0 for i in range(0)]
return output, na_pos
@numba.jit(nopython=True, nogil=True, parallel=False)
def grouped_var(
values: np.ndarray,
result_dtype: np.dtype,
labels: npt.NDArray[np.intp],
ngroups: int,
min_periods: int,
ddof: int = 1,
) -> tuple[np.ndarray, list[int]]:
N = len(labels)
nobs_arr = np.zeros(ngroups, dtype=np.int64)
comp_arr = np.zeros(ngroups, dtype=values.dtype)
consecutive_counts = np.zeros(ngroups, dtype=np.int64)
prev_vals = np.zeros(ngroups, dtype=values.dtype)
output = np.zeros(ngroups, dtype=result_dtype)
means = np.zeros(ngroups, dtype=result_dtype)
for i in range(N):
lab = labels[i]
val = values[i]
if lab < 0:
continue
mean_x = means[lab]
ssqdm_x = output[lab]
nobs = nobs_arr[lab]
compensation_add = comp_arr[lab]
num_consecutive_same_value = consecutive_counts[lab]
prev_value = prev_vals[lab]
(
nobs,
mean_x,
ssqdm_x,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_var(
val,
nobs,
mean_x,
ssqdm_x,
compensation_add,
num_consecutive_same_value,
prev_value,
)
output[lab] = ssqdm_x
means[lab] = mean_x
consecutive_counts[lab] = num_consecutive_same_value
prev_vals[lab] = prev_value
comp_arr[lab] = compensation_add
nobs_arr[lab] = nobs
# Post-processing, replace vars that don't satisfy min_periods
for lab in range(ngroups):
nobs = nobs_arr[lab]
num_consecutive_same_value = consecutive_counts[lab]
ssqdm_x = output[lab]
if nobs >= min_periods and nobs > ddof:
if nobs == 1 or num_consecutive_same_value >= nobs:
result = 0.0
else:
result = ssqdm_x / (nobs - ddof)
else:
result = np.nan
output[lab] = result
# Second pass to get the std.dev
# na_position is empty list since float64 can already hold nans
# Do list comprehension, since numba cannot figure out that na_pos is
# empty list of ints on its own
na_pos = [0 for i in range(0)]
return output, na_pos

View File

@ -0,0 +1,340 @@
"""
accessor.py contains base classes for implementing accessor properties
that can be mixed into or pinned onto other pandas classes.
"""
from __future__ import annotations
from typing import (
Callable,
final,
)
import warnings
from pandas.util._decorators import doc
from pandas.util._exceptions import find_stack_level
class DirNamesMixin:
_accessors: set[str] = set()
_hidden_attrs: frozenset[str] = frozenset()
@final
def _dir_deletions(self) -> set[str]:
"""
Delete unwanted __dir__ for this object.
"""
return self._accessors | self._hidden_attrs
def _dir_additions(self) -> set[str]:
"""
Add additional __dir__ for this object.
"""
return {accessor for accessor in self._accessors if hasattr(self, accessor)}
def __dir__(self) -> list[str]:
"""
Provide method name lookup and completion.
Notes
-----
Only provide 'public' methods.
"""
rv = set(super().__dir__())
rv = (rv - self._dir_deletions()) | self._dir_additions()
return sorted(rv)
class PandasDelegate:
"""
Abstract base class for delegating methods/properties.
"""
def _delegate_property_get(self, name: str, *args, **kwargs):
raise TypeError(f"You cannot access the property {name}")
def _delegate_property_set(self, name: str, value, *args, **kwargs):
raise TypeError(f"The property {name} cannot be set")
def _delegate_method(self, name: str, *args, **kwargs):
raise TypeError(f"You cannot call method {name}")
@classmethod
def _add_delegate_accessors(
cls,
delegate,
accessors: list[str],
typ: str,
overwrite: bool = False,
accessor_mapping: Callable[[str], str] = lambda x: x,
raise_on_missing: bool = True,
) -> None:
"""
Add accessors to cls from the delegate class.
Parameters
----------
cls
Class to add the methods/properties to.
delegate
Class to get methods/properties and doc-strings.
accessors : list of str
List of accessors to add.
typ : {'property', 'method'}
overwrite : bool, default False
Overwrite the method/property in the target class if it exists.
accessor_mapping: Callable, default lambda x: x
Callable to map the delegate's function to the cls' function.
raise_on_missing: bool, default True
Raise if an accessor does not exist on delegate.
False skips the missing accessor.
"""
def _create_delegator_property(name: str):
def _getter(self):
return self._delegate_property_get(name)
def _setter(self, new_values):
return self._delegate_property_set(name, new_values)
_getter.__name__ = name
_setter.__name__ = name
return property(
fget=_getter,
fset=_setter,
doc=getattr(delegate, accessor_mapping(name)).__doc__,
)
def _create_delegator_method(name: str):
def f(self, *args, **kwargs):
return self._delegate_method(name, *args, **kwargs)
f.__name__ = name
f.__doc__ = getattr(delegate, accessor_mapping(name)).__doc__
return f
for name in accessors:
if (
not raise_on_missing
and getattr(delegate, accessor_mapping(name), None) is None
):
continue
if typ == "property":
f = _create_delegator_property(name)
else:
f = _create_delegator_method(name)
# don't overwrite existing methods/properties
if overwrite or not hasattr(cls, name):
setattr(cls, name, f)
def delegate_names(
delegate,
accessors: list[str],
typ: str,
overwrite: bool = False,
accessor_mapping: Callable[[str], str] = lambda x: x,
raise_on_missing: bool = True,
):
"""
Add delegated names to a class using a class decorator. This provides
an alternative usage to directly calling `_add_delegate_accessors`
below a class definition.
Parameters
----------
delegate : object
The class to get methods/properties & doc-strings.
accessors : Sequence[str]
List of accessor to add.
typ : {'property', 'method'}
overwrite : bool, default False
Overwrite the method/property in the target class if it exists.
accessor_mapping: Callable, default lambda x: x
Callable to map the delegate's function to the cls' function.
raise_on_missing: bool, default True
Raise if an accessor does not exist on delegate.
False skips the missing accessor.
Returns
-------
callable
A class decorator.
Examples
--------
@delegate_names(Categorical, ["categories", "ordered"], "property")
class CategoricalAccessor(PandasDelegate):
[...]
"""
def add_delegate_accessors(cls):
cls._add_delegate_accessors(
delegate,
accessors,
typ,
overwrite=overwrite,
accessor_mapping=accessor_mapping,
raise_on_missing=raise_on_missing,
)
return cls
return add_delegate_accessors
# Ported with modifications from xarray; licence at LICENSES/XARRAY_LICENSE
# https://github.com/pydata/xarray/blob/master/xarray/core/extensions.py
# 1. We don't need to catch and re-raise AttributeErrors as RuntimeErrors
# 2. We use a UserWarning instead of a custom Warning
class CachedAccessor:
"""
Custom property-like object.
A descriptor for caching accessors.
Parameters
----------
name : str
Namespace that will be accessed under, e.g. ``df.foo``.
accessor : cls
Class with the extension methods.
Notes
-----
For accessor, The class's __init__ method assumes that one of
``Series``, ``DataFrame`` or ``Index`` as the
single argument ``data``.
"""
def __init__(self, name: str, accessor) -> None:
self._name = name
self._accessor = accessor
def __get__(self, obj, cls):
if obj is None:
# we're accessing the attribute of the class, i.e., Dataset.geo
return self._accessor
accessor_obj = self._accessor(obj)
# Replace the property with the accessor object. Inspired by:
# https://www.pydanny.com/cached-property.html
# We need to use object.__setattr__ because we overwrite __setattr__ on
# NDFrame
object.__setattr__(obj, self._name, accessor_obj)
return accessor_obj
@doc(klass="", others="")
def _register_accessor(name: str, cls):
"""
Register a custom accessor on {klass} objects.
Parameters
----------
name : str
Name under which the accessor should be registered. A warning is issued
if this name conflicts with a preexisting attribute.
Returns
-------
callable
A class decorator.
See Also
--------
register_dataframe_accessor : Register a custom accessor on DataFrame objects.
register_series_accessor : Register a custom accessor on Series objects.
register_index_accessor : Register a custom accessor on Index objects.
Notes
-----
When accessed, your accessor will be initialized with the pandas object
the user is interacting with. So the signature must be
.. code-block:: python
def __init__(self, pandas_object): # noqa: E999
...
For consistency with pandas methods, you should raise an ``AttributeError``
if the data passed to your accessor has an incorrect dtype.
>>> pd.Series(['a', 'b']).dt
Traceback (most recent call last):
...
AttributeError: Can only use .dt accessor with datetimelike values
Examples
--------
In your library code::
import pandas as pd
@pd.api.extensions.register_dataframe_accessor("geo")
class GeoAccessor:
def __init__(self, pandas_obj):
self._obj = pandas_obj
@property
def center(self):
# return the geographic center point of this DataFrame
lat = self._obj.latitude
lon = self._obj.longitude
return (float(lon.mean()), float(lat.mean()))
def plot(self):
# plot this array's data on a map, e.g., using Cartopy
pass
Back in an interactive IPython session:
.. code-block:: ipython
In [1]: ds = pd.DataFrame({{"longitude": np.linspace(0, 10),
...: "latitude": np.linspace(0, 20)}})
In [2]: ds.geo.center
Out[2]: (5.0, 10.0)
In [3]: ds.geo.plot() # plots data on a map
"""
def decorator(accessor):
if hasattr(cls, name):
warnings.warn(
f"registration of accessor {repr(accessor)} under name "
f"{repr(name)} for type {repr(cls)} is overriding a preexisting "
f"attribute with the same name.",
UserWarning,
stacklevel=find_stack_level(),
)
setattr(cls, name, CachedAccessor(name, accessor))
cls._accessors.add(name)
return accessor
return decorator
@doc(_register_accessor, klass="DataFrame")
def register_dataframe_accessor(name: str):
from pandas import DataFrame
return _register_accessor(name, DataFrame)
@doc(_register_accessor, klass="Series")
def register_series_accessor(name: str):
from pandas import Series
return _register_accessor(name, Series)
@doc(_register_accessor, klass="Index")
def register_index_accessor(name: str):
from pandas import Index
return _register_accessor(name, Index)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,140 @@
from pandas._libs import (
NaT,
Period,
Timedelta,
Timestamp,
)
from pandas._libs.missing import NA
from pandas.core.dtypes.dtypes import (
ArrowDtype,
CategoricalDtype,
DatetimeTZDtype,
IntervalDtype,
PeriodDtype,
)
from pandas.core.dtypes.missing import (
isna,
isnull,
notna,
notnull,
)
from pandas.core.algorithms import (
factorize,
unique,
value_counts,
)
from pandas.core.arrays import Categorical
from pandas.core.arrays.boolean import BooleanDtype
from pandas.core.arrays.floating import (
Float32Dtype,
Float64Dtype,
)
from pandas.core.arrays.integer import (
Int8Dtype,
Int16Dtype,
Int32Dtype,
Int64Dtype,
UInt8Dtype,
UInt16Dtype,
UInt32Dtype,
UInt64Dtype,
)
from pandas.core.arrays.string_ import StringDtype
from pandas.core.construction import array
from pandas.core.flags import Flags
from pandas.core.groupby import (
Grouper,
NamedAgg,
)
from pandas.core.indexes.api import (
CategoricalIndex,
DatetimeIndex,
Index,
IntervalIndex,
MultiIndex,
PeriodIndex,
RangeIndex,
TimedeltaIndex,
)
from pandas.core.indexes.datetimes import (
bdate_range,
date_range,
)
from pandas.core.indexes.interval import (
Interval,
interval_range,
)
from pandas.core.indexes.period import period_range
from pandas.core.indexes.timedeltas import timedelta_range
from pandas.core.indexing import IndexSlice
from pandas.core.series import Series
from pandas.core.tools.datetimes import to_datetime
from pandas.core.tools.numeric import to_numeric
from pandas.core.tools.timedeltas import to_timedelta
from pandas.io.formats.format import set_eng_float_format
from pandas.tseries.offsets import DateOffset
# DataFrame needs to be imported after NamedAgg to avoid a circular import
from pandas.core.frame import DataFrame # isort:skip
__all__ = [
"array",
"ArrowDtype",
"bdate_range",
"BooleanDtype",
"Categorical",
"CategoricalDtype",
"CategoricalIndex",
"DataFrame",
"DateOffset",
"date_range",
"DatetimeIndex",
"DatetimeTZDtype",
"factorize",
"Flags",
"Float32Dtype",
"Float64Dtype",
"Grouper",
"Index",
"IndexSlice",
"Int16Dtype",
"Int32Dtype",
"Int64Dtype",
"Int8Dtype",
"Interval",
"IntervalDtype",
"IntervalIndex",
"interval_range",
"isna",
"isnull",
"MultiIndex",
"NA",
"NamedAgg",
"NaT",
"notna",
"notnull",
"Period",
"PeriodDtype",
"PeriodIndex",
"period_range",
"RangeIndex",
"Series",
"set_eng_float_format",
"StringDtype",
"Timedelta",
"TimedeltaIndex",
"timedelta_range",
"Timestamp",
"to_datetime",
"to_numeric",
"to_timedelta",
"UInt16Dtype",
"UInt32Dtype",
"UInt64Dtype",
"UInt8Dtype",
"unique",
"value_counts",
]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,9 @@
"""
core.array_algos is for algorithms that operate on ndarray and ExtensionArray.
These should:
- Assume that any Index, Series, or DataFrame objects have already been unwrapped.
- Assume that any list arguments have already been cast to ndarray/EA.
- Not depend on Index, Series, or DataFrame, nor import any of these.
- May dispatch to ExtensionArray methods, but should not import from core.arrays.
"""

View File

@ -0,0 +1,67 @@
"""
datetimelke_accumulations.py is for accumulations of datetimelike extension arrays
"""
from __future__ import annotations
from typing import Callable
import numpy as np
from pandas._libs import iNaT
from pandas.core.dtypes.missing import isna
def _cum_func(
func: Callable,
values: np.ndarray,
*,
skipna: bool = True,
):
"""
Accumulations for 1D datetimelike arrays.
Parameters
----------
func : np.cumsum, np.maximum.accumulate, np.minimum.accumulate
values : np.ndarray
Numpy array with the values (can be of any dtype that support the
operation). Values is changed is modified inplace.
skipna : bool, default True
Whether to skip NA.
"""
try:
fill_value = {
np.maximum.accumulate: np.iinfo(np.int64).min,
np.cumsum: 0,
np.minimum.accumulate: np.iinfo(np.int64).max,
}[func]
except KeyError:
raise ValueError(f"No accumulation for {func} implemented on BaseMaskedArray")
mask = isna(values)
y = values.view("i8")
y[mask] = fill_value
if not skipna:
mask = np.maximum.accumulate(mask)
result = func(y)
result[mask] = iNaT
if values.dtype.kind in "mM":
return result.view(values.dtype.base)
return result
def cumsum(values: np.ndarray, *, skipna: bool = True) -> np.ndarray:
return _cum_func(np.cumsum, values, skipna=skipna)
def cummin(values: np.ndarray, *, skipna: bool = True):
return _cum_func(np.minimum.accumulate, values, skipna=skipna)
def cummax(values: np.ndarray, *, skipna: bool = True):
return _cum_func(np.maximum.accumulate, values, skipna=skipna)

View File

@ -0,0 +1,90 @@
"""
masked_accumulations.py is for accumulation algorithms using a mask-based approach
for missing values.
"""
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Callable,
)
import numpy as np
if TYPE_CHECKING:
from pandas._typing import npt
def _cum_func(
func: Callable,
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
):
"""
Accumulations for 1D masked array.
We will modify values in place to replace NAs with the appropriate fill value.
Parameters
----------
func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate
values : np.ndarray
Numpy array with the values (can be of any dtype that support the
operation).
mask : np.ndarray
Boolean numpy array (True values indicate missing values).
skipna : bool, default True
Whether to skip NA.
"""
dtype_info: np.iinfo | np.finfo
if values.dtype.kind == "f":
dtype_info = np.finfo(values.dtype.type)
elif values.dtype.kind in "iu":
dtype_info = np.iinfo(values.dtype.type)
elif values.dtype.kind == "b":
# Max value of bool is 1, but since we are setting into a boolean
# array, 255 is fine as well. Min value has to be 0 when setting
# into the boolean array.
dtype_info = np.iinfo(np.uint8)
else:
raise NotImplementedError(
f"No masked accumulation defined for dtype {values.dtype.type}"
)
try:
fill_value = {
np.cumprod: 1,
np.maximum.accumulate: dtype_info.min,
np.cumsum: 0,
np.minimum.accumulate: dtype_info.max,
}[func]
except KeyError:
raise NotImplementedError(
f"No accumulation for {func} implemented on BaseMaskedArray"
)
values[mask] = fill_value
if not skipna:
mask = np.maximum.accumulate(mask)
values = func(values)
return values, mask
def cumsum(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
return _cum_func(np.cumsum, values, mask, skipna=skipna)
def cumprod(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
return _cum_func(np.cumprod, values, mask, skipna=skipna)
def cummin(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
return _cum_func(np.minimum.accumulate, values, mask, skipna=skipna)
def cummax(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
return _cum_func(np.maximum.accumulate, values, mask, skipna=skipna)

View File

@ -0,0 +1,197 @@
"""
masked_reductions.py is for reduction algorithms using a mask-based approach
for missing values.
"""
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Callable,
)
import warnings
import numpy as np
from pandas._libs import missing as libmissing
from pandas.core.nanops import check_below_min_count
if TYPE_CHECKING:
from pandas._typing import (
AxisInt,
npt,
)
def _reductions(
func: Callable,
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
min_count: int = 0,
axis: AxisInt | None = None,
**kwargs,
):
"""
Sum, mean or product for 1D masked array.
Parameters
----------
func : np.sum or np.prod
values : np.ndarray
Numpy array with the values (can be of any dtype that support the
operation).
mask : np.ndarray[bool]
Boolean numpy array (True values indicate missing values).
skipna : bool, default True
Whether to skip NA.
min_count : int, default 0
The required number of valid values to perform the operation. If fewer than
``min_count`` non-NA values are present the result will be NA.
axis : int, optional, default None
"""
if not skipna:
if mask.any() or check_below_min_count(values.shape, None, min_count):
return libmissing.NA
else:
return func(values, axis=axis, **kwargs)
else:
if check_below_min_count(values.shape, mask, min_count) and (
axis is None or values.ndim == 1
):
return libmissing.NA
return func(values, where=~mask, axis=axis, **kwargs)
def sum(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
min_count: int = 0,
axis: AxisInt | None = None,
):
return _reductions(
np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
)
def prod(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
min_count: int = 0,
axis: AxisInt | None = None,
):
return _reductions(
np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
)
def _minmax(
func: Callable,
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
axis: AxisInt | None = None,
):
"""
Reduction for 1D masked array.
Parameters
----------
func : np.min or np.max
values : np.ndarray
Numpy array with the values (can be of any dtype that support the
operation).
mask : np.ndarray[bool]
Boolean numpy array (True values indicate missing values).
skipna : bool, default True
Whether to skip NA.
axis : int, optional, default None
"""
if not skipna:
if mask.any() or not values.size:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA
else:
return func(values, axis=axis)
else:
subset = values[~mask]
if subset.size:
return func(subset, axis=axis)
else:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA
def min(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
axis: AxisInt | None = None,
):
return _minmax(np.min, values=values, mask=mask, skipna=skipna, axis=axis)
def max(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
axis: AxisInt | None = None,
):
return _minmax(np.max, values=values, mask=mask, skipna=skipna, axis=axis)
def mean(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
axis: AxisInt | None = None,
):
if not values.size or mask.all():
return libmissing.NA
return _reductions(np.mean, values=values, mask=mask, skipna=skipna, axis=axis)
def var(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
axis: AxisInt | None = None,
ddof: int = 1,
):
if not values.size or mask.all():
return libmissing.NA
with warnings.catch_warnings():
warnings.simplefilter("ignore", RuntimeWarning)
return _reductions(
np.var, values=values, mask=mask, skipna=skipna, axis=axis, ddof=ddof
)
def std(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
axis: AxisInt | None = None,
ddof: int = 1,
):
if not values.size or mask.all():
return libmissing.NA
with warnings.catch_warnings():
warnings.simplefilter("ignore", RuntimeWarning)
return _reductions(
np.std, values=values, mask=mask, skipna=skipna, axis=axis, ddof=ddof
)

View File

@ -0,0 +1,149 @@
"""
EA-compatible analogue to np.putmask
"""
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
)
import numpy as np
from pandas._libs import lib
from pandas.core.dtypes.cast import infer_dtype_from
from pandas.core.dtypes.common import is_list_like
from pandas.core.arrays import ExtensionArray
if TYPE_CHECKING:
from pandas._typing import (
ArrayLike,
npt,
)
from pandas import MultiIndex
def putmask_inplace(values: ArrayLike, mask: npt.NDArray[np.bool_], value: Any) -> None:
"""
ExtensionArray-compatible implementation of np.putmask. The main
difference is we do not handle repeating or truncating like numpy.
Parameters
----------
values: np.ndarray or ExtensionArray
mask : np.ndarray[bool]
We assume extract_bool_array has already been called.
value : Any
"""
if (
not isinstance(values, np.ndarray)
or (values.dtype == object and not lib.is_scalar(value))
# GH#43424: np.putmask raises TypeError if we cannot cast between types with
# rule = "safe", a stricter guarantee we may not have here
or (
isinstance(value, np.ndarray) and not np.can_cast(value.dtype, values.dtype)
)
):
# GH#19266 using np.putmask gives unexpected results with listlike value
# along with object dtype
if is_list_like(value) and len(value) == len(values):
values[mask] = value[mask]
else:
values[mask] = value
else:
# GH#37833 np.putmask is more performant than __setitem__
np.putmask(values, mask, value)
def putmask_without_repeat(
values: np.ndarray, mask: npt.NDArray[np.bool_], new: Any
) -> None:
"""
np.putmask will truncate or repeat if `new` is a listlike with
len(new) != len(values). We require an exact match.
Parameters
----------
values : np.ndarray
mask : np.ndarray[bool]
new : Any
"""
if getattr(new, "ndim", 0) >= 1:
new = new.astype(values.dtype, copy=False)
# TODO: this prob needs some better checking for 2D cases
nlocs = mask.sum()
if nlocs > 0 and is_list_like(new) and getattr(new, "ndim", 1) == 1:
shape = np.shape(new)
# np.shape compat for if setitem_datetimelike_compat
# changed arraylike to list e.g. test_where_dt64_2d
if nlocs == shape[-1]:
# GH#30567
# If length of ``new`` is less than the length of ``values``,
# `np.putmask` would first repeat the ``new`` array and then
# assign the masked values hence produces incorrect result.
# `np.place` on the other hand uses the ``new`` values at it is
# to place in the masked locations of ``values``
np.place(values, mask, new)
# i.e. values[mask] = new
elif mask.shape[-1] == shape[-1] or shape[-1] == 1:
np.putmask(values, mask, new)
else:
raise ValueError("cannot assign mismatch length to masked array")
else:
np.putmask(values, mask, new)
def validate_putmask(
values: ArrayLike | MultiIndex, mask: np.ndarray
) -> tuple[npt.NDArray[np.bool_], bool]:
"""
Validate mask and check if this putmask operation is a no-op.
"""
mask = extract_bool_array(mask)
if mask.shape != values.shape:
raise ValueError("putmask: mask and data must be the same size")
noop = not mask.any()
return mask, noop
def extract_bool_array(mask: ArrayLike) -> npt.NDArray[np.bool_]:
"""
If we have a SparseArray or BooleanArray, convert it to ndarray[bool].
"""
if isinstance(mask, ExtensionArray):
# We could have BooleanArray, Sparse[bool], ...
# Except for BooleanArray, this is equivalent to just
# np.asarray(mask, dtype=bool)
mask = mask.to_numpy(dtype=bool, na_value=False)
mask = np.asarray(mask, dtype=bool)
return mask
def setitem_datetimelike_compat(values: np.ndarray, num_set: int, other):
"""
Parameters
----------
values : np.ndarray
num_set : int
For putmask, this is mask.sum()
other : Any
"""
if values.dtype == object:
dtype, _ = infer_dtype_from(other)
if lib.is_np_dtype(dtype, "mM"):
# https://github.com/numpy/numpy/issues/12550
# timedelta64 will incorrectly cast to int
if not is_list_like(other):
other = [other] * num_set
else:
other = list(other)
return other

View File

@ -0,0 +1,226 @@
from __future__ import annotations
from typing import TYPE_CHECKING
import numpy as np
from pandas.core.dtypes.missing import (
isna,
na_value_for_dtype,
)
if TYPE_CHECKING:
from pandas._typing import (
ArrayLike,
Scalar,
npt,
)
def quantile_compat(
values: ArrayLike, qs: npt.NDArray[np.float64], interpolation: str
) -> ArrayLike:
"""
Compute the quantiles of the given values for each quantile in `qs`.
Parameters
----------
values : np.ndarray or ExtensionArray
qs : np.ndarray[float64]
interpolation : str
Returns
-------
np.ndarray or ExtensionArray
"""
if isinstance(values, np.ndarray):
fill_value = na_value_for_dtype(values.dtype, compat=False)
mask = isna(values)
return quantile_with_mask(values, mask, fill_value, qs, interpolation)
else:
return values._quantile(qs, interpolation)
def quantile_with_mask(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
fill_value,
qs: npt.NDArray[np.float64],
interpolation: str,
) -> np.ndarray:
"""
Compute the quantiles of the given values for each quantile in `qs`.
Parameters
----------
values : np.ndarray
For ExtensionArray, this is _values_for_factorize()[0]
mask : np.ndarray[bool]
mask = isna(values)
For ExtensionArray, this is computed before calling _value_for_factorize
fill_value : Scalar
The value to interpret fill NA entries with
For ExtensionArray, this is _values_for_factorize()[1]
qs : np.ndarray[float64]
interpolation : str
Type of interpolation
Returns
-------
np.ndarray
Notes
-----
Assumes values is already 2D. For ExtensionArray this means np.atleast_2d
has been called on _values_for_factorize()[0]
Quantile is computed along axis=1.
"""
assert values.shape == mask.shape
if values.ndim == 1:
# unsqueeze, operate, re-squeeze
values = np.atleast_2d(values)
mask = np.atleast_2d(mask)
res_values = quantile_with_mask(values, mask, fill_value, qs, interpolation)
return res_values[0]
assert values.ndim == 2
is_empty = values.shape[1] == 0
if is_empty:
# create the array of na_values
# 2d len(values) * len(qs)
flat = np.array([fill_value] * len(qs))
result = np.repeat(flat, len(values)).reshape(len(values), len(qs))
else:
result = _nanpercentile(
values,
qs * 100.0,
na_value=fill_value,
mask=mask,
interpolation=interpolation,
)
result = np.asarray(result)
result = result.T
return result
def _nanpercentile_1d(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
qs: npt.NDArray[np.float64],
na_value: Scalar,
interpolation: str,
) -> Scalar | np.ndarray:
"""
Wrapper for np.percentile that skips missing values, specialized to
1-dimensional case.
Parameters
----------
values : array over which to find quantiles
mask : ndarray[bool]
locations in values that should be considered missing
qs : np.ndarray[float64] of quantile indices to find
na_value : scalar
value to return for empty or all-null values
interpolation : str
Returns
-------
quantiles : scalar or array
"""
# mask is Union[ExtensionArray, ndarray]
values = values[~mask]
if len(values) == 0:
# Can't pass dtype=values.dtype here bc we might have na_value=np.nan
# with values.dtype=int64 see test_quantile_empty
# equiv: 'np.array([na_value] * len(qs))' but much faster
return np.full(len(qs), na_value)
return np.percentile(
values,
qs,
# error: No overload variant of "percentile" matches argument
# types "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]"
# , "Dict[str, str]" [call-overload]
method=interpolation, # type: ignore[call-overload]
)
def _nanpercentile(
values: np.ndarray,
qs: npt.NDArray[np.float64],
*,
na_value,
mask: npt.NDArray[np.bool_],
interpolation: str,
):
"""
Wrapper for np.percentile that skips missing values.
Parameters
----------
values : np.ndarray[ndim=2] over which to find quantiles
qs : np.ndarray[float64] of quantile indices to find
na_value : scalar
value to return for empty or all-null values
mask : np.ndarray[bool]
locations in values that should be considered missing
interpolation : str
Returns
-------
quantiles : scalar or array
"""
if values.dtype.kind in "mM":
# need to cast to integer to avoid rounding errors in numpy
result = _nanpercentile(
values.view("i8"),
qs=qs,
na_value=na_value.view("i8"),
mask=mask,
interpolation=interpolation,
)
# Note: we have to do `astype` and not view because in general we
# have float result at this point, not i8
return result.astype(values.dtype)
if mask.any():
# Caller is responsible for ensuring mask shape match
assert mask.shape == values.shape
result = [
_nanpercentile_1d(val, m, qs, na_value, interpolation=interpolation)
for (val, m) in zip(list(values), list(mask))
]
if values.dtype.kind == "f":
# preserve itemsize
result = np.asarray(result, dtype=values.dtype).T
else:
result = np.asarray(result).T
if (
result.dtype != values.dtype
and not mask.all()
and (result == result.astype(values.dtype, copy=False)).all()
):
# mask.all() will never get cast back to int
# e.g. values id integer dtype and result is floating dtype,
# only cast back to integer dtype if result values are all-integer.
result = result.astype(values.dtype, copy=False)
return result
else:
return np.percentile(
values,
qs,
axis=1,
# error: No overload variant of "percentile" matches argument types
# "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]",
# "int", "Dict[str, str]" [call-overload]
method=interpolation, # type: ignore[call-overload]
)

View File

@ -0,0 +1,152 @@
"""
Methods used by Block.replace and related methods.
"""
from __future__ import annotations
import operator
import re
from re import Pattern
from typing import (
TYPE_CHECKING,
Any,
)
import numpy as np
from pandas.core.dtypes.common import (
is_bool,
is_re,
is_re_compilable,
)
from pandas.core.dtypes.missing import isna
if TYPE_CHECKING:
from pandas._typing import (
ArrayLike,
Scalar,
npt,
)
def should_use_regex(regex: bool, to_replace: Any) -> bool:
"""
Decide whether to treat `to_replace` as a regular expression.
"""
if is_re(to_replace):
regex = True
regex = regex and is_re_compilable(to_replace)
# Don't use regex if the pattern is empty.
regex = regex and re.compile(to_replace).pattern != ""
return regex
def compare_or_regex_search(
a: ArrayLike, b: Scalar | Pattern, regex: bool, mask: npt.NDArray[np.bool_]
) -> ArrayLike:
"""
Compare two array-like inputs of the same shape or two scalar values
Calls operator.eq or re.search, depending on regex argument. If regex is
True, perform an element-wise regex matching.
Parameters
----------
a : array-like
b : scalar or regex pattern
regex : bool
mask : np.ndarray[bool]
Returns
-------
mask : array-like of bool
"""
if isna(b):
return ~mask
def _check_comparison_types(
result: ArrayLike | bool, a: ArrayLike, b: Scalar | Pattern
):
"""
Raises an error if the two arrays (a,b) cannot be compared.
Otherwise, returns the comparison result as expected.
"""
if is_bool(result) and isinstance(a, np.ndarray):
type_names = [type(a).__name__, type(b).__name__]
type_names[0] = f"ndarray(dtype={a.dtype})"
raise TypeError(
f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}"
)
if not regex or not should_use_regex(regex, b):
# TODO: should use missing.mask_missing?
op = lambda x: operator.eq(x, b)
else:
op = np.vectorize(
lambda x: bool(re.search(b, x))
if isinstance(x, str) and isinstance(b, (str, Pattern))
else False
)
# GH#32621 use mask to avoid comparing to NAs
if isinstance(a, np.ndarray):
a = a[mask]
result = op(a)
if isinstance(result, np.ndarray) and mask is not None:
# The shape of the mask can differ to that of the result
# since we may compare only a subset of a's or b's elements
tmp = np.zeros(mask.shape, dtype=np.bool_)
np.place(tmp, mask, result)
result = tmp
_check_comparison_types(result, a, b)
return result
def replace_regex(
values: ArrayLike, rx: re.Pattern, value, mask: npt.NDArray[np.bool_] | None
) -> None:
"""
Parameters
----------
values : ArrayLike
Object dtype.
rx : re.Pattern
value : Any
mask : np.ndarray[bool], optional
Notes
-----
Alters values in-place.
"""
# deal with replacing values with objects (strings) that match but
# whose replacement is not a string (numeric, nan, object)
if isna(value) or not isinstance(value, str):
def re_replacer(s):
if is_re(rx) and isinstance(s, str):
return value if rx.search(s) is not None else s
else:
return s
else:
# value is guaranteed to be a string here, s can be either a string
# or null if it's null it gets returned
def re_replacer(s):
if is_re(rx) and isinstance(s, str):
return rx.sub(value, s)
else:
return s
f = np.vectorize(re_replacer, otypes=[np.object_])
if mask is None:
values[:] = f(values)
else:
values[mask] = f(values[mask])

View File

@ -0,0 +1,594 @@
from __future__ import annotations
import functools
from typing import (
TYPE_CHECKING,
cast,
overload,
)
import numpy as np
from pandas._libs import (
algos as libalgos,
lib,
)
from pandas.core.dtypes.cast import maybe_promote
from pandas.core.dtypes.common import (
ensure_platform_int,
is_1d_only_ea_dtype,
)
from pandas.core.dtypes.missing import na_value_for_dtype
from pandas.core.construction import ensure_wrapped_if_datetimelike
if TYPE_CHECKING:
from pandas._typing import (
ArrayLike,
AxisInt,
npt,
)
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
from pandas.core.arrays.base import ExtensionArray
@overload
def take_nd(
arr: np.ndarray,
indexer,
axis: AxisInt = ...,
fill_value=...,
allow_fill: bool = ...,
) -> np.ndarray:
...
@overload
def take_nd(
arr: ExtensionArray,
indexer,
axis: AxisInt = ...,
fill_value=...,
allow_fill: bool = ...,
) -> ArrayLike:
...
def take_nd(
arr: ArrayLike,
indexer,
axis: AxisInt = 0,
fill_value=lib.no_default,
allow_fill: bool = True,
) -> ArrayLike:
"""
Specialized Cython take which sets NaN values in one pass
This dispatches to ``take`` defined on ExtensionArrays.
Note: this function assumes that the indexer is a valid(ated) indexer with
no out of bound indices.
Parameters
----------
arr : np.ndarray or ExtensionArray
Input array.
indexer : ndarray
1-D array of indices to take, subarrays corresponding to -1 value
indices are filed with fill_value
axis : int, default 0
Axis to take from
fill_value : any, default np.nan
Fill value to replace -1 values with
allow_fill : bool, default True
If False, indexer is assumed to contain no -1 values so no filling
will be done. This short-circuits computation of a mask. Result is
undefined if allow_fill == False and -1 is present in indexer.
Returns
-------
subarray : np.ndarray or ExtensionArray
May be the same type as the input, or cast to an ndarray.
"""
if fill_value is lib.no_default:
fill_value = na_value_for_dtype(arr.dtype, compat=False)
elif lib.is_np_dtype(arr.dtype, "mM"):
dtype, fill_value = maybe_promote(arr.dtype, fill_value)
if arr.dtype != dtype:
# EA.take is strict about returning a new object of the same type
# so for that case cast upfront
arr = arr.astype(dtype)
if not isinstance(arr, np.ndarray):
# i.e. ExtensionArray,
# includes for EA to catch DatetimeArray, TimedeltaArray
if not is_1d_only_ea_dtype(arr.dtype):
# i.e. DatetimeArray, TimedeltaArray
arr = cast("NDArrayBackedExtensionArray", arr)
return arr.take(
indexer, fill_value=fill_value, allow_fill=allow_fill, axis=axis
)
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
arr = np.asarray(arr)
return _take_nd_ndarray(arr, indexer, axis, fill_value, allow_fill)
def _take_nd_ndarray(
arr: np.ndarray,
indexer: npt.NDArray[np.intp] | None,
axis: AxisInt,
fill_value,
allow_fill: bool,
) -> np.ndarray:
if indexer is None:
indexer = np.arange(arr.shape[axis], dtype=np.intp)
dtype, fill_value = arr.dtype, arr.dtype.type()
else:
indexer = ensure_platform_int(indexer)
dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value(
arr, indexer, fill_value, allow_fill
)
flip_order = False
if arr.ndim == 2 and arr.flags.f_contiguous:
flip_order = True
if flip_order:
arr = arr.T
axis = arr.ndim - axis - 1
# at this point, it's guaranteed that dtype can hold both the arr values
# and the fill_value
out_shape_ = list(arr.shape)
out_shape_[axis] = len(indexer)
out_shape = tuple(out_shape_)
if arr.flags.f_contiguous and axis == arr.ndim - 1:
# minor tweak that can make an order-of-magnitude difference
# for dataframes initialized directly from 2-d ndarrays
# (s.t. df.values is c-contiguous and df._mgr.blocks[0] is its
# f-contiguous transpose)
out = np.empty(out_shape, dtype=dtype, order="F")
else:
out = np.empty(out_shape, dtype=dtype)
func = _get_take_nd_function(
arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info
)
func(arr, indexer, out, fill_value)
if flip_order:
out = out.T
return out
def take_1d(
arr: ArrayLike,
indexer: npt.NDArray[np.intp],
fill_value=None,
allow_fill: bool = True,
mask: npt.NDArray[np.bool_] | None = None,
) -> ArrayLike:
"""
Specialized version for 1D arrays. Differences compared to `take_nd`:
- Assumes input array has already been converted to numpy array / EA
- Assumes indexer is already guaranteed to be intp dtype ndarray
- Only works for 1D arrays
To ensure the lowest possible overhead.
Note: similarly to `take_nd`, this function assumes that the indexer is
a valid(ated) indexer with no out of bound indices.
Parameters
----------
arr : np.ndarray or ExtensionArray
Input array.
indexer : ndarray
1-D array of indices to take (validated indices, intp dtype).
fill_value : any, default np.nan
Fill value to replace -1 values with
allow_fill : bool, default True
If False, indexer is assumed to contain no -1 values so no filling
will be done. This short-circuits computation of a mask. Result is
undefined if allow_fill == False and -1 is present in indexer.
mask : np.ndarray, optional, default None
If `allow_fill` is True, and the mask (where indexer == -1) is already
known, it can be passed to avoid recomputation.
"""
if not isinstance(arr, np.ndarray):
# ExtensionArray -> dispatch to their method
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
if not allow_fill:
return arr.take(indexer)
dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value(
arr, indexer, fill_value, True, mask
)
# at this point, it's guaranteed that dtype can hold both the arr values
# and the fill_value
out = np.empty(indexer.shape, dtype=dtype)
func = _get_take_nd_function(
arr.ndim, arr.dtype, out.dtype, axis=0, mask_info=mask_info
)
func(arr, indexer, out, fill_value)
return out
def take_2d_multi(
arr: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
fill_value=np.nan,
) -> np.ndarray:
"""
Specialized Cython take which sets NaN values in one pass.
"""
# This is only called from one place in DataFrame._reindex_multi,
# so we know indexer is well-behaved.
assert indexer is not None
assert indexer[0] is not None
assert indexer[1] is not None
row_idx, col_idx = indexer
row_idx = ensure_platform_int(row_idx)
col_idx = ensure_platform_int(col_idx)
indexer = row_idx, col_idx
mask_info = None
# check for promotion based on types only (do this first because
# it's faster than computing a mask)
dtype, fill_value = maybe_promote(arr.dtype, fill_value)
if dtype != arr.dtype:
# check if promotion is actually required based on indexer
row_mask = row_idx == -1
col_mask = col_idx == -1
row_needs = row_mask.any()
col_needs = col_mask.any()
mask_info = (row_mask, col_mask), (row_needs, col_needs)
if not (row_needs or col_needs):
# if not, then depromote, set fill_value to dummy
# (it won't be used but we don't want the cython code
# to crash when trying to cast it to dtype)
dtype, fill_value = arr.dtype, arr.dtype.type()
# at this point, it's guaranteed that dtype can hold both the arr values
# and the fill_value
out_shape = len(row_idx), len(col_idx)
out = np.empty(out_shape, dtype=dtype)
func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None)
if func is None and arr.dtype != out.dtype:
func = _take_2d_multi_dict.get((out.dtype.name, out.dtype.name), None)
if func is not None:
func = _convert_wrapper(func, out.dtype)
if func is not None:
func(arr, indexer, out=out, fill_value=fill_value)
else:
# test_reindex_multi
_take_2d_multi_object(
arr, indexer, out, fill_value=fill_value, mask_info=mask_info
)
return out
@functools.lru_cache
def _get_take_nd_function_cached(
ndim: int, arr_dtype: np.dtype, out_dtype: np.dtype, axis: AxisInt
):
"""
Part of _get_take_nd_function below that doesn't need `mask_info` and thus
can be cached (mask_info potentially contains a numpy ndarray which is not
hashable and thus cannot be used as argument for cached function).
"""
tup = (arr_dtype.name, out_dtype.name)
if ndim == 1:
func = _take_1d_dict.get(tup, None)
elif ndim == 2:
if axis == 0:
func = _take_2d_axis0_dict.get(tup, None)
else:
func = _take_2d_axis1_dict.get(tup, None)
if func is not None:
return func
# We get here with string, uint, float16, and complex dtypes that could
# potentially be handled in algos_take_helper.
# Also a couple with (M8[ns], object) and (m8[ns], object)
tup = (out_dtype.name, out_dtype.name)
if ndim == 1:
func = _take_1d_dict.get(tup, None)
elif ndim == 2:
if axis == 0:
func = _take_2d_axis0_dict.get(tup, None)
else:
func = _take_2d_axis1_dict.get(tup, None)
if func is not None:
func = _convert_wrapper(func, out_dtype)
return func
return None
def _get_take_nd_function(
ndim: int,
arr_dtype: np.dtype,
out_dtype: np.dtype,
axis: AxisInt = 0,
mask_info=None,
):
"""
Get the appropriate "take" implementation for the given dimension, axis
and dtypes.
"""
func = None
if ndim <= 2:
# for this part we don't need `mask_info` -> use the cached algo lookup
func = _get_take_nd_function_cached(ndim, arr_dtype, out_dtype, axis)
if func is None:
def func(arr, indexer, out, fill_value=np.nan) -> None:
indexer = ensure_platform_int(indexer)
_take_nd_object(
arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info
)
return func
def _view_wrapper(f, arr_dtype=None, out_dtype=None, fill_wrap=None):
def wrapper(
arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
) -> None:
if arr_dtype is not None:
arr = arr.view(arr_dtype)
if out_dtype is not None:
out = out.view(out_dtype)
if fill_wrap is not None:
# FIXME: if we get here with dt64/td64 we need to be sure we have
# matching resos
if fill_value.dtype.kind == "m":
fill_value = fill_value.astype("m8[ns]")
else:
fill_value = fill_value.astype("M8[ns]")
fill_value = fill_wrap(fill_value)
f(arr, indexer, out, fill_value=fill_value)
return wrapper
def _convert_wrapper(f, conv_dtype):
def wrapper(
arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
) -> None:
if conv_dtype == object:
# GH#39755 avoid casting dt64/td64 to integers
arr = ensure_wrapped_if_datetimelike(arr)
arr = arr.astype(conv_dtype)
f(arr, indexer, out, fill_value=fill_value)
return wrapper
_take_1d_dict = {
("int8", "int8"): libalgos.take_1d_int8_int8,
("int8", "int32"): libalgos.take_1d_int8_int32,
("int8", "int64"): libalgos.take_1d_int8_int64,
("int8", "float64"): libalgos.take_1d_int8_float64,
("int16", "int16"): libalgos.take_1d_int16_int16,
("int16", "int32"): libalgos.take_1d_int16_int32,
("int16", "int64"): libalgos.take_1d_int16_int64,
("int16", "float64"): libalgos.take_1d_int16_float64,
("int32", "int32"): libalgos.take_1d_int32_int32,
("int32", "int64"): libalgos.take_1d_int32_int64,
("int32", "float64"): libalgos.take_1d_int32_float64,
("int64", "int64"): libalgos.take_1d_int64_int64,
("int64", "float64"): libalgos.take_1d_int64_float64,
("float32", "float32"): libalgos.take_1d_float32_float32,
("float32", "float64"): libalgos.take_1d_float32_float64,
("float64", "float64"): libalgos.take_1d_float64_float64,
("object", "object"): libalgos.take_1d_object_object,
("bool", "bool"): _view_wrapper(libalgos.take_1d_bool_bool, np.uint8, np.uint8),
("bool", "object"): _view_wrapper(libalgos.take_1d_bool_object, np.uint8, None),
("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
libalgos.take_1d_int64_int64, np.int64, np.int64, np.int64
),
("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
libalgos.take_1d_int64_int64, np.int64, np.int64, np.int64
),
}
_take_2d_axis0_dict = {
("int8", "int8"): libalgos.take_2d_axis0_int8_int8,
("int8", "int32"): libalgos.take_2d_axis0_int8_int32,
("int8", "int64"): libalgos.take_2d_axis0_int8_int64,
("int8", "float64"): libalgos.take_2d_axis0_int8_float64,
("int16", "int16"): libalgos.take_2d_axis0_int16_int16,
("int16", "int32"): libalgos.take_2d_axis0_int16_int32,
("int16", "int64"): libalgos.take_2d_axis0_int16_int64,
("int16", "float64"): libalgos.take_2d_axis0_int16_float64,
("int32", "int32"): libalgos.take_2d_axis0_int32_int32,
("int32", "int64"): libalgos.take_2d_axis0_int32_int64,
("int32", "float64"): libalgos.take_2d_axis0_int32_float64,
("int64", "int64"): libalgos.take_2d_axis0_int64_int64,
("int64", "float64"): libalgos.take_2d_axis0_int64_float64,
("float32", "float32"): libalgos.take_2d_axis0_float32_float32,
("float32", "float64"): libalgos.take_2d_axis0_float32_float64,
("float64", "float64"): libalgos.take_2d_axis0_float64_float64,
("object", "object"): libalgos.take_2d_axis0_object_object,
("bool", "bool"): _view_wrapper(
libalgos.take_2d_axis0_bool_bool, np.uint8, np.uint8
),
("bool", "object"): _view_wrapper(
libalgos.take_2d_axis0_bool_object, np.uint8, None
),
("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
libalgos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64
),
("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
libalgos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64
),
}
_take_2d_axis1_dict = {
("int8", "int8"): libalgos.take_2d_axis1_int8_int8,
("int8", "int32"): libalgos.take_2d_axis1_int8_int32,
("int8", "int64"): libalgos.take_2d_axis1_int8_int64,
("int8", "float64"): libalgos.take_2d_axis1_int8_float64,
("int16", "int16"): libalgos.take_2d_axis1_int16_int16,
("int16", "int32"): libalgos.take_2d_axis1_int16_int32,
("int16", "int64"): libalgos.take_2d_axis1_int16_int64,
("int16", "float64"): libalgos.take_2d_axis1_int16_float64,
("int32", "int32"): libalgos.take_2d_axis1_int32_int32,
("int32", "int64"): libalgos.take_2d_axis1_int32_int64,
("int32", "float64"): libalgos.take_2d_axis1_int32_float64,
("int64", "int64"): libalgos.take_2d_axis1_int64_int64,
("int64", "float64"): libalgos.take_2d_axis1_int64_float64,
("float32", "float32"): libalgos.take_2d_axis1_float32_float32,
("float32", "float64"): libalgos.take_2d_axis1_float32_float64,
("float64", "float64"): libalgos.take_2d_axis1_float64_float64,
("object", "object"): libalgos.take_2d_axis1_object_object,
("bool", "bool"): _view_wrapper(
libalgos.take_2d_axis1_bool_bool, np.uint8, np.uint8
),
("bool", "object"): _view_wrapper(
libalgos.take_2d_axis1_bool_object, np.uint8, None
),
("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
libalgos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64
),
("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
libalgos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64
),
}
_take_2d_multi_dict = {
("int8", "int8"): libalgos.take_2d_multi_int8_int8,
("int8", "int32"): libalgos.take_2d_multi_int8_int32,
("int8", "int64"): libalgos.take_2d_multi_int8_int64,
("int8", "float64"): libalgos.take_2d_multi_int8_float64,
("int16", "int16"): libalgos.take_2d_multi_int16_int16,
("int16", "int32"): libalgos.take_2d_multi_int16_int32,
("int16", "int64"): libalgos.take_2d_multi_int16_int64,
("int16", "float64"): libalgos.take_2d_multi_int16_float64,
("int32", "int32"): libalgos.take_2d_multi_int32_int32,
("int32", "int64"): libalgos.take_2d_multi_int32_int64,
("int32", "float64"): libalgos.take_2d_multi_int32_float64,
("int64", "int64"): libalgos.take_2d_multi_int64_int64,
("int64", "float64"): libalgos.take_2d_multi_int64_float64,
("float32", "float32"): libalgos.take_2d_multi_float32_float32,
("float32", "float64"): libalgos.take_2d_multi_float32_float64,
("float64", "float64"): libalgos.take_2d_multi_float64_float64,
("object", "object"): libalgos.take_2d_multi_object_object,
("bool", "bool"): _view_wrapper(
libalgos.take_2d_multi_bool_bool, np.uint8, np.uint8
),
("bool", "object"): _view_wrapper(
libalgos.take_2d_multi_bool_object, np.uint8, None
),
("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
libalgos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64
),
("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
libalgos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64
),
}
def _take_nd_object(
arr: np.ndarray,
indexer: npt.NDArray[np.intp],
out: np.ndarray,
axis: AxisInt,
fill_value,
mask_info,
) -> None:
if mask_info is not None:
mask, needs_masking = mask_info
else:
mask = indexer == -1
needs_masking = mask.any()
if arr.dtype != out.dtype:
arr = arr.astype(out.dtype)
if arr.shape[axis] > 0:
arr.take(indexer, axis=axis, out=out)
if needs_masking:
outindexer = [slice(None)] * arr.ndim
outindexer[axis] = mask
out[tuple(outindexer)] = fill_value
def _take_2d_multi_object(
arr: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
out: np.ndarray,
fill_value,
mask_info,
) -> None:
# this is not ideal, performance-wise, but it's better than raising
# an exception (best to optimize in Cython to avoid getting here)
row_idx, col_idx = indexer # both np.intp
if mask_info is not None:
(row_mask, col_mask), (row_needs, col_needs) = mask_info
else:
row_mask = row_idx == -1
col_mask = col_idx == -1
row_needs = row_mask.any()
col_needs = col_mask.any()
if fill_value is not None:
if row_needs:
out[row_mask, :] = fill_value
if col_needs:
out[:, col_mask] = fill_value
for i, u_ in enumerate(row_idx):
if u_ != -1:
for j, v in enumerate(col_idx):
if v != -1:
out[i, j] = arr[u_, v]
def _take_preprocess_indexer_and_fill_value(
arr: np.ndarray,
indexer: npt.NDArray[np.intp],
fill_value,
allow_fill: bool,
mask: npt.NDArray[np.bool_] | None = None,
):
mask_info: tuple[np.ndarray | None, bool] | None = None
if not allow_fill:
dtype, fill_value = arr.dtype, arr.dtype.type()
mask_info = None, False
else:
# check for promotion based on types only (do this first because
# it's faster than computing a mask)
dtype, fill_value = maybe_promote(arr.dtype, fill_value)
if dtype != arr.dtype:
# check if promotion is actually required based on indexer
if mask is not None:
needs_masking = True
else:
mask = indexer == -1
needs_masking = bool(mask.any())
mask_info = mask, needs_masking
if not needs_masking:
# if not, then depromote, set fill_value to dummy
# (it won't be used but we don't want the cython code
# to crash when trying to cast it to dtype)
dtype, fill_value = arr.dtype, arr.dtype.type()
return dtype, fill_value, mask_info

View File

@ -0,0 +1,50 @@
"""
transforms.py is for shape-preserving functions.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import numpy as np
if TYPE_CHECKING:
from pandas._typing import (
AxisInt,
Scalar,
)
def shift(
values: np.ndarray, periods: int, axis: AxisInt, fill_value: Scalar
) -> np.ndarray:
new_values = values
if periods == 0 or values.size == 0:
return new_values.copy()
# make sure array sent to np.roll is c_contiguous
f_ordered = values.flags.f_contiguous
if f_ordered:
new_values = new_values.T
axis = new_values.ndim - axis - 1
if new_values.size:
new_values = np.roll(
new_values,
np.intp(periods),
axis=axis,
)
axis_indexer = [slice(None)] * values.ndim
if periods > 0:
axis_indexer[axis] = slice(None, periods)
else:
axis_indexer[axis] = slice(periods, None)
new_values[tuple(axis_indexer)] = fill_value
# restore original order
if f_ordered:
new_values = new_values.T
return new_values

View File

@ -0,0 +1,530 @@
"""
Methods that can be shared by many array-like classes or subclasses:
Series
Index
ExtensionArray
"""
from __future__ import annotations
import operator
from typing import Any
import numpy as np
from pandas._libs import lib
from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op
from pandas.core.dtypes.generic import ABCNDFrame
from pandas.core import roperator
from pandas.core.construction import extract_array
from pandas.core.ops.common import unpack_zerodim_and_defer
REDUCTION_ALIASES = {
"maximum": "max",
"minimum": "min",
"add": "sum",
"multiply": "prod",
}
class OpsMixin:
# -------------------------------------------------------------
# Comparisons
def _cmp_method(self, other, op):
return NotImplemented
@unpack_zerodim_and_defer("__eq__")
def __eq__(self, other):
return self._cmp_method(other, operator.eq)
@unpack_zerodim_and_defer("__ne__")
def __ne__(self, other):
return self._cmp_method(other, operator.ne)
@unpack_zerodim_and_defer("__lt__")
def __lt__(self, other):
return self._cmp_method(other, operator.lt)
@unpack_zerodim_and_defer("__le__")
def __le__(self, other):
return self._cmp_method(other, operator.le)
@unpack_zerodim_and_defer("__gt__")
def __gt__(self, other):
return self._cmp_method(other, operator.gt)
@unpack_zerodim_and_defer("__ge__")
def __ge__(self, other):
return self._cmp_method(other, operator.ge)
# -------------------------------------------------------------
# Logical Methods
def _logical_method(self, other, op):
return NotImplemented
@unpack_zerodim_and_defer("__and__")
def __and__(self, other):
return self._logical_method(other, operator.and_)
@unpack_zerodim_and_defer("__rand__")
def __rand__(self, other):
return self._logical_method(other, roperator.rand_)
@unpack_zerodim_and_defer("__or__")
def __or__(self, other):
return self._logical_method(other, operator.or_)
@unpack_zerodim_and_defer("__ror__")
def __ror__(self, other):
return self._logical_method(other, roperator.ror_)
@unpack_zerodim_and_defer("__xor__")
def __xor__(self, other):
return self._logical_method(other, operator.xor)
@unpack_zerodim_and_defer("__rxor__")
def __rxor__(self, other):
return self._logical_method(other, roperator.rxor)
# -------------------------------------------------------------
# Arithmetic Methods
def _arith_method(self, other, op):
return NotImplemented
@unpack_zerodim_and_defer("__add__")
def __add__(self, other):
"""
Get Addition of DataFrame and other, column-wise.
Equivalent to ``DataFrame.add(other)``.
Parameters
----------
other : scalar, sequence, Series, dict or DataFrame
Object to be added to the DataFrame.
Returns
-------
DataFrame
The result of adding ``other`` to DataFrame.
See Also
--------
DataFrame.add : Add a DataFrame and another object, with option for index-
or column-oriented addition.
Examples
--------
>>> df = pd.DataFrame({'height': [1.5, 2.6], 'weight': [500, 800]},
... index=['elk', 'moose'])
>>> df
height weight
elk 1.5 500
moose 2.6 800
Adding a scalar affects all rows and columns.
>>> df[['height', 'weight']] + 1.5
height weight
elk 3.0 501.5
moose 4.1 801.5
Each element of a list is added to a column of the DataFrame, in order.
>>> df[['height', 'weight']] + [0.5, 1.5]
height weight
elk 2.0 501.5
moose 3.1 801.5
Keys of a dictionary are aligned to the DataFrame, based on column names;
each value in the dictionary is added to the corresponding column.
>>> df[['height', 'weight']] + {'height': 0.5, 'weight': 1.5}
height weight
elk 2.0 501.5
moose 3.1 801.5
When `other` is a :class:`Series`, the index of `other` is aligned with the
columns of the DataFrame.
>>> s1 = pd.Series([0.5, 1.5], index=['weight', 'height'])
>>> df[['height', 'weight']] + s1
height weight
elk 3.0 500.5
moose 4.1 800.5
Even when the index of `other` is the same as the index of the DataFrame,
the :class:`Series` will not be reoriented. If index-wise alignment is desired,
:meth:`DataFrame.add` should be used with `axis='index'`.
>>> s2 = pd.Series([0.5, 1.5], index=['elk', 'moose'])
>>> df[['height', 'weight']] + s2
elk height moose weight
elk NaN NaN NaN NaN
moose NaN NaN NaN NaN
>>> df[['height', 'weight']].add(s2, axis='index')
height weight
elk 2.0 500.5
moose 4.1 801.5
When `other` is a :class:`DataFrame`, both columns names and the
index are aligned.
>>> other = pd.DataFrame({'height': [0.2, 0.4, 0.6]},
... index=['elk', 'moose', 'deer'])
>>> df[['height', 'weight']] + other
height weight
deer NaN NaN
elk 1.7 NaN
moose 3.0 NaN
"""
return self._arith_method(other, operator.add)
@unpack_zerodim_and_defer("__radd__")
def __radd__(self, other):
return self._arith_method(other, roperator.radd)
@unpack_zerodim_and_defer("__sub__")
def __sub__(self, other):
return self._arith_method(other, operator.sub)
@unpack_zerodim_and_defer("__rsub__")
def __rsub__(self, other):
return self._arith_method(other, roperator.rsub)
@unpack_zerodim_and_defer("__mul__")
def __mul__(self, other):
return self._arith_method(other, operator.mul)
@unpack_zerodim_and_defer("__rmul__")
def __rmul__(self, other):
return self._arith_method(other, roperator.rmul)
@unpack_zerodim_and_defer("__truediv__")
def __truediv__(self, other):
return self._arith_method(other, operator.truediv)
@unpack_zerodim_and_defer("__rtruediv__")
def __rtruediv__(self, other):
return self._arith_method(other, roperator.rtruediv)
@unpack_zerodim_and_defer("__floordiv__")
def __floordiv__(self, other):
return self._arith_method(other, operator.floordiv)
@unpack_zerodim_and_defer("__rfloordiv")
def __rfloordiv__(self, other):
return self._arith_method(other, roperator.rfloordiv)
@unpack_zerodim_and_defer("__mod__")
def __mod__(self, other):
return self._arith_method(other, operator.mod)
@unpack_zerodim_and_defer("__rmod__")
def __rmod__(self, other):
return self._arith_method(other, roperator.rmod)
@unpack_zerodim_and_defer("__divmod__")
def __divmod__(self, other):
return self._arith_method(other, divmod)
@unpack_zerodim_and_defer("__rdivmod__")
def __rdivmod__(self, other):
return self._arith_method(other, roperator.rdivmod)
@unpack_zerodim_and_defer("__pow__")
def __pow__(self, other):
return self._arith_method(other, operator.pow)
@unpack_zerodim_and_defer("__rpow__")
def __rpow__(self, other):
return self._arith_method(other, roperator.rpow)
# -----------------------------------------------------------------------------
# Helpers to implement __array_ufunc__
def array_ufunc(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any):
"""
Compatibility with numpy ufuncs.
See also
--------
numpy.org/doc/stable/reference/arrays.classes.html#numpy.class.__array_ufunc__
"""
from pandas.core.frame import (
DataFrame,
Series,
)
from pandas.core.generic import NDFrame
from pandas.core.internals import (
ArrayManager,
BlockManager,
)
cls = type(self)
kwargs = _standardize_out_kwarg(**kwargs)
# for binary ops, use our custom dunder methods
result = maybe_dispatch_ufunc_to_dunder_op(self, ufunc, method, *inputs, **kwargs)
if result is not NotImplemented:
return result
# Determine if we should defer.
no_defer = (
np.ndarray.__array_ufunc__,
cls.__array_ufunc__,
)
for item in inputs:
higher_priority = (
hasattr(item, "__array_priority__")
and item.__array_priority__ > self.__array_priority__
)
has_array_ufunc = (
hasattr(item, "__array_ufunc__")
and type(item).__array_ufunc__ not in no_defer
and not isinstance(item, self._HANDLED_TYPES)
)
if higher_priority or has_array_ufunc:
return NotImplemented
# align all the inputs.
types = tuple(type(x) for x in inputs)
alignable = [x for x, t in zip(inputs, types) if issubclass(t, NDFrame)]
if len(alignable) > 1:
# This triggers alignment.
# At the moment, there aren't any ufuncs with more than two inputs
# so this ends up just being x1.index | x2.index, but we write
# it to handle *args.
set_types = set(types)
if len(set_types) > 1 and {DataFrame, Series}.issubset(set_types):
# We currently don't handle ufunc(DataFrame, Series)
# well. Previously this raised an internal ValueError. We might
# support it someday, so raise a NotImplementedError.
raise NotImplementedError(
f"Cannot apply ufunc {ufunc} to mixed DataFrame and Series inputs."
)
axes = self.axes
for obj in alignable[1:]:
# this relies on the fact that we aren't handling mixed
# series / frame ufuncs.
for i, (ax1, ax2) in enumerate(zip(axes, obj.axes)):
axes[i] = ax1.union(ax2)
reconstruct_axes = dict(zip(self._AXIS_ORDERS, axes))
inputs = tuple(
x.reindex(**reconstruct_axes) if issubclass(t, NDFrame) else x
for x, t in zip(inputs, types)
)
else:
reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes))
if self.ndim == 1:
names = [getattr(x, "name") for x in inputs if hasattr(x, "name")]
name = names[0] if len(set(names)) == 1 else None
reconstruct_kwargs = {"name": name}
else:
reconstruct_kwargs = {}
def reconstruct(result):
if ufunc.nout > 1:
# np.modf, np.frexp, np.divmod
return tuple(_reconstruct(x) for x in result)
return _reconstruct(result)
def _reconstruct(result):
if lib.is_scalar(result):
return result
if result.ndim != self.ndim:
if method == "outer":
raise NotImplementedError
return result
if isinstance(result, (BlockManager, ArrayManager)):
# we went through BlockManager.apply e.g. np.sqrt
result = self._constructor_from_mgr(result, axes=result.axes)
else:
# we converted an array, lost our axes
result = self._constructor(
result, **reconstruct_axes, **reconstruct_kwargs, copy=False
)
# TODO: When we support multiple values in __finalize__, this
# should pass alignable to `__finalize__` instead of self.
# Then `np.add(a, b)` would consider attrs from both a and b
# when a and b are NDFrames.
if len(alignable) == 1:
result = result.__finalize__(self)
return result
if "out" in kwargs:
# e.g. test_multiindex_get_loc
result = dispatch_ufunc_with_out(self, ufunc, method, *inputs, **kwargs)
return reconstruct(result)
if method == "reduce":
# e.g. test.series.test_ufunc.test_reduce
result = dispatch_reduction_ufunc(self, ufunc, method, *inputs, **kwargs)
if result is not NotImplemented:
return result
# We still get here with kwargs `axis` for e.g. np.maximum.accumulate
# and `dtype` and `keepdims` for np.ptp
if self.ndim > 1 and (len(inputs) > 1 or ufunc.nout > 1):
# Just give up on preserving types in the complex case.
# In theory we could preserve them for them.
# * nout>1 is doable if BlockManager.apply took nout and
# returned a Tuple[BlockManager].
# * len(inputs) > 1 is doable when we know that we have
# aligned blocks / dtypes.
# e.g. my_ufunc, modf, logaddexp, heaviside, subtract, add
inputs = tuple(np.asarray(x) for x in inputs)
# Note: we can't use default_array_ufunc here bc reindexing means
# that `self` may not be among `inputs`
result = getattr(ufunc, method)(*inputs, **kwargs)
elif self.ndim == 1:
# ufunc(series, ...)
inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs)
result = getattr(ufunc, method)(*inputs, **kwargs)
else:
# ufunc(dataframe)
if method == "__call__" and not kwargs:
# for np.<ufunc>(..) calls
# kwargs cannot necessarily be handled block-by-block, so only
# take this path if there are no kwargs
mgr = inputs[0]._mgr
result = mgr.apply(getattr(ufunc, method))
else:
# otherwise specific ufunc methods (eg np.<ufunc>.accumulate(..))
# Those can have an axis keyword and thus can't be called block-by-block
result = default_array_ufunc(inputs[0], ufunc, method, *inputs, **kwargs)
# e.g. np.negative (only one reached), with "where" and "out" in kwargs
result = reconstruct(result)
return result
def _standardize_out_kwarg(**kwargs) -> dict:
"""
If kwargs contain "out1" and "out2", replace that with a tuple "out"
np.divmod, np.modf, np.frexp can have either `out=(out1, out2)` or
`out1=out1, out2=out2)`
"""
if "out" not in kwargs and "out1" in kwargs and "out2" in kwargs:
out1 = kwargs.pop("out1")
out2 = kwargs.pop("out2")
out = (out1, out2)
kwargs["out"] = out
return kwargs
def dispatch_ufunc_with_out(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
"""
If we have an `out` keyword, then call the ufunc without `out` and then
set the result into the given `out`.
"""
# Note: we assume _standardize_out_kwarg has already been called.
out = kwargs.pop("out")
where = kwargs.pop("where", None)
result = getattr(ufunc, method)(*inputs, **kwargs)
if result is NotImplemented:
return NotImplemented
if isinstance(result, tuple):
# i.e. np.divmod, np.modf, np.frexp
if not isinstance(out, tuple) or len(out) != len(result):
raise NotImplementedError
for arr, res in zip(out, result):
_assign_where(arr, res, where)
return out
if isinstance(out, tuple):
if len(out) == 1:
out = out[0]
else:
raise NotImplementedError
_assign_where(out, result, where)
return out
def _assign_where(out, result, where) -> None:
"""
Set a ufunc result into 'out', masking with a 'where' argument if necessary.
"""
if where is None:
# no 'where' arg passed to ufunc
out[:] = result
else:
np.putmask(out, where, result)
def default_array_ufunc(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
"""
Fallback to the behavior we would get if we did not define __array_ufunc__.
Notes
-----
We are assuming that `self` is among `inputs`.
"""
if not any(x is self for x in inputs):
raise NotImplementedError
new_inputs = [x if x is not self else np.asarray(x) for x in inputs]
return getattr(ufunc, method)(*new_inputs, **kwargs)
def dispatch_reduction_ufunc(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
"""
Dispatch ufunc reductions to self's reduction methods.
"""
assert method == "reduce"
if len(inputs) != 1 or inputs[0] is not self:
return NotImplemented
if ufunc.__name__ not in REDUCTION_ALIASES:
return NotImplemented
method_name = REDUCTION_ALIASES[ufunc.__name__]
# NB: we are assuming that min/max represent minimum/maximum methods,
# which would not be accurate for e.g. Timestamp.min
if not hasattr(self, method_name):
return NotImplemented
if self.ndim > 1:
if isinstance(self, ABCNDFrame):
# TODO: test cases where this doesn't hold, i.e. 2D DTA/TDA
kwargs["numeric_only"] = False
if "axis" not in kwargs:
# For DataFrame reductions we don't want the default axis=0
# Note: np.min is not a ufunc, but uses array_function_dispatch,
# so calls DataFrame.min (without ever getting here) with the np.min
# default of axis=None, which DataFrame.min catches and changes to axis=0.
# np.minimum.reduce(df) gets here bc axis is not in kwargs,
# so we set axis=0 to match the behaviorof np.minimum.reduce(df.values)
kwargs["axis"] = 0
# By default, numpy's reductions do not skip NaNs, so we have to
# pass skipna=False
return getattr(self, method_name)(skipna=False, **kwargs)

View File

@ -0,0 +1,43 @@
from pandas.core.arrays.arrow import ArrowExtensionArray
from pandas.core.arrays.base import (
ExtensionArray,
ExtensionOpsMixin,
ExtensionScalarOpsMixin,
)
from pandas.core.arrays.boolean import BooleanArray
from pandas.core.arrays.categorical import Categorical
from pandas.core.arrays.datetimes import DatetimeArray
from pandas.core.arrays.floating import FloatingArray
from pandas.core.arrays.integer import IntegerArray
from pandas.core.arrays.interval import IntervalArray
from pandas.core.arrays.masked import BaseMaskedArray
from pandas.core.arrays.numpy_ import NumpyExtensionArray
from pandas.core.arrays.period import (
PeriodArray,
period_array,
)
from pandas.core.arrays.sparse import SparseArray
from pandas.core.arrays.string_ import StringArray
from pandas.core.arrays.string_arrow import ArrowStringArray
from pandas.core.arrays.timedeltas import TimedeltaArray
__all__ = [
"ArrowExtensionArray",
"ExtensionArray",
"ExtensionOpsMixin",
"ExtensionScalarOpsMixin",
"ArrowStringArray",
"BaseMaskedArray",
"BooleanArray",
"Categorical",
"DatetimeArray",
"FloatingArray",
"IntegerArray",
"IntervalArray",
"NumpyExtensionArray",
"PeriodArray",
"period_array",
"SparseArray",
"StringArray",
"TimedeltaArray",
]

View File

@ -0,0 +1,84 @@
from __future__ import annotations
from typing import Literal
import numpy as np
from pandas.compat import pa_version_under10p1
if not pa_version_under10p1:
import pyarrow as pa
import pyarrow.compute as pc
class ArrowStringArrayMixin:
_pa_array = None
def __init__(self, *args, **kwargs) -> None:
raise NotImplementedError
def _str_pad(
self,
width: int,
side: Literal["left", "right", "both"] = "left",
fillchar: str = " ",
):
if side == "left":
pa_pad = pc.utf8_lpad
elif side == "right":
pa_pad = pc.utf8_rpad
elif side == "both":
pa_pad = pc.utf8_center
else:
raise ValueError(
f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'"
)
return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar))
def _str_get(self, i: int):
lengths = pc.utf8_length(self._pa_array)
if i >= 0:
out_of_bounds = pc.greater_equal(i, lengths)
start = i
stop = i + 1
step = 1
else:
out_of_bounds = pc.greater(-i, lengths)
start = i
stop = i - 1
step = -1
not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True))
selected = pc.utf8_slice_codeunits(
self._pa_array, start=start, stop=stop, step=step
)
null_value = pa.scalar(
None, type=self._pa_array.type # type: ignore[attr-defined]
)
result = pc.if_else(not_out_of_bounds, selected, null_value)
return type(self)(result)
def _str_slice_replace(
self, start: int | None = None, stop: int | None = None, repl: str | None = None
):
if repl is None:
repl = ""
if start is None:
start = 0
if stop is None:
stop = np.iinfo(np.int64).max
return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl))
def _str_capitalize(self):
return type(self)(pc.utf8_capitalize(self._pa_array))
def _str_title(self):
return type(self)(pc.utf8_title(self._pa_array))
def _str_swapcase(self):
return type(self)(pc.utf8_swapcase(self._pa_array))
def _str_removesuffix(self, suffix: str):
ends_with = pc.ends_with(self._pa_array, pattern=suffix)
removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
result = pc.if_else(ends_with, removed, self._pa_array)
return type(self)(result)

View File

@ -0,0 +1,547 @@
from __future__ import annotations
from functools import wraps
from typing import (
TYPE_CHECKING,
Any,
Literal,
cast,
overload,
)
import numpy as np
from pandas._libs import lib
from pandas._libs.arrays import NDArrayBacked
from pandas._libs.tslibs import is_supported_dtype
from pandas._typing import (
ArrayLike,
AxisInt,
Dtype,
F,
FillnaOptions,
PositionalIndexer2D,
PositionalIndexerTuple,
ScalarIndexer,
Self,
SequenceIndexer,
Shape,
TakeIndexer,
npt,
)
from pandas.errors import AbstractMethodError
from pandas.util._decorators import doc
from pandas.util._validators import (
validate_bool_kwarg,
validate_fillna_kwargs,
validate_insert_loc,
)
from pandas.core.dtypes.common import pandas_dtype
from pandas.core.dtypes.dtypes import (
DatetimeTZDtype,
ExtensionDtype,
PeriodDtype,
)
from pandas.core.dtypes.missing import array_equivalent
from pandas.core import missing
from pandas.core.algorithms import (
take,
unique,
value_counts_internal as value_counts,
)
from pandas.core.array_algos.quantile import quantile_with_mask
from pandas.core.array_algos.transforms import shift
from pandas.core.arrays.base import ExtensionArray
from pandas.core.construction import extract_array
from pandas.core.indexers import check_array_indexer
from pandas.core.sorting import nargminmax
if TYPE_CHECKING:
from collections.abc import Sequence
from pandas._typing import (
NumpySorter,
NumpyValueArrayLike,
)
from pandas import Series
def ravel_compat(meth: F) -> F:
"""
Decorator to ravel a 2D array before passing it to a cython operation,
then reshape the result to our own shape.
"""
@wraps(meth)
def method(self, *args, **kwargs):
if self.ndim == 1:
return meth(self, *args, **kwargs)
flags = self._ndarray.flags
flat = self.ravel("K")
result = meth(flat, *args, **kwargs)
order = "F" if flags.f_contiguous else "C"
return result.reshape(self.shape, order=order)
return cast(F, method)
class NDArrayBackedExtensionArray(NDArrayBacked, ExtensionArray):
"""
ExtensionArray that is backed by a single NumPy ndarray.
"""
_ndarray: np.ndarray
# scalar used to denote NA value inside our self._ndarray, e.g. -1
# for Categorical, iNaT for Period. Outside of object dtype,
# self.isna() should be exactly locations in self._ndarray with
# _internal_fill_value.
_internal_fill_value: Any
def _box_func(self, x):
"""
Wrap numpy type in our dtype.type if necessary.
"""
return x
def _validate_scalar(self, value):
# used by NDArrayBackedExtensionIndex.insert
raise AbstractMethodError(self)
# ------------------------------------------------------------------------
def view(self, dtype: Dtype | None = None) -> ArrayLike:
# We handle datetime64, datetime64tz, timedelta64, and period
# dtypes here. Everything else we pass through to the underlying
# ndarray.
if dtype is None or dtype is self.dtype:
return self._from_backing_data(self._ndarray)
if isinstance(dtype, type):
# we sometimes pass non-dtype objects, e.g np.ndarray;
# pass those through to the underlying ndarray
return self._ndarray.view(dtype)
dtype = pandas_dtype(dtype)
arr = self._ndarray
if isinstance(dtype, PeriodDtype):
cls = dtype.construct_array_type()
return cls(arr.view("i8"), dtype=dtype)
elif isinstance(dtype, DatetimeTZDtype):
dt_cls = dtype.construct_array_type()
dt64_values = arr.view(f"M8[{dtype.unit}]")
return dt_cls._simple_new(dt64_values, dtype=dtype)
elif lib.is_np_dtype(dtype, "M") and is_supported_dtype(dtype):
from pandas.core.arrays import DatetimeArray
dt64_values = arr.view(dtype)
return DatetimeArray._simple_new(dt64_values, dtype=dtype)
elif lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype):
from pandas.core.arrays import TimedeltaArray
td64_values = arr.view(dtype)
return TimedeltaArray._simple_new(td64_values, dtype=dtype)
# error: Argument "dtype" to "view" of "_ArrayOrScalarCommon" has incompatible
# type "Union[ExtensionDtype, dtype[Any]]"; expected "Union[dtype[Any], None,
# type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int,
# Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
return arr.view(dtype=dtype) # type: ignore[arg-type]
def take(
self,
indices: TakeIndexer,
*,
allow_fill: bool = False,
fill_value: Any = None,
axis: AxisInt = 0,
) -> Self:
if allow_fill:
fill_value = self._validate_scalar(fill_value)
new_data = take(
self._ndarray,
indices,
allow_fill=allow_fill,
fill_value=fill_value,
axis=axis,
)
return self._from_backing_data(new_data)
# ------------------------------------------------------------------------
def equals(self, other) -> bool:
if type(self) is not type(other):
return False
if self.dtype != other.dtype:
return False
return bool(array_equivalent(self._ndarray, other._ndarray, dtype_equal=True))
@classmethod
def _from_factorized(cls, values, original):
assert values.dtype == original._ndarray.dtype
return original._from_backing_data(values)
def _values_for_argsort(self) -> np.ndarray:
return self._ndarray
def _values_for_factorize(self):
return self._ndarray, self._internal_fill_value
def _hash_pandas_object(
self, *, encoding: str, hash_key: str, categorize: bool
) -> npt.NDArray[np.uint64]:
from pandas.core.util.hashing import hash_array
values = self._ndarray
return hash_array(
values, encoding=encoding, hash_key=hash_key, categorize=categorize
)
# Signature of "argmin" incompatible with supertype "ExtensionArray"
def argmin(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[override]
# override base class by adding axis keyword
validate_bool_kwarg(skipna, "skipna")
if not skipna and self._hasna:
raise NotImplementedError
return nargminmax(self, "argmin", axis=axis)
# Signature of "argmax" incompatible with supertype "ExtensionArray"
def argmax(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[override]
# override base class by adding axis keyword
validate_bool_kwarg(skipna, "skipna")
if not skipna and self._hasna:
raise NotImplementedError
return nargminmax(self, "argmax", axis=axis)
def unique(self) -> Self:
new_data = unique(self._ndarray)
return self._from_backing_data(new_data)
@classmethod
@doc(ExtensionArray._concat_same_type)
def _concat_same_type(
cls,
to_concat: Sequence[Self],
axis: AxisInt = 0,
) -> Self:
if not lib.dtypes_all_equal([x.dtype for x in to_concat]):
dtypes = {str(x.dtype) for x in to_concat}
raise ValueError("to_concat must have the same dtype", dtypes)
return super()._concat_same_type(to_concat, axis=axis)
@doc(ExtensionArray.searchsorted)
def searchsorted(
self,
value: NumpyValueArrayLike | ExtensionArray,
side: Literal["left", "right"] = "left",
sorter: NumpySorter | None = None,
) -> npt.NDArray[np.intp] | np.intp:
npvalue = self._validate_setitem_value(value)
return self._ndarray.searchsorted(npvalue, side=side, sorter=sorter)
@doc(ExtensionArray.shift)
def shift(self, periods: int = 1, fill_value=None):
# NB: shift is always along axis=0
axis = 0
fill_value = self._validate_scalar(fill_value)
new_values = shift(self._ndarray, periods, axis, fill_value)
return self._from_backing_data(new_values)
def __setitem__(self, key, value) -> None:
key = check_array_indexer(self, key)
value = self._validate_setitem_value(value)
self._ndarray[key] = value
def _validate_setitem_value(self, value):
return value
@overload
def __getitem__(self, key: ScalarIndexer) -> Any:
...
@overload
def __getitem__(
self,
key: SequenceIndexer | PositionalIndexerTuple,
) -> Self:
...
def __getitem__(
self,
key: PositionalIndexer2D,
) -> Self | Any:
if lib.is_integer(key):
# fast-path
result = self._ndarray[key]
if self.ndim == 1:
return self._box_func(result)
return self._from_backing_data(result)
# error: Incompatible types in assignment (expression has type "ExtensionArray",
# variable has type "Union[int, slice, ndarray]")
key = extract_array(key, extract_numpy=True) # type: ignore[assignment]
key = check_array_indexer(self, key)
result = self._ndarray[key]
if lib.is_scalar(result):
return self._box_func(result)
result = self._from_backing_data(result)
return result
def _fill_mask_inplace(
self, method: str, limit: int | None, mask: npt.NDArray[np.bool_]
) -> None:
# (for now) when self.ndim == 2, we assume axis=0
func = missing.get_fill_func(method, ndim=self.ndim)
func(self._ndarray.T, limit=limit, mask=mask.T)
def _pad_or_backfill(
self,
*,
method: FillnaOptions,
limit: int | None = None,
limit_area: Literal["inside", "outside"] | None = None,
copy: bool = True,
) -> Self:
mask = self.isna()
if mask.any():
# (for now) when self.ndim == 2, we assume axis=0
func = missing.get_fill_func(method, ndim=self.ndim)
npvalues = self._ndarray.T
if copy:
npvalues = npvalues.copy()
func(npvalues, limit=limit, limit_area=limit_area, mask=mask.T)
npvalues = npvalues.T
if copy:
new_values = self._from_backing_data(npvalues)
else:
new_values = self
else:
if copy:
new_values = self.copy()
else:
new_values = self
return new_values
@doc(ExtensionArray.fillna)
def fillna(
self, value=None, method=None, limit: int | None = None, copy: bool = True
) -> Self:
value, method = validate_fillna_kwargs(
value, method, validate_scalar_dict_value=False
)
mask = self.isna()
# error: Argument 2 to "check_value_size" has incompatible type
# "ExtensionArray"; expected "ndarray"
value = missing.check_value_size(
value, mask, len(self) # type: ignore[arg-type]
)
if mask.any():
if method is not None:
# (for now) when self.ndim == 2, we assume axis=0
func = missing.get_fill_func(method, ndim=self.ndim)
npvalues = self._ndarray.T
if copy:
npvalues = npvalues.copy()
func(npvalues, limit=limit, mask=mask.T)
npvalues = npvalues.T
# TODO: NumpyExtensionArray didn't used to copy, need tests
# for this
new_values = self._from_backing_data(npvalues)
else:
# fill with value
if copy:
new_values = self.copy()
else:
new_values = self[:]
new_values[mask] = value
else:
# We validate the fill_value even if there is nothing to fill
if value is not None:
self._validate_setitem_value(value)
if not copy:
new_values = self[:]
else:
new_values = self.copy()
return new_values
# ------------------------------------------------------------------------
# Reductions
def _wrap_reduction_result(self, axis: AxisInt | None, result):
if axis is None or self.ndim == 1:
return self._box_func(result)
return self._from_backing_data(result)
# ------------------------------------------------------------------------
# __array_function__ methods
def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
"""
Analogue to np.putmask(self, mask, value)
Parameters
----------
mask : np.ndarray[bool]
value : scalar or listlike
Raises
------
TypeError
If value cannot be cast to self.dtype.
"""
value = self._validate_setitem_value(value)
np.putmask(self._ndarray, mask, value)
def _where(self: Self, mask: npt.NDArray[np.bool_], value) -> Self:
"""
Analogue to np.where(mask, self, value)
Parameters
----------
mask : np.ndarray[bool]
value : scalar or listlike
Raises
------
TypeError
If value cannot be cast to self.dtype.
"""
value = self._validate_setitem_value(value)
res_values = np.where(mask, self._ndarray, value)
if res_values.dtype != self._ndarray.dtype:
raise AssertionError(
# GH#56410
"Something has gone wrong, please report a bug at "
"github.com/pandas-dev/pandas/"
)
return self._from_backing_data(res_values)
# ------------------------------------------------------------------------
# Index compat methods
def insert(self, loc: int, item) -> Self:
"""
Make new ExtensionArray inserting new item at location. Follows
Python list.append semantics for negative values.
Parameters
----------
loc : int
item : object
Returns
-------
type(self)
"""
loc = validate_insert_loc(loc, len(self))
code = self._validate_scalar(item)
new_vals = np.concatenate(
(
self._ndarray[:loc],
np.asarray([code], dtype=self._ndarray.dtype),
self._ndarray[loc:],
)
)
return self._from_backing_data(new_vals)
# ------------------------------------------------------------------------
# Additional array methods
# These are not part of the EA API, but we implement them because
# pandas assumes they're there.
def value_counts(self, dropna: bool = True) -> Series:
"""
Return a Series containing counts of unique values.
Parameters
----------
dropna : bool, default True
Don't include counts of NA values.
Returns
-------
Series
"""
if self.ndim != 1:
raise NotImplementedError
from pandas import (
Index,
Series,
)
if dropna:
# error: Unsupported operand type for ~ ("ExtensionArray")
values = self[~self.isna()]._ndarray # type: ignore[operator]
else:
values = self._ndarray
result = value_counts(values, sort=False, dropna=dropna)
index_arr = self._from_backing_data(np.asarray(result.index._data))
index = Index(index_arr, name=result.index.name)
return Series(result._values, index=index, name=result.name, copy=False)
def _quantile(
self,
qs: npt.NDArray[np.float64],
interpolation: str,
) -> Self:
# TODO: disable for Categorical if not ordered?
mask = np.asarray(self.isna())
arr = self._ndarray
fill_value = self._internal_fill_value
res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)
res_values = self._cast_quantile_result(res_values)
return self._from_backing_data(res_values)
# TODO: see if we can share this with other dispatch-wrapping methods
def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
"""
Cast the result of quantile_with_mask to an appropriate dtype
to pass to _from_backing_data in _quantile.
"""
return res_values
# ------------------------------------------------------------------------
# numpy-like methods
@classmethod
def _empty(cls, shape: Shape, dtype: ExtensionDtype) -> Self:
"""
Analogous to np.empty(shape, dtype=dtype)
Parameters
----------
shape : tuple[int]
dtype : ExtensionDtype
"""
# The base implementation uses a naive approach to find the dtype
# for the backing ndarray
arr = cls._from_sequence([], dtype=dtype)
backing = np.empty(shape, dtype=arr._ndarray.dtype)
return arr._from_backing_data(backing)

View File

@ -0,0 +1,207 @@
"""
Helper functions to generate range-like data for DatetimeArray
(and possibly TimedeltaArray/PeriodArray)
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import numpy as np
from pandas._libs.lib import i8max
from pandas._libs.tslibs import (
BaseOffset,
OutOfBoundsDatetime,
Timedelta,
Timestamp,
iNaT,
)
if TYPE_CHECKING:
from pandas._typing import npt
def generate_regular_range(
start: Timestamp | Timedelta | None,
end: Timestamp | Timedelta | None,
periods: int | None,
freq: BaseOffset,
unit: str = "ns",
) -> npt.NDArray[np.intp]:
"""
Generate a range of dates or timestamps with the spans between dates
described by the given `freq` DateOffset.
Parameters
----------
start : Timedelta, Timestamp or None
First point of produced date range.
end : Timedelta, Timestamp or None
Last point of produced date range.
periods : int or None
Number of periods in produced date range.
freq : Tick
Describes space between dates in produced date range.
unit : str, default "ns"
The resolution the output is meant to represent.
Returns
-------
ndarray[np.int64]
Representing the given resolution.
"""
istart = start._value if start is not None else None
iend = end._value if end is not None else None
freq.nanos # raises if non-fixed frequency
td = Timedelta(freq)
b: int
e: int
try:
td = td.as_unit(unit, round_ok=False)
except ValueError as err:
raise ValueError(
f"freq={freq} is incompatible with unit={unit}. "
"Use a lower freq or a higher unit instead."
) from err
stride = int(td._value)
if periods is None and istart is not None and iend is not None:
b = istart
# cannot just use e = Timestamp(end) + 1 because arange breaks when
# stride is too large, see GH10887
e = b + (iend - b) // stride * stride + stride // 2 + 1
elif istart is not None and periods is not None:
b = istart
e = _generate_range_overflow_safe(b, periods, stride, side="start")
elif iend is not None and periods is not None:
e = iend + stride
b = _generate_range_overflow_safe(e, periods, stride, side="end")
else:
raise ValueError(
"at least 'start' or 'end' should be specified if a 'period' is given."
)
with np.errstate(over="raise"):
# If the range is sufficiently large, np.arange may overflow
# and incorrectly return an empty array if not caught.
try:
values = np.arange(b, e, stride, dtype=np.int64)
except FloatingPointError:
xdr = [b]
while xdr[-1] != e:
xdr.append(xdr[-1] + stride)
values = np.array(xdr[:-1], dtype=np.int64)
return values
def _generate_range_overflow_safe(
endpoint: int, periods: int, stride: int, side: str = "start"
) -> int:
"""
Calculate the second endpoint for passing to np.arange, checking
to avoid an integer overflow. Catch OverflowError and re-raise
as OutOfBoundsDatetime.
Parameters
----------
endpoint : int
nanosecond timestamp of the known endpoint of the desired range
periods : int
number of periods in the desired range
stride : int
nanoseconds between periods in the desired range
side : {'start', 'end'}
which end of the range `endpoint` refers to
Returns
-------
other_end : int
Raises
------
OutOfBoundsDatetime
"""
# GH#14187 raise instead of incorrectly wrapping around
assert side in ["start", "end"]
i64max = np.uint64(i8max)
msg = f"Cannot generate range with {side}={endpoint} and periods={periods}"
with np.errstate(over="raise"):
# if periods * strides cannot be multiplied within the *uint64* bounds,
# we cannot salvage the operation by recursing, so raise
try:
addend = np.uint64(periods) * np.uint64(np.abs(stride))
except FloatingPointError as err:
raise OutOfBoundsDatetime(msg) from err
if np.abs(addend) <= i64max:
# relatively easy case without casting concerns
return _generate_range_overflow_safe_signed(endpoint, periods, stride, side)
elif (endpoint > 0 and side == "start" and stride > 0) or (
endpoint < 0 < stride and side == "end"
):
# no chance of not-overflowing
raise OutOfBoundsDatetime(msg)
elif side == "end" and endpoint - stride <= i64max < endpoint:
# in _generate_regular_range we added `stride` thereby overflowing
# the bounds. Adjust to fix this.
return _generate_range_overflow_safe(
endpoint - stride, periods - 1, stride, side
)
# split into smaller pieces
mid_periods = periods // 2
remaining = periods - mid_periods
assert 0 < remaining < periods, (remaining, periods, endpoint, stride)
midpoint = int(_generate_range_overflow_safe(endpoint, mid_periods, stride, side))
return _generate_range_overflow_safe(midpoint, remaining, stride, side)
def _generate_range_overflow_safe_signed(
endpoint: int, periods: int, stride: int, side: str
) -> int:
"""
A special case for _generate_range_overflow_safe where `periods * stride`
can be calculated without overflowing int64 bounds.
"""
assert side in ["start", "end"]
if side == "end":
stride *= -1
with np.errstate(over="raise"):
addend = np.int64(periods) * np.int64(stride)
try:
# easy case with no overflows
result = np.int64(endpoint) + addend
if result == iNaT:
# Putting this into a DatetimeArray/TimedeltaArray
# would incorrectly be interpreted as NaT
raise OverflowError
return int(result)
except (FloatingPointError, OverflowError):
# with endpoint negative and addend positive we risk
# FloatingPointError; with reversed signed we risk OverflowError
pass
# if stride and endpoint had opposite signs, then endpoint + addend
# should never overflow. so they must have the same signs
assert (stride > 0 and endpoint >= 0) or (stride < 0 and endpoint <= 0)
if stride > 0:
# watch out for very special case in which we just slightly
# exceed implementation bounds, but when passing the result to
# np.arange will get a result slightly within the bounds
uresult = np.uint64(endpoint) + np.uint64(addend)
i64max = np.uint64(i8max)
assert uresult > i64max
if uresult <= i64max + np.uint64(stride):
return int(uresult)
raise OutOfBoundsDatetime(
f"Cannot generate range with {side}={endpoint} and periods={periods}"
)

View File

@ -0,0 +1,63 @@
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
)
import numpy as np
from pandas._libs import lib
from pandas.errors import LossySetitemError
from pandas.core.dtypes.cast import np_can_hold_element
from pandas.core.dtypes.common import is_numeric_dtype
if TYPE_CHECKING:
from pandas._typing import (
ArrayLike,
npt,
)
def to_numpy_dtype_inference(
arr: ArrayLike, dtype: npt.DTypeLike | None, na_value, hasna: bool
) -> tuple[npt.DTypeLike, Any]:
if dtype is None and is_numeric_dtype(arr.dtype):
dtype_given = False
if hasna:
if arr.dtype.kind == "b":
dtype = np.dtype(np.object_)
else:
if arr.dtype.kind in "iu":
dtype = np.dtype(np.float64)
else:
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
if na_value is lib.no_default:
na_value = np.nan
else:
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
elif dtype is not None:
dtype = np.dtype(dtype)
dtype_given = True
else:
dtype_given = True
if na_value is lib.no_default:
if dtype is None or not hasna:
na_value = arr.dtype.na_value
elif dtype.kind == "f": # type: ignore[union-attr]
na_value = np.nan
elif dtype.kind == "M": # type: ignore[union-attr]
na_value = np.datetime64("nat")
elif dtype.kind == "m": # type: ignore[union-attr]
na_value = np.timedelta64("nat")
else:
na_value = arr.dtype.na_value
if not dtype_given and hasna:
try:
np_can_hold_element(dtype, na_value) # type: ignore[arg-type]
except LossySetitemError:
dtype = np.dtype(np.object_)
return dtype, na_value

View File

@ -0,0 +1,7 @@
from pandas.core.arrays.arrow.accessors import (
ListAccessor,
StructAccessor,
)
from pandas.core.arrays.arrow.array import ArrowExtensionArray
__all__ = ["ArrowExtensionArray", "StructAccessor", "ListAccessor"]

View File

@ -0,0 +1,66 @@
from __future__ import annotations
import warnings
import numpy as np
import pyarrow
from pandas.errors import PerformanceWarning
from pandas.util._exceptions import find_stack_level
def fallback_performancewarning(version: str | None = None) -> None:
"""
Raise a PerformanceWarning for falling back to ExtensionArray's
non-pyarrow method
"""
msg = "Falling back on a non-pyarrow code path which may decrease performance."
if version is not None:
msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning."
warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level())
def pyarrow_array_to_numpy_and_mask(
arr, dtype: np.dtype
) -> tuple[np.ndarray, np.ndarray]:
"""
Convert a primitive pyarrow.Array to a numpy array and boolean mask based
on the buffers of the Array.
At the moment pyarrow.BooleanArray is not supported.
Parameters
----------
arr : pyarrow.Array
dtype : numpy.dtype
Returns
-------
(data, mask)
Tuple of two numpy arrays with the raw data (with specified dtype) and
a boolean mask (validity mask, so False means missing)
"""
dtype = np.dtype(dtype)
if pyarrow.types.is_null(arr.type):
# No initialization of data is needed since everything is null
data = np.empty(len(arr), dtype=dtype)
mask = np.zeros(len(arr), dtype=bool)
return data, mask
buflist = arr.buffers()
# Since Arrow buffers might contain padding and the data might be offset,
# the buffer gets sliced here before handing it to numpy.
# See also https://github.com/pandas-dev/pandas/issues/40896
offset = arr.offset * dtype.itemsize
length = len(arr) * dtype.itemsize
data_buf = buflist[1][offset : offset + length]
data = np.frombuffer(data_buf, dtype=dtype)
bitmask = buflist[0]
if bitmask is not None:
mask = pyarrow.BooleanArray.from_buffers(
pyarrow.bool_(), len(arr), [None, bitmask], offset=arr.offset
)
mask = np.asarray(mask)
else:
mask = np.ones(len(arr), dtype=bool)
return data, mask

View File

@ -0,0 +1,473 @@
"""Accessors for arrow-backed data."""
from __future__ import annotations
from abc import (
ABCMeta,
abstractmethod,
)
from typing import (
TYPE_CHECKING,
cast,
)
from pandas.compat import (
pa_version_under10p1,
pa_version_under11p0,
)
from pandas.core.dtypes.common import is_list_like
if not pa_version_under10p1:
import pyarrow as pa
import pyarrow.compute as pc
from pandas.core.dtypes.dtypes import ArrowDtype
if TYPE_CHECKING:
from collections.abc import Iterator
from pandas import (
DataFrame,
Series,
)
class ArrowAccessor(metaclass=ABCMeta):
@abstractmethod
def __init__(self, data, validation_msg: str) -> None:
self._data = data
self._validation_msg = validation_msg
self._validate(data)
@abstractmethod
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
pass
def _validate(self, data):
dtype = data.dtype
if not isinstance(dtype, ArrowDtype):
# Raise AttributeError so that inspect can handle non-struct Series.
raise AttributeError(self._validation_msg.format(dtype=dtype))
if not self._is_valid_pyarrow_dtype(dtype.pyarrow_dtype):
# Raise AttributeError so that inspect can handle invalid Series.
raise AttributeError(self._validation_msg.format(dtype=dtype))
@property
def _pa_array(self):
return self._data.array._pa_array
class ListAccessor(ArrowAccessor):
"""
Accessor object for list data properties of the Series values.
Parameters
----------
data : Series
Series containing Arrow list data.
"""
def __init__(self, data=None) -> None:
super().__init__(
data,
validation_msg="Can only use the '.list' accessor with "
"'list[pyarrow]' dtype, not {dtype}.",
)
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
return (
pa.types.is_list(pyarrow_dtype)
or pa.types.is_fixed_size_list(pyarrow_dtype)
or pa.types.is_large_list(pyarrow_dtype)
)
def len(self) -> Series:
"""
Return the length of each list in the Series.
Returns
-------
pandas.Series
The length of each list.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... [1, 2, 3],
... [3],
... ],
... dtype=pd.ArrowDtype(pa.list_(
... pa.int64()
... ))
... )
>>> s.list.len()
0 3
1 1
dtype: int32[pyarrow]
"""
from pandas import Series
value_lengths = pc.list_value_length(self._pa_array)
return Series(value_lengths, dtype=ArrowDtype(value_lengths.type))
def __getitem__(self, key: int | slice) -> Series:
"""
Index or slice lists in the Series.
Parameters
----------
key : int | slice
Index or slice of indices to access from each list.
Returns
-------
pandas.Series
The list at requested index.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... [1, 2, 3],
... [3],
... ],
... dtype=pd.ArrowDtype(pa.list_(
... pa.int64()
... ))
... )
>>> s.list[0]
0 1
1 3
dtype: int64[pyarrow]
"""
from pandas import Series
if isinstance(key, int):
# TODO: Support negative key but pyarrow does not allow
# element index to be an array.
# if key < 0:
# key = pc.add(key, pc.list_value_length(self._pa_array))
element = pc.list_element(self._pa_array, key)
return Series(element, dtype=ArrowDtype(element.type))
elif isinstance(key, slice):
if pa_version_under11p0:
raise NotImplementedError(
f"List slice not supported by pyarrow {pa.__version__}."
)
# TODO: Support negative start/stop/step, ideally this would be added
# upstream in pyarrow.
start, stop, step = key.start, key.stop, key.step
if start is None:
# TODO: When adding negative step support
# this should be setto last element of array
# when step is negative.
start = 0
if step is None:
step = 1
sliced = pc.list_slice(self._pa_array, start, stop, step)
return Series(sliced, dtype=ArrowDtype(sliced.type))
else:
raise ValueError(f"key must be an int or slice, got {type(key).__name__}")
def __iter__(self) -> Iterator:
raise TypeError(f"'{type(self).__name__}' object is not iterable")
def flatten(self) -> Series:
"""
Flatten list values.
Returns
-------
pandas.Series
The data from all lists in the series flattened.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... [1, 2, 3],
... [3],
... ],
... dtype=pd.ArrowDtype(pa.list_(
... pa.int64()
... ))
... )
>>> s.list.flatten()
0 1
1 2
2 3
3 3
dtype: int64[pyarrow]
"""
from pandas import Series
flattened = pc.list_flatten(self._pa_array)
return Series(flattened, dtype=ArrowDtype(flattened.type))
class StructAccessor(ArrowAccessor):
"""
Accessor object for structured data properties of the Series values.
Parameters
----------
data : Series
Series containing Arrow struct data.
"""
def __init__(self, data=None) -> None:
super().__init__(
data,
validation_msg=(
"Can only use the '.struct' accessor with 'struct[pyarrow]' "
"dtype, not {dtype}."
),
)
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
return pa.types.is_struct(pyarrow_dtype)
@property
def dtypes(self) -> Series:
"""
Return the dtype object of each child field of the struct.
Returns
-------
pandas.Series
The data type of each child field.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... {"version": 1, "project": "pandas"},
... {"version": 2, "project": "pandas"},
... {"version": 1, "project": "numpy"},
... ],
... dtype=pd.ArrowDtype(pa.struct(
... [("version", pa.int64()), ("project", pa.string())]
... ))
... )
>>> s.struct.dtypes
version int64[pyarrow]
project string[pyarrow]
dtype: object
"""
from pandas import (
Index,
Series,
)
pa_type = self._data.dtype.pyarrow_dtype
types = [ArrowDtype(struct.type) for struct in pa_type]
names = [struct.name for struct in pa_type]
return Series(types, index=Index(names))
def field(
self,
name_or_index: list[str]
| list[bytes]
| list[int]
| pc.Expression
| bytes
| str
| int,
) -> Series:
"""
Extract a child field of a struct as a Series.
Parameters
----------
name_or_index : str | bytes | int | expression | list
Name or index of the child field to extract.
For list-like inputs, this will index into a nested
struct.
Returns
-------
pandas.Series
The data corresponding to the selected child field.
See Also
--------
Series.struct.explode : Return all child fields as a DataFrame.
Notes
-----
The name of the resulting Series will be set using the following
rules:
- For string, bytes, or integer `name_or_index` (or a list of these, for
a nested selection), the Series name is set to the selected
field's name.
- For a :class:`pyarrow.compute.Expression`, this is set to
the string form of the expression.
- For list-like `name_or_index`, the name will be set to the
name of the final field selected.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... {"version": 1, "project": "pandas"},
... {"version": 2, "project": "pandas"},
... {"version": 1, "project": "numpy"},
... ],
... dtype=pd.ArrowDtype(pa.struct(
... [("version", pa.int64()), ("project", pa.string())]
... ))
... )
Extract by field name.
>>> s.struct.field("project")
0 pandas
1 pandas
2 numpy
Name: project, dtype: string[pyarrow]
Extract by field index.
>>> s.struct.field(0)
0 1
1 2
2 1
Name: version, dtype: int64[pyarrow]
Or an expression
>>> import pyarrow.compute as pc
>>> s.struct.field(pc.field("project"))
0 pandas
1 pandas
2 numpy
Name: project, dtype: string[pyarrow]
For nested struct types, you can pass a list of values to index
multiple levels:
>>> version_type = pa.struct([
... ("major", pa.int64()),
... ("minor", pa.int64()),
... ])
>>> s = pd.Series(
... [
... {"version": {"major": 1, "minor": 5}, "project": "pandas"},
... {"version": {"major": 2, "minor": 1}, "project": "pandas"},
... {"version": {"major": 1, "minor": 26}, "project": "numpy"},
... ],
... dtype=pd.ArrowDtype(pa.struct(
... [("version", version_type), ("project", pa.string())]
... ))
... )
>>> s.struct.field(["version", "minor"])
0 5
1 1
2 26
Name: minor, dtype: int64[pyarrow]
>>> s.struct.field([0, 0])
0 1
1 2
2 1
Name: major, dtype: int64[pyarrow]
"""
from pandas import Series
def get_name(
level_name_or_index: list[str]
| list[bytes]
| list[int]
| pc.Expression
| bytes
| str
| int,
data: pa.ChunkedArray,
):
if isinstance(level_name_or_index, int):
name = data.type.field(level_name_or_index).name
elif isinstance(level_name_or_index, (str, bytes)):
name = level_name_or_index
elif isinstance(level_name_or_index, pc.Expression):
name = str(level_name_or_index)
elif is_list_like(level_name_or_index):
# For nested input like [2, 1, 2]
# iteratively get the struct and field name. The last
# one is used for the name of the index.
level_name_or_index = list(reversed(level_name_or_index))
selected = data
while level_name_or_index:
# we need the cast, otherwise mypy complains about
# getting ints, bytes, or str here, which isn't possible.
level_name_or_index = cast(list, level_name_or_index)
name_or_index = level_name_or_index.pop()
name = get_name(name_or_index, selected)
selected = selected.type.field(selected.type.get_field_index(name))
name = selected.name
else:
raise ValueError(
"name_or_index must be an int, str, bytes, "
"pyarrow.compute.Expression, or list of those"
)
return name
pa_arr = self._data.array._pa_array
name = get_name(name_or_index, pa_arr)
field_arr = pc.struct_field(pa_arr, name_or_index)
return Series(
field_arr,
dtype=ArrowDtype(field_arr.type),
index=self._data.index,
name=name,
)
def explode(self) -> DataFrame:
"""
Extract all child fields of a struct as a DataFrame.
Returns
-------
pandas.DataFrame
The data corresponding to all child fields.
See Also
--------
Series.struct.field : Return a single child field as a Series.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... {"version": 1, "project": "pandas"},
... {"version": 2, "project": "pandas"},
... {"version": 1, "project": "numpy"},
... ],
... dtype=pd.ArrowDtype(pa.struct(
... [("version", pa.int64()), ("project", pa.string())]
... ))
... )
>>> s.struct.explode()
version project
0 1 pandas
1 2 pandas
2 1 numpy
"""
from pandas import concat
pa_type = self._pa_array.type
return concat(
[self.field(i) for i in range(pa_type.num_fields)], axis="columns"
)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,174 @@
from __future__ import annotations
import json
from typing import TYPE_CHECKING
import pyarrow
from pandas.compat import pa_version_under14p1
from pandas.core.dtypes.dtypes import (
IntervalDtype,
PeriodDtype,
)
from pandas.core.arrays.interval import VALID_CLOSED
if TYPE_CHECKING:
from pandas._typing import IntervalClosedType
class ArrowPeriodType(pyarrow.ExtensionType):
def __init__(self, freq) -> None:
# attributes need to be set first before calling
# super init (as that calls serialize)
self._freq = freq
pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period")
@property
def freq(self):
return self._freq
def __arrow_ext_serialize__(self) -> bytes:
metadata = {"freq": self.freq}
return json.dumps(metadata).encode()
@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowPeriodType:
metadata = json.loads(serialized.decode())
return ArrowPeriodType(metadata["freq"])
def __eq__(self, other):
if isinstance(other, pyarrow.BaseExtensionType):
return type(self) == type(other) and self.freq == other.freq
else:
return NotImplemented
def __ne__(self, other) -> bool:
return not self == other
def __hash__(self) -> int:
return hash((str(self), self.freq))
def to_pandas_dtype(self) -> PeriodDtype:
return PeriodDtype(freq=self.freq)
# register the type with a dummy instance
_period_type = ArrowPeriodType("D")
pyarrow.register_extension_type(_period_type)
class ArrowIntervalType(pyarrow.ExtensionType):
def __init__(self, subtype, closed: IntervalClosedType) -> None:
# attributes need to be set first before calling
# super init (as that calls serialize)
assert closed in VALID_CLOSED
self._closed: IntervalClosedType = closed
if not isinstance(subtype, pyarrow.DataType):
subtype = pyarrow.type_for_alias(str(subtype))
self._subtype = subtype
storage_type = pyarrow.struct([("left", subtype), ("right", subtype)])
pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval")
@property
def subtype(self):
return self._subtype
@property
def closed(self) -> IntervalClosedType:
return self._closed
def __arrow_ext_serialize__(self) -> bytes:
metadata = {"subtype": str(self.subtype), "closed": self.closed}
return json.dumps(metadata).encode()
@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowIntervalType:
metadata = json.loads(serialized.decode())
subtype = pyarrow.type_for_alias(metadata["subtype"])
closed = metadata["closed"]
return ArrowIntervalType(subtype, closed)
def __eq__(self, other):
if isinstance(other, pyarrow.BaseExtensionType):
return (
type(self) == type(other)
and self.subtype == other.subtype
and self.closed == other.closed
)
else:
return NotImplemented
def __ne__(self, other) -> bool:
return not self == other
def __hash__(self) -> int:
return hash((str(self), str(self.subtype), self.closed))
def to_pandas_dtype(self) -> IntervalDtype:
return IntervalDtype(self.subtype.to_pandas_dtype(), self.closed)
# register the type with a dummy instance
_interval_type = ArrowIntervalType(pyarrow.int64(), "left")
pyarrow.register_extension_type(_interval_type)
_ERROR_MSG = """\
Disallowed deserialization of 'arrow.py_extension_type':
storage_type = {storage_type}
serialized = {serialized}
pickle disassembly:\n{pickle_disassembly}
Reading of untrusted Parquet or Feather files with a PyExtensionType column
allows arbitrary code execution.
If you trust this file, you can enable reading the extension type by one of:
- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)`
- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running
`import pyarrow_hotfix; pyarrow_hotfix.uninstall()`
We strongly recommend updating your Parquet/Feather files to use extension types
derived from `pyarrow.ExtensionType` instead, and register this type explicitly.
"""
def patch_pyarrow():
# starting from pyarrow 14.0.1, it has its own mechanism
if not pa_version_under14p1:
return
# if https://github.com/pitrou/pyarrow-hotfix was installed and enabled
if getattr(pyarrow, "_hotfix_installed", False):
return
class ForbiddenExtensionType(pyarrow.ExtensionType):
def __arrow_ext_serialize__(self):
return b""
@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized):
import io
import pickletools
out = io.StringIO()
pickletools.dis(serialized, out)
raise RuntimeError(
_ERROR_MSG.format(
storage_type=storage_type,
serialized=serialized,
pickle_disassembly=out.getvalue(),
)
)
pyarrow.unregister_extension_type("arrow.py_extension_type")
pyarrow.register_extension_type(
ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type")
)
pyarrow._hotfix_installed = True
patch_pyarrow()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,407 @@
from __future__ import annotations
import numbers
from typing import (
TYPE_CHECKING,
ClassVar,
cast,
)
import numpy as np
from pandas._libs import (
lib,
missing as libmissing,
)
from pandas.core.dtypes.common import is_list_like
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.core.dtypes.missing import isna
from pandas.core import ops
from pandas.core.array_algos import masked_accumulations
from pandas.core.arrays.masked import (
BaseMaskedArray,
BaseMaskedDtype,
)
if TYPE_CHECKING:
import pyarrow
from pandas._typing import (
Dtype,
DtypeObj,
Self,
npt,
type_t,
)
@register_extension_dtype
class BooleanDtype(BaseMaskedDtype):
"""
Extension dtype for boolean data.
.. warning::
BooleanDtype is considered experimental. The implementation and
parts of the API may change without warning.
Attributes
----------
None
Methods
-------
None
Examples
--------
>>> pd.BooleanDtype()
BooleanDtype
"""
name: ClassVar[str] = "boolean"
# https://github.com/python/mypy/issues/4125
# error: Signature of "type" incompatible with supertype "BaseMaskedDtype"
@property
def type(self) -> type: # type: ignore[override]
return np.bool_
@property
def kind(self) -> str:
return "b"
@property
def numpy_dtype(self) -> np.dtype:
return np.dtype("bool")
@classmethod
def construct_array_type(cls) -> type_t[BooleanArray]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
return BooleanArray
def __repr__(self) -> str:
return "BooleanDtype"
@property
def _is_boolean(self) -> bool:
return True
@property
def _is_numeric(self) -> bool:
return True
def __from_arrow__(
self, array: pyarrow.Array | pyarrow.ChunkedArray
) -> BooleanArray:
"""
Construct BooleanArray from pyarrow Array/ChunkedArray.
"""
import pyarrow
if array.type != pyarrow.bool_() and not pyarrow.types.is_null(array.type):
raise TypeError(f"Expected array of boolean type, got {array.type} instead")
if isinstance(array, pyarrow.Array):
chunks = [array]
length = len(array)
else:
# pyarrow.ChunkedArray
chunks = array.chunks
length = array.length()
if pyarrow.types.is_null(array.type):
mask = np.ones(length, dtype=bool)
# No need to init data, since all null
data = np.empty(length, dtype=bool)
return BooleanArray(data, mask)
results = []
for arr in chunks:
buflist = arr.buffers()
data = pyarrow.BooleanArray.from_buffers(
arr.type, len(arr), [None, buflist[1]], offset=arr.offset
).to_numpy(zero_copy_only=False)
if arr.null_count != 0:
mask = pyarrow.BooleanArray.from_buffers(
arr.type, len(arr), [None, buflist[0]], offset=arr.offset
).to_numpy(zero_copy_only=False)
mask = ~mask
else:
mask = np.zeros(len(arr), dtype=bool)
bool_arr = BooleanArray(data, mask)
results.append(bool_arr)
if not results:
return BooleanArray(
np.array([], dtype=np.bool_), np.array([], dtype=np.bool_)
)
else:
return BooleanArray._concat_same_type(results)
def coerce_to_array(
values, mask=None, copy: bool = False
) -> tuple[np.ndarray, np.ndarray]:
"""
Coerce the input values array to numpy arrays with a mask.
Parameters
----------
values : 1D list-like
mask : bool 1D array, optional
copy : bool, default False
if True, copy the input
Returns
-------
tuple of (values, mask)
"""
if isinstance(values, BooleanArray):
if mask is not None:
raise ValueError("cannot pass mask for BooleanArray input")
values, mask = values._data, values._mask
if copy:
values = values.copy()
mask = mask.copy()
return values, mask
mask_values = None
if isinstance(values, np.ndarray) and values.dtype == np.bool_:
if copy:
values = values.copy()
elif isinstance(values, np.ndarray) and values.dtype.kind in "iufcb":
mask_values = isna(values)
values_bool = np.zeros(len(values), dtype=bool)
values_bool[~mask_values] = values[~mask_values].astype(bool)
if not np.all(
values_bool[~mask_values].astype(values.dtype) == values[~mask_values]
):
raise TypeError("Need to pass bool-like values")
values = values_bool
else:
values_object = np.asarray(values, dtype=object)
inferred_dtype = lib.infer_dtype(values_object, skipna=True)
integer_like = ("floating", "integer", "mixed-integer-float")
if inferred_dtype not in ("boolean", "empty") + integer_like:
raise TypeError("Need to pass bool-like values")
# mypy does not narrow the type of mask_values to npt.NDArray[np.bool_]
# within this branch, it assumes it can also be None
mask_values = cast("npt.NDArray[np.bool_]", isna(values_object))
values = np.zeros(len(values), dtype=bool)
values[~mask_values] = values_object[~mask_values].astype(bool)
# if the values were integer-like, validate it were actually 0/1's
if (inferred_dtype in integer_like) and not (
np.all(
values[~mask_values].astype(float)
== values_object[~mask_values].astype(float)
)
):
raise TypeError("Need to pass bool-like values")
if mask is None and mask_values is None:
mask = np.zeros(values.shape, dtype=bool)
elif mask is None:
mask = mask_values
else:
if isinstance(mask, np.ndarray) and mask.dtype == np.bool_:
if mask_values is not None:
mask = mask | mask_values
else:
if copy:
mask = mask.copy()
else:
mask = np.array(mask, dtype=bool)
if mask_values is not None:
mask = mask | mask_values
if values.shape != mask.shape:
raise ValueError("values.shape and mask.shape must match")
return values, mask
class BooleanArray(BaseMaskedArray):
"""
Array of boolean (True/False) data with missing values.
This is a pandas Extension array for boolean data, under the hood
represented by 2 numpy arrays: a boolean array with the data and
a boolean array with the mask (True indicating missing).
BooleanArray implements Kleene logic (sometimes called three-value
logic) for logical operations. See :ref:`boolean.kleene` for more.
To construct an BooleanArray from generic array-like input, use
:func:`pandas.array` specifying ``dtype="boolean"`` (see examples
below).
.. warning::
BooleanArray is considered experimental. The implementation and
parts of the API may change without warning.
Parameters
----------
values : numpy.ndarray
A 1-d boolean-dtype array with the data.
mask : numpy.ndarray
A 1-d boolean-dtype array indicating missing values (True
indicates missing).
copy : bool, default False
Whether to copy the `values` and `mask` arrays.
Attributes
----------
None
Methods
-------
None
Returns
-------
BooleanArray
Examples
--------
Create an BooleanArray with :func:`pandas.array`:
>>> pd.array([True, False, None], dtype="boolean")
<BooleanArray>
[True, False, <NA>]
Length: 3, dtype: boolean
"""
# The value used to fill '_data' to avoid upcasting
_internal_fill_value = False
# Fill values used for any/all
# Incompatible types in assignment (expression has type "bool", base class
# "BaseMaskedArray" defined the type as "<typing special form>")
_truthy_value = True # type: ignore[assignment]
_falsey_value = False # type: ignore[assignment]
_TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"}
_FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"}
@classmethod
def _simple_new(cls, values: np.ndarray, mask: npt.NDArray[np.bool_]) -> Self:
result = super()._simple_new(values, mask)
result._dtype = BooleanDtype()
return result
def __init__(
self, values: np.ndarray, mask: np.ndarray, copy: bool = False
) -> None:
if not (isinstance(values, np.ndarray) and values.dtype == np.bool_):
raise TypeError(
"values should be boolean numpy array. Use "
"the 'pd.array' function instead"
)
self._dtype = BooleanDtype()
super().__init__(values, mask, copy=copy)
@property
def dtype(self) -> BooleanDtype:
return self._dtype
@classmethod
def _from_sequence_of_strings(
cls,
strings: list[str],
*,
dtype: Dtype | None = None,
copy: bool = False,
true_values: list[str] | None = None,
false_values: list[str] | None = None,
) -> BooleanArray:
true_values_union = cls._TRUE_VALUES.union(true_values or [])
false_values_union = cls._FALSE_VALUES.union(false_values or [])
def map_string(s) -> bool:
if s in true_values_union:
return True
elif s in false_values_union:
return False
else:
raise ValueError(f"{s} cannot be cast to bool")
scalars = np.array(strings, dtype=object)
mask = isna(scalars)
scalars[~mask] = list(map(map_string, scalars[~mask]))
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
_HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_)
@classmethod
def _coerce_to_array(
cls, value, *, dtype: DtypeObj, copy: bool = False
) -> tuple[np.ndarray, np.ndarray]:
if dtype:
assert dtype == "boolean"
return coerce_to_array(value, copy=copy)
def _logical_method(self, other, op):
assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"}
other_is_scalar = lib.is_scalar(other)
mask = None
if isinstance(other, BooleanArray):
other, mask = other._data, other._mask
elif is_list_like(other):
other = np.asarray(other, dtype="bool")
if other.ndim > 1:
raise NotImplementedError("can only perform ops with 1-d structures")
other, mask = coerce_to_array(other, copy=False)
elif isinstance(other, np.bool_):
other = other.item()
if other_is_scalar and other is not libmissing.NA and not lib.is_bool(other):
raise TypeError(
"'other' should be pandas.NA or a bool. "
f"Got {type(other).__name__} instead."
)
if not other_is_scalar and len(self) != len(other):
raise ValueError("Lengths must match")
if op.__name__ in {"or_", "ror_"}:
result, mask = ops.kleene_or(self._data, other, self._mask, mask)
elif op.__name__ in {"and_", "rand_"}:
result, mask = ops.kleene_and(self._data, other, self._mask, mask)
else:
# i.e. xor, rxor
result, mask = ops.kleene_xor(self._data, other, self._mask, mask)
# i.e. BooleanArray
return self._maybe_mask_result(result, mask)
def _accumulate(
self, name: str, *, skipna: bool = True, **kwargs
) -> BaseMaskedArray:
data = self._data
mask = self._mask
if name in ("cummin", "cummax"):
op = getattr(masked_accumulations, name)
data, mask = op(data, mask, skipna=skipna, **kwargs)
return self._simple_new(data, mask)
else:
from pandas.core.arrays import IntegerArray
return IntegerArray(data.astype(int), mask)._accumulate(
name, skipna=skipna, **kwargs
)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,173 @@
from __future__ import annotations
from typing import ClassVar
import numpy as np
from pandas.core.dtypes.base import register_extension_dtype
from pandas.core.dtypes.common import is_float_dtype
from pandas.core.arrays.numeric import (
NumericArray,
NumericDtype,
)
class FloatingDtype(NumericDtype):
"""
An ExtensionDtype to hold a single size of floating dtype.
These specific implementations are subclasses of the non-public
FloatingDtype. For example we have Float32Dtype to represent float32.
The attributes name & type are set when these subclasses are created.
"""
_default_np_dtype = np.dtype(np.float64)
_checker = is_float_dtype
@classmethod
def construct_array_type(cls) -> type[FloatingArray]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
return FloatingArray
@classmethod
def _get_dtype_mapping(cls) -> dict[np.dtype, FloatingDtype]:
return NUMPY_FLOAT_TO_DTYPE
@classmethod
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
"""
Safely cast the values to the given dtype.
"safe" in this context means the casting is lossless.
"""
# This is really only here for compatibility with IntegerDtype
# Here for compat with IntegerDtype
return values.astype(dtype, copy=copy)
class FloatingArray(NumericArray):
"""
Array of floating (optional missing) values.
.. warning::
FloatingArray is currently experimental, and its API or internal
implementation may change without warning. Especially the behaviour
regarding NaN (distinct from NA missing values) is subject to change.
We represent a FloatingArray with 2 numpy arrays:
- data: contains a numpy float array of the appropriate dtype
- mask: a boolean array holding a mask on the data, True is missing
To construct an FloatingArray from generic array-like input, use
:func:`pandas.array` with one of the float dtypes (see examples).
See :ref:`integer_na` for more.
Parameters
----------
values : numpy.ndarray
A 1-d float-dtype array.
mask : numpy.ndarray
A 1-d boolean-dtype array indicating missing values.
copy : bool, default False
Whether to copy the `values` and `mask`.
Attributes
----------
None
Methods
-------
None
Returns
-------
FloatingArray
Examples
--------
Create an FloatingArray with :func:`pandas.array`:
>>> pd.array([0.1, None, 0.3], dtype=pd.Float32Dtype())
<FloatingArray>
[0.1, <NA>, 0.3]
Length: 3, dtype: Float32
String aliases for the dtypes are also available. They are capitalized.
>>> pd.array([0.1, None, 0.3], dtype="Float32")
<FloatingArray>
[0.1, <NA>, 0.3]
Length: 3, dtype: Float32
"""
_dtype_cls = FloatingDtype
# The value used to fill '_data' to avoid upcasting
_internal_fill_value = np.nan
# Fill values used for any/all
# Incompatible types in assignment (expression has type "float", base class
# "BaseMaskedArray" defined the type as "<typing special form>")
_truthy_value = 1.0 # type: ignore[assignment]
_falsey_value = 0.0 # type: ignore[assignment]
_dtype_docstring = """
An ExtensionDtype for {dtype} data.
This dtype uses ``pd.NA`` as missing value indicator.
Attributes
----------
None
Methods
-------
None
Examples
--------
For Float32Dtype:
>>> ser = pd.Series([2.25, pd.NA], dtype=pd.Float32Dtype())
>>> ser.dtype
Float32Dtype()
For Float64Dtype:
>>> ser = pd.Series([2.25, pd.NA], dtype=pd.Float64Dtype())
>>> ser.dtype
Float64Dtype()
"""
# create the Dtype
@register_extension_dtype
class Float32Dtype(FloatingDtype):
type = np.float32
name: ClassVar[str] = "Float32"
__doc__ = _dtype_docstring.format(dtype="float32")
@register_extension_dtype
class Float64Dtype(FloatingDtype):
type = np.float64
name: ClassVar[str] = "Float64"
__doc__ = _dtype_docstring.format(dtype="float64")
NUMPY_FLOAT_TO_DTYPE: dict[np.dtype, FloatingDtype] = {
np.dtype(np.float32): Float32Dtype(),
np.dtype(np.float64): Float64Dtype(),
}

View File

@ -0,0 +1,272 @@
from __future__ import annotations
from typing import ClassVar
import numpy as np
from pandas.core.dtypes.base import register_extension_dtype
from pandas.core.dtypes.common import is_integer_dtype
from pandas.core.arrays.numeric import (
NumericArray,
NumericDtype,
)
class IntegerDtype(NumericDtype):
"""
An ExtensionDtype to hold a single size & kind of integer dtype.
These specific implementations are subclasses of the non-public
IntegerDtype. For example, we have Int8Dtype to represent signed int 8s.
The attributes name & type are set when these subclasses are created.
"""
_default_np_dtype = np.dtype(np.int64)
_checker = is_integer_dtype
@classmethod
def construct_array_type(cls) -> type[IntegerArray]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
return IntegerArray
@classmethod
def _get_dtype_mapping(cls) -> dict[np.dtype, IntegerDtype]:
return NUMPY_INT_TO_DTYPE
@classmethod
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
"""
Safely cast the values to the given dtype.
"safe" in this context means the casting is lossless. e.g. if 'values'
has a floating dtype, each value must be an integer.
"""
try:
return values.astype(dtype, casting="safe", copy=copy)
except TypeError as err:
casted = values.astype(dtype, copy=copy)
if (casted == values).all():
return casted
raise TypeError(
f"cannot safely cast non-equivalent {values.dtype} to {np.dtype(dtype)}"
) from err
class IntegerArray(NumericArray):
"""
Array of integer (optional missing) values.
Uses :attr:`pandas.NA` as the missing value.
.. warning::
IntegerArray is currently experimental, and its API or internal
implementation may change without warning.
We represent an IntegerArray with 2 numpy arrays:
- data: contains a numpy integer array of the appropriate dtype
- mask: a boolean array holding a mask on the data, True is missing
To construct an IntegerArray from generic array-like input, use
:func:`pandas.array` with one of the integer dtypes (see examples).
See :ref:`integer_na` for more.
Parameters
----------
values : numpy.ndarray
A 1-d integer-dtype array.
mask : numpy.ndarray
A 1-d boolean-dtype array indicating missing values.
copy : bool, default False
Whether to copy the `values` and `mask`.
Attributes
----------
None
Methods
-------
None
Returns
-------
IntegerArray
Examples
--------
Create an IntegerArray with :func:`pandas.array`.
>>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype())
>>> int_array
<IntegerArray>
[1, <NA>, 3]
Length: 3, dtype: Int32
String aliases for the dtypes are also available. They are capitalized.
>>> pd.array([1, None, 3], dtype='Int32')
<IntegerArray>
[1, <NA>, 3]
Length: 3, dtype: Int32
>>> pd.array([1, None, 3], dtype='UInt16')
<IntegerArray>
[1, <NA>, 3]
Length: 3, dtype: UInt16
"""
_dtype_cls = IntegerDtype
# The value used to fill '_data' to avoid upcasting
_internal_fill_value = 1
# Fill values used for any/all
# Incompatible types in assignment (expression has type "int", base class
# "BaseMaskedArray" defined the type as "<typing special form>")
_truthy_value = 1 # type: ignore[assignment]
_falsey_value = 0 # type: ignore[assignment]
_dtype_docstring = """
An ExtensionDtype for {dtype} integer data.
Uses :attr:`pandas.NA` as its missing value, rather than :attr:`numpy.nan`.
Attributes
----------
None
Methods
-------
None
Examples
--------
For Int8Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int8Dtype())
>>> ser.dtype
Int8Dtype()
For Int16Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int16Dtype())
>>> ser.dtype
Int16Dtype()
For Int32Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int32Dtype())
>>> ser.dtype
Int32Dtype()
For Int64Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int64Dtype())
>>> ser.dtype
Int64Dtype()
For UInt8Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt8Dtype())
>>> ser.dtype
UInt8Dtype()
For UInt16Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt16Dtype())
>>> ser.dtype
UInt16Dtype()
For UInt32Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt32Dtype())
>>> ser.dtype
UInt32Dtype()
For UInt64Dtype:
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt64Dtype())
>>> ser.dtype
UInt64Dtype()
"""
# create the Dtype
@register_extension_dtype
class Int8Dtype(IntegerDtype):
type = np.int8
name: ClassVar[str] = "Int8"
__doc__ = _dtype_docstring.format(dtype="int8")
@register_extension_dtype
class Int16Dtype(IntegerDtype):
type = np.int16
name: ClassVar[str] = "Int16"
__doc__ = _dtype_docstring.format(dtype="int16")
@register_extension_dtype
class Int32Dtype(IntegerDtype):
type = np.int32
name: ClassVar[str] = "Int32"
__doc__ = _dtype_docstring.format(dtype="int32")
@register_extension_dtype
class Int64Dtype(IntegerDtype):
type = np.int64
name: ClassVar[str] = "Int64"
__doc__ = _dtype_docstring.format(dtype="int64")
@register_extension_dtype
class UInt8Dtype(IntegerDtype):
type = np.uint8
name: ClassVar[str] = "UInt8"
__doc__ = _dtype_docstring.format(dtype="uint8")
@register_extension_dtype
class UInt16Dtype(IntegerDtype):
type = np.uint16
name: ClassVar[str] = "UInt16"
__doc__ = _dtype_docstring.format(dtype="uint16")
@register_extension_dtype
class UInt32Dtype(IntegerDtype):
type = np.uint32
name: ClassVar[str] = "UInt32"
__doc__ = _dtype_docstring.format(dtype="uint32")
@register_extension_dtype
class UInt64Dtype(IntegerDtype):
type = np.uint64
name: ClassVar[str] = "UInt64"
__doc__ = _dtype_docstring.format(dtype="uint64")
NUMPY_INT_TO_DTYPE: dict[np.dtype, IntegerDtype] = {
np.dtype(np.int8): Int8Dtype(),
np.dtype(np.int16): Int16Dtype(),
np.dtype(np.int32): Int32Dtype(),
np.dtype(np.int64): Int64Dtype(),
np.dtype(np.uint8): UInt8Dtype(),
np.dtype(np.uint16): UInt16Dtype(),
np.dtype(np.uint32): UInt32Dtype(),
np.dtype(np.uint64): UInt64Dtype(),
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,286 @@
from __future__ import annotations
import numbers
from typing import (
TYPE_CHECKING,
Any,
Callable,
)
import numpy as np
from pandas._libs import (
lib,
missing as libmissing,
)
from pandas.errors import AbstractMethodError
from pandas.util._decorators import cache_readonly
from pandas.core.dtypes.common import (
is_integer_dtype,
is_string_dtype,
pandas_dtype,
)
from pandas.core.arrays.masked import (
BaseMaskedArray,
BaseMaskedDtype,
)
if TYPE_CHECKING:
from collections.abc import Mapping
import pyarrow
from pandas._typing import (
Dtype,
DtypeObj,
Self,
npt,
)
class NumericDtype(BaseMaskedDtype):
_default_np_dtype: np.dtype
_checker: Callable[[Any], bool] # is_foo_dtype
def __repr__(self) -> str:
return f"{self.name}Dtype()"
@cache_readonly
def is_signed_integer(self) -> bool:
return self.kind == "i"
@cache_readonly
def is_unsigned_integer(self) -> bool:
return self.kind == "u"
@property
def _is_numeric(self) -> bool:
return True
def __from_arrow__(
self, array: pyarrow.Array | pyarrow.ChunkedArray
) -> BaseMaskedArray:
"""
Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray.
"""
import pyarrow
from pandas.core.arrays.arrow._arrow_utils import (
pyarrow_array_to_numpy_and_mask,
)
array_class = self.construct_array_type()
pyarrow_type = pyarrow.from_numpy_dtype(self.type)
if not array.type.equals(pyarrow_type) and not pyarrow.types.is_null(
array.type
):
# test_from_arrow_type_error raise for string, but allow
# through itemsize conversion GH#31896
rt_dtype = pandas_dtype(array.type.to_pandas_dtype())
if rt_dtype.kind not in "iuf":
# Could allow "c" or potentially disallow float<->int conversion,
# but at the moment we specifically test that uint<->int works
raise TypeError(
f"Expected array of {self} type, got {array.type} instead"
)
array = array.cast(pyarrow_type)
if isinstance(array, pyarrow.ChunkedArray):
# TODO this "if" can be removed when requiring pyarrow >= 10.0, which fixed
# combine_chunks for empty arrays https://github.com/apache/arrow/pull/13757
if array.num_chunks == 0:
array = pyarrow.array([], type=array.type)
else:
array = array.combine_chunks()
data, mask = pyarrow_array_to_numpy_and_mask(array, dtype=self.numpy_dtype)
return array_class(data.copy(), ~mask, copy=False)
@classmethod
def _get_dtype_mapping(cls) -> Mapping[np.dtype, NumericDtype]:
raise AbstractMethodError(cls)
@classmethod
def _standardize_dtype(cls, dtype: NumericDtype | str | np.dtype) -> NumericDtype:
"""
Convert a string representation or a numpy dtype to NumericDtype.
"""
if isinstance(dtype, str) and (dtype.startswith(("Int", "UInt", "Float"))):
# Avoid DeprecationWarning from NumPy about np.dtype("Int64")
# https://github.com/numpy/numpy/pull/7476
dtype = dtype.lower()
if not isinstance(dtype, NumericDtype):
mapping = cls._get_dtype_mapping()
try:
dtype = mapping[np.dtype(dtype)]
except KeyError as err:
raise ValueError(f"invalid dtype specified {dtype}") from err
return dtype
@classmethod
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
"""
Safely cast the values to the given dtype.
"safe" in this context means the casting is lossless.
"""
raise AbstractMethodError(cls)
def _coerce_to_data_and_mask(
values, dtype, copy: bool, dtype_cls: type[NumericDtype], default_dtype: np.dtype
):
checker = dtype_cls._checker
mask = None
inferred_type = None
if dtype is None and hasattr(values, "dtype"):
if checker(values.dtype):
dtype = values.dtype
if dtype is not None:
dtype = dtype_cls._standardize_dtype(dtype)
cls = dtype_cls.construct_array_type()
if isinstance(values, cls):
values, mask = values._data, values._mask
if dtype is not None:
values = values.astype(dtype.numpy_dtype, copy=False)
if copy:
values = values.copy()
mask = mask.copy()
return values, mask, dtype, inferred_type
original = values
if not copy:
values = np.asarray(values)
else:
values = np.array(values, copy=copy)
inferred_type = None
if values.dtype == object or is_string_dtype(values.dtype):
inferred_type = lib.infer_dtype(values, skipna=True)
if inferred_type == "boolean" and dtype is None:
name = dtype_cls.__name__.strip("_")
raise TypeError(f"{values.dtype} cannot be converted to {name}")
elif values.dtype.kind == "b" and checker(dtype):
if not copy:
values = np.asarray(values, dtype=default_dtype)
else:
values = np.array(values, dtype=default_dtype, copy=copy)
elif values.dtype.kind not in "iuf":
name = dtype_cls.__name__.strip("_")
raise TypeError(f"{values.dtype} cannot be converted to {name}")
if values.ndim != 1:
raise TypeError("values must be a 1D list-like")
if mask is None:
if values.dtype.kind in "iu":
# fastpath
mask = np.zeros(len(values), dtype=np.bool_)
else:
mask = libmissing.is_numeric_na(values)
else:
assert len(mask) == len(values)
if mask.ndim != 1:
raise TypeError("mask must be a 1D list-like")
# infer dtype if needed
if dtype is None:
dtype = default_dtype
else:
dtype = dtype.numpy_dtype
if is_integer_dtype(dtype) and values.dtype.kind == "f" and len(values) > 0:
if mask.all():
values = np.ones(values.shape, dtype=dtype)
else:
idx = np.nanargmax(values)
if int(values[idx]) != original[idx]:
# We have ints that lost precision during the cast.
inferred_type = lib.infer_dtype(original, skipna=True)
if (
inferred_type not in ["floating", "mixed-integer-float"]
and not mask.any()
):
values = np.asarray(original, dtype=dtype)
else:
values = np.asarray(original, dtype="object")
# we copy as need to coerce here
if mask.any():
values = values.copy()
values[mask] = cls._internal_fill_value
if inferred_type in ("string", "unicode"):
# casts from str are always safe since they raise
# a ValueError if the str cannot be parsed into a float
values = values.astype(dtype, copy=copy)
else:
values = dtype_cls._safe_cast(values, dtype, copy=False)
return values, mask, dtype, inferred_type
class NumericArray(BaseMaskedArray):
"""
Base class for IntegerArray and FloatingArray.
"""
_dtype_cls: type[NumericDtype]
def __init__(
self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False
) -> None:
checker = self._dtype_cls._checker
if not (isinstance(values, np.ndarray) and checker(values.dtype)):
descr = (
"floating"
if self._dtype_cls.kind == "f" # type: ignore[comparison-overlap]
else "integer"
)
raise TypeError(
f"values should be {descr} numpy array. Use "
"the 'pd.array' function instead"
)
if values.dtype == np.float16:
# If we don't raise here, then accessing self.dtype would raise
raise TypeError("FloatingArray does not support np.float16 dtype.")
super().__init__(values, mask, copy=copy)
@cache_readonly
def dtype(self) -> NumericDtype:
mapping = self._dtype_cls._get_dtype_mapping()
return mapping[self._data.dtype]
@classmethod
def _coerce_to_array(
cls, value, *, dtype: DtypeObj, copy: bool = False
) -> tuple[np.ndarray, np.ndarray]:
dtype_cls = cls._dtype_cls
default_dtype = dtype_cls._default_np_dtype
values, mask, _, _ = _coerce_to_data_and_mask(
value, dtype, copy, dtype_cls, default_dtype
)
return values, mask
@classmethod
def _from_sequence_of_strings(
cls, strings, *, dtype: Dtype | None = None, copy: bool = False
) -> Self:
from pandas.core.tools.numeric import to_numeric
scalars = to_numeric(strings, errors="raise", dtype_backend="numpy_nullable")
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
_HANDLED_TYPES = (np.ndarray, numbers.Number)

Some files were not shown because too many files have changed in this diff Show More