Updated script that can be controled by Nodejs web app
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
239
lib/python3.13/site-packages/pandas/core/_numba/executor.py
Normal file
239
lib/python3.13/site-packages/pandas/core/_numba/executor.py
Normal file
@ -0,0 +1,239 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import functools
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import Scalar
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
|
||||
@functools.cache
|
||||
def generate_apply_looper(func, nopython=True, nogil=True, parallel=False):
|
||||
if TYPE_CHECKING:
|
||||
import numba
|
||||
else:
|
||||
numba = import_optional_dependency("numba")
|
||||
nb_compat_func = numba.extending.register_jitable(func)
|
||||
|
||||
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
|
||||
def nb_looper(values, axis):
|
||||
# Operate on the first row/col in order to get
|
||||
# the output shape
|
||||
if axis == 0:
|
||||
first_elem = values[:, 0]
|
||||
dim0 = values.shape[1]
|
||||
else:
|
||||
first_elem = values[0]
|
||||
dim0 = values.shape[0]
|
||||
res0 = nb_compat_func(first_elem)
|
||||
# Use np.asarray to get shape for
|
||||
# https://github.com/numba/numba/issues/4202#issuecomment-1185981507
|
||||
buf_shape = (dim0,) + np.atleast_1d(np.asarray(res0)).shape
|
||||
if axis == 0:
|
||||
buf_shape = buf_shape[::-1]
|
||||
buff = np.empty(buf_shape)
|
||||
|
||||
if axis == 1:
|
||||
buff[0] = res0
|
||||
for i in numba.prange(1, values.shape[0]):
|
||||
buff[i] = nb_compat_func(values[i])
|
||||
else:
|
||||
buff[:, 0] = res0
|
||||
for j in numba.prange(1, values.shape[1]):
|
||||
buff[:, j] = nb_compat_func(values[:, j])
|
||||
return buff
|
||||
|
||||
return nb_looper
|
||||
|
||||
|
||||
@functools.cache
|
||||
def make_looper(func, result_dtype, is_grouped_kernel, nopython, nogil, parallel):
|
||||
if TYPE_CHECKING:
|
||||
import numba
|
||||
else:
|
||||
numba = import_optional_dependency("numba")
|
||||
|
||||
if is_grouped_kernel:
|
||||
|
||||
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
|
||||
def column_looper(
|
||||
values: np.ndarray,
|
||||
labels: np.ndarray,
|
||||
ngroups: int,
|
||||
min_periods: int,
|
||||
*args,
|
||||
):
|
||||
result = np.empty((values.shape[0], ngroups), dtype=result_dtype)
|
||||
na_positions = {}
|
||||
for i in numba.prange(values.shape[0]):
|
||||
output, na_pos = func(
|
||||
values[i], result_dtype, labels, ngroups, min_periods, *args
|
||||
)
|
||||
result[i] = output
|
||||
if len(na_pos) > 0:
|
||||
na_positions[i] = np.array(na_pos)
|
||||
return result, na_positions
|
||||
|
||||
else:
|
||||
|
||||
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
|
||||
def column_looper(
|
||||
values: np.ndarray,
|
||||
start: np.ndarray,
|
||||
end: np.ndarray,
|
||||
min_periods: int,
|
||||
*args,
|
||||
):
|
||||
result = np.empty((values.shape[0], len(start)), dtype=result_dtype)
|
||||
na_positions = {}
|
||||
for i in numba.prange(values.shape[0]):
|
||||
output, na_pos = func(
|
||||
values[i], result_dtype, start, end, min_periods, *args
|
||||
)
|
||||
result[i] = output
|
||||
if len(na_pos) > 0:
|
||||
na_positions[i] = np.array(na_pos)
|
||||
return result, na_positions
|
||||
|
||||
return column_looper
|
||||
|
||||
|
||||
default_dtype_mapping: dict[np.dtype, Any] = {
|
||||
np.dtype("int8"): np.int64,
|
||||
np.dtype("int16"): np.int64,
|
||||
np.dtype("int32"): np.int64,
|
||||
np.dtype("int64"): np.int64,
|
||||
np.dtype("uint8"): np.uint64,
|
||||
np.dtype("uint16"): np.uint64,
|
||||
np.dtype("uint32"): np.uint64,
|
||||
np.dtype("uint64"): np.uint64,
|
||||
np.dtype("float32"): np.float64,
|
||||
np.dtype("float64"): np.float64,
|
||||
np.dtype("complex64"): np.complex128,
|
||||
np.dtype("complex128"): np.complex128,
|
||||
}
|
||||
|
||||
|
||||
# TODO: Preserve complex dtypes
|
||||
|
||||
float_dtype_mapping: dict[np.dtype, Any] = {
|
||||
np.dtype("int8"): np.float64,
|
||||
np.dtype("int16"): np.float64,
|
||||
np.dtype("int32"): np.float64,
|
||||
np.dtype("int64"): np.float64,
|
||||
np.dtype("uint8"): np.float64,
|
||||
np.dtype("uint16"): np.float64,
|
||||
np.dtype("uint32"): np.float64,
|
||||
np.dtype("uint64"): np.float64,
|
||||
np.dtype("float32"): np.float64,
|
||||
np.dtype("float64"): np.float64,
|
||||
np.dtype("complex64"): np.float64,
|
||||
np.dtype("complex128"): np.float64,
|
||||
}
|
||||
|
||||
identity_dtype_mapping: dict[np.dtype, Any] = {
|
||||
np.dtype("int8"): np.int8,
|
||||
np.dtype("int16"): np.int16,
|
||||
np.dtype("int32"): np.int32,
|
||||
np.dtype("int64"): np.int64,
|
||||
np.dtype("uint8"): np.uint8,
|
||||
np.dtype("uint16"): np.uint16,
|
||||
np.dtype("uint32"): np.uint32,
|
||||
np.dtype("uint64"): np.uint64,
|
||||
np.dtype("float32"): np.float32,
|
||||
np.dtype("float64"): np.float64,
|
||||
np.dtype("complex64"): np.complex64,
|
||||
np.dtype("complex128"): np.complex128,
|
||||
}
|
||||
|
||||
|
||||
def generate_shared_aggregator(
|
||||
func: Callable[..., Scalar],
|
||||
dtype_mapping: dict[np.dtype, np.dtype],
|
||||
is_grouped_kernel: bool,
|
||||
nopython: bool,
|
||||
nogil: bool,
|
||||
parallel: bool,
|
||||
):
|
||||
"""
|
||||
Generate a Numba function that loops over the columns 2D object and applies
|
||||
a 1D numba kernel over each column.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : function
|
||||
aggregation function to be applied to each column
|
||||
dtype_mapping: dict or None
|
||||
If not None, maps a dtype to a result dtype.
|
||||
Otherwise, will fall back to default mapping.
|
||||
is_grouped_kernel: bool, default False
|
||||
Whether func operates using the group labels (True)
|
||||
or using starts/ends arrays
|
||||
|
||||
If true, you also need to pass the number of groups to this function
|
||||
nopython : bool
|
||||
nopython to be passed into numba.jit
|
||||
nogil : bool
|
||||
nogil to be passed into numba.jit
|
||||
parallel : bool
|
||||
parallel to be passed into numba.jit
|
||||
|
||||
Returns
|
||||
-------
|
||||
Numba function
|
||||
"""
|
||||
|
||||
# A wrapper around the looper function,
|
||||
# to dispatch based on dtype since numba is unable to do that in nopython mode
|
||||
|
||||
# It also post-processes the values by inserting nans where number of observations
|
||||
# is less than min_periods
|
||||
# Cannot do this in numba nopython mode
|
||||
# (you'll run into type-unification error when you cast int -> float)
|
||||
def looper_wrapper(
|
||||
values,
|
||||
start=None,
|
||||
end=None,
|
||||
labels=None,
|
||||
ngroups=None,
|
||||
min_periods: int = 0,
|
||||
**kwargs,
|
||||
):
|
||||
result_dtype = dtype_mapping[values.dtype]
|
||||
column_looper = make_looper(
|
||||
func, result_dtype, is_grouped_kernel, nopython, nogil, parallel
|
||||
)
|
||||
# Need to unpack kwargs since numba only supports *args
|
||||
if is_grouped_kernel:
|
||||
result, na_positions = column_looper(
|
||||
values, labels, ngroups, min_periods, *kwargs.values()
|
||||
)
|
||||
else:
|
||||
result, na_positions = column_looper(
|
||||
values, start, end, min_periods, *kwargs.values()
|
||||
)
|
||||
if result.dtype.kind == "i":
|
||||
# Look if na_positions is not empty
|
||||
# If so, convert the whole block
|
||||
# This is OK since int dtype cannot hold nan,
|
||||
# so if min_periods not satisfied for 1 col, it is not satisfied for
|
||||
# all columns at that index
|
||||
for na_pos in na_positions.values():
|
||||
if len(na_pos) > 0:
|
||||
result = result.astype("float64")
|
||||
break
|
||||
# TODO: Optimize this
|
||||
for i, na_pos in na_positions.items():
|
||||
if len(na_pos) > 0:
|
||||
result[i, na_pos] = np.nan
|
||||
return result
|
||||
|
||||
return looper_wrapper
|
584
lib/python3.13/site-packages/pandas/core/_numba/extensions.py
Normal file
584
lib/python3.13/site-packages/pandas/core/_numba/extensions.py
Normal file
@ -0,0 +1,584 @@
|
||||
# Disable type checking for this module since numba's internals
|
||||
# are not typed, and we use numba's internals via its extension API
|
||||
# mypy: ignore-errors
|
||||
"""
|
||||
Utility classes/functions to let numba recognize
|
||||
pandas Index/Series/DataFrame
|
||||
|
||||
Mostly vendored from https://github.com/numba/numba/blob/main/numba/tests/pdlike_usecase.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from contextlib import contextmanager
|
||||
import operator
|
||||
|
||||
import numba
|
||||
from numba import types
|
||||
from numba.core import cgutils
|
||||
from numba.core.datamodel import models
|
||||
from numba.core.extending import (
|
||||
NativeValue,
|
||||
box,
|
||||
lower_builtin,
|
||||
make_attribute_wrapper,
|
||||
overload,
|
||||
overload_attribute,
|
||||
overload_method,
|
||||
register_model,
|
||||
type_callable,
|
||||
typeof_impl,
|
||||
unbox,
|
||||
)
|
||||
from numba.core.imputils import impl_ret_borrowed
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
|
||||
from pandas.core.indexes.base import Index
|
||||
from pandas.core.indexing import _iLocIndexer
|
||||
from pandas.core.internals import SingleBlockManager
|
||||
from pandas.core.series import Series
|
||||
|
||||
|
||||
# Helper function to hack around fact that Index casts numpy string dtype to object
|
||||
#
|
||||
# Idea is to set an attribute on a Index called _numba_data
|
||||
# that is the original data, or the object data casted to numpy string dtype,
|
||||
# with a context manager that is unset afterwards
|
||||
@contextmanager
|
||||
def set_numba_data(index: Index):
|
||||
numba_data = index._data
|
||||
if numba_data.dtype == object:
|
||||
if not lib.is_string_array(numba_data):
|
||||
raise ValueError(
|
||||
"The numba engine only supports using string or numeric column names"
|
||||
)
|
||||
numba_data = numba_data.astype("U")
|
||||
try:
|
||||
index._numba_data = numba_data
|
||||
yield index
|
||||
finally:
|
||||
del index._numba_data
|
||||
|
||||
|
||||
# TODO: Range index support
|
||||
# (this currently lowers OK, but does not round-trip)
|
||||
class IndexType(types.Type):
|
||||
"""
|
||||
The type class for Index objects.
|
||||
"""
|
||||
|
||||
def __init__(self, dtype, layout, pyclass: any) -> None:
|
||||
self.pyclass = pyclass
|
||||
name = f"index({dtype}, {layout})"
|
||||
self.dtype = dtype
|
||||
self.layout = layout
|
||||
super().__init__(name)
|
||||
|
||||
@property
|
||||
def key(self):
|
||||
return self.pyclass, self.dtype, self.layout
|
||||
|
||||
@property
|
||||
def as_array(self):
|
||||
return types.Array(self.dtype, 1, self.layout)
|
||||
|
||||
def copy(self, dtype=None, ndim: int = 1, layout=None):
|
||||
assert ndim == 1
|
||||
if dtype is None:
|
||||
dtype = self.dtype
|
||||
layout = layout or self.layout
|
||||
return type(self)(dtype, layout, self.pyclass)
|
||||
|
||||
|
||||
class SeriesType(types.Type):
|
||||
"""
|
||||
The type class for Series objects.
|
||||
"""
|
||||
|
||||
def __init__(self, dtype, index, namety) -> None:
|
||||
assert isinstance(index, IndexType)
|
||||
self.dtype = dtype
|
||||
self.index = index
|
||||
self.values = types.Array(self.dtype, 1, "C")
|
||||
self.namety = namety
|
||||
name = f"series({dtype}, {index}, {namety})"
|
||||
super().__init__(name)
|
||||
|
||||
@property
|
||||
def key(self):
|
||||
return self.dtype, self.index, self.namety
|
||||
|
||||
@property
|
||||
def as_array(self):
|
||||
return self.values
|
||||
|
||||
def copy(self, dtype=None, ndim: int = 1, layout: str = "C"):
|
||||
assert ndim == 1
|
||||
assert layout == "C"
|
||||
if dtype is None:
|
||||
dtype = self.dtype
|
||||
return type(self)(dtype, self.index, self.namety)
|
||||
|
||||
|
||||
@typeof_impl.register(Index)
|
||||
def typeof_index(val, c):
|
||||
"""
|
||||
This will assume that only strings are in object dtype
|
||||
index.
|
||||
(you should check this before this gets lowered down to numba)
|
||||
"""
|
||||
# arrty = typeof_impl(val._data, c)
|
||||
arrty = typeof_impl(val._numba_data, c)
|
||||
assert arrty.ndim == 1
|
||||
return IndexType(arrty.dtype, arrty.layout, type(val))
|
||||
|
||||
|
||||
@typeof_impl.register(Series)
|
||||
def typeof_series(val, c):
|
||||
index = typeof_impl(val.index, c)
|
||||
arrty = typeof_impl(val.values, c)
|
||||
namety = typeof_impl(val.name, c)
|
||||
assert arrty.ndim == 1
|
||||
assert arrty.layout == "C"
|
||||
return SeriesType(arrty.dtype, index, namety)
|
||||
|
||||
|
||||
@type_callable(Series)
|
||||
def type_series_constructor(context):
|
||||
def typer(data, index, name=None):
|
||||
if isinstance(index, IndexType) and isinstance(data, types.Array):
|
||||
assert data.ndim == 1
|
||||
if name is None:
|
||||
name = types.intp
|
||||
return SeriesType(data.dtype, index, name)
|
||||
|
||||
return typer
|
||||
|
||||
|
||||
@type_callable(Index)
|
||||
def type_index_constructor(context):
|
||||
def typer(data, hashmap=None):
|
||||
if isinstance(data, types.Array):
|
||||
assert data.layout == "C"
|
||||
assert data.ndim == 1
|
||||
assert hashmap is None or isinstance(hashmap, types.DictType)
|
||||
return IndexType(data.dtype, layout=data.layout, pyclass=Index)
|
||||
|
||||
return typer
|
||||
|
||||
|
||||
# Backend extensions for Index and Series and Frame
|
||||
@register_model(IndexType)
|
||||
class IndexModel(models.StructModel):
|
||||
def __init__(self, dmm, fe_type) -> None:
|
||||
# We don't want the numpy string scalar type in our hashmap
|
||||
members = [
|
||||
("data", fe_type.as_array),
|
||||
# This is an attempt to emulate our hashtable code with a numba
|
||||
# typed dict
|
||||
# It maps from values in the index to their integer positions in the array
|
||||
("hashmap", types.DictType(fe_type.dtype, types.intp)),
|
||||
# Pointer to the Index object this was created from, or that it
|
||||
# boxes to
|
||||
# https://numba.discourse.group/t/qst-how-to-cache-the-boxing-of-an-object/2128/2?u=lithomas1
|
||||
("parent", types.pyobject),
|
||||
]
|
||||
models.StructModel.__init__(self, dmm, fe_type, members)
|
||||
|
||||
|
||||
@register_model(SeriesType)
|
||||
class SeriesModel(models.StructModel):
|
||||
def __init__(self, dmm, fe_type) -> None:
|
||||
members = [
|
||||
("index", fe_type.index),
|
||||
("values", fe_type.as_array),
|
||||
("name", fe_type.namety),
|
||||
]
|
||||
models.StructModel.__init__(self, dmm, fe_type, members)
|
||||
|
||||
|
||||
make_attribute_wrapper(IndexType, "data", "_data")
|
||||
make_attribute_wrapper(IndexType, "hashmap", "hashmap")
|
||||
|
||||
make_attribute_wrapper(SeriesType, "index", "index")
|
||||
make_attribute_wrapper(SeriesType, "values", "values")
|
||||
make_attribute_wrapper(SeriesType, "name", "name")
|
||||
|
||||
|
||||
@lower_builtin(Series, types.Array, IndexType)
|
||||
def pdseries_constructor(context, builder, sig, args):
|
||||
data, index = args
|
||||
series = cgutils.create_struct_proxy(sig.return_type)(context, builder)
|
||||
series.index = index
|
||||
series.values = data
|
||||
series.name = context.get_constant(types.intp, 0)
|
||||
return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue())
|
||||
|
||||
|
||||
@lower_builtin(Series, types.Array, IndexType, types.intp)
|
||||
@lower_builtin(Series, types.Array, IndexType, types.float64)
|
||||
@lower_builtin(Series, types.Array, IndexType, types.unicode_type)
|
||||
def pdseries_constructor_with_name(context, builder, sig, args):
|
||||
data, index, name = args
|
||||
series = cgutils.create_struct_proxy(sig.return_type)(context, builder)
|
||||
series.index = index
|
||||
series.values = data
|
||||
series.name = name
|
||||
return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue())
|
||||
|
||||
|
||||
@lower_builtin(Index, types.Array, types.DictType, types.pyobject)
|
||||
def index_constructor_2arg(context, builder, sig, args):
|
||||
(data, hashmap, parent) = args
|
||||
index = cgutils.create_struct_proxy(sig.return_type)(context, builder)
|
||||
|
||||
index.data = data
|
||||
index.hashmap = hashmap
|
||||
index.parent = parent
|
||||
return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue())
|
||||
|
||||
|
||||
@lower_builtin(Index, types.Array, types.DictType)
|
||||
def index_constructor_2arg_parent(context, builder, sig, args):
|
||||
# Basically same as index_constructor_1arg, but also lets you specify the
|
||||
# parent object
|
||||
(data, hashmap) = args
|
||||
index = cgutils.create_struct_proxy(sig.return_type)(context, builder)
|
||||
|
||||
index.data = data
|
||||
index.hashmap = hashmap
|
||||
return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue())
|
||||
|
||||
|
||||
@lower_builtin(Index, types.Array)
|
||||
def index_constructor_1arg(context, builder, sig, args):
|
||||
from numba.typed import Dict
|
||||
|
||||
key_type = sig.return_type.dtype
|
||||
value_type = types.intp
|
||||
|
||||
def index_impl(data):
|
||||
return Index(data, Dict.empty(key_type, value_type))
|
||||
|
||||
return context.compile_internal(builder, index_impl, sig, args)
|
||||
|
||||
|
||||
# Helper to convert the unicodecharseq (numpy string scalar) into a unicode_type
|
||||
# (regular string)
|
||||
def maybe_cast_str(x):
|
||||
# Dummy function that numba can overload
|
||||
pass
|
||||
|
||||
|
||||
@overload(maybe_cast_str)
|
||||
def maybe_cast_str_impl(x):
|
||||
"""Converts numba UnicodeCharSeq (numpy string scalar) -> unicode type (string).
|
||||
Is a no-op for other types."""
|
||||
if isinstance(x, types.UnicodeCharSeq):
|
||||
return lambda x: str(x)
|
||||
else:
|
||||
return lambda x: x
|
||||
|
||||
|
||||
@unbox(IndexType)
|
||||
def unbox_index(typ, obj, c):
|
||||
"""
|
||||
Convert a Index object to a native structure.
|
||||
|
||||
Note: Object dtype is not allowed here
|
||||
"""
|
||||
data_obj = c.pyapi.object_getattr_string(obj, "_numba_data")
|
||||
index = cgutils.create_struct_proxy(typ)(c.context, c.builder)
|
||||
# If we see an object array, assume its been validated as only containing strings
|
||||
# We still need to do the conversion though
|
||||
index.data = c.unbox(typ.as_array, data_obj).value
|
||||
typed_dict_obj = c.pyapi.unserialize(c.pyapi.serialize_object(numba.typed.Dict))
|
||||
# Create an empty typed dict in numba for the hashmap for indexing
|
||||
# equiv of numba.typed.Dict.empty(typ.dtype, types.intp)
|
||||
arr_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.dtype))
|
||||
intp_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(types.intp))
|
||||
hashmap_obj = c.pyapi.call_method(
|
||||
typed_dict_obj, "empty", (arr_type_obj, intp_type_obj)
|
||||
)
|
||||
index.hashmap = c.unbox(types.DictType(typ.dtype, types.intp), hashmap_obj).value
|
||||
# Set the parent for speedy boxing.
|
||||
index.parent = obj
|
||||
|
||||
# Decrefs
|
||||
c.pyapi.decref(data_obj)
|
||||
c.pyapi.decref(arr_type_obj)
|
||||
c.pyapi.decref(intp_type_obj)
|
||||
c.pyapi.decref(typed_dict_obj)
|
||||
|
||||
return NativeValue(index._getvalue())
|
||||
|
||||
|
||||
@unbox(SeriesType)
|
||||
def unbox_series(typ, obj, c):
|
||||
"""
|
||||
Convert a Series object to a native structure.
|
||||
"""
|
||||
index_obj = c.pyapi.object_getattr_string(obj, "index")
|
||||
values_obj = c.pyapi.object_getattr_string(obj, "values")
|
||||
name_obj = c.pyapi.object_getattr_string(obj, "name")
|
||||
|
||||
series = cgutils.create_struct_proxy(typ)(c.context, c.builder)
|
||||
series.index = c.unbox(typ.index, index_obj).value
|
||||
series.values = c.unbox(typ.values, values_obj).value
|
||||
series.name = c.unbox(typ.namety, name_obj).value
|
||||
|
||||
# Decrefs
|
||||
c.pyapi.decref(index_obj)
|
||||
c.pyapi.decref(values_obj)
|
||||
c.pyapi.decref(name_obj)
|
||||
|
||||
return NativeValue(series._getvalue())
|
||||
|
||||
|
||||
@box(IndexType)
|
||||
def box_index(typ, val, c):
|
||||
"""
|
||||
Convert a native index structure to a Index object.
|
||||
|
||||
If our native index is of a numpy string dtype, we'll cast it to
|
||||
object.
|
||||
"""
|
||||
# First build a Numpy array object, then wrap it in a Index
|
||||
index = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val)
|
||||
|
||||
res = cgutils.alloca_once_value(c.builder, index.parent)
|
||||
|
||||
# Does parent exist?
|
||||
# (it means already boxed once, or Index same as original df.index or df.columns)
|
||||
# xref https://github.com/numba/numba/blob/596e8a55334cc46854e3192766e643767bd7c934/numba/core/boxing.py#L593C17-L593C17
|
||||
with c.builder.if_else(cgutils.is_not_null(c.builder, index.parent)) as (
|
||||
has_parent,
|
||||
otherwise,
|
||||
):
|
||||
with has_parent:
|
||||
c.pyapi.incref(index.parent)
|
||||
with otherwise:
|
||||
# TODO: preserve the original class for the index
|
||||
# Also need preserve the name of the Index
|
||||
# class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.pyclass))
|
||||
class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Index))
|
||||
array_obj = c.box(typ.as_array, index.data)
|
||||
if isinstance(typ.dtype, types.UnicodeCharSeq):
|
||||
# We converted to numpy string dtype, convert back
|
||||
# to object since _simple_new won't do that for uss
|
||||
object_str_obj = c.pyapi.unserialize(c.pyapi.serialize_object("object"))
|
||||
array_obj = c.pyapi.call_method(array_obj, "astype", (object_str_obj,))
|
||||
c.pyapi.decref(object_str_obj)
|
||||
# this is basically Index._simple_new(array_obj, name_obj) in python
|
||||
index_obj = c.pyapi.call_method(class_obj, "_simple_new", (array_obj,))
|
||||
index.parent = index_obj
|
||||
c.builder.store(index_obj, res)
|
||||
|
||||
# Decrefs
|
||||
c.pyapi.decref(class_obj)
|
||||
c.pyapi.decref(array_obj)
|
||||
return c.builder.load(res)
|
||||
|
||||
|
||||
@box(SeriesType)
|
||||
def box_series(typ, val, c):
|
||||
"""
|
||||
Convert a native series structure to a Series object.
|
||||
"""
|
||||
series = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val)
|
||||
series_const_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Series._from_mgr))
|
||||
mgr_const_obj = c.pyapi.unserialize(
|
||||
c.pyapi.serialize_object(SingleBlockManager.from_array)
|
||||
)
|
||||
index_obj = c.box(typ.index, series.index)
|
||||
array_obj = c.box(typ.as_array, series.values)
|
||||
name_obj = c.box(typ.namety, series.name)
|
||||
# This is basically equivalent of
|
||||
# pd.Series(data=array_obj, index=index_obj)
|
||||
# To improve perf, we will construct the Series from a manager
|
||||
# object to avoid checks.
|
||||
# We'll also set the name attribute manually to avoid validation
|
||||
mgr_obj = c.pyapi.call_function_objargs(
|
||||
mgr_const_obj,
|
||||
(
|
||||
array_obj,
|
||||
index_obj,
|
||||
),
|
||||
)
|
||||
mgr_axes_obj = c.pyapi.object_getattr_string(mgr_obj, "axes")
|
||||
# Series._constructor_from_mgr(mgr, axes)
|
||||
series_obj = c.pyapi.call_function_objargs(
|
||||
series_const_obj, (mgr_obj, mgr_axes_obj)
|
||||
)
|
||||
c.pyapi.object_setattr_string(series_obj, "_name", name_obj)
|
||||
|
||||
# Decrefs
|
||||
c.pyapi.decref(series_const_obj)
|
||||
c.pyapi.decref(mgr_axes_obj)
|
||||
c.pyapi.decref(mgr_obj)
|
||||
c.pyapi.decref(mgr_const_obj)
|
||||
c.pyapi.decref(index_obj)
|
||||
c.pyapi.decref(array_obj)
|
||||
c.pyapi.decref(name_obj)
|
||||
|
||||
return series_obj
|
||||
|
||||
|
||||
# Add common series reductions (e.g. mean, sum),
|
||||
# and also add common binops (e.g. add, sub, mul, div)
|
||||
def generate_series_reduction(ser_reduction, ser_method):
|
||||
@overload_method(SeriesType, ser_reduction)
|
||||
def series_reduction(series):
|
||||
def series_reduction_impl(series):
|
||||
return ser_method(series.values)
|
||||
|
||||
return series_reduction_impl
|
||||
|
||||
return series_reduction
|
||||
|
||||
|
||||
def generate_series_binop(binop):
|
||||
@overload(binop)
|
||||
def series_binop(series1, value):
|
||||
if isinstance(series1, SeriesType):
|
||||
if isinstance(value, SeriesType):
|
||||
|
||||
def series_binop_impl(series1, series2):
|
||||
# TODO: Check index matching?
|
||||
return Series(
|
||||
binop(series1.values, series2.values),
|
||||
series1.index,
|
||||
series1.name,
|
||||
)
|
||||
|
||||
return series_binop_impl
|
||||
else:
|
||||
|
||||
def series_binop_impl(series1, value):
|
||||
return Series(
|
||||
binop(series1.values, value), series1.index, series1.name
|
||||
)
|
||||
|
||||
return series_binop_impl
|
||||
|
||||
return series_binop
|
||||
|
||||
|
||||
series_reductions = [
|
||||
("sum", np.sum),
|
||||
("mean", np.mean),
|
||||
# Disabled due to discrepancies between numba std. dev
|
||||
# and pandas std. dev (no way to specify dof)
|
||||
# ("std", np.std),
|
||||
# ("var", np.var),
|
||||
("min", np.min),
|
||||
("max", np.max),
|
||||
]
|
||||
for reduction, reduction_method in series_reductions:
|
||||
generate_series_reduction(reduction, reduction_method)
|
||||
|
||||
series_binops = [operator.add, operator.sub, operator.mul, operator.truediv]
|
||||
|
||||
for ser_binop in series_binops:
|
||||
generate_series_binop(ser_binop)
|
||||
|
||||
|
||||
# get_loc on Index
|
||||
@overload_method(IndexType, "get_loc")
|
||||
def index_get_loc(index, item):
|
||||
def index_get_loc_impl(index, item):
|
||||
# Initialize the hash table if not initialized
|
||||
if len(index.hashmap) == 0:
|
||||
for i, val in enumerate(index._data):
|
||||
index.hashmap[val] = i
|
||||
return index.hashmap[item]
|
||||
|
||||
return index_get_loc_impl
|
||||
|
||||
|
||||
# Indexing for Series/Index
|
||||
@overload(operator.getitem)
|
||||
def series_indexing(series, item):
|
||||
if isinstance(series, SeriesType):
|
||||
|
||||
def series_getitem(series, item):
|
||||
loc = series.index.get_loc(item)
|
||||
return series.iloc[loc]
|
||||
|
||||
return series_getitem
|
||||
|
||||
|
||||
@overload(operator.getitem)
|
||||
def index_indexing(index, idx):
|
||||
if isinstance(index, IndexType):
|
||||
|
||||
def index_getitem(index, idx):
|
||||
return index._data[idx]
|
||||
|
||||
return index_getitem
|
||||
|
||||
|
||||
class IlocType(types.Type):
|
||||
def __init__(self, obj_type) -> None:
|
||||
self.obj_type = obj_type
|
||||
name = f"iLocIndexer({obj_type})"
|
||||
super().__init__(name=name)
|
||||
|
||||
@property
|
||||
def key(self):
|
||||
return self.obj_type
|
||||
|
||||
|
||||
@typeof_impl.register(_iLocIndexer)
|
||||
def typeof_iloc(val, c):
|
||||
objtype = typeof_impl(val.obj, c)
|
||||
return IlocType(objtype)
|
||||
|
||||
|
||||
@type_callable(_iLocIndexer)
|
||||
def type_iloc_constructor(context):
|
||||
def typer(obj):
|
||||
if isinstance(obj, SeriesType):
|
||||
return IlocType(obj)
|
||||
|
||||
return typer
|
||||
|
||||
|
||||
@lower_builtin(_iLocIndexer, SeriesType)
|
||||
def iloc_constructor(context, builder, sig, args):
|
||||
(obj,) = args
|
||||
iloc_indexer = cgutils.create_struct_proxy(sig.return_type)(context, builder)
|
||||
iloc_indexer.obj = obj
|
||||
return impl_ret_borrowed(
|
||||
context, builder, sig.return_type, iloc_indexer._getvalue()
|
||||
)
|
||||
|
||||
|
||||
@register_model(IlocType)
|
||||
class ILocModel(models.StructModel):
|
||||
def __init__(self, dmm, fe_type) -> None:
|
||||
members = [("obj", fe_type.obj_type)]
|
||||
models.StructModel.__init__(self, dmm, fe_type, members)
|
||||
|
||||
|
||||
make_attribute_wrapper(IlocType, "obj", "obj")
|
||||
|
||||
|
||||
@overload_attribute(SeriesType, "iloc")
|
||||
def series_iloc(series):
|
||||
def get(series):
|
||||
return _iLocIndexer(series)
|
||||
|
||||
return get
|
||||
|
||||
|
||||
@overload(operator.getitem)
|
||||
def iloc_getitem(iloc_indexer, i):
|
||||
if isinstance(iloc_indexer, IlocType):
|
||||
|
||||
def getitem_impl(iloc_indexer, i):
|
||||
return iloc_indexer.obj.values[i]
|
||||
|
||||
return getitem_impl
|
@ -0,0 +1,27 @@
|
||||
from pandas.core._numba.kernels.mean_ import (
|
||||
grouped_mean,
|
||||
sliding_mean,
|
||||
)
|
||||
from pandas.core._numba.kernels.min_max_ import (
|
||||
grouped_min_max,
|
||||
sliding_min_max,
|
||||
)
|
||||
from pandas.core._numba.kernels.sum_ import (
|
||||
grouped_sum,
|
||||
sliding_sum,
|
||||
)
|
||||
from pandas.core._numba.kernels.var_ import (
|
||||
grouped_var,
|
||||
sliding_var,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"sliding_mean",
|
||||
"grouped_mean",
|
||||
"sliding_sum",
|
||||
"grouped_sum",
|
||||
"sliding_var",
|
||||
"grouped_var",
|
||||
"sliding_min_max",
|
||||
"grouped_min_max",
|
||||
]
|
196
lib/python3.13/site-packages/pandas/core/_numba/kernels/mean_.py
Normal file
196
lib/python3.13/site-packages/pandas/core/_numba/kernels/mean_.py
Normal file
@ -0,0 +1,196 @@
|
||||
"""
|
||||
Numba 1D mean kernels that can be shared by
|
||||
* Dataframe / Series
|
||||
* groupby
|
||||
* rolling / expanding
|
||||
|
||||
Mirrors pandas/_libs/window/aggregation.pyx
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numba
|
||||
import numpy as np
|
||||
|
||||
from pandas.core._numba.kernels.shared import is_monotonic_increasing
|
||||
from pandas.core._numba.kernels.sum_ import grouped_kahan_sum
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import npt
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def add_mean(
|
||||
val: float,
|
||||
nobs: int,
|
||||
sum_x: float,
|
||||
neg_ct: int,
|
||||
compensation: float,
|
||||
num_consecutive_same_value: int,
|
||||
prev_value: float,
|
||||
) -> tuple[int, float, int, float, int, float]:
|
||||
if not np.isnan(val):
|
||||
nobs += 1
|
||||
y = val - compensation
|
||||
t = sum_x + y
|
||||
compensation = t - sum_x - y
|
||||
sum_x = t
|
||||
if val < 0:
|
||||
neg_ct += 1
|
||||
|
||||
if val == prev_value:
|
||||
num_consecutive_same_value += 1
|
||||
else:
|
||||
num_consecutive_same_value = 1
|
||||
prev_value = val
|
||||
|
||||
return nobs, sum_x, neg_ct, compensation, num_consecutive_same_value, prev_value
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def remove_mean(
|
||||
val: float, nobs: int, sum_x: float, neg_ct: int, compensation: float
|
||||
) -> tuple[int, float, int, float]:
|
||||
if not np.isnan(val):
|
||||
nobs -= 1
|
||||
y = -val - compensation
|
||||
t = sum_x + y
|
||||
compensation = t - sum_x - y
|
||||
sum_x = t
|
||||
if val < 0:
|
||||
neg_ct -= 1
|
||||
return nobs, sum_x, neg_ct, compensation
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def sliding_mean(
|
||||
values: np.ndarray,
|
||||
result_dtype: np.dtype,
|
||||
start: np.ndarray,
|
||||
end: np.ndarray,
|
||||
min_periods: int,
|
||||
) -> tuple[np.ndarray, list[int]]:
|
||||
N = len(start)
|
||||
nobs = 0
|
||||
sum_x = 0.0
|
||||
neg_ct = 0
|
||||
compensation_add = 0.0
|
||||
compensation_remove = 0.0
|
||||
|
||||
is_monotonic_increasing_bounds = is_monotonic_increasing(
|
||||
start
|
||||
) and is_monotonic_increasing(end)
|
||||
|
||||
output = np.empty(N, dtype=result_dtype)
|
||||
|
||||
for i in range(N):
|
||||
s = start[i]
|
||||
e = end[i]
|
||||
if i == 0 or not is_monotonic_increasing_bounds:
|
||||
prev_value = values[s]
|
||||
num_consecutive_same_value = 0
|
||||
|
||||
for j in range(s, e):
|
||||
val = values[j]
|
||||
(
|
||||
nobs,
|
||||
sum_x,
|
||||
neg_ct,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
) = add_mean(
|
||||
val,
|
||||
nobs,
|
||||
sum_x,
|
||||
neg_ct,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value, # pyright: ignore[reportGeneralTypeIssues]
|
||||
)
|
||||
else:
|
||||
for j in range(start[i - 1], s):
|
||||
val = values[j]
|
||||
nobs, sum_x, neg_ct, compensation_remove = remove_mean(
|
||||
val, nobs, sum_x, neg_ct, compensation_remove
|
||||
)
|
||||
|
||||
for j in range(end[i - 1], e):
|
||||
val = values[j]
|
||||
(
|
||||
nobs,
|
||||
sum_x,
|
||||
neg_ct,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
) = add_mean(
|
||||
val,
|
||||
nobs,
|
||||
sum_x,
|
||||
neg_ct,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value, # pyright: ignore[reportGeneralTypeIssues]
|
||||
)
|
||||
|
||||
if nobs >= min_periods and nobs > 0:
|
||||
result = sum_x / nobs
|
||||
if num_consecutive_same_value >= nobs:
|
||||
result = prev_value
|
||||
elif neg_ct == 0 and result < 0:
|
||||
result = 0
|
||||
elif neg_ct == nobs and result > 0:
|
||||
result = 0
|
||||
else:
|
||||
result = np.nan
|
||||
|
||||
output[i] = result
|
||||
|
||||
if not is_monotonic_increasing_bounds:
|
||||
nobs = 0
|
||||
sum_x = 0.0
|
||||
neg_ct = 0
|
||||
compensation_remove = 0.0
|
||||
|
||||
# na_position is empty list since float64 can already hold nans
|
||||
# Do list comprehension, since numba cannot figure out that na_pos is
|
||||
# empty list of ints on its own
|
||||
na_pos = [0 for i in range(0)]
|
||||
return output, na_pos
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def grouped_mean(
|
||||
values: np.ndarray,
|
||||
result_dtype: np.dtype,
|
||||
labels: npt.NDArray[np.intp],
|
||||
ngroups: int,
|
||||
min_periods: int,
|
||||
) -> tuple[np.ndarray, list[int]]:
|
||||
output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum(
|
||||
values, result_dtype, labels, ngroups
|
||||
)
|
||||
|
||||
# Post-processing, replace sums that don't satisfy min_periods
|
||||
for lab in range(ngroups):
|
||||
nobs = nobs_arr[lab]
|
||||
num_consecutive_same_value = consecutive_counts[lab]
|
||||
prev_value = prev_vals[lab]
|
||||
sum_x = output[lab]
|
||||
if nobs >= min_periods:
|
||||
if num_consecutive_same_value >= nobs:
|
||||
result = prev_value * nobs
|
||||
else:
|
||||
result = sum_x
|
||||
else:
|
||||
result = np.nan
|
||||
result /= nobs
|
||||
output[lab] = result
|
||||
|
||||
# na_position is empty list since float64 can already hold nans
|
||||
# Do list comprehension, since numba cannot figure out that na_pos is
|
||||
# empty list of ints on its own
|
||||
na_pos = [0 for i in range(0)]
|
||||
return output, na_pos
|
@ -0,0 +1,125 @@
|
||||
"""
|
||||
Numba 1D min/max kernels that can be shared by
|
||||
* Dataframe / Series
|
||||
* groupby
|
||||
* rolling / expanding
|
||||
|
||||
Mirrors pandas/_libs/window/aggregation.pyx
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numba
|
||||
import numpy as np
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import npt
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def sliding_min_max(
|
||||
values: np.ndarray,
|
||||
result_dtype: np.dtype,
|
||||
start: np.ndarray,
|
||||
end: np.ndarray,
|
||||
min_periods: int,
|
||||
is_max: bool,
|
||||
) -> tuple[np.ndarray, list[int]]:
|
||||
N = len(start)
|
||||
nobs = 0
|
||||
output = np.empty(N, dtype=result_dtype)
|
||||
na_pos = []
|
||||
# Use deque once numba supports it
|
||||
# https://github.com/numba/numba/issues/7417
|
||||
Q: list = []
|
||||
W: list = []
|
||||
for i in range(N):
|
||||
curr_win_size = end[i] - start[i]
|
||||
if i == 0:
|
||||
st = start[i]
|
||||
else:
|
||||
st = end[i - 1]
|
||||
|
||||
for k in range(st, end[i]):
|
||||
ai = values[k]
|
||||
if not np.isnan(ai):
|
||||
nobs += 1
|
||||
elif is_max:
|
||||
ai = -np.inf
|
||||
else:
|
||||
ai = np.inf
|
||||
# Discard previous entries if we find new min or max
|
||||
if is_max:
|
||||
while Q and ((ai >= values[Q[-1]]) or values[Q[-1]] != values[Q[-1]]):
|
||||
Q.pop()
|
||||
else:
|
||||
while Q and ((ai <= values[Q[-1]]) or values[Q[-1]] != values[Q[-1]]):
|
||||
Q.pop()
|
||||
Q.append(k)
|
||||
W.append(k)
|
||||
|
||||
# Discard entries outside and left of current window
|
||||
while Q and Q[0] <= start[i] - 1:
|
||||
Q.pop(0)
|
||||
while W and W[0] <= start[i] - 1:
|
||||
if not np.isnan(values[W[0]]):
|
||||
nobs -= 1
|
||||
W.pop(0)
|
||||
|
||||
# Save output based on index in input value array
|
||||
if Q and curr_win_size > 0 and nobs >= min_periods:
|
||||
output[i] = values[Q[0]]
|
||||
else:
|
||||
if values.dtype.kind != "i":
|
||||
output[i] = np.nan
|
||||
else:
|
||||
na_pos.append(i)
|
||||
|
||||
return output, na_pos
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def grouped_min_max(
|
||||
values: np.ndarray,
|
||||
result_dtype: np.dtype,
|
||||
labels: npt.NDArray[np.intp],
|
||||
ngroups: int,
|
||||
min_periods: int,
|
||||
is_max: bool,
|
||||
) -> tuple[np.ndarray, list[int]]:
|
||||
N = len(labels)
|
||||
nobs = np.zeros(ngroups, dtype=np.int64)
|
||||
na_pos = []
|
||||
output = np.empty(ngroups, dtype=result_dtype)
|
||||
|
||||
for i in range(N):
|
||||
lab = labels[i]
|
||||
val = values[i]
|
||||
if lab < 0:
|
||||
continue
|
||||
|
||||
if values.dtype.kind == "i" or not np.isnan(val):
|
||||
nobs[lab] += 1
|
||||
else:
|
||||
# NaN value cannot be a min/max value
|
||||
continue
|
||||
|
||||
if nobs[lab] == 1:
|
||||
# First element in group, set output equal to this
|
||||
output[lab] = val
|
||||
continue
|
||||
|
||||
if is_max:
|
||||
if val > output[lab]:
|
||||
output[lab] = val
|
||||
else:
|
||||
if val < output[lab]:
|
||||
output[lab] = val
|
||||
|
||||
# Set labels that don't satisfy min_periods as np.nan
|
||||
for lab, count in enumerate(nobs):
|
||||
if count < min_periods:
|
||||
na_pos.append(lab)
|
||||
|
||||
return output, na_pos
|
@ -0,0 +1,29 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numba
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy as np
|
||||
|
||||
|
||||
@numba.jit(
|
||||
# error: Any? not callable
|
||||
numba.boolean(numba.int64[:]), # type: ignore[misc]
|
||||
nopython=True,
|
||||
nogil=True,
|
||||
parallel=False,
|
||||
)
|
||||
def is_monotonic_increasing(bounds: np.ndarray) -> bool:
|
||||
"""Check if int64 values are monotonically increasing."""
|
||||
n = len(bounds)
|
||||
if n < 2:
|
||||
return True
|
||||
prev = bounds[0]
|
||||
for i in range(1, n):
|
||||
cur = bounds[i]
|
||||
if cur < prev:
|
||||
return False
|
||||
prev = cur
|
||||
return True
|
244
lib/python3.13/site-packages/pandas/core/_numba/kernels/sum_.py
Normal file
244
lib/python3.13/site-packages/pandas/core/_numba/kernels/sum_.py
Normal file
@ -0,0 +1,244 @@
|
||||
"""
|
||||
Numba 1D sum kernels that can be shared by
|
||||
* Dataframe / Series
|
||||
* groupby
|
||||
* rolling / expanding
|
||||
|
||||
Mirrors pandas/_libs/window/aggregation.pyx
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
)
|
||||
|
||||
import numba
|
||||
from numba.extending import register_jitable
|
||||
import numpy as np
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import npt
|
||||
|
||||
from pandas.core._numba.kernels.shared import is_monotonic_increasing
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def add_sum(
|
||||
val: Any,
|
||||
nobs: int,
|
||||
sum_x: Any,
|
||||
compensation: Any,
|
||||
num_consecutive_same_value: int,
|
||||
prev_value: Any,
|
||||
) -> tuple[int, Any, Any, int, Any]:
|
||||
if not np.isnan(val):
|
||||
nobs += 1
|
||||
y = val - compensation
|
||||
t = sum_x + y
|
||||
compensation = t - sum_x - y
|
||||
sum_x = t
|
||||
|
||||
if val == prev_value:
|
||||
num_consecutive_same_value += 1
|
||||
else:
|
||||
num_consecutive_same_value = 1
|
||||
prev_value = val
|
||||
|
||||
return nobs, sum_x, compensation, num_consecutive_same_value, prev_value
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def remove_sum(
|
||||
val: Any, nobs: int, sum_x: Any, compensation: Any
|
||||
) -> tuple[int, Any, Any]:
|
||||
if not np.isnan(val):
|
||||
nobs -= 1
|
||||
y = -val - compensation
|
||||
t = sum_x + y
|
||||
compensation = t - sum_x - y
|
||||
sum_x = t
|
||||
return nobs, sum_x, compensation
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def sliding_sum(
|
||||
values: np.ndarray,
|
||||
result_dtype: np.dtype,
|
||||
start: np.ndarray,
|
||||
end: np.ndarray,
|
||||
min_periods: int,
|
||||
) -> tuple[np.ndarray, list[int]]:
|
||||
dtype = values.dtype
|
||||
|
||||
na_val: object = np.nan
|
||||
if dtype.kind == "i":
|
||||
na_val = 0
|
||||
|
||||
N = len(start)
|
||||
nobs = 0
|
||||
sum_x = 0
|
||||
compensation_add = 0
|
||||
compensation_remove = 0
|
||||
na_pos = []
|
||||
|
||||
is_monotonic_increasing_bounds = is_monotonic_increasing(
|
||||
start
|
||||
) and is_monotonic_increasing(end)
|
||||
|
||||
output = np.empty(N, dtype=result_dtype)
|
||||
|
||||
for i in range(N):
|
||||
s = start[i]
|
||||
e = end[i]
|
||||
if i == 0 or not is_monotonic_increasing_bounds:
|
||||
prev_value = values[s]
|
||||
num_consecutive_same_value = 0
|
||||
|
||||
for j in range(s, e):
|
||||
val = values[j]
|
||||
(
|
||||
nobs,
|
||||
sum_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
) = add_sum(
|
||||
val,
|
||||
nobs,
|
||||
sum_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
)
|
||||
else:
|
||||
for j in range(start[i - 1], s):
|
||||
val = values[j]
|
||||
nobs, sum_x, compensation_remove = remove_sum(
|
||||
val, nobs, sum_x, compensation_remove
|
||||
)
|
||||
|
||||
for j in range(end[i - 1], e):
|
||||
val = values[j]
|
||||
(
|
||||
nobs,
|
||||
sum_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
) = add_sum(
|
||||
val,
|
||||
nobs,
|
||||
sum_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
)
|
||||
|
||||
if nobs == 0 == min_periods:
|
||||
result: object = 0
|
||||
elif nobs >= min_periods:
|
||||
if num_consecutive_same_value >= nobs:
|
||||
result = prev_value * nobs
|
||||
else:
|
||||
result = sum_x
|
||||
else:
|
||||
result = na_val
|
||||
if dtype.kind == "i":
|
||||
na_pos.append(i)
|
||||
|
||||
output[i] = result
|
||||
|
||||
if not is_monotonic_increasing_bounds:
|
||||
nobs = 0
|
||||
sum_x = 0
|
||||
compensation_remove = 0
|
||||
|
||||
return output, na_pos
|
||||
|
||||
|
||||
# Mypy/pyright don't like the fact that the decorator is untyped
|
||||
@register_jitable # type: ignore[misc]
|
||||
def grouped_kahan_sum(
|
||||
values: np.ndarray,
|
||||
result_dtype: np.dtype,
|
||||
labels: npt.NDArray[np.intp],
|
||||
ngroups: int,
|
||||
) -> tuple[
|
||||
np.ndarray, npt.NDArray[np.int64], np.ndarray, npt.NDArray[np.int64], np.ndarray
|
||||
]:
|
||||
N = len(labels)
|
||||
|
||||
nobs_arr = np.zeros(ngroups, dtype=np.int64)
|
||||
comp_arr = np.zeros(ngroups, dtype=values.dtype)
|
||||
consecutive_counts = np.zeros(ngroups, dtype=np.int64)
|
||||
prev_vals = np.zeros(ngroups, dtype=values.dtype)
|
||||
output = np.zeros(ngroups, dtype=result_dtype)
|
||||
|
||||
for i in range(N):
|
||||
lab = labels[i]
|
||||
val = values[i]
|
||||
|
||||
if lab < 0:
|
||||
continue
|
||||
|
||||
sum_x = output[lab]
|
||||
nobs = nobs_arr[lab]
|
||||
compensation_add = comp_arr[lab]
|
||||
num_consecutive_same_value = consecutive_counts[lab]
|
||||
prev_value = prev_vals[lab]
|
||||
|
||||
(
|
||||
nobs,
|
||||
sum_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
) = add_sum(
|
||||
val,
|
||||
nobs,
|
||||
sum_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
)
|
||||
|
||||
output[lab] = sum_x
|
||||
consecutive_counts[lab] = num_consecutive_same_value
|
||||
prev_vals[lab] = prev_value
|
||||
comp_arr[lab] = compensation_add
|
||||
nobs_arr[lab] = nobs
|
||||
return output, nobs_arr, comp_arr, consecutive_counts, prev_vals
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def grouped_sum(
|
||||
values: np.ndarray,
|
||||
result_dtype: np.dtype,
|
||||
labels: npt.NDArray[np.intp],
|
||||
ngroups: int,
|
||||
min_periods: int,
|
||||
) -> tuple[np.ndarray, list[int]]:
|
||||
na_pos = []
|
||||
|
||||
output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum(
|
||||
values, result_dtype, labels, ngroups
|
||||
)
|
||||
|
||||
# Post-processing, replace sums that don't satisfy min_periods
|
||||
for lab in range(ngroups):
|
||||
nobs = nobs_arr[lab]
|
||||
num_consecutive_same_value = consecutive_counts[lab]
|
||||
prev_value = prev_vals[lab]
|
||||
sum_x = output[lab]
|
||||
if nobs >= min_periods:
|
||||
if num_consecutive_same_value >= nobs:
|
||||
result = prev_value * nobs
|
||||
else:
|
||||
result = sum_x
|
||||
else:
|
||||
result = sum_x # Don't change val, will be replaced by nan later
|
||||
na_pos.append(lab)
|
||||
output[lab] = result
|
||||
|
||||
return output, na_pos
|
245
lib/python3.13/site-packages/pandas/core/_numba/kernels/var_.py
Normal file
245
lib/python3.13/site-packages/pandas/core/_numba/kernels/var_.py
Normal file
@ -0,0 +1,245 @@
|
||||
"""
|
||||
Numba 1D var kernels that can be shared by
|
||||
* Dataframe / Series
|
||||
* groupby
|
||||
* rolling / expanding
|
||||
|
||||
Mirrors pandas/_libs/window/aggregation.pyx
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numba
|
||||
import numpy as np
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import npt
|
||||
|
||||
from pandas.core._numba.kernels.shared import is_monotonic_increasing
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def add_var(
|
||||
val: float,
|
||||
nobs: int,
|
||||
mean_x: float,
|
||||
ssqdm_x: float,
|
||||
compensation: float,
|
||||
num_consecutive_same_value: int,
|
||||
prev_value: float,
|
||||
) -> tuple[int, float, float, float, int, float]:
|
||||
if not np.isnan(val):
|
||||
if val == prev_value:
|
||||
num_consecutive_same_value += 1
|
||||
else:
|
||||
num_consecutive_same_value = 1
|
||||
prev_value = val
|
||||
|
||||
nobs += 1
|
||||
prev_mean = mean_x - compensation
|
||||
y = val - compensation
|
||||
t = y - mean_x
|
||||
compensation = t + mean_x - y
|
||||
delta = t
|
||||
if nobs:
|
||||
mean_x += delta / nobs
|
||||
else:
|
||||
mean_x = 0
|
||||
ssqdm_x += (val - prev_mean) * (val - mean_x)
|
||||
return nobs, mean_x, ssqdm_x, compensation, num_consecutive_same_value, prev_value
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def remove_var(
|
||||
val: float, nobs: int, mean_x: float, ssqdm_x: float, compensation: float
|
||||
) -> tuple[int, float, float, float]:
|
||||
if not np.isnan(val):
|
||||
nobs -= 1
|
||||
if nobs:
|
||||
prev_mean = mean_x - compensation
|
||||
y = val - compensation
|
||||
t = y - mean_x
|
||||
compensation = t + mean_x - y
|
||||
delta = t
|
||||
mean_x -= delta / nobs
|
||||
ssqdm_x -= (val - prev_mean) * (val - mean_x)
|
||||
else:
|
||||
mean_x = 0
|
||||
ssqdm_x = 0
|
||||
return nobs, mean_x, ssqdm_x, compensation
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def sliding_var(
|
||||
values: np.ndarray,
|
||||
result_dtype: np.dtype,
|
||||
start: np.ndarray,
|
||||
end: np.ndarray,
|
||||
min_periods: int,
|
||||
ddof: int = 1,
|
||||
) -> tuple[np.ndarray, list[int]]:
|
||||
N = len(start)
|
||||
nobs = 0
|
||||
mean_x = 0.0
|
||||
ssqdm_x = 0.0
|
||||
compensation_add = 0.0
|
||||
compensation_remove = 0.0
|
||||
|
||||
min_periods = max(min_periods, 1)
|
||||
is_monotonic_increasing_bounds = is_monotonic_increasing(
|
||||
start
|
||||
) and is_monotonic_increasing(end)
|
||||
|
||||
output = np.empty(N, dtype=result_dtype)
|
||||
|
||||
for i in range(N):
|
||||
s = start[i]
|
||||
e = end[i]
|
||||
if i == 0 or not is_monotonic_increasing_bounds:
|
||||
prev_value = values[s]
|
||||
num_consecutive_same_value = 0
|
||||
|
||||
for j in range(s, e):
|
||||
val = values[j]
|
||||
(
|
||||
nobs,
|
||||
mean_x,
|
||||
ssqdm_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
) = add_var(
|
||||
val,
|
||||
nobs,
|
||||
mean_x,
|
||||
ssqdm_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
)
|
||||
else:
|
||||
for j in range(start[i - 1], s):
|
||||
val = values[j]
|
||||
nobs, mean_x, ssqdm_x, compensation_remove = remove_var(
|
||||
val, nobs, mean_x, ssqdm_x, compensation_remove
|
||||
)
|
||||
|
||||
for j in range(end[i - 1], e):
|
||||
val = values[j]
|
||||
(
|
||||
nobs,
|
||||
mean_x,
|
||||
ssqdm_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
) = add_var(
|
||||
val,
|
||||
nobs,
|
||||
mean_x,
|
||||
ssqdm_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
)
|
||||
|
||||
if nobs >= min_periods and nobs > ddof:
|
||||
if nobs == 1 or num_consecutive_same_value >= nobs:
|
||||
result = 0.0
|
||||
else:
|
||||
result = ssqdm_x / (nobs - ddof)
|
||||
else:
|
||||
result = np.nan
|
||||
|
||||
output[i] = result
|
||||
|
||||
if not is_monotonic_increasing_bounds:
|
||||
nobs = 0
|
||||
mean_x = 0.0
|
||||
ssqdm_x = 0.0
|
||||
compensation_remove = 0.0
|
||||
|
||||
# na_position is empty list since float64 can already hold nans
|
||||
# Do list comprehension, since numba cannot figure out that na_pos is
|
||||
# empty list of ints on its own
|
||||
na_pos = [0 for i in range(0)]
|
||||
return output, na_pos
|
||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False)
|
||||
def grouped_var(
|
||||
values: np.ndarray,
|
||||
result_dtype: np.dtype,
|
||||
labels: npt.NDArray[np.intp],
|
||||
ngroups: int,
|
||||
min_periods: int,
|
||||
ddof: int = 1,
|
||||
) -> tuple[np.ndarray, list[int]]:
|
||||
N = len(labels)
|
||||
|
||||
nobs_arr = np.zeros(ngroups, dtype=np.int64)
|
||||
comp_arr = np.zeros(ngroups, dtype=values.dtype)
|
||||
consecutive_counts = np.zeros(ngroups, dtype=np.int64)
|
||||
prev_vals = np.zeros(ngroups, dtype=values.dtype)
|
||||
output = np.zeros(ngroups, dtype=result_dtype)
|
||||
means = np.zeros(ngroups, dtype=result_dtype)
|
||||
|
||||
for i in range(N):
|
||||
lab = labels[i]
|
||||
val = values[i]
|
||||
|
||||
if lab < 0:
|
||||
continue
|
||||
|
||||
mean_x = means[lab]
|
||||
ssqdm_x = output[lab]
|
||||
nobs = nobs_arr[lab]
|
||||
compensation_add = comp_arr[lab]
|
||||
num_consecutive_same_value = consecutive_counts[lab]
|
||||
prev_value = prev_vals[lab]
|
||||
|
||||
(
|
||||
nobs,
|
||||
mean_x,
|
||||
ssqdm_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
) = add_var(
|
||||
val,
|
||||
nobs,
|
||||
mean_x,
|
||||
ssqdm_x,
|
||||
compensation_add,
|
||||
num_consecutive_same_value,
|
||||
prev_value,
|
||||
)
|
||||
|
||||
output[lab] = ssqdm_x
|
||||
means[lab] = mean_x
|
||||
consecutive_counts[lab] = num_consecutive_same_value
|
||||
prev_vals[lab] = prev_value
|
||||
comp_arr[lab] = compensation_add
|
||||
nobs_arr[lab] = nobs
|
||||
|
||||
# Post-processing, replace vars that don't satisfy min_periods
|
||||
for lab in range(ngroups):
|
||||
nobs = nobs_arr[lab]
|
||||
num_consecutive_same_value = consecutive_counts[lab]
|
||||
ssqdm_x = output[lab]
|
||||
if nobs >= min_periods and nobs > ddof:
|
||||
if nobs == 1 or num_consecutive_same_value >= nobs:
|
||||
result = 0.0
|
||||
else:
|
||||
result = ssqdm_x / (nobs - ddof)
|
||||
else:
|
||||
result = np.nan
|
||||
output[lab] = result
|
||||
|
||||
# Second pass to get the std.dev
|
||||
# na_position is empty list since float64 can already hold nans
|
||||
# Do list comprehension, since numba cannot figure out that na_pos is
|
||||
# empty list of ints on its own
|
||||
na_pos = [0 for i in range(0)]
|
||||
return output, na_pos
|
340
lib/python3.13/site-packages/pandas/core/accessor.py
Normal file
340
lib/python3.13/site-packages/pandas/core/accessor.py
Normal file
@ -0,0 +1,340 @@
|
||||
"""
|
||||
|
||||
accessor.py contains base classes for implementing accessor properties
|
||||
that can be mixed into or pinned onto other pandas classes.
|
||||
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
Callable,
|
||||
final,
|
||||
)
|
||||
import warnings
|
||||
|
||||
from pandas.util._decorators import doc
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
|
||||
class DirNamesMixin:
|
||||
_accessors: set[str] = set()
|
||||
_hidden_attrs: frozenset[str] = frozenset()
|
||||
|
||||
@final
|
||||
def _dir_deletions(self) -> set[str]:
|
||||
"""
|
||||
Delete unwanted __dir__ for this object.
|
||||
"""
|
||||
return self._accessors | self._hidden_attrs
|
||||
|
||||
def _dir_additions(self) -> set[str]:
|
||||
"""
|
||||
Add additional __dir__ for this object.
|
||||
"""
|
||||
return {accessor for accessor in self._accessors if hasattr(self, accessor)}
|
||||
|
||||
def __dir__(self) -> list[str]:
|
||||
"""
|
||||
Provide method name lookup and completion.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Only provide 'public' methods.
|
||||
"""
|
||||
rv = set(super().__dir__())
|
||||
rv = (rv - self._dir_deletions()) | self._dir_additions()
|
||||
return sorted(rv)
|
||||
|
||||
|
||||
class PandasDelegate:
|
||||
"""
|
||||
Abstract base class for delegating methods/properties.
|
||||
"""
|
||||
|
||||
def _delegate_property_get(self, name: str, *args, **kwargs):
|
||||
raise TypeError(f"You cannot access the property {name}")
|
||||
|
||||
def _delegate_property_set(self, name: str, value, *args, **kwargs):
|
||||
raise TypeError(f"The property {name} cannot be set")
|
||||
|
||||
def _delegate_method(self, name: str, *args, **kwargs):
|
||||
raise TypeError(f"You cannot call method {name}")
|
||||
|
||||
@classmethod
|
||||
def _add_delegate_accessors(
|
||||
cls,
|
||||
delegate,
|
||||
accessors: list[str],
|
||||
typ: str,
|
||||
overwrite: bool = False,
|
||||
accessor_mapping: Callable[[str], str] = lambda x: x,
|
||||
raise_on_missing: bool = True,
|
||||
) -> None:
|
||||
"""
|
||||
Add accessors to cls from the delegate class.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cls
|
||||
Class to add the methods/properties to.
|
||||
delegate
|
||||
Class to get methods/properties and doc-strings.
|
||||
accessors : list of str
|
||||
List of accessors to add.
|
||||
typ : {'property', 'method'}
|
||||
overwrite : bool, default False
|
||||
Overwrite the method/property in the target class if it exists.
|
||||
accessor_mapping: Callable, default lambda x: x
|
||||
Callable to map the delegate's function to the cls' function.
|
||||
raise_on_missing: bool, default True
|
||||
Raise if an accessor does not exist on delegate.
|
||||
False skips the missing accessor.
|
||||
"""
|
||||
|
||||
def _create_delegator_property(name: str):
|
||||
def _getter(self):
|
||||
return self._delegate_property_get(name)
|
||||
|
||||
def _setter(self, new_values):
|
||||
return self._delegate_property_set(name, new_values)
|
||||
|
||||
_getter.__name__ = name
|
||||
_setter.__name__ = name
|
||||
|
||||
return property(
|
||||
fget=_getter,
|
||||
fset=_setter,
|
||||
doc=getattr(delegate, accessor_mapping(name)).__doc__,
|
||||
)
|
||||
|
||||
def _create_delegator_method(name: str):
|
||||
def f(self, *args, **kwargs):
|
||||
return self._delegate_method(name, *args, **kwargs)
|
||||
|
||||
f.__name__ = name
|
||||
f.__doc__ = getattr(delegate, accessor_mapping(name)).__doc__
|
||||
|
||||
return f
|
||||
|
||||
for name in accessors:
|
||||
if (
|
||||
not raise_on_missing
|
||||
and getattr(delegate, accessor_mapping(name), None) is None
|
||||
):
|
||||
continue
|
||||
|
||||
if typ == "property":
|
||||
f = _create_delegator_property(name)
|
||||
else:
|
||||
f = _create_delegator_method(name)
|
||||
|
||||
# don't overwrite existing methods/properties
|
||||
if overwrite or not hasattr(cls, name):
|
||||
setattr(cls, name, f)
|
||||
|
||||
|
||||
def delegate_names(
|
||||
delegate,
|
||||
accessors: list[str],
|
||||
typ: str,
|
||||
overwrite: bool = False,
|
||||
accessor_mapping: Callable[[str], str] = lambda x: x,
|
||||
raise_on_missing: bool = True,
|
||||
):
|
||||
"""
|
||||
Add delegated names to a class using a class decorator. This provides
|
||||
an alternative usage to directly calling `_add_delegate_accessors`
|
||||
below a class definition.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
delegate : object
|
||||
The class to get methods/properties & doc-strings.
|
||||
accessors : Sequence[str]
|
||||
List of accessor to add.
|
||||
typ : {'property', 'method'}
|
||||
overwrite : bool, default False
|
||||
Overwrite the method/property in the target class if it exists.
|
||||
accessor_mapping: Callable, default lambda x: x
|
||||
Callable to map the delegate's function to the cls' function.
|
||||
raise_on_missing: bool, default True
|
||||
Raise if an accessor does not exist on delegate.
|
||||
False skips the missing accessor.
|
||||
|
||||
Returns
|
||||
-------
|
||||
callable
|
||||
A class decorator.
|
||||
|
||||
Examples
|
||||
--------
|
||||
@delegate_names(Categorical, ["categories", "ordered"], "property")
|
||||
class CategoricalAccessor(PandasDelegate):
|
||||
[...]
|
||||
"""
|
||||
|
||||
def add_delegate_accessors(cls):
|
||||
cls._add_delegate_accessors(
|
||||
delegate,
|
||||
accessors,
|
||||
typ,
|
||||
overwrite=overwrite,
|
||||
accessor_mapping=accessor_mapping,
|
||||
raise_on_missing=raise_on_missing,
|
||||
)
|
||||
return cls
|
||||
|
||||
return add_delegate_accessors
|
||||
|
||||
|
||||
# Ported with modifications from xarray; licence at LICENSES/XARRAY_LICENSE
|
||||
# https://github.com/pydata/xarray/blob/master/xarray/core/extensions.py
|
||||
# 1. We don't need to catch and re-raise AttributeErrors as RuntimeErrors
|
||||
# 2. We use a UserWarning instead of a custom Warning
|
||||
|
||||
|
||||
class CachedAccessor:
|
||||
"""
|
||||
Custom property-like object.
|
||||
|
||||
A descriptor for caching accessors.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name : str
|
||||
Namespace that will be accessed under, e.g. ``df.foo``.
|
||||
accessor : cls
|
||||
Class with the extension methods.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For accessor, The class's __init__ method assumes that one of
|
||||
``Series``, ``DataFrame`` or ``Index`` as the
|
||||
single argument ``data``.
|
||||
"""
|
||||
|
||||
def __init__(self, name: str, accessor) -> None:
|
||||
self._name = name
|
||||
self._accessor = accessor
|
||||
|
||||
def __get__(self, obj, cls):
|
||||
if obj is None:
|
||||
# we're accessing the attribute of the class, i.e., Dataset.geo
|
||||
return self._accessor
|
||||
accessor_obj = self._accessor(obj)
|
||||
# Replace the property with the accessor object. Inspired by:
|
||||
# https://www.pydanny.com/cached-property.html
|
||||
# We need to use object.__setattr__ because we overwrite __setattr__ on
|
||||
# NDFrame
|
||||
object.__setattr__(obj, self._name, accessor_obj)
|
||||
return accessor_obj
|
||||
|
||||
|
||||
@doc(klass="", others="")
|
||||
def _register_accessor(name: str, cls):
|
||||
"""
|
||||
Register a custom accessor on {klass} objects.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name : str
|
||||
Name under which the accessor should be registered. A warning is issued
|
||||
if this name conflicts with a preexisting attribute.
|
||||
|
||||
Returns
|
||||
-------
|
||||
callable
|
||||
A class decorator.
|
||||
|
||||
See Also
|
||||
--------
|
||||
register_dataframe_accessor : Register a custom accessor on DataFrame objects.
|
||||
register_series_accessor : Register a custom accessor on Series objects.
|
||||
register_index_accessor : Register a custom accessor on Index objects.
|
||||
|
||||
Notes
|
||||
-----
|
||||
When accessed, your accessor will be initialized with the pandas object
|
||||
the user is interacting with. So the signature must be
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def __init__(self, pandas_object): # noqa: E999
|
||||
...
|
||||
|
||||
For consistency with pandas methods, you should raise an ``AttributeError``
|
||||
if the data passed to your accessor has an incorrect dtype.
|
||||
|
||||
>>> pd.Series(['a', 'b']).dt
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
AttributeError: Can only use .dt accessor with datetimelike values
|
||||
|
||||
Examples
|
||||
--------
|
||||
In your library code::
|
||||
|
||||
import pandas as pd
|
||||
|
||||
@pd.api.extensions.register_dataframe_accessor("geo")
|
||||
class GeoAccessor:
|
||||
def __init__(self, pandas_obj):
|
||||
self._obj = pandas_obj
|
||||
|
||||
@property
|
||||
def center(self):
|
||||
# return the geographic center point of this DataFrame
|
||||
lat = self._obj.latitude
|
||||
lon = self._obj.longitude
|
||||
return (float(lon.mean()), float(lat.mean()))
|
||||
|
||||
def plot(self):
|
||||
# plot this array's data on a map, e.g., using Cartopy
|
||||
pass
|
||||
|
||||
Back in an interactive IPython session:
|
||||
|
||||
.. code-block:: ipython
|
||||
|
||||
In [1]: ds = pd.DataFrame({{"longitude": np.linspace(0, 10),
|
||||
...: "latitude": np.linspace(0, 20)}})
|
||||
In [2]: ds.geo.center
|
||||
Out[2]: (5.0, 10.0)
|
||||
In [3]: ds.geo.plot() # plots data on a map
|
||||
"""
|
||||
|
||||
def decorator(accessor):
|
||||
if hasattr(cls, name):
|
||||
warnings.warn(
|
||||
f"registration of accessor {repr(accessor)} under name "
|
||||
f"{repr(name)} for type {repr(cls)} is overriding a preexisting "
|
||||
f"attribute with the same name.",
|
||||
UserWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
setattr(cls, name, CachedAccessor(name, accessor))
|
||||
cls._accessors.add(name)
|
||||
return accessor
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
@doc(_register_accessor, klass="DataFrame")
|
||||
def register_dataframe_accessor(name: str):
|
||||
from pandas import DataFrame
|
||||
|
||||
return _register_accessor(name, DataFrame)
|
||||
|
||||
|
||||
@doc(_register_accessor, klass="Series")
|
||||
def register_series_accessor(name: str):
|
||||
from pandas import Series
|
||||
|
||||
return _register_accessor(name, Series)
|
||||
|
||||
|
||||
@doc(_register_accessor, klass="Index")
|
||||
def register_index_accessor(name: str):
|
||||
from pandas import Index
|
||||
|
||||
return _register_accessor(name, Index)
|
1747
lib/python3.13/site-packages/pandas/core/algorithms.py
Normal file
1747
lib/python3.13/site-packages/pandas/core/algorithms.py
Normal file
File diff suppressed because it is too large
Load Diff
140
lib/python3.13/site-packages/pandas/core/api.py
Normal file
140
lib/python3.13/site-packages/pandas/core/api.py
Normal file
@ -0,0 +1,140 @@
|
||||
from pandas._libs import (
|
||||
NaT,
|
||||
Period,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
)
|
||||
from pandas._libs.missing import NA
|
||||
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
ArrowDtype,
|
||||
CategoricalDtype,
|
||||
DatetimeTZDtype,
|
||||
IntervalDtype,
|
||||
PeriodDtype,
|
||||
)
|
||||
from pandas.core.dtypes.missing import (
|
||||
isna,
|
||||
isnull,
|
||||
notna,
|
||||
notnull,
|
||||
)
|
||||
|
||||
from pandas.core.algorithms import (
|
||||
factorize,
|
||||
unique,
|
||||
value_counts,
|
||||
)
|
||||
from pandas.core.arrays import Categorical
|
||||
from pandas.core.arrays.boolean import BooleanDtype
|
||||
from pandas.core.arrays.floating import (
|
||||
Float32Dtype,
|
||||
Float64Dtype,
|
||||
)
|
||||
from pandas.core.arrays.integer import (
|
||||
Int8Dtype,
|
||||
Int16Dtype,
|
||||
Int32Dtype,
|
||||
Int64Dtype,
|
||||
UInt8Dtype,
|
||||
UInt16Dtype,
|
||||
UInt32Dtype,
|
||||
UInt64Dtype,
|
||||
)
|
||||
from pandas.core.arrays.string_ import StringDtype
|
||||
from pandas.core.construction import array
|
||||
from pandas.core.flags import Flags
|
||||
from pandas.core.groupby import (
|
||||
Grouper,
|
||||
NamedAgg,
|
||||
)
|
||||
from pandas.core.indexes.api import (
|
||||
CategoricalIndex,
|
||||
DatetimeIndex,
|
||||
Index,
|
||||
IntervalIndex,
|
||||
MultiIndex,
|
||||
PeriodIndex,
|
||||
RangeIndex,
|
||||
TimedeltaIndex,
|
||||
)
|
||||
from pandas.core.indexes.datetimes import (
|
||||
bdate_range,
|
||||
date_range,
|
||||
)
|
||||
from pandas.core.indexes.interval import (
|
||||
Interval,
|
||||
interval_range,
|
||||
)
|
||||
from pandas.core.indexes.period import period_range
|
||||
from pandas.core.indexes.timedeltas import timedelta_range
|
||||
from pandas.core.indexing import IndexSlice
|
||||
from pandas.core.series import Series
|
||||
from pandas.core.tools.datetimes import to_datetime
|
||||
from pandas.core.tools.numeric import to_numeric
|
||||
from pandas.core.tools.timedeltas import to_timedelta
|
||||
|
||||
from pandas.io.formats.format import set_eng_float_format
|
||||
from pandas.tseries.offsets import DateOffset
|
||||
|
||||
# DataFrame needs to be imported after NamedAgg to avoid a circular import
|
||||
from pandas.core.frame import DataFrame # isort:skip
|
||||
|
||||
__all__ = [
|
||||
"array",
|
||||
"ArrowDtype",
|
||||
"bdate_range",
|
||||
"BooleanDtype",
|
||||
"Categorical",
|
||||
"CategoricalDtype",
|
||||
"CategoricalIndex",
|
||||
"DataFrame",
|
||||
"DateOffset",
|
||||
"date_range",
|
||||
"DatetimeIndex",
|
||||
"DatetimeTZDtype",
|
||||
"factorize",
|
||||
"Flags",
|
||||
"Float32Dtype",
|
||||
"Float64Dtype",
|
||||
"Grouper",
|
||||
"Index",
|
||||
"IndexSlice",
|
||||
"Int16Dtype",
|
||||
"Int32Dtype",
|
||||
"Int64Dtype",
|
||||
"Int8Dtype",
|
||||
"Interval",
|
||||
"IntervalDtype",
|
||||
"IntervalIndex",
|
||||
"interval_range",
|
||||
"isna",
|
||||
"isnull",
|
||||
"MultiIndex",
|
||||
"NA",
|
||||
"NamedAgg",
|
||||
"NaT",
|
||||
"notna",
|
||||
"notnull",
|
||||
"Period",
|
||||
"PeriodDtype",
|
||||
"PeriodIndex",
|
||||
"period_range",
|
||||
"RangeIndex",
|
||||
"Series",
|
||||
"set_eng_float_format",
|
||||
"StringDtype",
|
||||
"Timedelta",
|
||||
"TimedeltaIndex",
|
||||
"timedelta_range",
|
||||
"Timestamp",
|
||||
"to_datetime",
|
||||
"to_numeric",
|
||||
"to_timedelta",
|
||||
"UInt16Dtype",
|
||||
"UInt32Dtype",
|
||||
"UInt64Dtype",
|
||||
"UInt8Dtype",
|
||||
"unique",
|
||||
"value_counts",
|
||||
]
|
2062
lib/python3.13/site-packages/pandas/core/apply.py
Normal file
2062
lib/python3.13/site-packages/pandas/core/apply.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,9 @@
|
||||
"""
|
||||
core.array_algos is for algorithms that operate on ndarray and ExtensionArray.
|
||||
These should:
|
||||
|
||||
- Assume that any Index, Series, or DataFrame objects have already been unwrapped.
|
||||
- Assume that any list arguments have already been cast to ndarray/EA.
|
||||
- Not depend on Index, Series, or DataFrame, nor import any of these.
|
||||
- May dispatch to ExtensionArray methods, but should not import from core.arrays.
|
||||
"""
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,67 @@
|
||||
"""
|
||||
datetimelke_accumulations.py is for accumulations of datetimelike extension arrays
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Callable
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import iNaT
|
||||
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
|
||||
def _cum_func(
|
||||
func: Callable,
|
||||
values: np.ndarray,
|
||||
*,
|
||||
skipna: bool = True,
|
||||
):
|
||||
"""
|
||||
Accumulations for 1D datetimelike arrays.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : np.cumsum, np.maximum.accumulate, np.minimum.accumulate
|
||||
values : np.ndarray
|
||||
Numpy array with the values (can be of any dtype that support the
|
||||
operation). Values is changed is modified inplace.
|
||||
skipna : bool, default True
|
||||
Whether to skip NA.
|
||||
"""
|
||||
try:
|
||||
fill_value = {
|
||||
np.maximum.accumulate: np.iinfo(np.int64).min,
|
||||
np.cumsum: 0,
|
||||
np.minimum.accumulate: np.iinfo(np.int64).max,
|
||||
}[func]
|
||||
except KeyError:
|
||||
raise ValueError(f"No accumulation for {func} implemented on BaseMaskedArray")
|
||||
|
||||
mask = isna(values)
|
||||
y = values.view("i8")
|
||||
y[mask] = fill_value
|
||||
|
||||
if not skipna:
|
||||
mask = np.maximum.accumulate(mask)
|
||||
|
||||
result = func(y)
|
||||
result[mask] = iNaT
|
||||
|
||||
if values.dtype.kind in "mM":
|
||||
return result.view(values.dtype.base)
|
||||
return result
|
||||
|
||||
|
||||
def cumsum(values: np.ndarray, *, skipna: bool = True) -> np.ndarray:
|
||||
return _cum_func(np.cumsum, values, skipna=skipna)
|
||||
|
||||
|
||||
def cummin(values: np.ndarray, *, skipna: bool = True):
|
||||
return _cum_func(np.minimum.accumulate, values, skipna=skipna)
|
||||
|
||||
|
||||
def cummax(values: np.ndarray, *, skipna: bool = True):
|
||||
return _cum_func(np.maximum.accumulate, values, skipna=skipna)
|
@ -0,0 +1,90 @@
|
||||
"""
|
||||
masked_accumulations.py is for accumulation algorithms using a mask-based approach
|
||||
for missing values.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Callable,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import npt
|
||||
|
||||
|
||||
def _cum_func(
|
||||
func: Callable,
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
*,
|
||||
skipna: bool = True,
|
||||
):
|
||||
"""
|
||||
Accumulations for 1D masked array.
|
||||
|
||||
We will modify values in place to replace NAs with the appropriate fill value.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate
|
||||
values : np.ndarray
|
||||
Numpy array with the values (can be of any dtype that support the
|
||||
operation).
|
||||
mask : np.ndarray
|
||||
Boolean numpy array (True values indicate missing values).
|
||||
skipna : bool, default True
|
||||
Whether to skip NA.
|
||||
"""
|
||||
dtype_info: np.iinfo | np.finfo
|
||||
if values.dtype.kind == "f":
|
||||
dtype_info = np.finfo(values.dtype.type)
|
||||
elif values.dtype.kind in "iu":
|
||||
dtype_info = np.iinfo(values.dtype.type)
|
||||
elif values.dtype.kind == "b":
|
||||
# Max value of bool is 1, but since we are setting into a boolean
|
||||
# array, 255 is fine as well. Min value has to be 0 when setting
|
||||
# into the boolean array.
|
||||
dtype_info = np.iinfo(np.uint8)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"No masked accumulation defined for dtype {values.dtype.type}"
|
||||
)
|
||||
try:
|
||||
fill_value = {
|
||||
np.cumprod: 1,
|
||||
np.maximum.accumulate: dtype_info.min,
|
||||
np.cumsum: 0,
|
||||
np.minimum.accumulate: dtype_info.max,
|
||||
}[func]
|
||||
except KeyError:
|
||||
raise NotImplementedError(
|
||||
f"No accumulation for {func} implemented on BaseMaskedArray"
|
||||
)
|
||||
|
||||
values[mask] = fill_value
|
||||
|
||||
if not skipna:
|
||||
mask = np.maximum.accumulate(mask)
|
||||
|
||||
values = func(values)
|
||||
return values, mask
|
||||
|
||||
|
||||
def cumsum(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
|
||||
return _cum_func(np.cumsum, values, mask, skipna=skipna)
|
||||
|
||||
|
||||
def cumprod(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
|
||||
return _cum_func(np.cumprod, values, mask, skipna=skipna)
|
||||
|
||||
|
||||
def cummin(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
|
||||
return _cum_func(np.minimum.accumulate, values, mask, skipna=skipna)
|
||||
|
||||
|
||||
def cummax(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
|
||||
return _cum_func(np.maximum.accumulate, values, mask, skipna=skipna)
|
@ -0,0 +1,197 @@
|
||||
"""
|
||||
masked_reductions.py is for reduction algorithms using a mask-based approach
|
||||
for missing values.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Callable,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import missing as libmissing
|
||||
|
||||
from pandas.core.nanops import check_below_min_count
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
AxisInt,
|
||||
npt,
|
||||
)
|
||||
|
||||
|
||||
def _reductions(
|
||||
func: Callable,
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
*,
|
||||
skipna: bool = True,
|
||||
min_count: int = 0,
|
||||
axis: AxisInt | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Sum, mean or product for 1D masked array.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : np.sum or np.prod
|
||||
values : np.ndarray
|
||||
Numpy array with the values (can be of any dtype that support the
|
||||
operation).
|
||||
mask : np.ndarray[bool]
|
||||
Boolean numpy array (True values indicate missing values).
|
||||
skipna : bool, default True
|
||||
Whether to skip NA.
|
||||
min_count : int, default 0
|
||||
The required number of valid values to perform the operation. If fewer than
|
||||
``min_count`` non-NA values are present the result will be NA.
|
||||
axis : int, optional, default None
|
||||
"""
|
||||
if not skipna:
|
||||
if mask.any() or check_below_min_count(values.shape, None, min_count):
|
||||
return libmissing.NA
|
||||
else:
|
||||
return func(values, axis=axis, **kwargs)
|
||||
else:
|
||||
if check_below_min_count(values.shape, mask, min_count) and (
|
||||
axis is None or values.ndim == 1
|
||||
):
|
||||
return libmissing.NA
|
||||
|
||||
return func(values, where=~mask, axis=axis, **kwargs)
|
||||
|
||||
|
||||
def sum(
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
*,
|
||||
skipna: bool = True,
|
||||
min_count: int = 0,
|
||||
axis: AxisInt | None = None,
|
||||
):
|
||||
return _reductions(
|
||||
np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
|
||||
)
|
||||
|
||||
|
||||
def prod(
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
*,
|
||||
skipna: bool = True,
|
||||
min_count: int = 0,
|
||||
axis: AxisInt | None = None,
|
||||
):
|
||||
return _reductions(
|
||||
np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
|
||||
)
|
||||
|
||||
|
||||
def _minmax(
|
||||
func: Callable,
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
*,
|
||||
skipna: bool = True,
|
||||
axis: AxisInt | None = None,
|
||||
):
|
||||
"""
|
||||
Reduction for 1D masked array.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : np.min or np.max
|
||||
values : np.ndarray
|
||||
Numpy array with the values (can be of any dtype that support the
|
||||
operation).
|
||||
mask : np.ndarray[bool]
|
||||
Boolean numpy array (True values indicate missing values).
|
||||
skipna : bool, default True
|
||||
Whether to skip NA.
|
||||
axis : int, optional, default None
|
||||
"""
|
||||
if not skipna:
|
||||
if mask.any() or not values.size:
|
||||
# min/max with empty array raise in numpy, pandas returns NA
|
||||
return libmissing.NA
|
||||
else:
|
||||
return func(values, axis=axis)
|
||||
else:
|
||||
subset = values[~mask]
|
||||
if subset.size:
|
||||
return func(subset, axis=axis)
|
||||
else:
|
||||
# min/max with empty array raise in numpy, pandas returns NA
|
||||
return libmissing.NA
|
||||
|
||||
|
||||
def min(
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
*,
|
||||
skipna: bool = True,
|
||||
axis: AxisInt | None = None,
|
||||
):
|
||||
return _minmax(np.min, values=values, mask=mask, skipna=skipna, axis=axis)
|
||||
|
||||
|
||||
def max(
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
*,
|
||||
skipna: bool = True,
|
||||
axis: AxisInt | None = None,
|
||||
):
|
||||
return _minmax(np.max, values=values, mask=mask, skipna=skipna, axis=axis)
|
||||
|
||||
|
||||
def mean(
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
*,
|
||||
skipna: bool = True,
|
||||
axis: AxisInt | None = None,
|
||||
):
|
||||
if not values.size or mask.all():
|
||||
return libmissing.NA
|
||||
return _reductions(np.mean, values=values, mask=mask, skipna=skipna, axis=axis)
|
||||
|
||||
|
||||
def var(
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
*,
|
||||
skipna: bool = True,
|
||||
axis: AxisInt | None = None,
|
||||
ddof: int = 1,
|
||||
):
|
||||
if not values.size or mask.all():
|
||||
return libmissing.NA
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", RuntimeWarning)
|
||||
return _reductions(
|
||||
np.var, values=values, mask=mask, skipna=skipna, axis=axis, ddof=ddof
|
||||
)
|
||||
|
||||
|
||||
def std(
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
*,
|
||||
skipna: bool = True,
|
||||
axis: AxisInt | None = None,
|
||||
ddof: int = 1,
|
||||
):
|
||||
if not values.size or mask.all():
|
||||
return libmissing.NA
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", RuntimeWarning)
|
||||
return _reductions(
|
||||
np.std, values=values, mask=mask, skipna=skipna, axis=axis, ddof=ddof
|
||||
)
|
149
lib/python3.13/site-packages/pandas/core/array_algos/putmask.py
Normal file
149
lib/python3.13/site-packages/pandas/core/array_algos/putmask.py
Normal file
@ -0,0 +1,149 @@
|
||||
"""
|
||||
EA-compatible analogue to np.putmask
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
|
||||
from pandas.core.dtypes.cast import infer_dtype_from
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
|
||||
from pandas.core.arrays import ExtensionArray
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
npt,
|
||||
)
|
||||
|
||||
from pandas import MultiIndex
|
||||
|
||||
|
||||
def putmask_inplace(values: ArrayLike, mask: npt.NDArray[np.bool_], value: Any) -> None:
|
||||
"""
|
||||
ExtensionArray-compatible implementation of np.putmask. The main
|
||||
difference is we do not handle repeating or truncating like numpy.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values: np.ndarray or ExtensionArray
|
||||
mask : np.ndarray[bool]
|
||||
We assume extract_bool_array has already been called.
|
||||
value : Any
|
||||
"""
|
||||
|
||||
if (
|
||||
not isinstance(values, np.ndarray)
|
||||
or (values.dtype == object and not lib.is_scalar(value))
|
||||
# GH#43424: np.putmask raises TypeError if we cannot cast between types with
|
||||
# rule = "safe", a stricter guarantee we may not have here
|
||||
or (
|
||||
isinstance(value, np.ndarray) and not np.can_cast(value.dtype, values.dtype)
|
||||
)
|
||||
):
|
||||
# GH#19266 using np.putmask gives unexpected results with listlike value
|
||||
# along with object dtype
|
||||
if is_list_like(value) and len(value) == len(values):
|
||||
values[mask] = value[mask]
|
||||
else:
|
||||
values[mask] = value
|
||||
else:
|
||||
# GH#37833 np.putmask is more performant than __setitem__
|
||||
np.putmask(values, mask, value)
|
||||
|
||||
|
||||
def putmask_without_repeat(
|
||||
values: np.ndarray, mask: npt.NDArray[np.bool_], new: Any
|
||||
) -> None:
|
||||
"""
|
||||
np.putmask will truncate or repeat if `new` is a listlike with
|
||||
len(new) != len(values). We require an exact match.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : np.ndarray
|
||||
mask : np.ndarray[bool]
|
||||
new : Any
|
||||
"""
|
||||
if getattr(new, "ndim", 0) >= 1:
|
||||
new = new.astype(values.dtype, copy=False)
|
||||
|
||||
# TODO: this prob needs some better checking for 2D cases
|
||||
nlocs = mask.sum()
|
||||
if nlocs > 0 and is_list_like(new) and getattr(new, "ndim", 1) == 1:
|
||||
shape = np.shape(new)
|
||||
# np.shape compat for if setitem_datetimelike_compat
|
||||
# changed arraylike to list e.g. test_where_dt64_2d
|
||||
if nlocs == shape[-1]:
|
||||
# GH#30567
|
||||
# If length of ``new`` is less than the length of ``values``,
|
||||
# `np.putmask` would first repeat the ``new`` array and then
|
||||
# assign the masked values hence produces incorrect result.
|
||||
# `np.place` on the other hand uses the ``new`` values at it is
|
||||
# to place in the masked locations of ``values``
|
||||
np.place(values, mask, new)
|
||||
# i.e. values[mask] = new
|
||||
elif mask.shape[-1] == shape[-1] or shape[-1] == 1:
|
||||
np.putmask(values, mask, new)
|
||||
else:
|
||||
raise ValueError("cannot assign mismatch length to masked array")
|
||||
else:
|
||||
np.putmask(values, mask, new)
|
||||
|
||||
|
||||
def validate_putmask(
|
||||
values: ArrayLike | MultiIndex, mask: np.ndarray
|
||||
) -> tuple[npt.NDArray[np.bool_], bool]:
|
||||
"""
|
||||
Validate mask and check if this putmask operation is a no-op.
|
||||
"""
|
||||
mask = extract_bool_array(mask)
|
||||
if mask.shape != values.shape:
|
||||
raise ValueError("putmask: mask and data must be the same size")
|
||||
|
||||
noop = not mask.any()
|
||||
return mask, noop
|
||||
|
||||
|
||||
def extract_bool_array(mask: ArrayLike) -> npt.NDArray[np.bool_]:
|
||||
"""
|
||||
If we have a SparseArray or BooleanArray, convert it to ndarray[bool].
|
||||
"""
|
||||
if isinstance(mask, ExtensionArray):
|
||||
# We could have BooleanArray, Sparse[bool], ...
|
||||
# Except for BooleanArray, this is equivalent to just
|
||||
# np.asarray(mask, dtype=bool)
|
||||
mask = mask.to_numpy(dtype=bool, na_value=False)
|
||||
|
||||
mask = np.asarray(mask, dtype=bool)
|
||||
return mask
|
||||
|
||||
|
||||
def setitem_datetimelike_compat(values: np.ndarray, num_set: int, other):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
values : np.ndarray
|
||||
num_set : int
|
||||
For putmask, this is mask.sum()
|
||||
other : Any
|
||||
"""
|
||||
if values.dtype == object:
|
||||
dtype, _ = infer_dtype_from(other)
|
||||
|
||||
if lib.is_np_dtype(dtype, "mM"):
|
||||
# https://github.com/numpy/numpy/issues/12550
|
||||
# timedelta64 will incorrectly cast to int
|
||||
if not is_list_like(other):
|
||||
other = [other] * num_set
|
||||
else:
|
||||
other = list(other)
|
||||
|
||||
return other
|
226
lib/python3.13/site-packages/pandas/core/array_algos/quantile.py
Normal file
226
lib/python3.13/site-packages/pandas/core/array_algos/quantile.py
Normal file
@ -0,0 +1,226 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.core.dtypes.missing import (
|
||||
isna,
|
||||
na_value_for_dtype,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
Scalar,
|
||||
npt,
|
||||
)
|
||||
|
||||
|
||||
def quantile_compat(
|
||||
values: ArrayLike, qs: npt.NDArray[np.float64], interpolation: str
|
||||
) -> ArrayLike:
|
||||
"""
|
||||
Compute the quantiles of the given values for each quantile in `qs`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : np.ndarray or ExtensionArray
|
||||
qs : np.ndarray[float64]
|
||||
interpolation : str
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.ndarray or ExtensionArray
|
||||
"""
|
||||
if isinstance(values, np.ndarray):
|
||||
fill_value = na_value_for_dtype(values.dtype, compat=False)
|
||||
mask = isna(values)
|
||||
return quantile_with_mask(values, mask, fill_value, qs, interpolation)
|
||||
else:
|
||||
return values._quantile(qs, interpolation)
|
||||
|
||||
|
||||
def quantile_with_mask(
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
fill_value,
|
||||
qs: npt.NDArray[np.float64],
|
||||
interpolation: str,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Compute the quantiles of the given values for each quantile in `qs`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : np.ndarray
|
||||
For ExtensionArray, this is _values_for_factorize()[0]
|
||||
mask : np.ndarray[bool]
|
||||
mask = isna(values)
|
||||
For ExtensionArray, this is computed before calling _value_for_factorize
|
||||
fill_value : Scalar
|
||||
The value to interpret fill NA entries with
|
||||
For ExtensionArray, this is _values_for_factorize()[1]
|
||||
qs : np.ndarray[float64]
|
||||
interpolation : str
|
||||
Type of interpolation
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.ndarray
|
||||
|
||||
Notes
|
||||
-----
|
||||
Assumes values is already 2D. For ExtensionArray this means np.atleast_2d
|
||||
has been called on _values_for_factorize()[0]
|
||||
|
||||
Quantile is computed along axis=1.
|
||||
"""
|
||||
assert values.shape == mask.shape
|
||||
if values.ndim == 1:
|
||||
# unsqueeze, operate, re-squeeze
|
||||
values = np.atleast_2d(values)
|
||||
mask = np.atleast_2d(mask)
|
||||
res_values = quantile_with_mask(values, mask, fill_value, qs, interpolation)
|
||||
return res_values[0]
|
||||
|
||||
assert values.ndim == 2
|
||||
|
||||
is_empty = values.shape[1] == 0
|
||||
|
||||
if is_empty:
|
||||
# create the array of na_values
|
||||
# 2d len(values) * len(qs)
|
||||
flat = np.array([fill_value] * len(qs))
|
||||
result = np.repeat(flat, len(values)).reshape(len(values), len(qs))
|
||||
else:
|
||||
result = _nanpercentile(
|
||||
values,
|
||||
qs * 100.0,
|
||||
na_value=fill_value,
|
||||
mask=mask,
|
||||
interpolation=interpolation,
|
||||
)
|
||||
|
||||
result = np.asarray(result)
|
||||
result = result.T
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _nanpercentile_1d(
|
||||
values: np.ndarray,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
qs: npt.NDArray[np.float64],
|
||||
na_value: Scalar,
|
||||
interpolation: str,
|
||||
) -> Scalar | np.ndarray:
|
||||
"""
|
||||
Wrapper for np.percentile that skips missing values, specialized to
|
||||
1-dimensional case.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : array over which to find quantiles
|
||||
mask : ndarray[bool]
|
||||
locations in values that should be considered missing
|
||||
qs : np.ndarray[float64] of quantile indices to find
|
||||
na_value : scalar
|
||||
value to return for empty or all-null values
|
||||
interpolation : str
|
||||
|
||||
Returns
|
||||
-------
|
||||
quantiles : scalar or array
|
||||
"""
|
||||
# mask is Union[ExtensionArray, ndarray]
|
||||
values = values[~mask]
|
||||
|
||||
if len(values) == 0:
|
||||
# Can't pass dtype=values.dtype here bc we might have na_value=np.nan
|
||||
# with values.dtype=int64 see test_quantile_empty
|
||||
# equiv: 'np.array([na_value] * len(qs))' but much faster
|
||||
return np.full(len(qs), na_value)
|
||||
|
||||
return np.percentile(
|
||||
values,
|
||||
qs,
|
||||
# error: No overload variant of "percentile" matches argument
|
||||
# types "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]"
|
||||
# , "Dict[str, str]" [call-overload]
|
||||
method=interpolation, # type: ignore[call-overload]
|
||||
)
|
||||
|
||||
|
||||
def _nanpercentile(
|
||||
values: np.ndarray,
|
||||
qs: npt.NDArray[np.float64],
|
||||
*,
|
||||
na_value,
|
||||
mask: npt.NDArray[np.bool_],
|
||||
interpolation: str,
|
||||
):
|
||||
"""
|
||||
Wrapper for np.percentile that skips missing values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : np.ndarray[ndim=2] over which to find quantiles
|
||||
qs : np.ndarray[float64] of quantile indices to find
|
||||
na_value : scalar
|
||||
value to return for empty or all-null values
|
||||
mask : np.ndarray[bool]
|
||||
locations in values that should be considered missing
|
||||
interpolation : str
|
||||
|
||||
Returns
|
||||
-------
|
||||
quantiles : scalar or array
|
||||
"""
|
||||
|
||||
if values.dtype.kind in "mM":
|
||||
# need to cast to integer to avoid rounding errors in numpy
|
||||
result = _nanpercentile(
|
||||
values.view("i8"),
|
||||
qs=qs,
|
||||
na_value=na_value.view("i8"),
|
||||
mask=mask,
|
||||
interpolation=interpolation,
|
||||
)
|
||||
|
||||
# Note: we have to do `astype` and not view because in general we
|
||||
# have float result at this point, not i8
|
||||
return result.astype(values.dtype)
|
||||
|
||||
if mask.any():
|
||||
# Caller is responsible for ensuring mask shape match
|
||||
assert mask.shape == values.shape
|
||||
result = [
|
||||
_nanpercentile_1d(val, m, qs, na_value, interpolation=interpolation)
|
||||
for (val, m) in zip(list(values), list(mask))
|
||||
]
|
||||
if values.dtype.kind == "f":
|
||||
# preserve itemsize
|
||||
result = np.asarray(result, dtype=values.dtype).T
|
||||
else:
|
||||
result = np.asarray(result).T
|
||||
if (
|
||||
result.dtype != values.dtype
|
||||
and not mask.all()
|
||||
and (result == result.astype(values.dtype, copy=False)).all()
|
||||
):
|
||||
# mask.all() will never get cast back to int
|
||||
# e.g. values id integer dtype and result is floating dtype,
|
||||
# only cast back to integer dtype if result values are all-integer.
|
||||
result = result.astype(values.dtype, copy=False)
|
||||
return result
|
||||
else:
|
||||
return np.percentile(
|
||||
values,
|
||||
qs,
|
||||
axis=1,
|
||||
# error: No overload variant of "percentile" matches argument types
|
||||
# "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]",
|
||||
# "int", "Dict[str, str]" [call-overload]
|
||||
method=interpolation, # type: ignore[call-overload]
|
||||
)
|
152
lib/python3.13/site-packages/pandas/core/array_algos/replace.py
Normal file
152
lib/python3.13/site-packages/pandas/core/array_algos/replace.py
Normal file
@ -0,0 +1,152 @@
|
||||
"""
|
||||
Methods used by Block.replace and related methods.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import operator
|
||||
import re
|
||||
from re import Pattern
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_bool,
|
||||
is_re,
|
||||
is_re_compilable,
|
||||
)
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
Scalar,
|
||||
npt,
|
||||
)
|
||||
|
||||
|
||||
def should_use_regex(regex: bool, to_replace: Any) -> bool:
|
||||
"""
|
||||
Decide whether to treat `to_replace` as a regular expression.
|
||||
"""
|
||||
if is_re(to_replace):
|
||||
regex = True
|
||||
|
||||
regex = regex and is_re_compilable(to_replace)
|
||||
|
||||
# Don't use regex if the pattern is empty.
|
||||
regex = regex and re.compile(to_replace).pattern != ""
|
||||
return regex
|
||||
|
||||
|
||||
def compare_or_regex_search(
|
||||
a: ArrayLike, b: Scalar | Pattern, regex: bool, mask: npt.NDArray[np.bool_]
|
||||
) -> ArrayLike:
|
||||
"""
|
||||
Compare two array-like inputs of the same shape or two scalar values
|
||||
|
||||
Calls operator.eq or re.search, depending on regex argument. If regex is
|
||||
True, perform an element-wise regex matching.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : array-like
|
||||
b : scalar or regex pattern
|
||||
regex : bool
|
||||
mask : np.ndarray[bool]
|
||||
|
||||
Returns
|
||||
-------
|
||||
mask : array-like of bool
|
||||
"""
|
||||
if isna(b):
|
||||
return ~mask
|
||||
|
||||
def _check_comparison_types(
|
||||
result: ArrayLike | bool, a: ArrayLike, b: Scalar | Pattern
|
||||
):
|
||||
"""
|
||||
Raises an error if the two arrays (a,b) cannot be compared.
|
||||
Otherwise, returns the comparison result as expected.
|
||||
"""
|
||||
if is_bool(result) and isinstance(a, np.ndarray):
|
||||
type_names = [type(a).__name__, type(b).__name__]
|
||||
|
||||
type_names[0] = f"ndarray(dtype={a.dtype})"
|
||||
|
||||
raise TypeError(
|
||||
f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}"
|
||||
)
|
||||
|
||||
if not regex or not should_use_regex(regex, b):
|
||||
# TODO: should use missing.mask_missing?
|
||||
op = lambda x: operator.eq(x, b)
|
||||
else:
|
||||
op = np.vectorize(
|
||||
lambda x: bool(re.search(b, x))
|
||||
if isinstance(x, str) and isinstance(b, (str, Pattern))
|
||||
else False
|
||||
)
|
||||
|
||||
# GH#32621 use mask to avoid comparing to NAs
|
||||
if isinstance(a, np.ndarray):
|
||||
a = a[mask]
|
||||
|
||||
result = op(a)
|
||||
|
||||
if isinstance(result, np.ndarray) and mask is not None:
|
||||
# The shape of the mask can differ to that of the result
|
||||
# since we may compare only a subset of a's or b's elements
|
||||
tmp = np.zeros(mask.shape, dtype=np.bool_)
|
||||
np.place(tmp, mask, result)
|
||||
result = tmp
|
||||
|
||||
_check_comparison_types(result, a, b)
|
||||
return result
|
||||
|
||||
|
||||
def replace_regex(
|
||||
values: ArrayLike, rx: re.Pattern, value, mask: npt.NDArray[np.bool_] | None
|
||||
) -> None:
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
values : ArrayLike
|
||||
Object dtype.
|
||||
rx : re.Pattern
|
||||
value : Any
|
||||
mask : np.ndarray[bool], optional
|
||||
|
||||
Notes
|
||||
-----
|
||||
Alters values in-place.
|
||||
"""
|
||||
|
||||
# deal with replacing values with objects (strings) that match but
|
||||
# whose replacement is not a string (numeric, nan, object)
|
||||
if isna(value) or not isinstance(value, str):
|
||||
|
||||
def re_replacer(s):
|
||||
if is_re(rx) and isinstance(s, str):
|
||||
return value if rx.search(s) is not None else s
|
||||
else:
|
||||
return s
|
||||
|
||||
else:
|
||||
# value is guaranteed to be a string here, s can be either a string
|
||||
# or null if it's null it gets returned
|
||||
def re_replacer(s):
|
||||
if is_re(rx) and isinstance(s, str):
|
||||
return rx.sub(value, s)
|
||||
else:
|
||||
return s
|
||||
|
||||
f = np.vectorize(re_replacer, otypes=[np.object_])
|
||||
|
||||
if mask is None:
|
||||
values[:] = f(values)
|
||||
else:
|
||||
values[mask] = f(values[mask])
|
594
lib/python3.13/site-packages/pandas/core/array_algos/take.py
Normal file
594
lib/python3.13/site-packages/pandas/core/array_algos/take.py
Normal file
@ -0,0 +1,594 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import functools
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
cast,
|
||||
overload,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
algos as libalgos,
|
||||
lib,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.cast import maybe_promote
|
||||
from pandas.core.dtypes.common import (
|
||||
ensure_platform_int,
|
||||
is_1d_only_ea_dtype,
|
||||
)
|
||||
from pandas.core.dtypes.missing import na_value_for_dtype
|
||||
|
||||
from pandas.core.construction import ensure_wrapped_if_datetimelike
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
AxisInt,
|
||||
npt,
|
||||
)
|
||||
|
||||
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
|
||||
from pandas.core.arrays.base import ExtensionArray
|
||||
|
||||
|
||||
@overload
|
||||
def take_nd(
|
||||
arr: np.ndarray,
|
||||
indexer,
|
||||
axis: AxisInt = ...,
|
||||
fill_value=...,
|
||||
allow_fill: bool = ...,
|
||||
) -> np.ndarray:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def take_nd(
|
||||
arr: ExtensionArray,
|
||||
indexer,
|
||||
axis: AxisInt = ...,
|
||||
fill_value=...,
|
||||
allow_fill: bool = ...,
|
||||
) -> ArrayLike:
|
||||
...
|
||||
|
||||
|
||||
def take_nd(
|
||||
arr: ArrayLike,
|
||||
indexer,
|
||||
axis: AxisInt = 0,
|
||||
fill_value=lib.no_default,
|
||||
allow_fill: bool = True,
|
||||
) -> ArrayLike:
|
||||
"""
|
||||
Specialized Cython take which sets NaN values in one pass
|
||||
|
||||
This dispatches to ``take`` defined on ExtensionArrays.
|
||||
|
||||
Note: this function assumes that the indexer is a valid(ated) indexer with
|
||||
no out of bound indices.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arr : np.ndarray or ExtensionArray
|
||||
Input array.
|
||||
indexer : ndarray
|
||||
1-D array of indices to take, subarrays corresponding to -1 value
|
||||
indices are filed with fill_value
|
||||
axis : int, default 0
|
||||
Axis to take from
|
||||
fill_value : any, default np.nan
|
||||
Fill value to replace -1 values with
|
||||
allow_fill : bool, default True
|
||||
If False, indexer is assumed to contain no -1 values so no filling
|
||||
will be done. This short-circuits computation of a mask. Result is
|
||||
undefined if allow_fill == False and -1 is present in indexer.
|
||||
|
||||
Returns
|
||||
-------
|
||||
subarray : np.ndarray or ExtensionArray
|
||||
May be the same type as the input, or cast to an ndarray.
|
||||
"""
|
||||
if fill_value is lib.no_default:
|
||||
fill_value = na_value_for_dtype(arr.dtype, compat=False)
|
||||
elif lib.is_np_dtype(arr.dtype, "mM"):
|
||||
dtype, fill_value = maybe_promote(arr.dtype, fill_value)
|
||||
if arr.dtype != dtype:
|
||||
# EA.take is strict about returning a new object of the same type
|
||||
# so for that case cast upfront
|
||||
arr = arr.astype(dtype)
|
||||
|
||||
if not isinstance(arr, np.ndarray):
|
||||
# i.e. ExtensionArray,
|
||||
# includes for EA to catch DatetimeArray, TimedeltaArray
|
||||
if not is_1d_only_ea_dtype(arr.dtype):
|
||||
# i.e. DatetimeArray, TimedeltaArray
|
||||
arr = cast("NDArrayBackedExtensionArray", arr)
|
||||
return arr.take(
|
||||
indexer, fill_value=fill_value, allow_fill=allow_fill, axis=axis
|
||||
)
|
||||
|
||||
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
|
||||
|
||||
arr = np.asarray(arr)
|
||||
return _take_nd_ndarray(arr, indexer, axis, fill_value, allow_fill)
|
||||
|
||||
|
||||
def _take_nd_ndarray(
|
||||
arr: np.ndarray,
|
||||
indexer: npt.NDArray[np.intp] | None,
|
||||
axis: AxisInt,
|
||||
fill_value,
|
||||
allow_fill: bool,
|
||||
) -> np.ndarray:
|
||||
if indexer is None:
|
||||
indexer = np.arange(arr.shape[axis], dtype=np.intp)
|
||||
dtype, fill_value = arr.dtype, arr.dtype.type()
|
||||
else:
|
||||
indexer = ensure_platform_int(indexer)
|
||||
|
||||
dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value(
|
||||
arr, indexer, fill_value, allow_fill
|
||||
)
|
||||
|
||||
flip_order = False
|
||||
if arr.ndim == 2 and arr.flags.f_contiguous:
|
||||
flip_order = True
|
||||
|
||||
if flip_order:
|
||||
arr = arr.T
|
||||
axis = arr.ndim - axis - 1
|
||||
|
||||
# at this point, it's guaranteed that dtype can hold both the arr values
|
||||
# and the fill_value
|
||||
out_shape_ = list(arr.shape)
|
||||
out_shape_[axis] = len(indexer)
|
||||
out_shape = tuple(out_shape_)
|
||||
if arr.flags.f_contiguous and axis == arr.ndim - 1:
|
||||
# minor tweak that can make an order-of-magnitude difference
|
||||
# for dataframes initialized directly from 2-d ndarrays
|
||||
# (s.t. df.values is c-contiguous and df._mgr.blocks[0] is its
|
||||
# f-contiguous transpose)
|
||||
out = np.empty(out_shape, dtype=dtype, order="F")
|
||||
else:
|
||||
out = np.empty(out_shape, dtype=dtype)
|
||||
|
||||
func = _get_take_nd_function(
|
||||
arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info
|
||||
)
|
||||
func(arr, indexer, out, fill_value)
|
||||
|
||||
if flip_order:
|
||||
out = out.T
|
||||
return out
|
||||
|
||||
|
||||
def take_1d(
|
||||
arr: ArrayLike,
|
||||
indexer: npt.NDArray[np.intp],
|
||||
fill_value=None,
|
||||
allow_fill: bool = True,
|
||||
mask: npt.NDArray[np.bool_] | None = None,
|
||||
) -> ArrayLike:
|
||||
"""
|
||||
Specialized version for 1D arrays. Differences compared to `take_nd`:
|
||||
|
||||
- Assumes input array has already been converted to numpy array / EA
|
||||
- Assumes indexer is already guaranteed to be intp dtype ndarray
|
||||
- Only works for 1D arrays
|
||||
|
||||
To ensure the lowest possible overhead.
|
||||
|
||||
Note: similarly to `take_nd`, this function assumes that the indexer is
|
||||
a valid(ated) indexer with no out of bound indices.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arr : np.ndarray or ExtensionArray
|
||||
Input array.
|
||||
indexer : ndarray
|
||||
1-D array of indices to take (validated indices, intp dtype).
|
||||
fill_value : any, default np.nan
|
||||
Fill value to replace -1 values with
|
||||
allow_fill : bool, default True
|
||||
If False, indexer is assumed to contain no -1 values so no filling
|
||||
will be done. This short-circuits computation of a mask. Result is
|
||||
undefined if allow_fill == False and -1 is present in indexer.
|
||||
mask : np.ndarray, optional, default None
|
||||
If `allow_fill` is True, and the mask (where indexer == -1) is already
|
||||
known, it can be passed to avoid recomputation.
|
||||
"""
|
||||
if not isinstance(arr, np.ndarray):
|
||||
# ExtensionArray -> dispatch to their method
|
||||
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
|
||||
|
||||
if not allow_fill:
|
||||
return arr.take(indexer)
|
||||
|
||||
dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value(
|
||||
arr, indexer, fill_value, True, mask
|
||||
)
|
||||
|
||||
# at this point, it's guaranteed that dtype can hold both the arr values
|
||||
# and the fill_value
|
||||
out = np.empty(indexer.shape, dtype=dtype)
|
||||
|
||||
func = _get_take_nd_function(
|
||||
arr.ndim, arr.dtype, out.dtype, axis=0, mask_info=mask_info
|
||||
)
|
||||
func(arr, indexer, out, fill_value)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def take_2d_multi(
|
||||
arr: np.ndarray,
|
||||
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
|
||||
fill_value=np.nan,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Specialized Cython take which sets NaN values in one pass.
|
||||
"""
|
||||
# This is only called from one place in DataFrame._reindex_multi,
|
||||
# so we know indexer is well-behaved.
|
||||
assert indexer is not None
|
||||
assert indexer[0] is not None
|
||||
assert indexer[1] is not None
|
||||
|
||||
row_idx, col_idx = indexer
|
||||
|
||||
row_idx = ensure_platform_int(row_idx)
|
||||
col_idx = ensure_platform_int(col_idx)
|
||||
indexer = row_idx, col_idx
|
||||
mask_info = None
|
||||
|
||||
# check for promotion based on types only (do this first because
|
||||
# it's faster than computing a mask)
|
||||
dtype, fill_value = maybe_promote(arr.dtype, fill_value)
|
||||
if dtype != arr.dtype:
|
||||
# check if promotion is actually required based on indexer
|
||||
row_mask = row_idx == -1
|
||||
col_mask = col_idx == -1
|
||||
row_needs = row_mask.any()
|
||||
col_needs = col_mask.any()
|
||||
mask_info = (row_mask, col_mask), (row_needs, col_needs)
|
||||
|
||||
if not (row_needs or col_needs):
|
||||
# if not, then depromote, set fill_value to dummy
|
||||
# (it won't be used but we don't want the cython code
|
||||
# to crash when trying to cast it to dtype)
|
||||
dtype, fill_value = arr.dtype, arr.dtype.type()
|
||||
|
||||
# at this point, it's guaranteed that dtype can hold both the arr values
|
||||
# and the fill_value
|
||||
out_shape = len(row_idx), len(col_idx)
|
||||
out = np.empty(out_shape, dtype=dtype)
|
||||
|
||||
func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None)
|
||||
if func is None and arr.dtype != out.dtype:
|
||||
func = _take_2d_multi_dict.get((out.dtype.name, out.dtype.name), None)
|
||||
if func is not None:
|
||||
func = _convert_wrapper(func, out.dtype)
|
||||
|
||||
if func is not None:
|
||||
func(arr, indexer, out=out, fill_value=fill_value)
|
||||
else:
|
||||
# test_reindex_multi
|
||||
_take_2d_multi_object(
|
||||
arr, indexer, out, fill_value=fill_value, mask_info=mask_info
|
||||
)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
@functools.lru_cache
|
||||
def _get_take_nd_function_cached(
|
||||
ndim: int, arr_dtype: np.dtype, out_dtype: np.dtype, axis: AxisInt
|
||||
):
|
||||
"""
|
||||
Part of _get_take_nd_function below that doesn't need `mask_info` and thus
|
||||
can be cached (mask_info potentially contains a numpy ndarray which is not
|
||||
hashable and thus cannot be used as argument for cached function).
|
||||
"""
|
||||
tup = (arr_dtype.name, out_dtype.name)
|
||||
if ndim == 1:
|
||||
func = _take_1d_dict.get(tup, None)
|
||||
elif ndim == 2:
|
||||
if axis == 0:
|
||||
func = _take_2d_axis0_dict.get(tup, None)
|
||||
else:
|
||||
func = _take_2d_axis1_dict.get(tup, None)
|
||||
if func is not None:
|
||||
return func
|
||||
|
||||
# We get here with string, uint, float16, and complex dtypes that could
|
||||
# potentially be handled in algos_take_helper.
|
||||
# Also a couple with (M8[ns], object) and (m8[ns], object)
|
||||
tup = (out_dtype.name, out_dtype.name)
|
||||
if ndim == 1:
|
||||
func = _take_1d_dict.get(tup, None)
|
||||
elif ndim == 2:
|
||||
if axis == 0:
|
||||
func = _take_2d_axis0_dict.get(tup, None)
|
||||
else:
|
||||
func = _take_2d_axis1_dict.get(tup, None)
|
||||
if func is not None:
|
||||
func = _convert_wrapper(func, out_dtype)
|
||||
return func
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _get_take_nd_function(
|
||||
ndim: int,
|
||||
arr_dtype: np.dtype,
|
||||
out_dtype: np.dtype,
|
||||
axis: AxisInt = 0,
|
||||
mask_info=None,
|
||||
):
|
||||
"""
|
||||
Get the appropriate "take" implementation for the given dimension, axis
|
||||
and dtypes.
|
||||
"""
|
||||
func = None
|
||||
if ndim <= 2:
|
||||
# for this part we don't need `mask_info` -> use the cached algo lookup
|
||||
func = _get_take_nd_function_cached(ndim, arr_dtype, out_dtype, axis)
|
||||
|
||||
if func is None:
|
||||
|
||||
def func(arr, indexer, out, fill_value=np.nan) -> None:
|
||||
indexer = ensure_platform_int(indexer)
|
||||
_take_nd_object(
|
||||
arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info
|
||||
)
|
||||
|
||||
return func
|
||||
|
||||
|
||||
def _view_wrapper(f, arr_dtype=None, out_dtype=None, fill_wrap=None):
|
||||
def wrapper(
|
||||
arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
|
||||
) -> None:
|
||||
if arr_dtype is not None:
|
||||
arr = arr.view(arr_dtype)
|
||||
if out_dtype is not None:
|
||||
out = out.view(out_dtype)
|
||||
if fill_wrap is not None:
|
||||
# FIXME: if we get here with dt64/td64 we need to be sure we have
|
||||
# matching resos
|
||||
if fill_value.dtype.kind == "m":
|
||||
fill_value = fill_value.astype("m8[ns]")
|
||||
else:
|
||||
fill_value = fill_value.astype("M8[ns]")
|
||||
fill_value = fill_wrap(fill_value)
|
||||
|
||||
f(arr, indexer, out, fill_value=fill_value)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def _convert_wrapper(f, conv_dtype):
|
||||
def wrapper(
|
||||
arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
|
||||
) -> None:
|
||||
if conv_dtype == object:
|
||||
# GH#39755 avoid casting dt64/td64 to integers
|
||||
arr = ensure_wrapped_if_datetimelike(arr)
|
||||
arr = arr.astype(conv_dtype)
|
||||
f(arr, indexer, out, fill_value=fill_value)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
_take_1d_dict = {
|
||||
("int8", "int8"): libalgos.take_1d_int8_int8,
|
||||
("int8", "int32"): libalgos.take_1d_int8_int32,
|
||||
("int8", "int64"): libalgos.take_1d_int8_int64,
|
||||
("int8", "float64"): libalgos.take_1d_int8_float64,
|
||||
("int16", "int16"): libalgos.take_1d_int16_int16,
|
||||
("int16", "int32"): libalgos.take_1d_int16_int32,
|
||||
("int16", "int64"): libalgos.take_1d_int16_int64,
|
||||
("int16", "float64"): libalgos.take_1d_int16_float64,
|
||||
("int32", "int32"): libalgos.take_1d_int32_int32,
|
||||
("int32", "int64"): libalgos.take_1d_int32_int64,
|
||||
("int32", "float64"): libalgos.take_1d_int32_float64,
|
||||
("int64", "int64"): libalgos.take_1d_int64_int64,
|
||||
("int64", "float64"): libalgos.take_1d_int64_float64,
|
||||
("float32", "float32"): libalgos.take_1d_float32_float32,
|
||||
("float32", "float64"): libalgos.take_1d_float32_float64,
|
||||
("float64", "float64"): libalgos.take_1d_float64_float64,
|
||||
("object", "object"): libalgos.take_1d_object_object,
|
||||
("bool", "bool"): _view_wrapper(libalgos.take_1d_bool_bool, np.uint8, np.uint8),
|
||||
("bool", "object"): _view_wrapper(libalgos.take_1d_bool_object, np.uint8, None),
|
||||
("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
|
||||
libalgos.take_1d_int64_int64, np.int64, np.int64, np.int64
|
||||
),
|
||||
("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
|
||||
libalgos.take_1d_int64_int64, np.int64, np.int64, np.int64
|
||||
),
|
||||
}
|
||||
|
||||
_take_2d_axis0_dict = {
|
||||
("int8", "int8"): libalgos.take_2d_axis0_int8_int8,
|
||||
("int8", "int32"): libalgos.take_2d_axis0_int8_int32,
|
||||
("int8", "int64"): libalgos.take_2d_axis0_int8_int64,
|
||||
("int8", "float64"): libalgos.take_2d_axis0_int8_float64,
|
||||
("int16", "int16"): libalgos.take_2d_axis0_int16_int16,
|
||||
("int16", "int32"): libalgos.take_2d_axis0_int16_int32,
|
||||
("int16", "int64"): libalgos.take_2d_axis0_int16_int64,
|
||||
("int16", "float64"): libalgos.take_2d_axis0_int16_float64,
|
||||
("int32", "int32"): libalgos.take_2d_axis0_int32_int32,
|
||||
("int32", "int64"): libalgos.take_2d_axis0_int32_int64,
|
||||
("int32", "float64"): libalgos.take_2d_axis0_int32_float64,
|
||||
("int64", "int64"): libalgos.take_2d_axis0_int64_int64,
|
||||
("int64", "float64"): libalgos.take_2d_axis0_int64_float64,
|
||||
("float32", "float32"): libalgos.take_2d_axis0_float32_float32,
|
||||
("float32", "float64"): libalgos.take_2d_axis0_float32_float64,
|
||||
("float64", "float64"): libalgos.take_2d_axis0_float64_float64,
|
||||
("object", "object"): libalgos.take_2d_axis0_object_object,
|
||||
("bool", "bool"): _view_wrapper(
|
||||
libalgos.take_2d_axis0_bool_bool, np.uint8, np.uint8
|
||||
),
|
||||
("bool", "object"): _view_wrapper(
|
||||
libalgos.take_2d_axis0_bool_object, np.uint8, None
|
||||
),
|
||||
("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
|
||||
libalgos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64
|
||||
),
|
||||
("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
|
||||
libalgos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64
|
||||
),
|
||||
}
|
||||
|
||||
_take_2d_axis1_dict = {
|
||||
("int8", "int8"): libalgos.take_2d_axis1_int8_int8,
|
||||
("int8", "int32"): libalgos.take_2d_axis1_int8_int32,
|
||||
("int8", "int64"): libalgos.take_2d_axis1_int8_int64,
|
||||
("int8", "float64"): libalgos.take_2d_axis1_int8_float64,
|
||||
("int16", "int16"): libalgos.take_2d_axis1_int16_int16,
|
||||
("int16", "int32"): libalgos.take_2d_axis1_int16_int32,
|
||||
("int16", "int64"): libalgos.take_2d_axis1_int16_int64,
|
||||
("int16", "float64"): libalgos.take_2d_axis1_int16_float64,
|
||||
("int32", "int32"): libalgos.take_2d_axis1_int32_int32,
|
||||
("int32", "int64"): libalgos.take_2d_axis1_int32_int64,
|
||||
("int32", "float64"): libalgos.take_2d_axis1_int32_float64,
|
||||
("int64", "int64"): libalgos.take_2d_axis1_int64_int64,
|
||||
("int64", "float64"): libalgos.take_2d_axis1_int64_float64,
|
||||
("float32", "float32"): libalgos.take_2d_axis1_float32_float32,
|
||||
("float32", "float64"): libalgos.take_2d_axis1_float32_float64,
|
||||
("float64", "float64"): libalgos.take_2d_axis1_float64_float64,
|
||||
("object", "object"): libalgos.take_2d_axis1_object_object,
|
||||
("bool", "bool"): _view_wrapper(
|
||||
libalgos.take_2d_axis1_bool_bool, np.uint8, np.uint8
|
||||
),
|
||||
("bool", "object"): _view_wrapper(
|
||||
libalgos.take_2d_axis1_bool_object, np.uint8, None
|
||||
),
|
||||
("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
|
||||
libalgos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64
|
||||
),
|
||||
("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
|
||||
libalgos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64
|
||||
),
|
||||
}
|
||||
|
||||
_take_2d_multi_dict = {
|
||||
("int8", "int8"): libalgos.take_2d_multi_int8_int8,
|
||||
("int8", "int32"): libalgos.take_2d_multi_int8_int32,
|
||||
("int8", "int64"): libalgos.take_2d_multi_int8_int64,
|
||||
("int8", "float64"): libalgos.take_2d_multi_int8_float64,
|
||||
("int16", "int16"): libalgos.take_2d_multi_int16_int16,
|
||||
("int16", "int32"): libalgos.take_2d_multi_int16_int32,
|
||||
("int16", "int64"): libalgos.take_2d_multi_int16_int64,
|
||||
("int16", "float64"): libalgos.take_2d_multi_int16_float64,
|
||||
("int32", "int32"): libalgos.take_2d_multi_int32_int32,
|
||||
("int32", "int64"): libalgos.take_2d_multi_int32_int64,
|
||||
("int32", "float64"): libalgos.take_2d_multi_int32_float64,
|
||||
("int64", "int64"): libalgos.take_2d_multi_int64_int64,
|
||||
("int64", "float64"): libalgos.take_2d_multi_int64_float64,
|
||||
("float32", "float32"): libalgos.take_2d_multi_float32_float32,
|
||||
("float32", "float64"): libalgos.take_2d_multi_float32_float64,
|
||||
("float64", "float64"): libalgos.take_2d_multi_float64_float64,
|
||||
("object", "object"): libalgos.take_2d_multi_object_object,
|
||||
("bool", "bool"): _view_wrapper(
|
||||
libalgos.take_2d_multi_bool_bool, np.uint8, np.uint8
|
||||
),
|
||||
("bool", "object"): _view_wrapper(
|
||||
libalgos.take_2d_multi_bool_object, np.uint8, None
|
||||
),
|
||||
("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
|
||||
libalgos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64
|
||||
),
|
||||
("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
|
||||
libalgos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def _take_nd_object(
|
||||
arr: np.ndarray,
|
||||
indexer: npt.NDArray[np.intp],
|
||||
out: np.ndarray,
|
||||
axis: AxisInt,
|
||||
fill_value,
|
||||
mask_info,
|
||||
) -> None:
|
||||
if mask_info is not None:
|
||||
mask, needs_masking = mask_info
|
||||
else:
|
||||
mask = indexer == -1
|
||||
needs_masking = mask.any()
|
||||
if arr.dtype != out.dtype:
|
||||
arr = arr.astype(out.dtype)
|
||||
if arr.shape[axis] > 0:
|
||||
arr.take(indexer, axis=axis, out=out)
|
||||
if needs_masking:
|
||||
outindexer = [slice(None)] * arr.ndim
|
||||
outindexer[axis] = mask
|
||||
out[tuple(outindexer)] = fill_value
|
||||
|
||||
|
||||
def _take_2d_multi_object(
|
||||
arr: np.ndarray,
|
||||
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
|
||||
out: np.ndarray,
|
||||
fill_value,
|
||||
mask_info,
|
||||
) -> None:
|
||||
# this is not ideal, performance-wise, but it's better than raising
|
||||
# an exception (best to optimize in Cython to avoid getting here)
|
||||
row_idx, col_idx = indexer # both np.intp
|
||||
if mask_info is not None:
|
||||
(row_mask, col_mask), (row_needs, col_needs) = mask_info
|
||||
else:
|
||||
row_mask = row_idx == -1
|
||||
col_mask = col_idx == -1
|
||||
row_needs = row_mask.any()
|
||||
col_needs = col_mask.any()
|
||||
if fill_value is not None:
|
||||
if row_needs:
|
||||
out[row_mask, :] = fill_value
|
||||
if col_needs:
|
||||
out[:, col_mask] = fill_value
|
||||
for i, u_ in enumerate(row_idx):
|
||||
if u_ != -1:
|
||||
for j, v in enumerate(col_idx):
|
||||
if v != -1:
|
||||
out[i, j] = arr[u_, v]
|
||||
|
||||
|
||||
def _take_preprocess_indexer_and_fill_value(
|
||||
arr: np.ndarray,
|
||||
indexer: npt.NDArray[np.intp],
|
||||
fill_value,
|
||||
allow_fill: bool,
|
||||
mask: npt.NDArray[np.bool_] | None = None,
|
||||
):
|
||||
mask_info: tuple[np.ndarray | None, bool] | None = None
|
||||
|
||||
if not allow_fill:
|
||||
dtype, fill_value = arr.dtype, arr.dtype.type()
|
||||
mask_info = None, False
|
||||
else:
|
||||
# check for promotion based on types only (do this first because
|
||||
# it's faster than computing a mask)
|
||||
dtype, fill_value = maybe_promote(arr.dtype, fill_value)
|
||||
if dtype != arr.dtype:
|
||||
# check if promotion is actually required based on indexer
|
||||
if mask is not None:
|
||||
needs_masking = True
|
||||
else:
|
||||
mask = indexer == -1
|
||||
needs_masking = bool(mask.any())
|
||||
mask_info = mask, needs_masking
|
||||
if not needs_masking:
|
||||
# if not, then depromote, set fill_value to dummy
|
||||
# (it won't be used but we don't want the cython code
|
||||
# to crash when trying to cast it to dtype)
|
||||
dtype, fill_value = arr.dtype, arr.dtype.type()
|
||||
|
||||
return dtype, fill_value, mask_info
|
@ -0,0 +1,50 @@
|
||||
"""
|
||||
transforms.py is for shape-preserving functions.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
AxisInt,
|
||||
Scalar,
|
||||
)
|
||||
|
||||
|
||||
def shift(
|
||||
values: np.ndarray, periods: int, axis: AxisInt, fill_value: Scalar
|
||||
) -> np.ndarray:
|
||||
new_values = values
|
||||
|
||||
if periods == 0 or values.size == 0:
|
||||
return new_values.copy()
|
||||
|
||||
# make sure array sent to np.roll is c_contiguous
|
||||
f_ordered = values.flags.f_contiguous
|
||||
if f_ordered:
|
||||
new_values = new_values.T
|
||||
axis = new_values.ndim - axis - 1
|
||||
|
||||
if new_values.size:
|
||||
new_values = np.roll(
|
||||
new_values,
|
||||
np.intp(periods),
|
||||
axis=axis,
|
||||
)
|
||||
|
||||
axis_indexer = [slice(None)] * values.ndim
|
||||
if periods > 0:
|
||||
axis_indexer[axis] = slice(None, periods)
|
||||
else:
|
||||
axis_indexer[axis] = slice(periods, None)
|
||||
new_values[tuple(axis_indexer)] = fill_value
|
||||
|
||||
# restore original order
|
||||
if f_ordered:
|
||||
new_values = new_values.T
|
||||
|
||||
return new_values
|
530
lib/python3.13/site-packages/pandas/core/arraylike.py
Normal file
530
lib/python3.13/site-packages/pandas/core/arraylike.py
Normal file
@ -0,0 +1,530 @@
|
||||
"""
|
||||
Methods that can be shared by many array-like classes or subclasses:
|
||||
Series
|
||||
Index
|
||||
ExtensionArray
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import operator
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op
|
||||
|
||||
from pandas.core.dtypes.generic import ABCNDFrame
|
||||
|
||||
from pandas.core import roperator
|
||||
from pandas.core.construction import extract_array
|
||||
from pandas.core.ops.common import unpack_zerodim_and_defer
|
||||
|
||||
REDUCTION_ALIASES = {
|
||||
"maximum": "max",
|
||||
"minimum": "min",
|
||||
"add": "sum",
|
||||
"multiply": "prod",
|
||||
}
|
||||
|
||||
|
||||
class OpsMixin:
|
||||
# -------------------------------------------------------------
|
||||
# Comparisons
|
||||
|
||||
def _cmp_method(self, other, op):
|
||||
return NotImplemented
|
||||
|
||||
@unpack_zerodim_and_defer("__eq__")
|
||||
def __eq__(self, other):
|
||||
return self._cmp_method(other, operator.eq)
|
||||
|
||||
@unpack_zerodim_and_defer("__ne__")
|
||||
def __ne__(self, other):
|
||||
return self._cmp_method(other, operator.ne)
|
||||
|
||||
@unpack_zerodim_and_defer("__lt__")
|
||||
def __lt__(self, other):
|
||||
return self._cmp_method(other, operator.lt)
|
||||
|
||||
@unpack_zerodim_and_defer("__le__")
|
||||
def __le__(self, other):
|
||||
return self._cmp_method(other, operator.le)
|
||||
|
||||
@unpack_zerodim_and_defer("__gt__")
|
||||
def __gt__(self, other):
|
||||
return self._cmp_method(other, operator.gt)
|
||||
|
||||
@unpack_zerodim_and_defer("__ge__")
|
||||
def __ge__(self, other):
|
||||
return self._cmp_method(other, operator.ge)
|
||||
|
||||
# -------------------------------------------------------------
|
||||
# Logical Methods
|
||||
|
||||
def _logical_method(self, other, op):
|
||||
return NotImplemented
|
||||
|
||||
@unpack_zerodim_and_defer("__and__")
|
||||
def __and__(self, other):
|
||||
return self._logical_method(other, operator.and_)
|
||||
|
||||
@unpack_zerodim_and_defer("__rand__")
|
||||
def __rand__(self, other):
|
||||
return self._logical_method(other, roperator.rand_)
|
||||
|
||||
@unpack_zerodim_and_defer("__or__")
|
||||
def __or__(self, other):
|
||||
return self._logical_method(other, operator.or_)
|
||||
|
||||
@unpack_zerodim_and_defer("__ror__")
|
||||
def __ror__(self, other):
|
||||
return self._logical_method(other, roperator.ror_)
|
||||
|
||||
@unpack_zerodim_and_defer("__xor__")
|
||||
def __xor__(self, other):
|
||||
return self._logical_method(other, operator.xor)
|
||||
|
||||
@unpack_zerodim_and_defer("__rxor__")
|
||||
def __rxor__(self, other):
|
||||
return self._logical_method(other, roperator.rxor)
|
||||
|
||||
# -------------------------------------------------------------
|
||||
# Arithmetic Methods
|
||||
|
||||
def _arith_method(self, other, op):
|
||||
return NotImplemented
|
||||
|
||||
@unpack_zerodim_and_defer("__add__")
|
||||
def __add__(self, other):
|
||||
"""
|
||||
Get Addition of DataFrame and other, column-wise.
|
||||
|
||||
Equivalent to ``DataFrame.add(other)``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
other : scalar, sequence, Series, dict or DataFrame
|
||||
Object to be added to the DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
The result of adding ``other`` to DataFrame.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.add : Add a DataFrame and another object, with option for index-
|
||||
or column-oriented addition.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({'height': [1.5, 2.6], 'weight': [500, 800]},
|
||||
... index=['elk', 'moose'])
|
||||
>>> df
|
||||
height weight
|
||||
elk 1.5 500
|
||||
moose 2.6 800
|
||||
|
||||
Adding a scalar affects all rows and columns.
|
||||
|
||||
>>> df[['height', 'weight']] + 1.5
|
||||
height weight
|
||||
elk 3.0 501.5
|
||||
moose 4.1 801.5
|
||||
|
||||
Each element of a list is added to a column of the DataFrame, in order.
|
||||
|
||||
>>> df[['height', 'weight']] + [0.5, 1.5]
|
||||
height weight
|
||||
elk 2.0 501.5
|
||||
moose 3.1 801.5
|
||||
|
||||
Keys of a dictionary are aligned to the DataFrame, based on column names;
|
||||
each value in the dictionary is added to the corresponding column.
|
||||
|
||||
>>> df[['height', 'weight']] + {'height': 0.5, 'weight': 1.5}
|
||||
height weight
|
||||
elk 2.0 501.5
|
||||
moose 3.1 801.5
|
||||
|
||||
When `other` is a :class:`Series`, the index of `other` is aligned with the
|
||||
columns of the DataFrame.
|
||||
|
||||
>>> s1 = pd.Series([0.5, 1.5], index=['weight', 'height'])
|
||||
>>> df[['height', 'weight']] + s1
|
||||
height weight
|
||||
elk 3.0 500.5
|
||||
moose 4.1 800.5
|
||||
|
||||
Even when the index of `other` is the same as the index of the DataFrame,
|
||||
the :class:`Series` will not be reoriented. If index-wise alignment is desired,
|
||||
:meth:`DataFrame.add` should be used with `axis='index'`.
|
||||
|
||||
>>> s2 = pd.Series([0.5, 1.5], index=['elk', 'moose'])
|
||||
>>> df[['height', 'weight']] + s2
|
||||
elk height moose weight
|
||||
elk NaN NaN NaN NaN
|
||||
moose NaN NaN NaN NaN
|
||||
|
||||
>>> df[['height', 'weight']].add(s2, axis='index')
|
||||
height weight
|
||||
elk 2.0 500.5
|
||||
moose 4.1 801.5
|
||||
|
||||
When `other` is a :class:`DataFrame`, both columns names and the
|
||||
index are aligned.
|
||||
|
||||
>>> other = pd.DataFrame({'height': [0.2, 0.4, 0.6]},
|
||||
... index=['elk', 'moose', 'deer'])
|
||||
>>> df[['height', 'weight']] + other
|
||||
height weight
|
||||
deer NaN NaN
|
||||
elk 1.7 NaN
|
||||
moose 3.0 NaN
|
||||
"""
|
||||
return self._arith_method(other, operator.add)
|
||||
|
||||
@unpack_zerodim_and_defer("__radd__")
|
||||
def __radd__(self, other):
|
||||
return self._arith_method(other, roperator.radd)
|
||||
|
||||
@unpack_zerodim_and_defer("__sub__")
|
||||
def __sub__(self, other):
|
||||
return self._arith_method(other, operator.sub)
|
||||
|
||||
@unpack_zerodim_and_defer("__rsub__")
|
||||
def __rsub__(self, other):
|
||||
return self._arith_method(other, roperator.rsub)
|
||||
|
||||
@unpack_zerodim_and_defer("__mul__")
|
||||
def __mul__(self, other):
|
||||
return self._arith_method(other, operator.mul)
|
||||
|
||||
@unpack_zerodim_and_defer("__rmul__")
|
||||
def __rmul__(self, other):
|
||||
return self._arith_method(other, roperator.rmul)
|
||||
|
||||
@unpack_zerodim_and_defer("__truediv__")
|
||||
def __truediv__(self, other):
|
||||
return self._arith_method(other, operator.truediv)
|
||||
|
||||
@unpack_zerodim_and_defer("__rtruediv__")
|
||||
def __rtruediv__(self, other):
|
||||
return self._arith_method(other, roperator.rtruediv)
|
||||
|
||||
@unpack_zerodim_and_defer("__floordiv__")
|
||||
def __floordiv__(self, other):
|
||||
return self._arith_method(other, operator.floordiv)
|
||||
|
||||
@unpack_zerodim_and_defer("__rfloordiv")
|
||||
def __rfloordiv__(self, other):
|
||||
return self._arith_method(other, roperator.rfloordiv)
|
||||
|
||||
@unpack_zerodim_and_defer("__mod__")
|
||||
def __mod__(self, other):
|
||||
return self._arith_method(other, operator.mod)
|
||||
|
||||
@unpack_zerodim_and_defer("__rmod__")
|
||||
def __rmod__(self, other):
|
||||
return self._arith_method(other, roperator.rmod)
|
||||
|
||||
@unpack_zerodim_and_defer("__divmod__")
|
||||
def __divmod__(self, other):
|
||||
return self._arith_method(other, divmod)
|
||||
|
||||
@unpack_zerodim_and_defer("__rdivmod__")
|
||||
def __rdivmod__(self, other):
|
||||
return self._arith_method(other, roperator.rdivmod)
|
||||
|
||||
@unpack_zerodim_and_defer("__pow__")
|
||||
def __pow__(self, other):
|
||||
return self._arith_method(other, operator.pow)
|
||||
|
||||
@unpack_zerodim_and_defer("__rpow__")
|
||||
def __rpow__(self, other):
|
||||
return self._arith_method(other, roperator.rpow)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Helpers to implement __array_ufunc__
|
||||
|
||||
|
||||
def array_ufunc(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any):
|
||||
"""
|
||||
Compatibility with numpy ufuncs.
|
||||
|
||||
See also
|
||||
--------
|
||||
numpy.org/doc/stable/reference/arrays.classes.html#numpy.class.__array_ufunc__
|
||||
"""
|
||||
from pandas.core.frame import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
from pandas.core.generic import NDFrame
|
||||
from pandas.core.internals import (
|
||||
ArrayManager,
|
||||
BlockManager,
|
||||
)
|
||||
|
||||
cls = type(self)
|
||||
|
||||
kwargs = _standardize_out_kwarg(**kwargs)
|
||||
|
||||
# for binary ops, use our custom dunder methods
|
||||
result = maybe_dispatch_ufunc_to_dunder_op(self, ufunc, method, *inputs, **kwargs)
|
||||
if result is not NotImplemented:
|
||||
return result
|
||||
|
||||
# Determine if we should defer.
|
||||
no_defer = (
|
||||
np.ndarray.__array_ufunc__,
|
||||
cls.__array_ufunc__,
|
||||
)
|
||||
|
||||
for item in inputs:
|
||||
higher_priority = (
|
||||
hasattr(item, "__array_priority__")
|
||||
and item.__array_priority__ > self.__array_priority__
|
||||
)
|
||||
has_array_ufunc = (
|
||||
hasattr(item, "__array_ufunc__")
|
||||
and type(item).__array_ufunc__ not in no_defer
|
||||
and not isinstance(item, self._HANDLED_TYPES)
|
||||
)
|
||||
if higher_priority or has_array_ufunc:
|
||||
return NotImplemented
|
||||
|
||||
# align all the inputs.
|
||||
types = tuple(type(x) for x in inputs)
|
||||
alignable = [x for x, t in zip(inputs, types) if issubclass(t, NDFrame)]
|
||||
|
||||
if len(alignable) > 1:
|
||||
# This triggers alignment.
|
||||
# At the moment, there aren't any ufuncs with more than two inputs
|
||||
# so this ends up just being x1.index | x2.index, but we write
|
||||
# it to handle *args.
|
||||
set_types = set(types)
|
||||
if len(set_types) > 1 and {DataFrame, Series}.issubset(set_types):
|
||||
# We currently don't handle ufunc(DataFrame, Series)
|
||||
# well. Previously this raised an internal ValueError. We might
|
||||
# support it someday, so raise a NotImplementedError.
|
||||
raise NotImplementedError(
|
||||
f"Cannot apply ufunc {ufunc} to mixed DataFrame and Series inputs."
|
||||
)
|
||||
axes = self.axes
|
||||
for obj in alignable[1:]:
|
||||
# this relies on the fact that we aren't handling mixed
|
||||
# series / frame ufuncs.
|
||||
for i, (ax1, ax2) in enumerate(zip(axes, obj.axes)):
|
||||
axes[i] = ax1.union(ax2)
|
||||
|
||||
reconstruct_axes = dict(zip(self._AXIS_ORDERS, axes))
|
||||
inputs = tuple(
|
||||
x.reindex(**reconstruct_axes) if issubclass(t, NDFrame) else x
|
||||
for x, t in zip(inputs, types)
|
||||
)
|
||||
else:
|
||||
reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes))
|
||||
|
||||
if self.ndim == 1:
|
||||
names = [getattr(x, "name") for x in inputs if hasattr(x, "name")]
|
||||
name = names[0] if len(set(names)) == 1 else None
|
||||
reconstruct_kwargs = {"name": name}
|
||||
else:
|
||||
reconstruct_kwargs = {}
|
||||
|
||||
def reconstruct(result):
|
||||
if ufunc.nout > 1:
|
||||
# np.modf, np.frexp, np.divmod
|
||||
return tuple(_reconstruct(x) for x in result)
|
||||
|
||||
return _reconstruct(result)
|
||||
|
||||
def _reconstruct(result):
|
||||
if lib.is_scalar(result):
|
||||
return result
|
||||
|
||||
if result.ndim != self.ndim:
|
||||
if method == "outer":
|
||||
raise NotImplementedError
|
||||
return result
|
||||
if isinstance(result, (BlockManager, ArrayManager)):
|
||||
# we went through BlockManager.apply e.g. np.sqrt
|
||||
result = self._constructor_from_mgr(result, axes=result.axes)
|
||||
else:
|
||||
# we converted an array, lost our axes
|
||||
result = self._constructor(
|
||||
result, **reconstruct_axes, **reconstruct_kwargs, copy=False
|
||||
)
|
||||
# TODO: When we support multiple values in __finalize__, this
|
||||
# should pass alignable to `__finalize__` instead of self.
|
||||
# Then `np.add(a, b)` would consider attrs from both a and b
|
||||
# when a and b are NDFrames.
|
||||
if len(alignable) == 1:
|
||||
result = result.__finalize__(self)
|
||||
return result
|
||||
|
||||
if "out" in kwargs:
|
||||
# e.g. test_multiindex_get_loc
|
||||
result = dispatch_ufunc_with_out(self, ufunc, method, *inputs, **kwargs)
|
||||
return reconstruct(result)
|
||||
|
||||
if method == "reduce":
|
||||
# e.g. test.series.test_ufunc.test_reduce
|
||||
result = dispatch_reduction_ufunc(self, ufunc, method, *inputs, **kwargs)
|
||||
if result is not NotImplemented:
|
||||
return result
|
||||
|
||||
# We still get here with kwargs `axis` for e.g. np.maximum.accumulate
|
||||
# and `dtype` and `keepdims` for np.ptp
|
||||
|
||||
if self.ndim > 1 and (len(inputs) > 1 or ufunc.nout > 1):
|
||||
# Just give up on preserving types in the complex case.
|
||||
# In theory we could preserve them for them.
|
||||
# * nout>1 is doable if BlockManager.apply took nout and
|
||||
# returned a Tuple[BlockManager].
|
||||
# * len(inputs) > 1 is doable when we know that we have
|
||||
# aligned blocks / dtypes.
|
||||
|
||||
# e.g. my_ufunc, modf, logaddexp, heaviside, subtract, add
|
||||
inputs = tuple(np.asarray(x) for x in inputs)
|
||||
# Note: we can't use default_array_ufunc here bc reindexing means
|
||||
# that `self` may not be among `inputs`
|
||||
result = getattr(ufunc, method)(*inputs, **kwargs)
|
||||
elif self.ndim == 1:
|
||||
# ufunc(series, ...)
|
||||
inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs)
|
||||
result = getattr(ufunc, method)(*inputs, **kwargs)
|
||||
else:
|
||||
# ufunc(dataframe)
|
||||
if method == "__call__" and not kwargs:
|
||||
# for np.<ufunc>(..) calls
|
||||
# kwargs cannot necessarily be handled block-by-block, so only
|
||||
# take this path if there are no kwargs
|
||||
mgr = inputs[0]._mgr
|
||||
result = mgr.apply(getattr(ufunc, method))
|
||||
else:
|
||||
# otherwise specific ufunc methods (eg np.<ufunc>.accumulate(..))
|
||||
# Those can have an axis keyword and thus can't be called block-by-block
|
||||
result = default_array_ufunc(inputs[0], ufunc, method, *inputs, **kwargs)
|
||||
# e.g. np.negative (only one reached), with "where" and "out" in kwargs
|
||||
|
||||
result = reconstruct(result)
|
||||
return result
|
||||
|
||||
|
||||
def _standardize_out_kwarg(**kwargs) -> dict:
|
||||
"""
|
||||
If kwargs contain "out1" and "out2", replace that with a tuple "out"
|
||||
|
||||
np.divmod, np.modf, np.frexp can have either `out=(out1, out2)` or
|
||||
`out1=out1, out2=out2)`
|
||||
"""
|
||||
if "out" not in kwargs and "out1" in kwargs and "out2" in kwargs:
|
||||
out1 = kwargs.pop("out1")
|
||||
out2 = kwargs.pop("out2")
|
||||
out = (out1, out2)
|
||||
kwargs["out"] = out
|
||||
return kwargs
|
||||
|
||||
|
||||
def dispatch_ufunc_with_out(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
|
||||
"""
|
||||
If we have an `out` keyword, then call the ufunc without `out` and then
|
||||
set the result into the given `out`.
|
||||
"""
|
||||
|
||||
# Note: we assume _standardize_out_kwarg has already been called.
|
||||
out = kwargs.pop("out")
|
||||
where = kwargs.pop("where", None)
|
||||
|
||||
result = getattr(ufunc, method)(*inputs, **kwargs)
|
||||
|
||||
if result is NotImplemented:
|
||||
return NotImplemented
|
||||
|
||||
if isinstance(result, tuple):
|
||||
# i.e. np.divmod, np.modf, np.frexp
|
||||
if not isinstance(out, tuple) or len(out) != len(result):
|
||||
raise NotImplementedError
|
||||
|
||||
for arr, res in zip(out, result):
|
||||
_assign_where(arr, res, where)
|
||||
|
||||
return out
|
||||
|
||||
if isinstance(out, tuple):
|
||||
if len(out) == 1:
|
||||
out = out[0]
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
_assign_where(out, result, where)
|
||||
return out
|
||||
|
||||
|
||||
def _assign_where(out, result, where) -> None:
|
||||
"""
|
||||
Set a ufunc result into 'out', masking with a 'where' argument if necessary.
|
||||
"""
|
||||
if where is None:
|
||||
# no 'where' arg passed to ufunc
|
||||
out[:] = result
|
||||
else:
|
||||
np.putmask(out, where, result)
|
||||
|
||||
|
||||
def default_array_ufunc(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
|
||||
"""
|
||||
Fallback to the behavior we would get if we did not define __array_ufunc__.
|
||||
|
||||
Notes
|
||||
-----
|
||||
We are assuming that `self` is among `inputs`.
|
||||
"""
|
||||
if not any(x is self for x in inputs):
|
||||
raise NotImplementedError
|
||||
|
||||
new_inputs = [x if x is not self else np.asarray(x) for x in inputs]
|
||||
|
||||
return getattr(ufunc, method)(*new_inputs, **kwargs)
|
||||
|
||||
|
||||
def dispatch_reduction_ufunc(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
|
||||
"""
|
||||
Dispatch ufunc reductions to self's reduction methods.
|
||||
"""
|
||||
assert method == "reduce"
|
||||
|
||||
if len(inputs) != 1 or inputs[0] is not self:
|
||||
return NotImplemented
|
||||
|
||||
if ufunc.__name__ not in REDUCTION_ALIASES:
|
||||
return NotImplemented
|
||||
|
||||
method_name = REDUCTION_ALIASES[ufunc.__name__]
|
||||
|
||||
# NB: we are assuming that min/max represent minimum/maximum methods,
|
||||
# which would not be accurate for e.g. Timestamp.min
|
||||
if not hasattr(self, method_name):
|
||||
return NotImplemented
|
||||
|
||||
if self.ndim > 1:
|
||||
if isinstance(self, ABCNDFrame):
|
||||
# TODO: test cases where this doesn't hold, i.e. 2D DTA/TDA
|
||||
kwargs["numeric_only"] = False
|
||||
|
||||
if "axis" not in kwargs:
|
||||
# For DataFrame reductions we don't want the default axis=0
|
||||
# Note: np.min is not a ufunc, but uses array_function_dispatch,
|
||||
# so calls DataFrame.min (without ever getting here) with the np.min
|
||||
# default of axis=None, which DataFrame.min catches and changes to axis=0.
|
||||
# np.minimum.reduce(df) gets here bc axis is not in kwargs,
|
||||
# so we set axis=0 to match the behaviorof np.minimum.reduce(df.values)
|
||||
kwargs["axis"] = 0
|
||||
|
||||
# By default, numpy's reductions do not skip NaNs, so we have to
|
||||
# pass skipna=False
|
||||
return getattr(self, method_name)(skipna=False, **kwargs)
|
43
lib/python3.13/site-packages/pandas/core/arrays/__init__.py
Normal file
43
lib/python3.13/site-packages/pandas/core/arrays/__init__.py
Normal file
@ -0,0 +1,43 @@
|
||||
from pandas.core.arrays.arrow import ArrowExtensionArray
|
||||
from pandas.core.arrays.base import (
|
||||
ExtensionArray,
|
||||
ExtensionOpsMixin,
|
||||
ExtensionScalarOpsMixin,
|
||||
)
|
||||
from pandas.core.arrays.boolean import BooleanArray
|
||||
from pandas.core.arrays.categorical import Categorical
|
||||
from pandas.core.arrays.datetimes import DatetimeArray
|
||||
from pandas.core.arrays.floating import FloatingArray
|
||||
from pandas.core.arrays.integer import IntegerArray
|
||||
from pandas.core.arrays.interval import IntervalArray
|
||||
from pandas.core.arrays.masked import BaseMaskedArray
|
||||
from pandas.core.arrays.numpy_ import NumpyExtensionArray
|
||||
from pandas.core.arrays.period import (
|
||||
PeriodArray,
|
||||
period_array,
|
||||
)
|
||||
from pandas.core.arrays.sparse import SparseArray
|
||||
from pandas.core.arrays.string_ import StringArray
|
||||
from pandas.core.arrays.string_arrow import ArrowStringArray
|
||||
from pandas.core.arrays.timedeltas import TimedeltaArray
|
||||
|
||||
__all__ = [
|
||||
"ArrowExtensionArray",
|
||||
"ExtensionArray",
|
||||
"ExtensionOpsMixin",
|
||||
"ExtensionScalarOpsMixin",
|
||||
"ArrowStringArray",
|
||||
"BaseMaskedArray",
|
||||
"BooleanArray",
|
||||
"Categorical",
|
||||
"DatetimeArray",
|
||||
"FloatingArray",
|
||||
"IntegerArray",
|
||||
"IntervalArray",
|
||||
"NumpyExtensionArray",
|
||||
"PeriodArray",
|
||||
"period_array",
|
||||
"SparseArray",
|
||||
"StringArray",
|
||||
"TimedeltaArray",
|
||||
]
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,84 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Literal
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat import pa_version_under10p1
|
||||
|
||||
if not pa_version_under10p1:
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
|
||||
|
||||
class ArrowStringArrayMixin:
|
||||
_pa_array = None
|
||||
|
||||
def __init__(self, *args, **kwargs) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
def _str_pad(
|
||||
self,
|
||||
width: int,
|
||||
side: Literal["left", "right", "both"] = "left",
|
||||
fillchar: str = " ",
|
||||
):
|
||||
if side == "left":
|
||||
pa_pad = pc.utf8_lpad
|
||||
elif side == "right":
|
||||
pa_pad = pc.utf8_rpad
|
||||
elif side == "both":
|
||||
pa_pad = pc.utf8_center
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'"
|
||||
)
|
||||
return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar))
|
||||
|
||||
def _str_get(self, i: int):
|
||||
lengths = pc.utf8_length(self._pa_array)
|
||||
if i >= 0:
|
||||
out_of_bounds = pc.greater_equal(i, lengths)
|
||||
start = i
|
||||
stop = i + 1
|
||||
step = 1
|
||||
else:
|
||||
out_of_bounds = pc.greater(-i, lengths)
|
||||
start = i
|
||||
stop = i - 1
|
||||
step = -1
|
||||
not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True))
|
||||
selected = pc.utf8_slice_codeunits(
|
||||
self._pa_array, start=start, stop=stop, step=step
|
||||
)
|
||||
null_value = pa.scalar(
|
||||
None, type=self._pa_array.type # type: ignore[attr-defined]
|
||||
)
|
||||
result = pc.if_else(not_out_of_bounds, selected, null_value)
|
||||
return type(self)(result)
|
||||
|
||||
def _str_slice_replace(
|
||||
self, start: int | None = None, stop: int | None = None, repl: str | None = None
|
||||
):
|
||||
if repl is None:
|
||||
repl = ""
|
||||
if start is None:
|
||||
start = 0
|
||||
if stop is None:
|
||||
stop = np.iinfo(np.int64).max
|
||||
return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl))
|
||||
|
||||
def _str_capitalize(self):
|
||||
return type(self)(pc.utf8_capitalize(self._pa_array))
|
||||
|
||||
def _str_title(self):
|
||||
return type(self)(pc.utf8_title(self._pa_array))
|
||||
|
||||
def _str_swapcase(self):
|
||||
return type(self)(pc.utf8_swapcase(self._pa_array))
|
||||
|
||||
def _str_removesuffix(self, suffix: str):
|
||||
ends_with = pc.ends_with(self._pa_array, pattern=suffix)
|
||||
removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
|
||||
result = pc.if_else(ends_with, removed, self._pa_array)
|
||||
return type(self)(result)
|
547
lib/python3.13/site-packages/pandas/core/arrays/_mixins.py
Normal file
547
lib/python3.13/site-packages/pandas/core/arrays/_mixins.py
Normal file
@ -0,0 +1,547 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import wraps
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Literal,
|
||||
cast,
|
||||
overload,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas._libs.arrays import NDArrayBacked
|
||||
from pandas._libs.tslibs import is_supported_dtype
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
AxisInt,
|
||||
Dtype,
|
||||
F,
|
||||
FillnaOptions,
|
||||
PositionalIndexer2D,
|
||||
PositionalIndexerTuple,
|
||||
ScalarIndexer,
|
||||
Self,
|
||||
SequenceIndexer,
|
||||
Shape,
|
||||
TakeIndexer,
|
||||
npt,
|
||||
)
|
||||
from pandas.errors import AbstractMethodError
|
||||
from pandas.util._decorators import doc
|
||||
from pandas.util._validators import (
|
||||
validate_bool_kwarg,
|
||||
validate_fillna_kwargs,
|
||||
validate_insert_loc,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import pandas_dtype
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
DatetimeTZDtype,
|
||||
ExtensionDtype,
|
||||
PeriodDtype,
|
||||
)
|
||||
from pandas.core.dtypes.missing import array_equivalent
|
||||
|
||||
from pandas.core import missing
|
||||
from pandas.core.algorithms import (
|
||||
take,
|
||||
unique,
|
||||
value_counts_internal as value_counts,
|
||||
)
|
||||
from pandas.core.array_algos.quantile import quantile_with_mask
|
||||
from pandas.core.array_algos.transforms import shift
|
||||
from pandas.core.arrays.base import ExtensionArray
|
||||
from pandas.core.construction import extract_array
|
||||
from pandas.core.indexers import check_array_indexer
|
||||
from pandas.core.sorting import nargminmax
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
from pandas._typing import (
|
||||
NumpySorter,
|
||||
NumpyValueArrayLike,
|
||||
)
|
||||
|
||||
from pandas import Series
|
||||
|
||||
|
||||
def ravel_compat(meth: F) -> F:
|
||||
"""
|
||||
Decorator to ravel a 2D array before passing it to a cython operation,
|
||||
then reshape the result to our own shape.
|
||||
"""
|
||||
|
||||
@wraps(meth)
|
||||
def method(self, *args, **kwargs):
|
||||
if self.ndim == 1:
|
||||
return meth(self, *args, **kwargs)
|
||||
|
||||
flags = self._ndarray.flags
|
||||
flat = self.ravel("K")
|
||||
result = meth(flat, *args, **kwargs)
|
||||
order = "F" if flags.f_contiguous else "C"
|
||||
return result.reshape(self.shape, order=order)
|
||||
|
||||
return cast(F, method)
|
||||
|
||||
|
||||
class NDArrayBackedExtensionArray(NDArrayBacked, ExtensionArray):
|
||||
"""
|
||||
ExtensionArray that is backed by a single NumPy ndarray.
|
||||
"""
|
||||
|
||||
_ndarray: np.ndarray
|
||||
|
||||
# scalar used to denote NA value inside our self._ndarray, e.g. -1
|
||||
# for Categorical, iNaT for Period. Outside of object dtype,
|
||||
# self.isna() should be exactly locations in self._ndarray with
|
||||
# _internal_fill_value.
|
||||
_internal_fill_value: Any
|
||||
|
||||
def _box_func(self, x):
|
||||
"""
|
||||
Wrap numpy type in our dtype.type if necessary.
|
||||
"""
|
||||
return x
|
||||
|
||||
def _validate_scalar(self, value):
|
||||
# used by NDArrayBackedExtensionIndex.insert
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
|
||||
def view(self, dtype: Dtype | None = None) -> ArrayLike:
|
||||
# We handle datetime64, datetime64tz, timedelta64, and period
|
||||
# dtypes here. Everything else we pass through to the underlying
|
||||
# ndarray.
|
||||
if dtype is None or dtype is self.dtype:
|
||||
return self._from_backing_data(self._ndarray)
|
||||
|
||||
if isinstance(dtype, type):
|
||||
# we sometimes pass non-dtype objects, e.g np.ndarray;
|
||||
# pass those through to the underlying ndarray
|
||||
return self._ndarray.view(dtype)
|
||||
|
||||
dtype = pandas_dtype(dtype)
|
||||
arr = self._ndarray
|
||||
|
||||
if isinstance(dtype, PeriodDtype):
|
||||
cls = dtype.construct_array_type()
|
||||
return cls(arr.view("i8"), dtype=dtype)
|
||||
elif isinstance(dtype, DatetimeTZDtype):
|
||||
dt_cls = dtype.construct_array_type()
|
||||
dt64_values = arr.view(f"M8[{dtype.unit}]")
|
||||
return dt_cls._simple_new(dt64_values, dtype=dtype)
|
||||
elif lib.is_np_dtype(dtype, "M") and is_supported_dtype(dtype):
|
||||
from pandas.core.arrays import DatetimeArray
|
||||
|
||||
dt64_values = arr.view(dtype)
|
||||
return DatetimeArray._simple_new(dt64_values, dtype=dtype)
|
||||
|
||||
elif lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype):
|
||||
from pandas.core.arrays import TimedeltaArray
|
||||
|
||||
td64_values = arr.view(dtype)
|
||||
return TimedeltaArray._simple_new(td64_values, dtype=dtype)
|
||||
|
||||
# error: Argument "dtype" to "view" of "_ArrayOrScalarCommon" has incompatible
|
||||
# type "Union[ExtensionDtype, dtype[Any]]"; expected "Union[dtype[Any], None,
|
||||
# type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int,
|
||||
# Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
|
||||
return arr.view(dtype=dtype) # type: ignore[arg-type]
|
||||
|
||||
def take(
|
||||
self,
|
||||
indices: TakeIndexer,
|
||||
*,
|
||||
allow_fill: bool = False,
|
||||
fill_value: Any = None,
|
||||
axis: AxisInt = 0,
|
||||
) -> Self:
|
||||
if allow_fill:
|
||||
fill_value = self._validate_scalar(fill_value)
|
||||
|
||||
new_data = take(
|
||||
self._ndarray,
|
||||
indices,
|
||||
allow_fill=allow_fill,
|
||||
fill_value=fill_value,
|
||||
axis=axis,
|
||||
)
|
||||
return self._from_backing_data(new_data)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
|
||||
def equals(self, other) -> bool:
|
||||
if type(self) is not type(other):
|
||||
return False
|
||||
if self.dtype != other.dtype:
|
||||
return False
|
||||
return bool(array_equivalent(self._ndarray, other._ndarray, dtype_equal=True))
|
||||
|
||||
@classmethod
|
||||
def _from_factorized(cls, values, original):
|
||||
assert values.dtype == original._ndarray.dtype
|
||||
return original._from_backing_data(values)
|
||||
|
||||
def _values_for_argsort(self) -> np.ndarray:
|
||||
return self._ndarray
|
||||
|
||||
def _values_for_factorize(self):
|
||||
return self._ndarray, self._internal_fill_value
|
||||
|
||||
def _hash_pandas_object(
|
||||
self, *, encoding: str, hash_key: str, categorize: bool
|
||||
) -> npt.NDArray[np.uint64]:
|
||||
from pandas.core.util.hashing import hash_array
|
||||
|
||||
values = self._ndarray
|
||||
return hash_array(
|
||||
values, encoding=encoding, hash_key=hash_key, categorize=categorize
|
||||
)
|
||||
|
||||
# Signature of "argmin" incompatible with supertype "ExtensionArray"
|
||||
def argmin(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[override]
|
||||
# override base class by adding axis keyword
|
||||
validate_bool_kwarg(skipna, "skipna")
|
||||
if not skipna and self._hasna:
|
||||
raise NotImplementedError
|
||||
return nargminmax(self, "argmin", axis=axis)
|
||||
|
||||
# Signature of "argmax" incompatible with supertype "ExtensionArray"
|
||||
def argmax(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[override]
|
||||
# override base class by adding axis keyword
|
||||
validate_bool_kwarg(skipna, "skipna")
|
||||
if not skipna and self._hasna:
|
||||
raise NotImplementedError
|
||||
return nargminmax(self, "argmax", axis=axis)
|
||||
|
||||
def unique(self) -> Self:
|
||||
new_data = unique(self._ndarray)
|
||||
return self._from_backing_data(new_data)
|
||||
|
||||
@classmethod
|
||||
@doc(ExtensionArray._concat_same_type)
|
||||
def _concat_same_type(
|
||||
cls,
|
||||
to_concat: Sequence[Self],
|
||||
axis: AxisInt = 0,
|
||||
) -> Self:
|
||||
if not lib.dtypes_all_equal([x.dtype for x in to_concat]):
|
||||
dtypes = {str(x.dtype) for x in to_concat}
|
||||
raise ValueError("to_concat must have the same dtype", dtypes)
|
||||
|
||||
return super()._concat_same_type(to_concat, axis=axis)
|
||||
|
||||
@doc(ExtensionArray.searchsorted)
|
||||
def searchsorted(
|
||||
self,
|
||||
value: NumpyValueArrayLike | ExtensionArray,
|
||||
side: Literal["left", "right"] = "left",
|
||||
sorter: NumpySorter | None = None,
|
||||
) -> npt.NDArray[np.intp] | np.intp:
|
||||
npvalue = self._validate_setitem_value(value)
|
||||
return self._ndarray.searchsorted(npvalue, side=side, sorter=sorter)
|
||||
|
||||
@doc(ExtensionArray.shift)
|
||||
def shift(self, periods: int = 1, fill_value=None):
|
||||
# NB: shift is always along axis=0
|
||||
axis = 0
|
||||
fill_value = self._validate_scalar(fill_value)
|
||||
new_values = shift(self._ndarray, periods, axis, fill_value)
|
||||
|
||||
return self._from_backing_data(new_values)
|
||||
|
||||
def __setitem__(self, key, value) -> None:
|
||||
key = check_array_indexer(self, key)
|
||||
value = self._validate_setitem_value(value)
|
||||
self._ndarray[key] = value
|
||||
|
||||
def _validate_setitem_value(self, value):
|
||||
return value
|
||||
|
||||
@overload
|
||||
def __getitem__(self, key: ScalarIndexer) -> Any:
|
||||
...
|
||||
|
||||
@overload
|
||||
def __getitem__(
|
||||
self,
|
||||
key: SequenceIndexer | PositionalIndexerTuple,
|
||||
) -> Self:
|
||||
...
|
||||
|
||||
def __getitem__(
|
||||
self,
|
||||
key: PositionalIndexer2D,
|
||||
) -> Self | Any:
|
||||
if lib.is_integer(key):
|
||||
# fast-path
|
||||
result = self._ndarray[key]
|
||||
if self.ndim == 1:
|
||||
return self._box_func(result)
|
||||
return self._from_backing_data(result)
|
||||
|
||||
# error: Incompatible types in assignment (expression has type "ExtensionArray",
|
||||
# variable has type "Union[int, slice, ndarray]")
|
||||
key = extract_array(key, extract_numpy=True) # type: ignore[assignment]
|
||||
key = check_array_indexer(self, key)
|
||||
result = self._ndarray[key]
|
||||
if lib.is_scalar(result):
|
||||
return self._box_func(result)
|
||||
|
||||
result = self._from_backing_data(result)
|
||||
return result
|
||||
|
||||
def _fill_mask_inplace(
|
||||
self, method: str, limit: int | None, mask: npt.NDArray[np.bool_]
|
||||
) -> None:
|
||||
# (for now) when self.ndim == 2, we assume axis=0
|
||||
func = missing.get_fill_func(method, ndim=self.ndim)
|
||||
func(self._ndarray.T, limit=limit, mask=mask.T)
|
||||
|
||||
def _pad_or_backfill(
|
||||
self,
|
||||
*,
|
||||
method: FillnaOptions,
|
||||
limit: int | None = None,
|
||||
limit_area: Literal["inside", "outside"] | None = None,
|
||||
copy: bool = True,
|
||||
) -> Self:
|
||||
mask = self.isna()
|
||||
if mask.any():
|
||||
# (for now) when self.ndim == 2, we assume axis=0
|
||||
func = missing.get_fill_func(method, ndim=self.ndim)
|
||||
|
||||
npvalues = self._ndarray.T
|
||||
if copy:
|
||||
npvalues = npvalues.copy()
|
||||
func(npvalues, limit=limit, limit_area=limit_area, mask=mask.T)
|
||||
npvalues = npvalues.T
|
||||
|
||||
if copy:
|
||||
new_values = self._from_backing_data(npvalues)
|
||||
else:
|
||||
new_values = self
|
||||
|
||||
else:
|
||||
if copy:
|
||||
new_values = self.copy()
|
||||
else:
|
||||
new_values = self
|
||||
return new_values
|
||||
|
||||
@doc(ExtensionArray.fillna)
|
||||
def fillna(
|
||||
self, value=None, method=None, limit: int | None = None, copy: bool = True
|
||||
) -> Self:
|
||||
value, method = validate_fillna_kwargs(
|
||||
value, method, validate_scalar_dict_value=False
|
||||
)
|
||||
|
||||
mask = self.isna()
|
||||
# error: Argument 2 to "check_value_size" has incompatible type
|
||||
# "ExtensionArray"; expected "ndarray"
|
||||
value = missing.check_value_size(
|
||||
value, mask, len(self) # type: ignore[arg-type]
|
||||
)
|
||||
|
||||
if mask.any():
|
||||
if method is not None:
|
||||
# (for now) when self.ndim == 2, we assume axis=0
|
||||
func = missing.get_fill_func(method, ndim=self.ndim)
|
||||
npvalues = self._ndarray.T
|
||||
if copy:
|
||||
npvalues = npvalues.copy()
|
||||
func(npvalues, limit=limit, mask=mask.T)
|
||||
npvalues = npvalues.T
|
||||
|
||||
# TODO: NumpyExtensionArray didn't used to copy, need tests
|
||||
# for this
|
||||
new_values = self._from_backing_data(npvalues)
|
||||
else:
|
||||
# fill with value
|
||||
if copy:
|
||||
new_values = self.copy()
|
||||
else:
|
||||
new_values = self[:]
|
||||
new_values[mask] = value
|
||||
else:
|
||||
# We validate the fill_value even if there is nothing to fill
|
||||
if value is not None:
|
||||
self._validate_setitem_value(value)
|
||||
|
||||
if not copy:
|
||||
new_values = self[:]
|
||||
else:
|
||||
new_values = self.copy()
|
||||
return new_values
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Reductions
|
||||
|
||||
def _wrap_reduction_result(self, axis: AxisInt | None, result):
|
||||
if axis is None or self.ndim == 1:
|
||||
return self._box_func(result)
|
||||
return self._from_backing_data(result)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# __array_function__ methods
|
||||
|
||||
def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
|
||||
"""
|
||||
Analogue to np.putmask(self, mask, value)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mask : np.ndarray[bool]
|
||||
value : scalar or listlike
|
||||
|
||||
Raises
|
||||
------
|
||||
TypeError
|
||||
If value cannot be cast to self.dtype.
|
||||
"""
|
||||
value = self._validate_setitem_value(value)
|
||||
|
||||
np.putmask(self._ndarray, mask, value)
|
||||
|
||||
def _where(self: Self, mask: npt.NDArray[np.bool_], value) -> Self:
|
||||
"""
|
||||
Analogue to np.where(mask, self, value)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mask : np.ndarray[bool]
|
||||
value : scalar or listlike
|
||||
|
||||
Raises
|
||||
------
|
||||
TypeError
|
||||
If value cannot be cast to self.dtype.
|
||||
"""
|
||||
value = self._validate_setitem_value(value)
|
||||
|
||||
res_values = np.where(mask, self._ndarray, value)
|
||||
if res_values.dtype != self._ndarray.dtype:
|
||||
raise AssertionError(
|
||||
# GH#56410
|
||||
"Something has gone wrong, please report a bug at "
|
||||
"github.com/pandas-dev/pandas/"
|
||||
)
|
||||
return self._from_backing_data(res_values)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Index compat methods
|
||||
|
||||
def insert(self, loc: int, item) -> Self:
|
||||
"""
|
||||
Make new ExtensionArray inserting new item at location. Follows
|
||||
Python list.append semantics for negative values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
loc : int
|
||||
item : object
|
||||
|
||||
Returns
|
||||
-------
|
||||
type(self)
|
||||
"""
|
||||
loc = validate_insert_loc(loc, len(self))
|
||||
|
||||
code = self._validate_scalar(item)
|
||||
|
||||
new_vals = np.concatenate(
|
||||
(
|
||||
self._ndarray[:loc],
|
||||
np.asarray([code], dtype=self._ndarray.dtype),
|
||||
self._ndarray[loc:],
|
||||
)
|
||||
)
|
||||
return self._from_backing_data(new_vals)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Additional array methods
|
||||
# These are not part of the EA API, but we implement them because
|
||||
# pandas assumes they're there.
|
||||
|
||||
def value_counts(self, dropna: bool = True) -> Series:
|
||||
"""
|
||||
Return a Series containing counts of unique values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dropna : bool, default True
|
||||
Don't include counts of NA values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series
|
||||
"""
|
||||
if self.ndim != 1:
|
||||
raise NotImplementedError
|
||||
|
||||
from pandas import (
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
|
||||
if dropna:
|
||||
# error: Unsupported operand type for ~ ("ExtensionArray")
|
||||
values = self[~self.isna()]._ndarray # type: ignore[operator]
|
||||
else:
|
||||
values = self._ndarray
|
||||
|
||||
result = value_counts(values, sort=False, dropna=dropna)
|
||||
|
||||
index_arr = self._from_backing_data(np.asarray(result.index._data))
|
||||
index = Index(index_arr, name=result.index.name)
|
||||
return Series(result._values, index=index, name=result.name, copy=False)
|
||||
|
||||
def _quantile(
|
||||
self,
|
||||
qs: npt.NDArray[np.float64],
|
||||
interpolation: str,
|
||||
) -> Self:
|
||||
# TODO: disable for Categorical if not ordered?
|
||||
|
||||
mask = np.asarray(self.isna())
|
||||
arr = self._ndarray
|
||||
fill_value = self._internal_fill_value
|
||||
|
||||
res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)
|
||||
|
||||
res_values = self._cast_quantile_result(res_values)
|
||||
return self._from_backing_data(res_values)
|
||||
|
||||
# TODO: see if we can share this with other dispatch-wrapping methods
|
||||
def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Cast the result of quantile_with_mask to an appropriate dtype
|
||||
to pass to _from_backing_data in _quantile.
|
||||
"""
|
||||
return res_values
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# numpy-like methods
|
||||
|
||||
@classmethod
|
||||
def _empty(cls, shape: Shape, dtype: ExtensionDtype) -> Self:
|
||||
"""
|
||||
Analogous to np.empty(shape, dtype=dtype)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
shape : tuple[int]
|
||||
dtype : ExtensionDtype
|
||||
"""
|
||||
# The base implementation uses a naive approach to find the dtype
|
||||
# for the backing ndarray
|
||||
arr = cls._from_sequence([], dtype=dtype)
|
||||
backing = np.empty(shape, dtype=arr._ndarray.dtype)
|
||||
return arr._from_backing_data(backing)
|
207
lib/python3.13/site-packages/pandas/core/arrays/_ranges.py
Normal file
207
lib/python3.13/site-packages/pandas/core/arrays/_ranges.py
Normal file
@ -0,0 +1,207 @@
|
||||
"""
|
||||
Helper functions to generate range-like data for DatetimeArray
|
||||
(and possibly TimedeltaArray/PeriodArray)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs.lib import i8max
|
||||
from pandas._libs.tslibs import (
|
||||
BaseOffset,
|
||||
OutOfBoundsDatetime,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
iNaT,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import npt
|
||||
|
||||
|
||||
def generate_regular_range(
|
||||
start: Timestamp | Timedelta | None,
|
||||
end: Timestamp | Timedelta | None,
|
||||
periods: int | None,
|
||||
freq: BaseOffset,
|
||||
unit: str = "ns",
|
||||
) -> npt.NDArray[np.intp]:
|
||||
"""
|
||||
Generate a range of dates or timestamps with the spans between dates
|
||||
described by the given `freq` DateOffset.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
start : Timedelta, Timestamp or None
|
||||
First point of produced date range.
|
||||
end : Timedelta, Timestamp or None
|
||||
Last point of produced date range.
|
||||
periods : int or None
|
||||
Number of periods in produced date range.
|
||||
freq : Tick
|
||||
Describes space between dates in produced date range.
|
||||
unit : str, default "ns"
|
||||
The resolution the output is meant to represent.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ndarray[np.int64]
|
||||
Representing the given resolution.
|
||||
"""
|
||||
istart = start._value if start is not None else None
|
||||
iend = end._value if end is not None else None
|
||||
freq.nanos # raises if non-fixed frequency
|
||||
td = Timedelta(freq)
|
||||
b: int
|
||||
e: int
|
||||
try:
|
||||
td = td.as_unit(unit, round_ok=False)
|
||||
except ValueError as err:
|
||||
raise ValueError(
|
||||
f"freq={freq} is incompatible with unit={unit}. "
|
||||
"Use a lower freq or a higher unit instead."
|
||||
) from err
|
||||
stride = int(td._value)
|
||||
|
||||
if periods is None and istart is not None and iend is not None:
|
||||
b = istart
|
||||
# cannot just use e = Timestamp(end) + 1 because arange breaks when
|
||||
# stride is too large, see GH10887
|
||||
e = b + (iend - b) // stride * stride + stride // 2 + 1
|
||||
elif istart is not None and periods is not None:
|
||||
b = istart
|
||||
e = _generate_range_overflow_safe(b, periods, stride, side="start")
|
||||
elif iend is not None and periods is not None:
|
||||
e = iend + stride
|
||||
b = _generate_range_overflow_safe(e, periods, stride, side="end")
|
||||
else:
|
||||
raise ValueError(
|
||||
"at least 'start' or 'end' should be specified if a 'period' is given."
|
||||
)
|
||||
|
||||
with np.errstate(over="raise"):
|
||||
# If the range is sufficiently large, np.arange may overflow
|
||||
# and incorrectly return an empty array if not caught.
|
||||
try:
|
||||
values = np.arange(b, e, stride, dtype=np.int64)
|
||||
except FloatingPointError:
|
||||
xdr = [b]
|
||||
while xdr[-1] != e:
|
||||
xdr.append(xdr[-1] + stride)
|
||||
values = np.array(xdr[:-1], dtype=np.int64)
|
||||
return values
|
||||
|
||||
|
||||
def _generate_range_overflow_safe(
|
||||
endpoint: int, periods: int, stride: int, side: str = "start"
|
||||
) -> int:
|
||||
"""
|
||||
Calculate the second endpoint for passing to np.arange, checking
|
||||
to avoid an integer overflow. Catch OverflowError and re-raise
|
||||
as OutOfBoundsDatetime.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
endpoint : int
|
||||
nanosecond timestamp of the known endpoint of the desired range
|
||||
periods : int
|
||||
number of periods in the desired range
|
||||
stride : int
|
||||
nanoseconds between periods in the desired range
|
||||
side : {'start', 'end'}
|
||||
which end of the range `endpoint` refers to
|
||||
|
||||
Returns
|
||||
-------
|
||||
other_end : int
|
||||
|
||||
Raises
|
||||
------
|
||||
OutOfBoundsDatetime
|
||||
"""
|
||||
# GH#14187 raise instead of incorrectly wrapping around
|
||||
assert side in ["start", "end"]
|
||||
|
||||
i64max = np.uint64(i8max)
|
||||
msg = f"Cannot generate range with {side}={endpoint} and periods={periods}"
|
||||
|
||||
with np.errstate(over="raise"):
|
||||
# if periods * strides cannot be multiplied within the *uint64* bounds,
|
||||
# we cannot salvage the operation by recursing, so raise
|
||||
try:
|
||||
addend = np.uint64(periods) * np.uint64(np.abs(stride))
|
||||
except FloatingPointError as err:
|
||||
raise OutOfBoundsDatetime(msg) from err
|
||||
|
||||
if np.abs(addend) <= i64max:
|
||||
# relatively easy case without casting concerns
|
||||
return _generate_range_overflow_safe_signed(endpoint, periods, stride, side)
|
||||
|
||||
elif (endpoint > 0 and side == "start" and stride > 0) or (
|
||||
endpoint < 0 < stride and side == "end"
|
||||
):
|
||||
# no chance of not-overflowing
|
||||
raise OutOfBoundsDatetime(msg)
|
||||
|
||||
elif side == "end" and endpoint - stride <= i64max < endpoint:
|
||||
# in _generate_regular_range we added `stride` thereby overflowing
|
||||
# the bounds. Adjust to fix this.
|
||||
return _generate_range_overflow_safe(
|
||||
endpoint - stride, periods - 1, stride, side
|
||||
)
|
||||
|
||||
# split into smaller pieces
|
||||
mid_periods = periods // 2
|
||||
remaining = periods - mid_periods
|
||||
assert 0 < remaining < periods, (remaining, periods, endpoint, stride)
|
||||
|
||||
midpoint = int(_generate_range_overflow_safe(endpoint, mid_periods, stride, side))
|
||||
return _generate_range_overflow_safe(midpoint, remaining, stride, side)
|
||||
|
||||
|
||||
def _generate_range_overflow_safe_signed(
|
||||
endpoint: int, periods: int, stride: int, side: str
|
||||
) -> int:
|
||||
"""
|
||||
A special case for _generate_range_overflow_safe where `periods * stride`
|
||||
can be calculated without overflowing int64 bounds.
|
||||
"""
|
||||
assert side in ["start", "end"]
|
||||
if side == "end":
|
||||
stride *= -1
|
||||
|
||||
with np.errstate(over="raise"):
|
||||
addend = np.int64(periods) * np.int64(stride)
|
||||
try:
|
||||
# easy case with no overflows
|
||||
result = np.int64(endpoint) + addend
|
||||
if result == iNaT:
|
||||
# Putting this into a DatetimeArray/TimedeltaArray
|
||||
# would incorrectly be interpreted as NaT
|
||||
raise OverflowError
|
||||
return int(result)
|
||||
except (FloatingPointError, OverflowError):
|
||||
# with endpoint negative and addend positive we risk
|
||||
# FloatingPointError; with reversed signed we risk OverflowError
|
||||
pass
|
||||
|
||||
# if stride and endpoint had opposite signs, then endpoint + addend
|
||||
# should never overflow. so they must have the same signs
|
||||
assert (stride > 0 and endpoint >= 0) or (stride < 0 and endpoint <= 0)
|
||||
|
||||
if stride > 0:
|
||||
# watch out for very special case in which we just slightly
|
||||
# exceed implementation bounds, but when passing the result to
|
||||
# np.arange will get a result slightly within the bounds
|
||||
|
||||
uresult = np.uint64(endpoint) + np.uint64(addend)
|
||||
i64max = np.uint64(i8max)
|
||||
assert uresult > i64max
|
||||
if uresult <= i64max + np.uint64(stride):
|
||||
return int(uresult)
|
||||
|
||||
raise OutOfBoundsDatetime(
|
||||
f"Cannot generate range with {side}={endpoint} and periods={periods}"
|
||||
)
|
63
lib/python3.13/site-packages/pandas/core/arrays/_utils.py
Normal file
63
lib/python3.13/site-packages/pandas/core/arrays/_utils.py
Normal file
@ -0,0 +1,63 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.errors import LossySetitemError
|
||||
|
||||
from pandas.core.dtypes.cast import np_can_hold_element
|
||||
from pandas.core.dtypes.common import is_numeric_dtype
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
npt,
|
||||
)
|
||||
|
||||
|
||||
def to_numpy_dtype_inference(
|
||||
arr: ArrayLike, dtype: npt.DTypeLike | None, na_value, hasna: bool
|
||||
) -> tuple[npt.DTypeLike, Any]:
|
||||
if dtype is None and is_numeric_dtype(arr.dtype):
|
||||
dtype_given = False
|
||||
if hasna:
|
||||
if arr.dtype.kind == "b":
|
||||
dtype = np.dtype(np.object_)
|
||||
else:
|
||||
if arr.dtype.kind in "iu":
|
||||
dtype = np.dtype(np.float64)
|
||||
else:
|
||||
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
|
||||
if na_value is lib.no_default:
|
||||
na_value = np.nan
|
||||
else:
|
||||
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
|
||||
elif dtype is not None:
|
||||
dtype = np.dtype(dtype)
|
||||
dtype_given = True
|
||||
else:
|
||||
dtype_given = True
|
||||
|
||||
if na_value is lib.no_default:
|
||||
if dtype is None or not hasna:
|
||||
na_value = arr.dtype.na_value
|
||||
elif dtype.kind == "f": # type: ignore[union-attr]
|
||||
na_value = np.nan
|
||||
elif dtype.kind == "M": # type: ignore[union-attr]
|
||||
na_value = np.datetime64("nat")
|
||||
elif dtype.kind == "m": # type: ignore[union-attr]
|
||||
na_value = np.timedelta64("nat")
|
||||
else:
|
||||
na_value = arr.dtype.na_value
|
||||
|
||||
if not dtype_given and hasna:
|
||||
try:
|
||||
np_can_hold_element(dtype, na_value) # type: ignore[arg-type]
|
||||
except LossySetitemError:
|
||||
dtype = np.dtype(np.object_)
|
||||
return dtype, na_value
|
@ -0,0 +1,7 @@
|
||||
from pandas.core.arrays.arrow.accessors import (
|
||||
ListAccessor,
|
||||
StructAccessor,
|
||||
)
|
||||
from pandas.core.arrays.arrow.array import ArrowExtensionArray
|
||||
|
||||
__all__ = ["ArrowExtensionArray", "StructAccessor", "ListAccessor"]
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,66 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pyarrow
|
||||
|
||||
from pandas.errors import PerformanceWarning
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
|
||||
def fallback_performancewarning(version: str | None = None) -> None:
|
||||
"""
|
||||
Raise a PerformanceWarning for falling back to ExtensionArray's
|
||||
non-pyarrow method
|
||||
"""
|
||||
msg = "Falling back on a non-pyarrow code path which may decrease performance."
|
||||
if version is not None:
|
||||
msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning."
|
||||
warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level())
|
||||
|
||||
|
||||
def pyarrow_array_to_numpy_and_mask(
|
||||
arr, dtype: np.dtype
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Convert a primitive pyarrow.Array to a numpy array and boolean mask based
|
||||
on the buffers of the Array.
|
||||
|
||||
At the moment pyarrow.BooleanArray is not supported.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arr : pyarrow.Array
|
||||
dtype : numpy.dtype
|
||||
|
||||
Returns
|
||||
-------
|
||||
(data, mask)
|
||||
Tuple of two numpy arrays with the raw data (with specified dtype) and
|
||||
a boolean mask (validity mask, so False means missing)
|
||||
"""
|
||||
dtype = np.dtype(dtype)
|
||||
|
||||
if pyarrow.types.is_null(arr.type):
|
||||
# No initialization of data is needed since everything is null
|
||||
data = np.empty(len(arr), dtype=dtype)
|
||||
mask = np.zeros(len(arr), dtype=bool)
|
||||
return data, mask
|
||||
buflist = arr.buffers()
|
||||
# Since Arrow buffers might contain padding and the data might be offset,
|
||||
# the buffer gets sliced here before handing it to numpy.
|
||||
# See also https://github.com/pandas-dev/pandas/issues/40896
|
||||
offset = arr.offset * dtype.itemsize
|
||||
length = len(arr) * dtype.itemsize
|
||||
data_buf = buflist[1][offset : offset + length]
|
||||
data = np.frombuffer(data_buf, dtype=dtype)
|
||||
bitmask = buflist[0]
|
||||
if bitmask is not None:
|
||||
mask = pyarrow.BooleanArray.from_buffers(
|
||||
pyarrow.bool_(), len(arr), [None, bitmask], offset=arr.offset
|
||||
)
|
||||
mask = np.asarray(mask)
|
||||
else:
|
||||
mask = np.ones(len(arr), dtype=bool)
|
||||
return data, mask
|
@ -0,0 +1,473 @@
|
||||
"""Accessors for arrow-backed data."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import (
|
||||
ABCMeta,
|
||||
abstractmethod,
|
||||
)
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
cast,
|
||||
)
|
||||
|
||||
from pandas.compat import (
|
||||
pa_version_under10p1,
|
||||
pa_version_under11p0,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
|
||||
if not pa_version_under10p1:
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
|
||||
from pandas.core.dtypes.dtypes import ArrowDtype
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterator
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
|
||||
|
||||
class ArrowAccessor(metaclass=ABCMeta):
|
||||
@abstractmethod
|
||||
def __init__(self, data, validation_msg: str) -> None:
|
||||
self._data = data
|
||||
self._validation_msg = validation_msg
|
||||
self._validate(data)
|
||||
|
||||
@abstractmethod
|
||||
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
|
||||
pass
|
||||
|
||||
def _validate(self, data):
|
||||
dtype = data.dtype
|
||||
if not isinstance(dtype, ArrowDtype):
|
||||
# Raise AttributeError so that inspect can handle non-struct Series.
|
||||
raise AttributeError(self._validation_msg.format(dtype=dtype))
|
||||
|
||||
if not self._is_valid_pyarrow_dtype(dtype.pyarrow_dtype):
|
||||
# Raise AttributeError so that inspect can handle invalid Series.
|
||||
raise AttributeError(self._validation_msg.format(dtype=dtype))
|
||||
|
||||
@property
|
||||
def _pa_array(self):
|
||||
return self._data.array._pa_array
|
||||
|
||||
|
||||
class ListAccessor(ArrowAccessor):
|
||||
"""
|
||||
Accessor object for list data properties of the Series values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : Series
|
||||
Series containing Arrow list data.
|
||||
"""
|
||||
|
||||
def __init__(self, data=None) -> None:
|
||||
super().__init__(
|
||||
data,
|
||||
validation_msg="Can only use the '.list' accessor with "
|
||||
"'list[pyarrow]' dtype, not {dtype}.",
|
||||
)
|
||||
|
||||
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
|
||||
return (
|
||||
pa.types.is_list(pyarrow_dtype)
|
||||
or pa.types.is_fixed_size_list(pyarrow_dtype)
|
||||
or pa.types.is_large_list(pyarrow_dtype)
|
||||
)
|
||||
|
||||
def len(self) -> Series:
|
||||
"""
|
||||
Return the length of each list in the Series.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
The length of each list.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... [1, 2, 3],
|
||||
... [3],
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.list_(
|
||||
... pa.int64()
|
||||
... ))
|
||||
... )
|
||||
>>> s.list.len()
|
||||
0 3
|
||||
1 1
|
||||
dtype: int32[pyarrow]
|
||||
"""
|
||||
from pandas import Series
|
||||
|
||||
value_lengths = pc.list_value_length(self._pa_array)
|
||||
return Series(value_lengths, dtype=ArrowDtype(value_lengths.type))
|
||||
|
||||
def __getitem__(self, key: int | slice) -> Series:
|
||||
"""
|
||||
Index or slice lists in the Series.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
key : int | slice
|
||||
Index or slice of indices to access from each list.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
The list at requested index.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... [1, 2, 3],
|
||||
... [3],
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.list_(
|
||||
... pa.int64()
|
||||
... ))
|
||||
... )
|
||||
>>> s.list[0]
|
||||
0 1
|
||||
1 3
|
||||
dtype: int64[pyarrow]
|
||||
"""
|
||||
from pandas import Series
|
||||
|
||||
if isinstance(key, int):
|
||||
# TODO: Support negative key but pyarrow does not allow
|
||||
# element index to be an array.
|
||||
# if key < 0:
|
||||
# key = pc.add(key, pc.list_value_length(self._pa_array))
|
||||
element = pc.list_element(self._pa_array, key)
|
||||
return Series(element, dtype=ArrowDtype(element.type))
|
||||
elif isinstance(key, slice):
|
||||
if pa_version_under11p0:
|
||||
raise NotImplementedError(
|
||||
f"List slice not supported by pyarrow {pa.__version__}."
|
||||
)
|
||||
|
||||
# TODO: Support negative start/stop/step, ideally this would be added
|
||||
# upstream in pyarrow.
|
||||
start, stop, step = key.start, key.stop, key.step
|
||||
if start is None:
|
||||
# TODO: When adding negative step support
|
||||
# this should be setto last element of array
|
||||
# when step is negative.
|
||||
start = 0
|
||||
if step is None:
|
||||
step = 1
|
||||
sliced = pc.list_slice(self._pa_array, start, stop, step)
|
||||
return Series(sliced, dtype=ArrowDtype(sliced.type))
|
||||
else:
|
||||
raise ValueError(f"key must be an int or slice, got {type(key).__name__}")
|
||||
|
||||
def __iter__(self) -> Iterator:
|
||||
raise TypeError(f"'{type(self).__name__}' object is not iterable")
|
||||
|
||||
def flatten(self) -> Series:
|
||||
"""
|
||||
Flatten list values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
The data from all lists in the series flattened.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... [1, 2, 3],
|
||||
... [3],
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.list_(
|
||||
... pa.int64()
|
||||
... ))
|
||||
... )
|
||||
>>> s.list.flatten()
|
||||
0 1
|
||||
1 2
|
||||
2 3
|
||||
3 3
|
||||
dtype: int64[pyarrow]
|
||||
"""
|
||||
from pandas import Series
|
||||
|
||||
flattened = pc.list_flatten(self._pa_array)
|
||||
return Series(flattened, dtype=ArrowDtype(flattened.type))
|
||||
|
||||
|
||||
class StructAccessor(ArrowAccessor):
|
||||
"""
|
||||
Accessor object for structured data properties of the Series values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : Series
|
||||
Series containing Arrow struct data.
|
||||
"""
|
||||
|
||||
def __init__(self, data=None) -> None:
|
||||
super().__init__(
|
||||
data,
|
||||
validation_msg=(
|
||||
"Can only use the '.struct' accessor with 'struct[pyarrow]' "
|
||||
"dtype, not {dtype}."
|
||||
),
|
||||
)
|
||||
|
||||
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
|
||||
return pa.types.is_struct(pyarrow_dtype)
|
||||
|
||||
@property
|
||||
def dtypes(self) -> Series:
|
||||
"""
|
||||
Return the dtype object of each child field of the struct.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
The data type of each child field.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... {"version": 1, "project": "pandas"},
|
||||
... {"version": 2, "project": "pandas"},
|
||||
... {"version": 1, "project": "numpy"},
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.struct(
|
||||
... [("version", pa.int64()), ("project", pa.string())]
|
||||
... ))
|
||||
... )
|
||||
>>> s.struct.dtypes
|
||||
version int64[pyarrow]
|
||||
project string[pyarrow]
|
||||
dtype: object
|
||||
"""
|
||||
from pandas import (
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
|
||||
pa_type = self._data.dtype.pyarrow_dtype
|
||||
types = [ArrowDtype(struct.type) for struct in pa_type]
|
||||
names = [struct.name for struct in pa_type]
|
||||
return Series(types, index=Index(names))
|
||||
|
||||
def field(
|
||||
self,
|
||||
name_or_index: list[str]
|
||||
| list[bytes]
|
||||
| list[int]
|
||||
| pc.Expression
|
||||
| bytes
|
||||
| str
|
||||
| int,
|
||||
) -> Series:
|
||||
"""
|
||||
Extract a child field of a struct as a Series.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name_or_index : str | bytes | int | expression | list
|
||||
Name or index of the child field to extract.
|
||||
|
||||
For list-like inputs, this will index into a nested
|
||||
struct.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
The data corresponding to the selected child field.
|
||||
|
||||
See Also
|
||||
--------
|
||||
Series.struct.explode : Return all child fields as a DataFrame.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The name of the resulting Series will be set using the following
|
||||
rules:
|
||||
|
||||
- For string, bytes, or integer `name_or_index` (or a list of these, for
|
||||
a nested selection), the Series name is set to the selected
|
||||
field's name.
|
||||
- For a :class:`pyarrow.compute.Expression`, this is set to
|
||||
the string form of the expression.
|
||||
- For list-like `name_or_index`, the name will be set to the
|
||||
name of the final field selected.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... {"version": 1, "project": "pandas"},
|
||||
... {"version": 2, "project": "pandas"},
|
||||
... {"version": 1, "project": "numpy"},
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.struct(
|
||||
... [("version", pa.int64()), ("project", pa.string())]
|
||||
... ))
|
||||
... )
|
||||
|
||||
Extract by field name.
|
||||
|
||||
>>> s.struct.field("project")
|
||||
0 pandas
|
||||
1 pandas
|
||||
2 numpy
|
||||
Name: project, dtype: string[pyarrow]
|
||||
|
||||
Extract by field index.
|
||||
|
||||
>>> s.struct.field(0)
|
||||
0 1
|
||||
1 2
|
||||
2 1
|
||||
Name: version, dtype: int64[pyarrow]
|
||||
|
||||
Or an expression
|
||||
|
||||
>>> import pyarrow.compute as pc
|
||||
>>> s.struct.field(pc.field("project"))
|
||||
0 pandas
|
||||
1 pandas
|
||||
2 numpy
|
||||
Name: project, dtype: string[pyarrow]
|
||||
|
||||
For nested struct types, you can pass a list of values to index
|
||||
multiple levels:
|
||||
|
||||
>>> version_type = pa.struct([
|
||||
... ("major", pa.int64()),
|
||||
... ("minor", pa.int64()),
|
||||
... ])
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... {"version": {"major": 1, "minor": 5}, "project": "pandas"},
|
||||
... {"version": {"major": 2, "minor": 1}, "project": "pandas"},
|
||||
... {"version": {"major": 1, "minor": 26}, "project": "numpy"},
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.struct(
|
||||
... [("version", version_type), ("project", pa.string())]
|
||||
... ))
|
||||
... )
|
||||
>>> s.struct.field(["version", "minor"])
|
||||
0 5
|
||||
1 1
|
||||
2 26
|
||||
Name: minor, dtype: int64[pyarrow]
|
||||
>>> s.struct.field([0, 0])
|
||||
0 1
|
||||
1 2
|
||||
2 1
|
||||
Name: major, dtype: int64[pyarrow]
|
||||
"""
|
||||
from pandas import Series
|
||||
|
||||
def get_name(
|
||||
level_name_or_index: list[str]
|
||||
| list[bytes]
|
||||
| list[int]
|
||||
| pc.Expression
|
||||
| bytes
|
||||
| str
|
||||
| int,
|
||||
data: pa.ChunkedArray,
|
||||
):
|
||||
if isinstance(level_name_or_index, int):
|
||||
name = data.type.field(level_name_or_index).name
|
||||
elif isinstance(level_name_or_index, (str, bytes)):
|
||||
name = level_name_or_index
|
||||
elif isinstance(level_name_or_index, pc.Expression):
|
||||
name = str(level_name_or_index)
|
||||
elif is_list_like(level_name_or_index):
|
||||
# For nested input like [2, 1, 2]
|
||||
# iteratively get the struct and field name. The last
|
||||
# one is used for the name of the index.
|
||||
level_name_or_index = list(reversed(level_name_or_index))
|
||||
selected = data
|
||||
while level_name_or_index:
|
||||
# we need the cast, otherwise mypy complains about
|
||||
# getting ints, bytes, or str here, which isn't possible.
|
||||
level_name_or_index = cast(list, level_name_or_index)
|
||||
name_or_index = level_name_or_index.pop()
|
||||
name = get_name(name_or_index, selected)
|
||||
selected = selected.type.field(selected.type.get_field_index(name))
|
||||
name = selected.name
|
||||
else:
|
||||
raise ValueError(
|
||||
"name_or_index must be an int, str, bytes, "
|
||||
"pyarrow.compute.Expression, or list of those"
|
||||
)
|
||||
return name
|
||||
|
||||
pa_arr = self._data.array._pa_array
|
||||
name = get_name(name_or_index, pa_arr)
|
||||
field_arr = pc.struct_field(pa_arr, name_or_index)
|
||||
|
||||
return Series(
|
||||
field_arr,
|
||||
dtype=ArrowDtype(field_arr.type),
|
||||
index=self._data.index,
|
||||
name=name,
|
||||
)
|
||||
|
||||
def explode(self) -> DataFrame:
|
||||
"""
|
||||
Extract all child fields of a struct as a DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.DataFrame
|
||||
The data corresponding to all child fields.
|
||||
|
||||
See Also
|
||||
--------
|
||||
Series.struct.field : Return a single child field as a Series.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... {"version": 1, "project": "pandas"},
|
||||
... {"version": 2, "project": "pandas"},
|
||||
... {"version": 1, "project": "numpy"},
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.struct(
|
||||
... [("version", pa.int64()), ("project", pa.string())]
|
||||
... ))
|
||||
... )
|
||||
|
||||
>>> s.struct.explode()
|
||||
version project
|
||||
0 1 pandas
|
||||
1 2 pandas
|
||||
2 1 numpy
|
||||
"""
|
||||
from pandas import concat
|
||||
|
||||
pa_type = self._pa_array.type
|
||||
return concat(
|
||||
[self.field(i) for i in range(pa_type.num_fields)], axis="columns"
|
||||
)
|
2942
lib/python3.13/site-packages/pandas/core/arrays/arrow/array.py
Normal file
2942
lib/python3.13/site-packages/pandas/core/arrays/arrow/array.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,174 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pyarrow
|
||||
|
||||
from pandas.compat import pa_version_under14p1
|
||||
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
IntervalDtype,
|
||||
PeriodDtype,
|
||||
)
|
||||
|
||||
from pandas.core.arrays.interval import VALID_CLOSED
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import IntervalClosedType
|
||||
|
||||
|
||||
class ArrowPeriodType(pyarrow.ExtensionType):
|
||||
def __init__(self, freq) -> None:
|
||||
# attributes need to be set first before calling
|
||||
# super init (as that calls serialize)
|
||||
self._freq = freq
|
||||
pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period")
|
||||
|
||||
@property
|
||||
def freq(self):
|
||||
return self._freq
|
||||
|
||||
def __arrow_ext_serialize__(self) -> bytes:
|
||||
metadata = {"freq": self.freq}
|
||||
return json.dumps(metadata).encode()
|
||||
|
||||
@classmethod
|
||||
def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowPeriodType:
|
||||
metadata = json.loads(serialized.decode())
|
||||
return ArrowPeriodType(metadata["freq"])
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, pyarrow.BaseExtensionType):
|
||||
return type(self) == type(other) and self.freq == other.freq
|
||||
else:
|
||||
return NotImplemented
|
||||
|
||||
def __ne__(self, other) -> bool:
|
||||
return not self == other
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return hash((str(self), self.freq))
|
||||
|
||||
def to_pandas_dtype(self) -> PeriodDtype:
|
||||
return PeriodDtype(freq=self.freq)
|
||||
|
||||
|
||||
# register the type with a dummy instance
|
||||
_period_type = ArrowPeriodType("D")
|
||||
pyarrow.register_extension_type(_period_type)
|
||||
|
||||
|
||||
class ArrowIntervalType(pyarrow.ExtensionType):
|
||||
def __init__(self, subtype, closed: IntervalClosedType) -> None:
|
||||
# attributes need to be set first before calling
|
||||
# super init (as that calls serialize)
|
||||
assert closed in VALID_CLOSED
|
||||
self._closed: IntervalClosedType = closed
|
||||
if not isinstance(subtype, pyarrow.DataType):
|
||||
subtype = pyarrow.type_for_alias(str(subtype))
|
||||
self._subtype = subtype
|
||||
|
||||
storage_type = pyarrow.struct([("left", subtype), ("right", subtype)])
|
||||
pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval")
|
||||
|
||||
@property
|
||||
def subtype(self):
|
||||
return self._subtype
|
||||
|
||||
@property
|
||||
def closed(self) -> IntervalClosedType:
|
||||
return self._closed
|
||||
|
||||
def __arrow_ext_serialize__(self) -> bytes:
|
||||
metadata = {"subtype": str(self.subtype), "closed": self.closed}
|
||||
return json.dumps(metadata).encode()
|
||||
|
||||
@classmethod
|
||||
def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowIntervalType:
|
||||
metadata = json.loads(serialized.decode())
|
||||
subtype = pyarrow.type_for_alias(metadata["subtype"])
|
||||
closed = metadata["closed"]
|
||||
return ArrowIntervalType(subtype, closed)
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, pyarrow.BaseExtensionType):
|
||||
return (
|
||||
type(self) == type(other)
|
||||
and self.subtype == other.subtype
|
||||
and self.closed == other.closed
|
||||
)
|
||||
else:
|
||||
return NotImplemented
|
||||
|
||||
def __ne__(self, other) -> bool:
|
||||
return not self == other
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return hash((str(self), str(self.subtype), self.closed))
|
||||
|
||||
def to_pandas_dtype(self) -> IntervalDtype:
|
||||
return IntervalDtype(self.subtype.to_pandas_dtype(), self.closed)
|
||||
|
||||
|
||||
# register the type with a dummy instance
|
||||
_interval_type = ArrowIntervalType(pyarrow.int64(), "left")
|
||||
pyarrow.register_extension_type(_interval_type)
|
||||
|
||||
|
||||
_ERROR_MSG = """\
|
||||
Disallowed deserialization of 'arrow.py_extension_type':
|
||||
storage_type = {storage_type}
|
||||
serialized = {serialized}
|
||||
pickle disassembly:\n{pickle_disassembly}
|
||||
|
||||
Reading of untrusted Parquet or Feather files with a PyExtensionType column
|
||||
allows arbitrary code execution.
|
||||
If you trust this file, you can enable reading the extension type by one of:
|
||||
|
||||
- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)`
|
||||
- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running
|
||||
`import pyarrow_hotfix; pyarrow_hotfix.uninstall()`
|
||||
|
||||
We strongly recommend updating your Parquet/Feather files to use extension types
|
||||
derived from `pyarrow.ExtensionType` instead, and register this type explicitly.
|
||||
"""
|
||||
|
||||
|
||||
def patch_pyarrow():
|
||||
# starting from pyarrow 14.0.1, it has its own mechanism
|
||||
if not pa_version_under14p1:
|
||||
return
|
||||
|
||||
# if https://github.com/pitrou/pyarrow-hotfix was installed and enabled
|
||||
if getattr(pyarrow, "_hotfix_installed", False):
|
||||
return
|
||||
|
||||
class ForbiddenExtensionType(pyarrow.ExtensionType):
|
||||
def __arrow_ext_serialize__(self):
|
||||
return b""
|
||||
|
||||
@classmethod
|
||||
def __arrow_ext_deserialize__(cls, storage_type, serialized):
|
||||
import io
|
||||
import pickletools
|
||||
|
||||
out = io.StringIO()
|
||||
pickletools.dis(serialized, out)
|
||||
raise RuntimeError(
|
||||
_ERROR_MSG.format(
|
||||
storage_type=storage_type,
|
||||
serialized=serialized,
|
||||
pickle_disassembly=out.getvalue(),
|
||||
)
|
||||
)
|
||||
|
||||
pyarrow.unregister_extension_type("arrow.py_extension_type")
|
||||
pyarrow.register_extension_type(
|
||||
ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type")
|
||||
)
|
||||
|
||||
pyarrow._hotfix_installed = True
|
||||
|
||||
|
||||
patch_pyarrow()
|
2588
lib/python3.13/site-packages/pandas/core/arrays/base.py
Normal file
2588
lib/python3.13/site-packages/pandas/core/arrays/base.py
Normal file
File diff suppressed because it is too large
Load Diff
407
lib/python3.13/site-packages/pandas/core/arrays/boolean.py
Normal file
407
lib/python3.13/site-packages/pandas/core/arrays/boolean.py
Normal file
@ -0,0 +1,407 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import numbers
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
ClassVar,
|
||||
cast,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
lib,
|
||||
missing as libmissing,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
from pandas.core.dtypes.dtypes import register_extension_dtype
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
from pandas.core import ops
|
||||
from pandas.core.array_algos import masked_accumulations
|
||||
from pandas.core.arrays.masked import (
|
||||
BaseMaskedArray,
|
||||
BaseMaskedDtype,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pyarrow
|
||||
|
||||
from pandas._typing import (
|
||||
Dtype,
|
||||
DtypeObj,
|
||||
Self,
|
||||
npt,
|
||||
type_t,
|
||||
)
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class BooleanDtype(BaseMaskedDtype):
|
||||
"""
|
||||
Extension dtype for boolean data.
|
||||
|
||||
.. warning::
|
||||
|
||||
BooleanDtype is considered experimental. The implementation and
|
||||
parts of the API may change without warning.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> pd.BooleanDtype()
|
||||
BooleanDtype
|
||||
"""
|
||||
|
||||
name: ClassVar[str] = "boolean"
|
||||
|
||||
# https://github.com/python/mypy/issues/4125
|
||||
# error: Signature of "type" incompatible with supertype "BaseMaskedDtype"
|
||||
@property
|
||||
def type(self) -> type: # type: ignore[override]
|
||||
return np.bool_
|
||||
|
||||
@property
|
||||
def kind(self) -> str:
|
||||
return "b"
|
||||
|
||||
@property
|
||||
def numpy_dtype(self) -> np.dtype:
|
||||
return np.dtype("bool")
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls) -> type_t[BooleanArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
return BooleanArray
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return "BooleanDtype"
|
||||
|
||||
@property
|
||||
def _is_boolean(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def _is_numeric(self) -> bool:
|
||||
return True
|
||||
|
||||
def __from_arrow__(
|
||||
self, array: pyarrow.Array | pyarrow.ChunkedArray
|
||||
) -> BooleanArray:
|
||||
"""
|
||||
Construct BooleanArray from pyarrow Array/ChunkedArray.
|
||||
"""
|
||||
import pyarrow
|
||||
|
||||
if array.type != pyarrow.bool_() and not pyarrow.types.is_null(array.type):
|
||||
raise TypeError(f"Expected array of boolean type, got {array.type} instead")
|
||||
|
||||
if isinstance(array, pyarrow.Array):
|
||||
chunks = [array]
|
||||
length = len(array)
|
||||
else:
|
||||
# pyarrow.ChunkedArray
|
||||
chunks = array.chunks
|
||||
length = array.length()
|
||||
|
||||
if pyarrow.types.is_null(array.type):
|
||||
mask = np.ones(length, dtype=bool)
|
||||
# No need to init data, since all null
|
||||
data = np.empty(length, dtype=bool)
|
||||
return BooleanArray(data, mask)
|
||||
|
||||
results = []
|
||||
for arr in chunks:
|
||||
buflist = arr.buffers()
|
||||
data = pyarrow.BooleanArray.from_buffers(
|
||||
arr.type, len(arr), [None, buflist[1]], offset=arr.offset
|
||||
).to_numpy(zero_copy_only=False)
|
||||
if arr.null_count != 0:
|
||||
mask = pyarrow.BooleanArray.from_buffers(
|
||||
arr.type, len(arr), [None, buflist[0]], offset=arr.offset
|
||||
).to_numpy(zero_copy_only=False)
|
||||
mask = ~mask
|
||||
else:
|
||||
mask = np.zeros(len(arr), dtype=bool)
|
||||
|
||||
bool_arr = BooleanArray(data, mask)
|
||||
results.append(bool_arr)
|
||||
|
||||
if not results:
|
||||
return BooleanArray(
|
||||
np.array([], dtype=np.bool_), np.array([], dtype=np.bool_)
|
||||
)
|
||||
else:
|
||||
return BooleanArray._concat_same_type(results)
|
||||
|
||||
|
||||
def coerce_to_array(
|
||||
values, mask=None, copy: bool = False
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Coerce the input values array to numpy arrays with a mask.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : 1D list-like
|
||||
mask : bool 1D array, optional
|
||||
copy : bool, default False
|
||||
if True, copy the input
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple of (values, mask)
|
||||
"""
|
||||
if isinstance(values, BooleanArray):
|
||||
if mask is not None:
|
||||
raise ValueError("cannot pass mask for BooleanArray input")
|
||||
values, mask = values._data, values._mask
|
||||
if copy:
|
||||
values = values.copy()
|
||||
mask = mask.copy()
|
||||
return values, mask
|
||||
|
||||
mask_values = None
|
||||
if isinstance(values, np.ndarray) and values.dtype == np.bool_:
|
||||
if copy:
|
||||
values = values.copy()
|
||||
elif isinstance(values, np.ndarray) and values.dtype.kind in "iufcb":
|
||||
mask_values = isna(values)
|
||||
|
||||
values_bool = np.zeros(len(values), dtype=bool)
|
||||
values_bool[~mask_values] = values[~mask_values].astype(bool)
|
||||
|
||||
if not np.all(
|
||||
values_bool[~mask_values].astype(values.dtype) == values[~mask_values]
|
||||
):
|
||||
raise TypeError("Need to pass bool-like values")
|
||||
|
||||
values = values_bool
|
||||
else:
|
||||
values_object = np.asarray(values, dtype=object)
|
||||
|
||||
inferred_dtype = lib.infer_dtype(values_object, skipna=True)
|
||||
integer_like = ("floating", "integer", "mixed-integer-float")
|
||||
if inferred_dtype not in ("boolean", "empty") + integer_like:
|
||||
raise TypeError("Need to pass bool-like values")
|
||||
|
||||
# mypy does not narrow the type of mask_values to npt.NDArray[np.bool_]
|
||||
# within this branch, it assumes it can also be None
|
||||
mask_values = cast("npt.NDArray[np.bool_]", isna(values_object))
|
||||
values = np.zeros(len(values), dtype=bool)
|
||||
values[~mask_values] = values_object[~mask_values].astype(bool)
|
||||
|
||||
# if the values were integer-like, validate it were actually 0/1's
|
||||
if (inferred_dtype in integer_like) and not (
|
||||
np.all(
|
||||
values[~mask_values].astype(float)
|
||||
== values_object[~mask_values].astype(float)
|
||||
)
|
||||
):
|
||||
raise TypeError("Need to pass bool-like values")
|
||||
|
||||
if mask is None and mask_values is None:
|
||||
mask = np.zeros(values.shape, dtype=bool)
|
||||
elif mask is None:
|
||||
mask = mask_values
|
||||
else:
|
||||
if isinstance(mask, np.ndarray) and mask.dtype == np.bool_:
|
||||
if mask_values is not None:
|
||||
mask = mask | mask_values
|
||||
else:
|
||||
if copy:
|
||||
mask = mask.copy()
|
||||
else:
|
||||
mask = np.array(mask, dtype=bool)
|
||||
if mask_values is not None:
|
||||
mask = mask | mask_values
|
||||
|
||||
if values.shape != mask.shape:
|
||||
raise ValueError("values.shape and mask.shape must match")
|
||||
|
||||
return values, mask
|
||||
|
||||
|
||||
class BooleanArray(BaseMaskedArray):
|
||||
"""
|
||||
Array of boolean (True/False) data with missing values.
|
||||
|
||||
This is a pandas Extension array for boolean data, under the hood
|
||||
represented by 2 numpy arrays: a boolean array with the data and
|
||||
a boolean array with the mask (True indicating missing).
|
||||
|
||||
BooleanArray implements Kleene logic (sometimes called three-value
|
||||
logic) for logical operations. See :ref:`boolean.kleene` for more.
|
||||
|
||||
To construct an BooleanArray from generic array-like input, use
|
||||
:func:`pandas.array` specifying ``dtype="boolean"`` (see examples
|
||||
below).
|
||||
|
||||
.. warning::
|
||||
|
||||
BooleanArray is considered experimental. The implementation and
|
||||
parts of the API may change without warning.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : numpy.ndarray
|
||||
A 1-d boolean-dtype array with the data.
|
||||
mask : numpy.ndarray
|
||||
A 1-d boolean-dtype array indicating missing values (True
|
||||
indicates missing).
|
||||
copy : bool, default False
|
||||
Whether to copy the `values` and `mask` arrays.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Returns
|
||||
-------
|
||||
BooleanArray
|
||||
|
||||
Examples
|
||||
--------
|
||||
Create an BooleanArray with :func:`pandas.array`:
|
||||
|
||||
>>> pd.array([True, False, None], dtype="boolean")
|
||||
<BooleanArray>
|
||||
[True, False, <NA>]
|
||||
Length: 3, dtype: boolean
|
||||
"""
|
||||
|
||||
# The value used to fill '_data' to avoid upcasting
|
||||
_internal_fill_value = False
|
||||
# Fill values used for any/all
|
||||
# Incompatible types in assignment (expression has type "bool", base class
|
||||
# "BaseMaskedArray" defined the type as "<typing special form>")
|
||||
_truthy_value = True # type: ignore[assignment]
|
||||
_falsey_value = False # type: ignore[assignment]
|
||||
_TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"}
|
||||
_FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"}
|
||||
|
||||
@classmethod
|
||||
def _simple_new(cls, values: np.ndarray, mask: npt.NDArray[np.bool_]) -> Self:
|
||||
result = super()._simple_new(values, mask)
|
||||
result._dtype = BooleanDtype()
|
||||
return result
|
||||
|
||||
def __init__(
|
||||
self, values: np.ndarray, mask: np.ndarray, copy: bool = False
|
||||
) -> None:
|
||||
if not (isinstance(values, np.ndarray) and values.dtype == np.bool_):
|
||||
raise TypeError(
|
||||
"values should be boolean numpy array. Use "
|
||||
"the 'pd.array' function instead"
|
||||
)
|
||||
self._dtype = BooleanDtype()
|
||||
super().__init__(values, mask, copy=copy)
|
||||
|
||||
@property
|
||||
def dtype(self) -> BooleanDtype:
|
||||
return self._dtype
|
||||
|
||||
@classmethod
|
||||
def _from_sequence_of_strings(
|
||||
cls,
|
||||
strings: list[str],
|
||||
*,
|
||||
dtype: Dtype | None = None,
|
||||
copy: bool = False,
|
||||
true_values: list[str] | None = None,
|
||||
false_values: list[str] | None = None,
|
||||
) -> BooleanArray:
|
||||
true_values_union = cls._TRUE_VALUES.union(true_values or [])
|
||||
false_values_union = cls._FALSE_VALUES.union(false_values or [])
|
||||
|
||||
def map_string(s) -> bool:
|
||||
if s in true_values_union:
|
||||
return True
|
||||
elif s in false_values_union:
|
||||
return False
|
||||
else:
|
||||
raise ValueError(f"{s} cannot be cast to bool")
|
||||
|
||||
scalars = np.array(strings, dtype=object)
|
||||
mask = isna(scalars)
|
||||
scalars[~mask] = list(map(map_string, scalars[~mask]))
|
||||
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
|
||||
|
||||
_HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_)
|
||||
|
||||
@classmethod
|
||||
def _coerce_to_array(
|
||||
cls, value, *, dtype: DtypeObj, copy: bool = False
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
if dtype:
|
||||
assert dtype == "boolean"
|
||||
return coerce_to_array(value, copy=copy)
|
||||
|
||||
def _logical_method(self, other, op):
|
||||
assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"}
|
||||
other_is_scalar = lib.is_scalar(other)
|
||||
mask = None
|
||||
|
||||
if isinstance(other, BooleanArray):
|
||||
other, mask = other._data, other._mask
|
||||
elif is_list_like(other):
|
||||
other = np.asarray(other, dtype="bool")
|
||||
if other.ndim > 1:
|
||||
raise NotImplementedError("can only perform ops with 1-d structures")
|
||||
other, mask = coerce_to_array(other, copy=False)
|
||||
elif isinstance(other, np.bool_):
|
||||
other = other.item()
|
||||
|
||||
if other_is_scalar and other is not libmissing.NA and not lib.is_bool(other):
|
||||
raise TypeError(
|
||||
"'other' should be pandas.NA or a bool. "
|
||||
f"Got {type(other).__name__} instead."
|
||||
)
|
||||
|
||||
if not other_is_scalar and len(self) != len(other):
|
||||
raise ValueError("Lengths must match")
|
||||
|
||||
if op.__name__ in {"or_", "ror_"}:
|
||||
result, mask = ops.kleene_or(self._data, other, self._mask, mask)
|
||||
elif op.__name__ in {"and_", "rand_"}:
|
||||
result, mask = ops.kleene_and(self._data, other, self._mask, mask)
|
||||
else:
|
||||
# i.e. xor, rxor
|
||||
result, mask = ops.kleene_xor(self._data, other, self._mask, mask)
|
||||
|
||||
# i.e. BooleanArray
|
||||
return self._maybe_mask_result(result, mask)
|
||||
|
||||
def _accumulate(
|
||||
self, name: str, *, skipna: bool = True, **kwargs
|
||||
) -> BaseMaskedArray:
|
||||
data = self._data
|
||||
mask = self._mask
|
||||
if name in ("cummin", "cummax"):
|
||||
op = getattr(masked_accumulations, name)
|
||||
data, mask = op(data, mask, skipna=skipna, **kwargs)
|
||||
return self._simple_new(data, mask)
|
||||
else:
|
||||
from pandas.core.arrays import IntegerArray
|
||||
|
||||
return IntegerArray(data.astype(int), mask)._accumulate(
|
||||
name, skipna=skipna, **kwargs
|
||||
)
|
3070
lib/python3.13/site-packages/pandas/core/arrays/categorical.py
Normal file
3070
lib/python3.13/site-packages/pandas/core/arrays/categorical.py
Normal file
File diff suppressed because it is too large
Load Diff
2556
lib/python3.13/site-packages/pandas/core/arrays/datetimelike.py
Normal file
2556
lib/python3.13/site-packages/pandas/core/arrays/datetimelike.py
Normal file
File diff suppressed because it is too large
Load Diff
2820
lib/python3.13/site-packages/pandas/core/arrays/datetimes.py
Normal file
2820
lib/python3.13/site-packages/pandas/core/arrays/datetimes.py
Normal file
File diff suppressed because it is too large
Load Diff
173
lib/python3.13/site-packages/pandas/core/arrays/floating.py
Normal file
173
lib/python3.13/site-packages/pandas/core/arrays/floating.py
Normal file
@ -0,0 +1,173 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import ClassVar
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.core.dtypes.base import register_extension_dtype
|
||||
from pandas.core.dtypes.common import is_float_dtype
|
||||
|
||||
from pandas.core.arrays.numeric import (
|
||||
NumericArray,
|
||||
NumericDtype,
|
||||
)
|
||||
|
||||
|
||||
class FloatingDtype(NumericDtype):
|
||||
"""
|
||||
An ExtensionDtype to hold a single size of floating dtype.
|
||||
|
||||
These specific implementations are subclasses of the non-public
|
||||
FloatingDtype. For example we have Float32Dtype to represent float32.
|
||||
|
||||
The attributes name & type are set when these subclasses are created.
|
||||
"""
|
||||
|
||||
_default_np_dtype = np.dtype(np.float64)
|
||||
_checker = is_float_dtype
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls) -> type[FloatingArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
return FloatingArray
|
||||
|
||||
@classmethod
|
||||
def _get_dtype_mapping(cls) -> dict[np.dtype, FloatingDtype]:
|
||||
return NUMPY_FLOAT_TO_DTYPE
|
||||
|
||||
@classmethod
|
||||
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
|
||||
"""
|
||||
Safely cast the values to the given dtype.
|
||||
|
||||
"safe" in this context means the casting is lossless.
|
||||
"""
|
||||
# This is really only here for compatibility with IntegerDtype
|
||||
# Here for compat with IntegerDtype
|
||||
return values.astype(dtype, copy=copy)
|
||||
|
||||
|
||||
class FloatingArray(NumericArray):
|
||||
"""
|
||||
Array of floating (optional missing) values.
|
||||
|
||||
.. warning::
|
||||
|
||||
FloatingArray is currently experimental, and its API or internal
|
||||
implementation may change without warning. Especially the behaviour
|
||||
regarding NaN (distinct from NA missing values) is subject to change.
|
||||
|
||||
We represent a FloatingArray with 2 numpy arrays:
|
||||
|
||||
- data: contains a numpy float array of the appropriate dtype
|
||||
- mask: a boolean array holding a mask on the data, True is missing
|
||||
|
||||
To construct an FloatingArray from generic array-like input, use
|
||||
:func:`pandas.array` with one of the float dtypes (see examples).
|
||||
|
||||
See :ref:`integer_na` for more.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : numpy.ndarray
|
||||
A 1-d float-dtype array.
|
||||
mask : numpy.ndarray
|
||||
A 1-d boolean-dtype array indicating missing values.
|
||||
copy : bool, default False
|
||||
Whether to copy the `values` and `mask`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Returns
|
||||
-------
|
||||
FloatingArray
|
||||
|
||||
Examples
|
||||
--------
|
||||
Create an FloatingArray with :func:`pandas.array`:
|
||||
|
||||
>>> pd.array([0.1, None, 0.3], dtype=pd.Float32Dtype())
|
||||
<FloatingArray>
|
||||
[0.1, <NA>, 0.3]
|
||||
Length: 3, dtype: Float32
|
||||
|
||||
String aliases for the dtypes are also available. They are capitalized.
|
||||
|
||||
>>> pd.array([0.1, None, 0.3], dtype="Float32")
|
||||
<FloatingArray>
|
||||
[0.1, <NA>, 0.3]
|
||||
Length: 3, dtype: Float32
|
||||
"""
|
||||
|
||||
_dtype_cls = FloatingDtype
|
||||
|
||||
# The value used to fill '_data' to avoid upcasting
|
||||
_internal_fill_value = np.nan
|
||||
# Fill values used for any/all
|
||||
# Incompatible types in assignment (expression has type "float", base class
|
||||
# "BaseMaskedArray" defined the type as "<typing special form>")
|
||||
_truthy_value = 1.0 # type: ignore[assignment]
|
||||
_falsey_value = 0.0 # type: ignore[assignment]
|
||||
|
||||
|
||||
_dtype_docstring = """
|
||||
An ExtensionDtype for {dtype} data.
|
||||
|
||||
This dtype uses ``pd.NA`` as missing value indicator.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Examples
|
||||
--------
|
||||
For Float32Dtype:
|
||||
|
||||
>>> ser = pd.Series([2.25, pd.NA], dtype=pd.Float32Dtype())
|
||||
>>> ser.dtype
|
||||
Float32Dtype()
|
||||
|
||||
For Float64Dtype:
|
||||
|
||||
>>> ser = pd.Series([2.25, pd.NA], dtype=pd.Float64Dtype())
|
||||
>>> ser.dtype
|
||||
Float64Dtype()
|
||||
"""
|
||||
|
||||
# create the Dtype
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Float32Dtype(FloatingDtype):
|
||||
type = np.float32
|
||||
name: ClassVar[str] = "Float32"
|
||||
__doc__ = _dtype_docstring.format(dtype="float32")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Float64Dtype(FloatingDtype):
|
||||
type = np.float64
|
||||
name: ClassVar[str] = "Float64"
|
||||
__doc__ = _dtype_docstring.format(dtype="float64")
|
||||
|
||||
|
||||
NUMPY_FLOAT_TO_DTYPE: dict[np.dtype, FloatingDtype] = {
|
||||
np.dtype(np.float32): Float32Dtype(),
|
||||
np.dtype(np.float64): Float64Dtype(),
|
||||
}
|
272
lib/python3.13/site-packages/pandas/core/arrays/integer.py
Normal file
272
lib/python3.13/site-packages/pandas/core/arrays/integer.py
Normal file
@ -0,0 +1,272 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import ClassVar
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.core.dtypes.base import register_extension_dtype
|
||||
from pandas.core.dtypes.common import is_integer_dtype
|
||||
|
||||
from pandas.core.arrays.numeric import (
|
||||
NumericArray,
|
||||
NumericDtype,
|
||||
)
|
||||
|
||||
|
||||
class IntegerDtype(NumericDtype):
|
||||
"""
|
||||
An ExtensionDtype to hold a single size & kind of integer dtype.
|
||||
|
||||
These specific implementations are subclasses of the non-public
|
||||
IntegerDtype. For example, we have Int8Dtype to represent signed int 8s.
|
||||
|
||||
The attributes name & type are set when these subclasses are created.
|
||||
"""
|
||||
|
||||
_default_np_dtype = np.dtype(np.int64)
|
||||
_checker = is_integer_dtype
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls) -> type[IntegerArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
return IntegerArray
|
||||
|
||||
@classmethod
|
||||
def _get_dtype_mapping(cls) -> dict[np.dtype, IntegerDtype]:
|
||||
return NUMPY_INT_TO_DTYPE
|
||||
|
||||
@classmethod
|
||||
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
|
||||
"""
|
||||
Safely cast the values to the given dtype.
|
||||
|
||||
"safe" in this context means the casting is lossless. e.g. if 'values'
|
||||
has a floating dtype, each value must be an integer.
|
||||
"""
|
||||
try:
|
||||
return values.astype(dtype, casting="safe", copy=copy)
|
||||
except TypeError as err:
|
||||
casted = values.astype(dtype, copy=copy)
|
||||
if (casted == values).all():
|
||||
return casted
|
||||
|
||||
raise TypeError(
|
||||
f"cannot safely cast non-equivalent {values.dtype} to {np.dtype(dtype)}"
|
||||
) from err
|
||||
|
||||
|
||||
class IntegerArray(NumericArray):
|
||||
"""
|
||||
Array of integer (optional missing) values.
|
||||
|
||||
Uses :attr:`pandas.NA` as the missing value.
|
||||
|
||||
.. warning::
|
||||
|
||||
IntegerArray is currently experimental, and its API or internal
|
||||
implementation may change without warning.
|
||||
|
||||
We represent an IntegerArray with 2 numpy arrays:
|
||||
|
||||
- data: contains a numpy integer array of the appropriate dtype
|
||||
- mask: a boolean array holding a mask on the data, True is missing
|
||||
|
||||
To construct an IntegerArray from generic array-like input, use
|
||||
:func:`pandas.array` with one of the integer dtypes (see examples).
|
||||
|
||||
See :ref:`integer_na` for more.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : numpy.ndarray
|
||||
A 1-d integer-dtype array.
|
||||
mask : numpy.ndarray
|
||||
A 1-d boolean-dtype array indicating missing values.
|
||||
copy : bool, default False
|
||||
Whether to copy the `values` and `mask`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Returns
|
||||
-------
|
||||
IntegerArray
|
||||
|
||||
Examples
|
||||
--------
|
||||
Create an IntegerArray with :func:`pandas.array`.
|
||||
|
||||
>>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype())
|
||||
>>> int_array
|
||||
<IntegerArray>
|
||||
[1, <NA>, 3]
|
||||
Length: 3, dtype: Int32
|
||||
|
||||
String aliases for the dtypes are also available. They are capitalized.
|
||||
|
||||
>>> pd.array([1, None, 3], dtype='Int32')
|
||||
<IntegerArray>
|
||||
[1, <NA>, 3]
|
||||
Length: 3, dtype: Int32
|
||||
|
||||
>>> pd.array([1, None, 3], dtype='UInt16')
|
||||
<IntegerArray>
|
||||
[1, <NA>, 3]
|
||||
Length: 3, dtype: UInt16
|
||||
"""
|
||||
|
||||
_dtype_cls = IntegerDtype
|
||||
|
||||
# The value used to fill '_data' to avoid upcasting
|
||||
_internal_fill_value = 1
|
||||
# Fill values used for any/all
|
||||
# Incompatible types in assignment (expression has type "int", base class
|
||||
# "BaseMaskedArray" defined the type as "<typing special form>")
|
||||
_truthy_value = 1 # type: ignore[assignment]
|
||||
_falsey_value = 0 # type: ignore[assignment]
|
||||
|
||||
|
||||
_dtype_docstring = """
|
||||
An ExtensionDtype for {dtype} integer data.
|
||||
|
||||
Uses :attr:`pandas.NA` as its missing value, rather than :attr:`numpy.nan`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Examples
|
||||
--------
|
||||
For Int8Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int8Dtype())
|
||||
>>> ser.dtype
|
||||
Int8Dtype()
|
||||
|
||||
For Int16Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int16Dtype())
|
||||
>>> ser.dtype
|
||||
Int16Dtype()
|
||||
|
||||
For Int32Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int32Dtype())
|
||||
>>> ser.dtype
|
||||
Int32Dtype()
|
||||
|
||||
For Int64Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int64Dtype())
|
||||
>>> ser.dtype
|
||||
Int64Dtype()
|
||||
|
||||
For UInt8Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt8Dtype())
|
||||
>>> ser.dtype
|
||||
UInt8Dtype()
|
||||
|
||||
For UInt16Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt16Dtype())
|
||||
>>> ser.dtype
|
||||
UInt16Dtype()
|
||||
|
||||
For UInt32Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt32Dtype())
|
||||
>>> ser.dtype
|
||||
UInt32Dtype()
|
||||
|
||||
For UInt64Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt64Dtype())
|
||||
>>> ser.dtype
|
||||
UInt64Dtype()
|
||||
"""
|
||||
|
||||
# create the Dtype
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Int8Dtype(IntegerDtype):
|
||||
type = np.int8
|
||||
name: ClassVar[str] = "Int8"
|
||||
__doc__ = _dtype_docstring.format(dtype="int8")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Int16Dtype(IntegerDtype):
|
||||
type = np.int16
|
||||
name: ClassVar[str] = "Int16"
|
||||
__doc__ = _dtype_docstring.format(dtype="int16")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Int32Dtype(IntegerDtype):
|
||||
type = np.int32
|
||||
name: ClassVar[str] = "Int32"
|
||||
__doc__ = _dtype_docstring.format(dtype="int32")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Int64Dtype(IntegerDtype):
|
||||
type = np.int64
|
||||
name: ClassVar[str] = "Int64"
|
||||
__doc__ = _dtype_docstring.format(dtype="int64")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class UInt8Dtype(IntegerDtype):
|
||||
type = np.uint8
|
||||
name: ClassVar[str] = "UInt8"
|
||||
__doc__ = _dtype_docstring.format(dtype="uint8")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class UInt16Dtype(IntegerDtype):
|
||||
type = np.uint16
|
||||
name: ClassVar[str] = "UInt16"
|
||||
__doc__ = _dtype_docstring.format(dtype="uint16")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class UInt32Dtype(IntegerDtype):
|
||||
type = np.uint32
|
||||
name: ClassVar[str] = "UInt32"
|
||||
__doc__ = _dtype_docstring.format(dtype="uint32")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class UInt64Dtype(IntegerDtype):
|
||||
type = np.uint64
|
||||
name: ClassVar[str] = "UInt64"
|
||||
__doc__ = _dtype_docstring.format(dtype="uint64")
|
||||
|
||||
|
||||
NUMPY_INT_TO_DTYPE: dict[np.dtype, IntegerDtype] = {
|
||||
np.dtype(np.int8): Int8Dtype(),
|
||||
np.dtype(np.int16): Int16Dtype(),
|
||||
np.dtype(np.int32): Int32Dtype(),
|
||||
np.dtype(np.int64): Int64Dtype(),
|
||||
np.dtype(np.uint8): UInt8Dtype(),
|
||||
np.dtype(np.uint16): UInt16Dtype(),
|
||||
np.dtype(np.uint32): UInt32Dtype(),
|
||||
np.dtype(np.uint64): UInt64Dtype(),
|
||||
}
|
1917
lib/python3.13/site-packages/pandas/core/arrays/interval.py
Normal file
1917
lib/python3.13/site-packages/pandas/core/arrays/interval.py
Normal file
File diff suppressed because it is too large
Load Diff
1650
lib/python3.13/site-packages/pandas/core/arrays/masked.py
Normal file
1650
lib/python3.13/site-packages/pandas/core/arrays/masked.py
Normal file
File diff suppressed because it is too large
Load Diff
286
lib/python3.13/site-packages/pandas/core/arrays/numeric.py
Normal file
286
lib/python3.13/site-packages/pandas/core/arrays/numeric.py
Normal file
@ -0,0 +1,286 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import numbers
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
lib,
|
||||
missing as libmissing,
|
||||
)
|
||||
from pandas.errors import AbstractMethodError
|
||||
from pandas.util._decorators import cache_readonly
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_integer_dtype,
|
||||
is_string_dtype,
|
||||
pandas_dtype,
|
||||
)
|
||||
|
||||
from pandas.core.arrays.masked import (
|
||||
BaseMaskedArray,
|
||||
BaseMaskedDtype,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Mapping
|
||||
|
||||
import pyarrow
|
||||
|
||||
from pandas._typing import (
|
||||
Dtype,
|
||||
DtypeObj,
|
||||
Self,
|
||||
npt,
|
||||
)
|
||||
|
||||
|
||||
class NumericDtype(BaseMaskedDtype):
|
||||
_default_np_dtype: np.dtype
|
||||
_checker: Callable[[Any], bool] # is_foo_dtype
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"{self.name}Dtype()"
|
||||
|
||||
@cache_readonly
|
||||
def is_signed_integer(self) -> bool:
|
||||
return self.kind == "i"
|
||||
|
||||
@cache_readonly
|
||||
def is_unsigned_integer(self) -> bool:
|
||||
return self.kind == "u"
|
||||
|
||||
@property
|
||||
def _is_numeric(self) -> bool:
|
||||
return True
|
||||
|
||||
def __from_arrow__(
|
||||
self, array: pyarrow.Array | pyarrow.ChunkedArray
|
||||
) -> BaseMaskedArray:
|
||||
"""
|
||||
Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray.
|
||||
"""
|
||||
import pyarrow
|
||||
|
||||
from pandas.core.arrays.arrow._arrow_utils import (
|
||||
pyarrow_array_to_numpy_and_mask,
|
||||
)
|
||||
|
||||
array_class = self.construct_array_type()
|
||||
|
||||
pyarrow_type = pyarrow.from_numpy_dtype(self.type)
|
||||
if not array.type.equals(pyarrow_type) and not pyarrow.types.is_null(
|
||||
array.type
|
||||
):
|
||||
# test_from_arrow_type_error raise for string, but allow
|
||||
# through itemsize conversion GH#31896
|
||||
rt_dtype = pandas_dtype(array.type.to_pandas_dtype())
|
||||
if rt_dtype.kind not in "iuf":
|
||||
# Could allow "c" or potentially disallow float<->int conversion,
|
||||
# but at the moment we specifically test that uint<->int works
|
||||
raise TypeError(
|
||||
f"Expected array of {self} type, got {array.type} instead"
|
||||
)
|
||||
|
||||
array = array.cast(pyarrow_type)
|
||||
|
||||
if isinstance(array, pyarrow.ChunkedArray):
|
||||
# TODO this "if" can be removed when requiring pyarrow >= 10.0, which fixed
|
||||
# combine_chunks for empty arrays https://github.com/apache/arrow/pull/13757
|
||||
if array.num_chunks == 0:
|
||||
array = pyarrow.array([], type=array.type)
|
||||
else:
|
||||
array = array.combine_chunks()
|
||||
|
||||
data, mask = pyarrow_array_to_numpy_and_mask(array, dtype=self.numpy_dtype)
|
||||
return array_class(data.copy(), ~mask, copy=False)
|
||||
|
||||
@classmethod
|
||||
def _get_dtype_mapping(cls) -> Mapping[np.dtype, NumericDtype]:
|
||||
raise AbstractMethodError(cls)
|
||||
|
||||
@classmethod
|
||||
def _standardize_dtype(cls, dtype: NumericDtype | str | np.dtype) -> NumericDtype:
|
||||
"""
|
||||
Convert a string representation or a numpy dtype to NumericDtype.
|
||||
"""
|
||||
if isinstance(dtype, str) and (dtype.startswith(("Int", "UInt", "Float"))):
|
||||
# Avoid DeprecationWarning from NumPy about np.dtype("Int64")
|
||||
# https://github.com/numpy/numpy/pull/7476
|
||||
dtype = dtype.lower()
|
||||
|
||||
if not isinstance(dtype, NumericDtype):
|
||||
mapping = cls._get_dtype_mapping()
|
||||
try:
|
||||
dtype = mapping[np.dtype(dtype)]
|
||||
except KeyError as err:
|
||||
raise ValueError(f"invalid dtype specified {dtype}") from err
|
||||
return dtype
|
||||
|
||||
@classmethod
|
||||
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
|
||||
"""
|
||||
Safely cast the values to the given dtype.
|
||||
|
||||
"safe" in this context means the casting is lossless.
|
||||
"""
|
||||
raise AbstractMethodError(cls)
|
||||
|
||||
|
||||
def _coerce_to_data_and_mask(
|
||||
values, dtype, copy: bool, dtype_cls: type[NumericDtype], default_dtype: np.dtype
|
||||
):
|
||||
checker = dtype_cls._checker
|
||||
|
||||
mask = None
|
||||
inferred_type = None
|
||||
|
||||
if dtype is None and hasattr(values, "dtype"):
|
||||
if checker(values.dtype):
|
||||
dtype = values.dtype
|
||||
|
||||
if dtype is not None:
|
||||
dtype = dtype_cls._standardize_dtype(dtype)
|
||||
|
||||
cls = dtype_cls.construct_array_type()
|
||||
if isinstance(values, cls):
|
||||
values, mask = values._data, values._mask
|
||||
if dtype is not None:
|
||||
values = values.astype(dtype.numpy_dtype, copy=False)
|
||||
|
||||
if copy:
|
||||
values = values.copy()
|
||||
mask = mask.copy()
|
||||
return values, mask, dtype, inferred_type
|
||||
|
||||
original = values
|
||||
if not copy:
|
||||
values = np.asarray(values)
|
||||
else:
|
||||
values = np.array(values, copy=copy)
|
||||
inferred_type = None
|
||||
if values.dtype == object or is_string_dtype(values.dtype):
|
||||
inferred_type = lib.infer_dtype(values, skipna=True)
|
||||
if inferred_type == "boolean" and dtype is None:
|
||||
name = dtype_cls.__name__.strip("_")
|
||||
raise TypeError(f"{values.dtype} cannot be converted to {name}")
|
||||
|
||||
elif values.dtype.kind == "b" and checker(dtype):
|
||||
if not copy:
|
||||
values = np.asarray(values, dtype=default_dtype)
|
||||
else:
|
||||
values = np.array(values, dtype=default_dtype, copy=copy)
|
||||
|
||||
elif values.dtype.kind not in "iuf":
|
||||
name = dtype_cls.__name__.strip("_")
|
||||
raise TypeError(f"{values.dtype} cannot be converted to {name}")
|
||||
|
||||
if values.ndim != 1:
|
||||
raise TypeError("values must be a 1D list-like")
|
||||
|
||||
if mask is None:
|
||||
if values.dtype.kind in "iu":
|
||||
# fastpath
|
||||
mask = np.zeros(len(values), dtype=np.bool_)
|
||||
else:
|
||||
mask = libmissing.is_numeric_na(values)
|
||||
else:
|
||||
assert len(mask) == len(values)
|
||||
|
||||
if mask.ndim != 1:
|
||||
raise TypeError("mask must be a 1D list-like")
|
||||
|
||||
# infer dtype if needed
|
||||
if dtype is None:
|
||||
dtype = default_dtype
|
||||
else:
|
||||
dtype = dtype.numpy_dtype
|
||||
|
||||
if is_integer_dtype(dtype) and values.dtype.kind == "f" and len(values) > 0:
|
||||
if mask.all():
|
||||
values = np.ones(values.shape, dtype=dtype)
|
||||
else:
|
||||
idx = np.nanargmax(values)
|
||||
if int(values[idx]) != original[idx]:
|
||||
# We have ints that lost precision during the cast.
|
||||
inferred_type = lib.infer_dtype(original, skipna=True)
|
||||
if (
|
||||
inferred_type not in ["floating", "mixed-integer-float"]
|
||||
and not mask.any()
|
||||
):
|
||||
values = np.asarray(original, dtype=dtype)
|
||||
else:
|
||||
values = np.asarray(original, dtype="object")
|
||||
|
||||
# we copy as need to coerce here
|
||||
if mask.any():
|
||||
values = values.copy()
|
||||
values[mask] = cls._internal_fill_value
|
||||
if inferred_type in ("string", "unicode"):
|
||||
# casts from str are always safe since they raise
|
||||
# a ValueError if the str cannot be parsed into a float
|
||||
values = values.astype(dtype, copy=copy)
|
||||
else:
|
||||
values = dtype_cls._safe_cast(values, dtype, copy=False)
|
||||
|
||||
return values, mask, dtype, inferred_type
|
||||
|
||||
|
||||
class NumericArray(BaseMaskedArray):
|
||||
"""
|
||||
Base class for IntegerArray and FloatingArray.
|
||||
"""
|
||||
|
||||
_dtype_cls: type[NumericDtype]
|
||||
|
||||
def __init__(
|
||||
self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False
|
||||
) -> None:
|
||||
checker = self._dtype_cls._checker
|
||||
if not (isinstance(values, np.ndarray) and checker(values.dtype)):
|
||||
descr = (
|
||||
"floating"
|
||||
if self._dtype_cls.kind == "f" # type: ignore[comparison-overlap]
|
||||
else "integer"
|
||||
)
|
||||
raise TypeError(
|
||||
f"values should be {descr} numpy array. Use "
|
||||
"the 'pd.array' function instead"
|
||||
)
|
||||
if values.dtype == np.float16:
|
||||
# If we don't raise here, then accessing self.dtype would raise
|
||||
raise TypeError("FloatingArray does not support np.float16 dtype.")
|
||||
|
||||
super().__init__(values, mask, copy=copy)
|
||||
|
||||
@cache_readonly
|
||||
def dtype(self) -> NumericDtype:
|
||||
mapping = self._dtype_cls._get_dtype_mapping()
|
||||
return mapping[self._data.dtype]
|
||||
|
||||
@classmethod
|
||||
def _coerce_to_array(
|
||||
cls, value, *, dtype: DtypeObj, copy: bool = False
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
dtype_cls = cls._dtype_cls
|
||||
default_dtype = dtype_cls._default_np_dtype
|
||||
values, mask, _, _ = _coerce_to_data_and_mask(
|
||||
value, dtype, copy, dtype_cls, default_dtype
|
||||
)
|
||||
return values, mask
|
||||
|
||||
@classmethod
|
||||
def _from_sequence_of_strings(
|
||||
cls, strings, *, dtype: Dtype | None = None, copy: bool = False
|
||||
) -> Self:
|
||||
from pandas.core.tools.numeric import to_numeric
|
||||
|
||||
scalars = to_numeric(strings, errors="raise", dtype_backend="numpy_nullable")
|
||||
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
|
||||
|
||||
_HANDLED_TYPES = (np.ndarray, numbers.Number)
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user