Updated script that can be controled by Nodejs web app
This commit is contained in:
43
lib/python3.13/site-packages/pandas/core/arrays/__init__.py
Normal file
43
lib/python3.13/site-packages/pandas/core/arrays/__init__.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from pandas.core.arrays.arrow import ArrowExtensionArray
|
||||
from pandas.core.arrays.base import (
|
||||
ExtensionArray,
|
||||
ExtensionOpsMixin,
|
||||
ExtensionScalarOpsMixin,
|
||||
)
|
||||
from pandas.core.arrays.boolean import BooleanArray
|
||||
from pandas.core.arrays.categorical import Categorical
|
||||
from pandas.core.arrays.datetimes import DatetimeArray
|
||||
from pandas.core.arrays.floating import FloatingArray
|
||||
from pandas.core.arrays.integer import IntegerArray
|
||||
from pandas.core.arrays.interval import IntervalArray
|
||||
from pandas.core.arrays.masked import BaseMaskedArray
|
||||
from pandas.core.arrays.numpy_ import NumpyExtensionArray
|
||||
from pandas.core.arrays.period import (
|
||||
PeriodArray,
|
||||
period_array,
|
||||
)
|
||||
from pandas.core.arrays.sparse import SparseArray
|
||||
from pandas.core.arrays.string_ import StringArray
|
||||
from pandas.core.arrays.string_arrow import ArrowStringArray
|
||||
from pandas.core.arrays.timedeltas import TimedeltaArray
|
||||
|
||||
__all__ = [
|
||||
"ArrowExtensionArray",
|
||||
"ExtensionArray",
|
||||
"ExtensionOpsMixin",
|
||||
"ExtensionScalarOpsMixin",
|
||||
"ArrowStringArray",
|
||||
"BaseMaskedArray",
|
||||
"BooleanArray",
|
||||
"Categorical",
|
||||
"DatetimeArray",
|
||||
"FloatingArray",
|
||||
"IntegerArray",
|
||||
"IntervalArray",
|
||||
"NumpyExtensionArray",
|
||||
"PeriodArray",
|
||||
"period_array",
|
||||
"SparseArray",
|
||||
"StringArray",
|
||||
"TimedeltaArray",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,84 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Literal
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat import pa_version_under10p1
|
||||
|
||||
if not pa_version_under10p1:
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
|
||||
|
||||
class ArrowStringArrayMixin:
|
||||
_pa_array = None
|
||||
|
||||
def __init__(self, *args, **kwargs) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
def _str_pad(
|
||||
self,
|
||||
width: int,
|
||||
side: Literal["left", "right", "both"] = "left",
|
||||
fillchar: str = " ",
|
||||
):
|
||||
if side == "left":
|
||||
pa_pad = pc.utf8_lpad
|
||||
elif side == "right":
|
||||
pa_pad = pc.utf8_rpad
|
||||
elif side == "both":
|
||||
pa_pad = pc.utf8_center
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'"
|
||||
)
|
||||
return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar))
|
||||
|
||||
def _str_get(self, i: int):
|
||||
lengths = pc.utf8_length(self._pa_array)
|
||||
if i >= 0:
|
||||
out_of_bounds = pc.greater_equal(i, lengths)
|
||||
start = i
|
||||
stop = i + 1
|
||||
step = 1
|
||||
else:
|
||||
out_of_bounds = pc.greater(-i, lengths)
|
||||
start = i
|
||||
stop = i - 1
|
||||
step = -1
|
||||
not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True))
|
||||
selected = pc.utf8_slice_codeunits(
|
||||
self._pa_array, start=start, stop=stop, step=step
|
||||
)
|
||||
null_value = pa.scalar(
|
||||
None, type=self._pa_array.type # type: ignore[attr-defined]
|
||||
)
|
||||
result = pc.if_else(not_out_of_bounds, selected, null_value)
|
||||
return type(self)(result)
|
||||
|
||||
def _str_slice_replace(
|
||||
self, start: int | None = None, stop: int | None = None, repl: str | None = None
|
||||
):
|
||||
if repl is None:
|
||||
repl = ""
|
||||
if start is None:
|
||||
start = 0
|
||||
if stop is None:
|
||||
stop = np.iinfo(np.int64).max
|
||||
return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl))
|
||||
|
||||
def _str_capitalize(self):
|
||||
return type(self)(pc.utf8_capitalize(self._pa_array))
|
||||
|
||||
def _str_title(self):
|
||||
return type(self)(pc.utf8_title(self._pa_array))
|
||||
|
||||
def _str_swapcase(self):
|
||||
return type(self)(pc.utf8_swapcase(self._pa_array))
|
||||
|
||||
def _str_removesuffix(self, suffix: str):
|
||||
ends_with = pc.ends_with(self._pa_array, pattern=suffix)
|
||||
removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
|
||||
result = pc.if_else(ends_with, removed, self._pa_array)
|
||||
return type(self)(result)
|
||||
547
lib/python3.13/site-packages/pandas/core/arrays/_mixins.py
Normal file
547
lib/python3.13/site-packages/pandas/core/arrays/_mixins.py
Normal file
@@ -0,0 +1,547 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import wraps
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Literal,
|
||||
cast,
|
||||
overload,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas._libs.arrays import NDArrayBacked
|
||||
from pandas._libs.tslibs import is_supported_dtype
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
AxisInt,
|
||||
Dtype,
|
||||
F,
|
||||
FillnaOptions,
|
||||
PositionalIndexer2D,
|
||||
PositionalIndexerTuple,
|
||||
ScalarIndexer,
|
||||
Self,
|
||||
SequenceIndexer,
|
||||
Shape,
|
||||
TakeIndexer,
|
||||
npt,
|
||||
)
|
||||
from pandas.errors import AbstractMethodError
|
||||
from pandas.util._decorators import doc
|
||||
from pandas.util._validators import (
|
||||
validate_bool_kwarg,
|
||||
validate_fillna_kwargs,
|
||||
validate_insert_loc,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import pandas_dtype
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
DatetimeTZDtype,
|
||||
ExtensionDtype,
|
||||
PeriodDtype,
|
||||
)
|
||||
from pandas.core.dtypes.missing import array_equivalent
|
||||
|
||||
from pandas.core import missing
|
||||
from pandas.core.algorithms import (
|
||||
take,
|
||||
unique,
|
||||
value_counts_internal as value_counts,
|
||||
)
|
||||
from pandas.core.array_algos.quantile import quantile_with_mask
|
||||
from pandas.core.array_algos.transforms import shift
|
||||
from pandas.core.arrays.base import ExtensionArray
|
||||
from pandas.core.construction import extract_array
|
||||
from pandas.core.indexers import check_array_indexer
|
||||
from pandas.core.sorting import nargminmax
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
from pandas._typing import (
|
||||
NumpySorter,
|
||||
NumpyValueArrayLike,
|
||||
)
|
||||
|
||||
from pandas import Series
|
||||
|
||||
|
||||
def ravel_compat(meth: F) -> F:
|
||||
"""
|
||||
Decorator to ravel a 2D array before passing it to a cython operation,
|
||||
then reshape the result to our own shape.
|
||||
"""
|
||||
|
||||
@wraps(meth)
|
||||
def method(self, *args, **kwargs):
|
||||
if self.ndim == 1:
|
||||
return meth(self, *args, **kwargs)
|
||||
|
||||
flags = self._ndarray.flags
|
||||
flat = self.ravel("K")
|
||||
result = meth(flat, *args, **kwargs)
|
||||
order = "F" if flags.f_contiguous else "C"
|
||||
return result.reshape(self.shape, order=order)
|
||||
|
||||
return cast(F, method)
|
||||
|
||||
|
||||
class NDArrayBackedExtensionArray(NDArrayBacked, ExtensionArray):
|
||||
"""
|
||||
ExtensionArray that is backed by a single NumPy ndarray.
|
||||
"""
|
||||
|
||||
_ndarray: np.ndarray
|
||||
|
||||
# scalar used to denote NA value inside our self._ndarray, e.g. -1
|
||||
# for Categorical, iNaT for Period. Outside of object dtype,
|
||||
# self.isna() should be exactly locations in self._ndarray with
|
||||
# _internal_fill_value.
|
||||
_internal_fill_value: Any
|
||||
|
||||
def _box_func(self, x):
|
||||
"""
|
||||
Wrap numpy type in our dtype.type if necessary.
|
||||
"""
|
||||
return x
|
||||
|
||||
def _validate_scalar(self, value):
|
||||
# used by NDArrayBackedExtensionIndex.insert
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
|
||||
def view(self, dtype: Dtype | None = None) -> ArrayLike:
|
||||
# We handle datetime64, datetime64tz, timedelta64, and period
|
||||
# dtypes here. Everything else we pass through to the underlying
|
||||
# ndarray.
|
||||
if dtype is None or dtype is self.dtype:
|
||||
return self._from_backing_data(self._ndarray)
|
||||
|
||||
if isinstance(dtype, type):
|
||||
# we sometimes pass non-dtype objects, e.g np.ndarray;
|
||||
# pass those through to the underlying ndarray
|
||||
return self._ndarray.view(dtype)
|
||||
|
||||
dtype = pandas_dtype(dtype)
|
||||
arr = self._ndarray
|
||||
|
||||
if isinstance(dtype, PeriodDtype):
|
||||
cls = dtype.construct_array_type()
|
||||
return cls(arr.view("i8"), dtype=dtype)
|
||||
elif isinstance(dtype, DatetimeTZDtype):
|
||||
dt_cls = dtype.construct_array_type()
|
||||
dt64_values = arr.view(f"M8[{dtype.unit}]")
|
||||
return dt_cls._simple_new(dt64_values, dtype=dtype)
|
||||
elif lib.is_np_dtype(dtype, "M") and is_supported_dtype(dtype):
|
||||
from pandas.core.arrays import DatetimeArray
|
||||
|
||||
dt64_values = arr.view(dtype)
|
||||
return DatetimeArray._simple_new(dt64_values, dtype=dtype)
|
||||
|
||||
elif lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype):
|
||||
from pandas.core.arrays import TimedeltaArray
|
||||
|
||||
td64_values = arr.view(dtype)
|
||||
return TimedeltaArray._simple_new(td64_values, dtype=dtype)
|
||||
|
||||
# error: Argument "dtype" to "view" of "_ArrayOrScalarCommon" has incompatible
|
||||
# type "Union[ExtensionDtype, dtype[Any]]"; expected "Union[dtype[Any], None,
|
||||
# type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int,
|
||||
# Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
|
||||
return arr.view(dtype=dtype) # type: ignore[arg-type]
|
||||
|
||||
def take(
|
||||
self,
|
||||
indices: TakeIndexer,
|
||||
*,
|
||||
allow_fill: bool = False,
|
||||
fill_value: Any = None,
|
||||
axis: AxisInt = 0,
|
||||
) -> Self:
|
||||
if allow_fill:
|
||||
fill_value = self._validate_scalar(fill_value)
|
||||
|
||||
new_data = take(
|
||||
self._ndarray,
|
||||
indices,
|
||||
allow_fill=allow_fill,
|
||||
fill_value=fill_value,
|
||||
axis=axis,
|
||||
)
|
||||
return self._from_backing_data(new_data)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
|
||||
def equals(self, other) -> bool:
|
||||
if type(self) is not type(other):
|
||||
return False
|
||||
if self.dtype != other.dtype:
|
||||
return False
|
||||
return bool(array_equivalent(self._ndarray, other._ndarray, dtype_equal=True))
|
||||
|
||||
@classmethod
|
||||
def _from_factorized(cls, values, original):
|
||||
assert values.dtype == original._ndarray.dtype
|
||||
return original._from_backing_data(values)
|
||||
|
||||
def _values_for_argsort(self) -> np.ndarray:
|
||||
return self._ndarray
|
||||
|
||||
def _values_for_factorize(self):
|
||||
return self._ndarray, self._internal_fill_value
|
||||
|
||||
def _hash_pandas_object(
|
||||
self, *, encoding: str, hash_key: str, categorize: bool
|
||||
) -> npt.NDArray[np.uint64]:
|
||||
from pandas.core.util.hashing import hash_array
|
||||
|
||||
values = self._ndarray
|
||||
return hash_array(
|
||||
values, encoding=encoding, hash_key=hash_key, categorize=categorize
|
||||
)
|
||||
|
||||
# Signature of "argmin" incompatible with supertype "ExtensionArray"
|
||||
def argmin(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[override]
|
||||
# override base class by adding axis keyword
|
||||
validate_bool_kwarg(skipna, "skipna")
|
||||
if not skipna and self._hasna:
|
||||
raise NotImplementedError
|
||||
return nargminmax(self, "argmin", axis=axis)
|
||||
|
||||
# Signature of "argmax" incompatible with supertype "ExtensionArray"
|
||||
def argmax(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[override]
|
||||
# override base class by adding axis keyword
|
||||
validate_bool_kwarg(skipna, "skipna")
|
||||
if not skipna and self._hasna:
|
||||
raise NotImplementedError
|
||||
return nargminmax(self, "argmax", axis=axis)
|
||||
|
||||
def unique(self) -> Self:
|
||||
new_data = unique(self._ndarray)
|
||||
return self._from_backing_data(new_data)
|
||||
|
||||
@classmethod
|
||||
@doc(ExtensionArray._concat_same_type)
|
||||
def _concat_same_type(
|
||||
cls,
|
||||
to_concat: Sequence[Self],
|
||||
axis: AxisInt = 0,
|
||||
) -> Self:
|
||||
if not lib.dtypes_all_equal([x.dtype for x in to_concat]):
|
||||
dtypes = {str(x.dtype) for x in to_concat}
|
||||
raise ValueError("to_concat must have the same dtype", dtypes)
|
||||
|
||||
return super()._concat_same_type(to_concat, axis=axis)
|
||||
|
||||
@doc(ExtensionArray.searchsorted)
|
||||
def searchsorted(
|
||||
self,
|
||||
value: NumpyValueArrayLike | ExtensionArray,
|
||||
side: Literal["left", "right"] = "left",
|
||||
sorter: NumpySorter | None = None,
|
||||
) -> npt.NDArray[np.intp] | np.intp:
|
||||
npvalue = self._validate_setitem_value(value)
|
||||
return self._ndarray.searchsorted(npvalue, side=side, sorter=sorter)
|
||||
|
||||
@doc(ExtensionArray.shift)
|
||||
def shift(self, periods: int = 1, fill_value=None):
|
||||
# NB: shift is always along axis=0
|
||||
axis = 0
|
||||
fill_value = self._validate_scalar(fill_value)
|
||||
new_values = shift(self._ndarray, periods, axis, fill_value)
|
||||
|
||||
return self._from_backing_data(new_values)
|
||||
|
||||
def __setitem__(self, key, value) -> None:
|
||||
key = check_array_indexer(self, key)
|
||||
value = self._validate_setitem_value(value)
|
||||
self._ndarray[key] = value
|
||||
|
||||
def _validate_setitem_value(self, value):
|
||||
return value
|
||||
|
||||
@overload
|
||||
def __getitem__(self, key: ScalarIndexer) -> Any:
|
||||
...
|
||||
|
||||
@overload
|
||||
def __getitem__(
|
||||
self,
|
||||
key: SequenceIndexer | PositionalIndexerTuple,
|
||||
) -> Self:
|
||||
...
|
||||
|
||||
def __getitem__(
|
||||
self,
|
||||
key: PositionalIndexer2D,
|
||||
) -> Self | Any:
|
||||
if lib.is_integer(key):
|
||||
# fast-path
|
||||
result = self._ndarray[key]
|
||||
if self.ndim == 1:
|
||||
return self._box_func(result)
|
||||
return self._from_backing_data(result)
|
||||
|
||||
# error: Incompatible types in assignment (expression has type "ExtensionArray",
|
||||
# variable has type "Union[int, slice, ndarray]")
|
||||
key = extract_array(key, extract_numpy=True) # type: ignore[assignment]
|
||||
key = check_array_indexer(self, key)
|
||||
result = self._ndarray[key]
|
||||
if lib.is_scalar(result):
|
||||
return self._box_func(result)
|
||||
|
||||
result = self._from_backing_data(result)
|
||||
return result
|
||||
|
||||
def _fill_mask_inplace(
|
||||
self, method: str, limit: int | None, mask: npt.NDArray[np.bool_]
|
||||
) -> None:
|
||||
# (for now) when self.ndim == 2, we assume axis=0
|
||||
func = missing.get_fill_func(method, ndim=self.ndim)
|
||||
func(self._ndarray.T, limit=limit, mask=mask.T)
|
||||
|
||||
def _pad_or_backfill(
|
||||
self,
|
||||
*,
|
||||
method: FillnaOptions,
|
||||
limit: int | None = None,
|
||||
limit_area: Literal["inside", "outside"] | None = None,
|
||||
copy: bool = True,
|
||||
) -> Self:
|
||||
mask = self.isna()
|
||||
if mask.any():
|
||||
# (for now) when self.ndim == 2, we assume axis=0
|
||||
func = missing.get_fill_func(method, ndim=self.ndim)
|
||||
|
||||
npvalues = self._ndarray.T
|
||||
if copy:
|
||||
npvalues = npvalues.copy()
|
||||
func(npvalues, limit=limit, limit_area=limit_area, mask=mask.T)
|
||||
npvalues = npvalues.T
|
||||
|
||||
if copy:
|
||||
new_values = self._from_backing_data(npvalues)
|
||||
else:
|
||||
new_values = self
|
||||
|
||||
else:
|
||||
if copy:
|
||||
new_values = self.copy()
|
||||
else:
|
||||
new_values = self
|
||||
return new_values
|
||||
|
||||
@doc(ExtensionArray.fillna)
|
||||
def fillna(
|
||||
self, value=None, method=None, limit: int | None = None, copy: bool = True
|
||||
) -> Self:
|
||||
value, method = validate_fillna_kwargs(
|
||||
value, method, validate_scalar_dict_value=False
|
||||
)
|
||||
|
||||
mask = self.isna()
|
||||
# error: Argument 2 to "check_value_size" has incompatible type
|
||||
# "ExtensionArray"; expected "ndarray"
|
||||
value = missing.check_value_size(
|
||||
value, mask, len(self) # type: ignore[arg-type]
|
||||
)
|
||||
|
||||
if mask.any():
|
||||
if method is not None:
|
||||
# (for now) when self.ndim == 2, we assume axis=0
|
||||
func = missing.get_fill_func(method, ndim=self.ndim)
|
||||
npvalues = self._ndarray.T
|
||||
if copy:
|
||||
npvalues = npvalues.copy()
|
||||
func(npvalues, limit=limit, mask=mask.T)
|
||||
npvalues = npvalues.T
|
||||
|
||||
# TODO: NumpyExtensionArray didn't used to copy, need tests
|
||||
# for this
|
||||
new_values = self._from_backing_data(npvalues)
|
||||
else:
|
||||
# fill with value
|
||||
if copy:
|
||||
new_values = self.copy()
|
||||
else:
|
||||
new_values = self[:]
|
||||
new_values[mask] = value
|
||||
else:
|
||||
# We validate the fill_value even if there is nothing to fill
|
||||
if value is not None:
|
||||
self._validate_setitem_value(value)
|
||||
|
||||
if not copy:
|
||||
new_values = self[:]
|
||||
else:
|
||||
new_values = self.copy()
|
||||
return new_values
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Reductions
|
||||
|
||||
def _wrap_reduction_result(self, axis: AxisInt | None, result):
|
||||
if axis is None or self.ndim == 1:
|
||||
return self._box_func(result)
|
||||
return self._from_backing_data(result)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# __array_function__ methods
|
||||
|
||||
def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
|
||||
"""
|
||||
Analogue to np.putmask(self, mask, value)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mask : np.ndarray[bool]
|
||||
value : scalar or listlike
|
||||
|
||||
Raises
|
||||
------
|
||||
TypeError
|
||||
If value cannot be cast to self.dtype.
|
||||
"""
|
||||
value = self._validate_setitem_value(value)
|
||||
|
||||
np.putmask(self._ndarray, mask, value)
|
||||
|
||||
def _where(self: Self, mask: npt.NDArray[np.bool_], value) -> Self:
|
||||
"""
|
||||
Analogue to np.where(mask, self, value)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mask : np.ndarray[bool]
|
||||
value : scalar or listlike
|
||||
|
||||
Raises
|
||||
------
|
||||
TypeError
|
||||
If value cannot be cast to self.dtype.
|
||||
"""
|
||||
value = self._validate_setitem_value(value)
|
||||
|
||||
res_values = np.where(mask, self._ndarray, value)
|
||||
if res_values.dtype != self._ndarray.dtype:
|
||||
raise AssertionError(
|
||||
# GH#56410
|
||||
"Something has gone wrong, please report a bug at "
|
||||
"github.com/pandas-dev/pandas/"
|
||||
)
|
||||
return self._from_backing_data(res_values)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Index compat methods
|
||||
|
||||
def insert(self, loc: int, item) -> Self:
|
||||
"""
|
||||
Make new ExtensionArray inserting new item at location. Follows
|
||||
Python list.append semantics for negative values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
loc : int
|
||||
item : object
|
||||
|
||||
Returns
|
||||
-------
|
||||
type(self)
|
||||
"""
|
||||
loc = validate_insert_loc(loc, len(self))
|
||||
|
||||
code = self._validate_scalar(item)
|
||||
|
||||
new_vals = np.concatenate(
|
||||
(
|
||||
self._ndarray[:loc],
|
||||
np.asarray([code], dtype=self._ndarray.dtype),
|
||||
self._ndarray[loc:],
|
||||
)
|
||||
)
|
||||
return self._from_backing_data(new_vals)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Additional array methods
|
||||
# These are not part of the EA API, but we implement them because
|
||||
# pandas assumes they're there.
|
||||
|
||||
def value_counts(self, dropna: bool = True) -> Series:
|
||||
"""
|
||||
Return a Series containing counts of unique values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dropna : bool, default True
|
||||
Don't include counts of NA values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series
|
||||
"""
|
||||
if self.ndim != 1:
|
||||
raise NotImplementedError
|
||||
|
||||
from pandas import (
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
|
||||
if dropna:
|
||||
# error: Unsupported operand type for ~ ("ExtensionArray")
|
||||
values = self[~self.isna()]._ndarray # type: ignore[operator]
|
||||
else:
|
||||
values = self._ndarray
|
||||
|
||||
result = value_counts(values, sort=False, dropna=dropna)
|
||||
|
||||
index_arr = self._from_backing_data(np.asarray(result.index._data))
|
||||
index = Index(index_arr, name=result.index.name)
|
||||
return Series(result._values, index=index, name=result.name, copy=False)
|
||||
|
||||
def _quantile(
|
||||
self,
|
||||
qs: npt.NDArray[np.float64],
|
||||
interpolation: str,
|
||||
) -> Self:
|
||||
# TODO: disable for Categorical if not ordered?
|
||||
|
||||
mask = np.asarray(self.isna())
|
||||
arr = self._ndarray
|
||||
fill_value = self._internal_fill_value
|
||||
|
||||
res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)
|
||||
|
||||
res_values = self._cast_quantile_result(res_values)
|
||||
return self._from_backing_data(res_values)
|
||||
|
||||
# TODO: see if we can share this with other dispatch-wrapping methods
|
||||
def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Cast the result of quantile_with_mask to an appropriate dtype
|
||||
to pass to _from_backing_data in _quantile.
|
||||
"""
|
||||
return res_values
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# numpy-like methods
|
||||
|
||||
@classmethod
|
||||
def _empty(cls, shape: Shape, dtype: ExtensionDtype) -> Self:
|
||||
"""
|
||||
Analogous to np.empty(shape, dtype=dtype)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
shape : tuple[int]
|
||||
dtype : ExtensionDtype
|
||||
"""
|
||||
# The base implementation uses a naive approach to find the dtype
|
||||
# for the backing ndarray
|
||||
arr = cls._from_sequence([], dtype=dtype)
|
||||
backing = np.empty(shape, dtype=arr._ndarray.dtype)
|
||||
return arr._from_backing_data(backing)
|
||||
207
lib/python3.13/site-packages/pandas/core/arrays/_ranges.py
Normal file
207
lib/python3.13/site-packages/pandas/core/arrays/_ranges.py
Normal file
@@ -0,0 +1,207 @@
|
||||
"""
|
||||
Helper functions to generate range-like data for DatetimeArray
|
||||
(and possibly TimedeltaArray/PeriodArray)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs.lib import i8max
|
||||
from pandas._libs.tslibs import (
|
||||
BaseOffset,
|
||||
OutOfBoundsDatetime,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
iNaT,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import npt
|
||||
|
||||
|
||||
def generate_regular_range(
|
||||
start: Timestamp | Timedelta | None,
|
||||
end: Timestamp | Timedelta | None,
|
||||
periods: int | None,
|
||||
freq: BaseOffset,
|
||||
unit: str = "ns",
|
||||
) -> npt.NDArray[np.intp]:
|
||||
"""
|
||||
Generate a range of dates or timestamps with the spans between dates
|
||||
described by the given `freq` DateOffset.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
start : Timedelta, Timestamp or None
|
||||
First point of produced date range.
|
||||
end : Timedelta, Timestamp or None
|
||||
Last point of produced date range.
|
||||
periods : int or None
|
||||
Number of periods in produced date range.
|
||||
freq : Tick
|
||||
Describes space between dates in produced date range.
|
||||
unit : str, default "ns"
|
||||
The resolution the output is meant to represent.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ndarray[np.int64]
|
||||
Representing the given resolution.
|
||||
"""
|
||||
istart = start._value if start is not None else None
|
||||
iend = end._value if end is not None else None
|
||||
freq.nanos # raises if non-fixed frequency
|
||||
td = Timedelta(freq)
|
||||
b: int
|
||||
e: int
|
||||
try:
|
||||
td = td.as_unit(unit, round_ok=False)
|
||||
except ValueError as err:
|
||||
raise ValueError(
|
||||
f"freq={freq} is incompatible with unit={unit}. "
|
||||
"Use a lower freq or a higher unit instead."
|
||||
) from err
|
||||
stride = int(td._value)
|
||||
|
||||
if periods is None and istart is not None and iend is not None:
|
||||
b = istart
|
||||
# cannot just use e = Timestamp(end) + 1 because arange breaks when
|
||||
# stride is too large, see GH10887
|
||||
e = b + (iend - b) // stride * stride + stride // 2 + 1
|
||||
elif istart is not None and periods is not None:
|
||||
b = istart
|
||||
e = _generate_range_overflow_safe(b, periods, stride, side="start")
|
||||
elif iend is not None and periods is not None:
|
||||
e = iend + stride
|
||||
b = _generate_range_overflow_safe(e, periods, stride, side="end")
|
||||
else:
|
||||
raise ValueError(
|
||||
"at least 'start' or 'end' should be specified if a 'period' is given."
|
||||
)
|
||||
|
||||
with np.errstate(over="raise"):
|
||||
# If the range is sufficiently large, np.arange may overflow
|
||||
# and incorrectly return an empty array if not caught.
|
||||
try:
|
||||
values = np.arange(b, e, stride, dtype=np.int64)
|
||||
except FloatingPointError:
|
||||
xdr = [b]
|
||||
while xdr[-1] != e:
|
||||
xdr.append(xdr[-1] + stride)
|
||||
values = np.array(xdr[:-1], dtype=np.int64)
|
||||
return values
|
||||
|
||||
|
||||
def _generate_range_overflow_safe(
|
||||
endpoint: int, periods: int, stride: int, side: str = "start"
|
||||
) -> int:
|
||||
"""
|
||||
Calculate the second endpoint for passing to np.arange, checking
|
||||
to avoid an integer overflow. Catch OverflowError and re-raise
|
||||
as OutOfBoundsDatetime.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
endpoint : int
|
||||
nanosecond timestamp of the known endpoint of the desired range
|
||||
periods : int
|
||||
number of periods in the desired range
|
||||
stride : int
|
||||
nanoseconds between periods in the desired range
|
||||
side : {'start', 'end'}
|
||||
which end of the range `endpoint` refers to
|
||||
|
||||
Returns
|
||||
-------
|
||||
other_end : int
|
||||
|
||||
Raises
|
||||
------
|
||||
OutOfBoundsDatetime
|
||||
"""
|
||||
# GH#14187 raise instead of incorrectly wrapping around
|
||||
assert side in ["start", "end"]
|
||||
|
||||
i64max = np.uint64(i8max)
|
||||
msg = f"Cannot generate range with {side}={endpoint} and periods={periods}"
|
||||
|
||||
with np.errstate(over="raise"):
|
||||
# if periods * strides cannot be multiplied within the *uint64* bounds,
|
||||
# we cannot salvage the operation by recursing, so raise
|
||||
try:
|
||||
addend = np.uint64(periods) * np.uint64(np.abs(stride))
|
||||
except FloatingPointError as err:
|
||||
raise OutOfBoundsDatetime(msg) from err
|
||||
|
||||
if np.abs(addend) <= i64max:
|
||||
# relatively easy case without casting concerns
|
||||
return _generate_range_overflow_safe_signed(endpoint, periods, stride, side)
|
||||
|
||||
elif (endpoint > 0 and side == "start" and stride > 0) or (
|
||||
endpoint < 0 < stride and side == "end"
|
||||
):
|
||||
# no chance of not-overflowing
|
||||
raise OutOfBoundsDatetime(msg)
|
||||
|
||||
elif side == "end" and endpoint - stride <= i64max < endpoint:
|
||||
# in _generate_regular_range we added `stride` thereby overflowing
|
||||
# the bounds. Adjust to fix this.
|
||||
return _generate_range_overflow_safe(
|
||||
endpoint - stride, periods - 1, stride, side
|
||||
)
|
||||
|
||||
# split into smaller pieces
|
||||
mid_periods = periods // 2
|
||||
remaining = periods - mid_periods
|
||||
assert 0 < remaining < periods, (remaining, periods, endpoint, stride)
|
||||
|
||||
midpoint = int(_generate_range_overflow_safe(endpoint, mid_periods, stride, side))
|
||||
return _generate_range_overflow_safe(midpoint, remaining, stride, side)
|
||||
|
||||
|
||||
def _generate_range_overflow_safe_signed(
|
||||
endpoint: int, periods: int, stride: int, side: str
|
||||
) -> int:
|
||||
"""
|
||||
A special case for _generate_range_overflow_safe where `periods * stride`
|
||||
can be calculated without overflowing int64 bounds.
|
||||
"""
|
||||
assert side in ["start", "end"]
|
||||
if side == "end":
|
||||
stride *= -1
|
||||
|
||||
with np.errstate(over="raise"):
|
||||
addend = np.int64(periods) * np.int64(stride)
|
||||
try:
|
||||
# easy case with no overflows
|
||||
result = np.int64(endpoint) + addend
|
||||
if result == iNaT:
|
||||
# Putting this into a DatetimeArray/TimedeltaArray
|
||||
# would incorrectly be interpreted as NaT
|
||||
raise OverflowError
|
||||
return int(result)
|
||||
except (FloatingPointError, OverflowError):
|
||||
# with endpoint negative and addend positive we risk
|
||||
# FloatingPointError; with reversed signed we risk OverflowError
|
||||
pass
|
||||
|
||||
# if stride and endpoint had opposite signs, then endpoint + addend
|
||||
# should never overflow. so they must have the same signs
|
||||
assert (stride > 0 and endpoint >= 0) or (stride < 0 and endpoint <= 0)
|
||||
|
||||
if stride > 0:
|
||||
# watch out for very special case in which we just slightly
|
||||
# exceed implementation bounds, but when passing the result to
|
||||
# np.arange will get a result slightly within the bounds
|
||||
|
||||
uresult = np.uint64(endpoint) + np.uint64(addend)
|
||||
i64max = np.uint64(i8max)
|
||||
assert uresult > i64max
|
||||
if uresult <= i64max + np.uint64(stride):
|
||||
return int(uresult)
|
||||
|
||||
raise OutOfBoundsDatetime(
|
||||
f"Cannot generate range with {side}={endpoint} and periods={periods}"
|
||||
)
|
||||
63
lib/python3.13/site-packages/pandas/core/arrays/_utils.py
Normal file
63
lib/python3.13/site-packages/pandas/core/arrays/_utils.py
Normal file
@@ -0,0 +1,63 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.errors import LossySetitemError
|
||||
|
||||
from pandas.core.dtypes.cast import np_can_hold_element
|
||||
from pandas.core.dtypes.common import is_numeric_dtype
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
npt,
|
||||
)
|
||||
|
||||
|
||||
def to_numpy_dtype_inference(
|
||||
arr: ArrayLike, dtype: npt.DTypeLike | None, na_value, hasna: bool
|
||||
) -> tuple[npt.DTypeLike, Any]:
|
||||
if dtype is None and is_numeric_dtype(arr.dtype):
|
||||
dtype_given = False
|
||||
if hasna:
|
||||
if arr.dtype.kind == "b":
|
||||
dtype = np.dtype(np.object_)
|
||||
else:
|
||||
if arr.dtype.kind in "iu":
|
||||
dtype = np.dtype(np.float64)
|
||||
else:
|
||||
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
|
||||
if na_value is lib.no_default:
|
||||
na_value = np.nan
|
||||
else:
|
||||
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
|
||||
elif dtype is not None:
|
||||
dtype = np.dtype(dtype)
|
||||
dtype_given = True
|
||||
else:
|
||||
dtype_given = True
|
||||
|
||||
if na_value is lib.no_default:
|
||||
if dtype is None or not hasna:
|
||||
na_value = arr.dtype.na_value
|
||||
elif dtype.kind == "f": # type: ignore[union-attr]
|
||||
na_value = np.nan
|
||||
elif dtype.kind == "M": # type: ignore[union-attr]
|
||||
na_value = np.datetime64("nat")
|
||||
elif dtype.kind == "m": # type: ignore[union-attr]
|
||||
na_value = np.timedelta64("nat")
|
||||
else:
|
||||
na_value = arr.dtype.na_value
|
||||
|
||||
if not dtype_given and hasna:
|
||||
try:
|
||||
np_can_hold_element(dtype, na_value) # type: ignore[arg-type]
|
||||
except LossySetitemError:
|
||||
dtype = np.dtype(np.object_)
|
||||
return dtype, na_value
|
||||
@@ -0,0 +1,7 @@
|
||||
from pandas.core.arrays.arrow.accessors import (
|
||||
ListAccessor,
|
||||
StructAccessor,
|
||||
)
|
||||
from pandas.core.arrays.arrow.array import ArrowExtensionArray
|
||||
|
||||
__all__ = ["ArrowExtensionArray", "StructAccessor", "ListAccessor"]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,66 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pyarrow
|
||||
|
||||
from pandas.errors import PerformanceWarning
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
|
||||
def fallback_performancewarning(version: str | None = None) -> None:
|
||||
"""
|
||||
Raise a PerformanceWarning for falling back to ExtensionArray's
|
||||
non-pyarrow method
|
||||
"""
|
||||
msg = "Falling back on a non-pyarrow code path which may decrease performance."
|
||||
if version is not None:
|
||||
msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning."
|
||||
warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level())
|
||||
|
||||
|
||||
def pyarrow_array_to_numpy_and_mask(
|
||||
arr, dtype: np.dtype
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Convert a primitive pyarrow.Array to a numpy array and boolean mask based
|
||||
on the buffers of the Array.
|
||||
|
||||
At the moment pyarrow.BooleanArray is not supported.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arr : pyarrow.Array
|
||||
dtype : numpy.dtype
|
||||
|
||||
Returns
|
||||
-------
|
||||
(data, mask)
|
||||
Tuple of two numpy arrays with the raw data (with specified dtype) and
|
||||
a boolean mask (validity mask, so False means missing)
|
||||
"""
|
||||
dtype = np.dtype(dtype)
|
||||
|
||||
if pyarrow.types.is_null(arr.type):
|
||||
# No initialization of data is needed since everything is null
|
||||
data = np.empty(len(arr), dtype=dtype)
|
||||
mask = np.zeros(len(arr), dtype=bool)
|
||||
return data, mask
|
||||
buflist = arr.buffers()
|
||||
# Since Arrow buffers might contain padding and the data might be offset,
|
||||
# the buffer gets sliced here before handing it to numpy.
|
||||
# See also https://github.com/pandas-dev/pandas/issues/40896
|
||||
offset = arr.offset * dtype.itemsize
|
||||
length = len(arr) * dtype.itemsize
|
||||
data_buf = buflist[1][offset : offset + length]
|
||||
data = np.frombuffer(data_buf, dtype=dtype)
|
||||
bitmask = buflist[0]
|
||||
if bitmask is not None:
|
||||
mask = pyarrow.BooleanArray.from_buffers(
|
||||
pyarrow.bool_(), len(arr), [None, bitmask], offset=arr.offset
|
||||
)
|
||||
mask = np.asarray(mask)
|
||||
else:
|
||||
mask = np.ones(len(arr), dtype=bool)
|
||||
return data, mask
|
||||
@@ -0,0 +1,473 @@
|
||||
"""Accessors for arrow-backed data."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import (
|
||||
ABCMeta,
|
||||
abstractmethod,
|
||||
)
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
cast,
|
||||
)
|
||||
|
||||
from pandas.compat import (
|
||||
pa_version_under10p1,
|
||||
pa_version_under11p0,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
|
||||
if not pa_version_under10p1:
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
|
||||
from pandas.core.dtypes.dtypes import ArrowDtype
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterator
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
|
||||
|
||||
class ArrowAccessor(metaclass=ABCMeta):
|
||||
@abstractmethod
|
||||
def __init__(self, data, validation_msg: str) -> None:
|
||||
self._data = data
|
||||
self._validation_msg = validation_msg
|
||||
self._validate(data)
|
||||
|
||||
@abstractmethod
|
||||
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
|
||||
pass
|
||||
|
||||
def _validate(self, data):
|
||||
dtype = data.dtype
|
||||
if not isinstance(dtype, ArrowDtype):
|
||||
# Raise AttributeError so that inspect can handle non-struct Series.
|
||||
raise AttributeError(self._validation_msg.format(dtype=dtype))
|
||||
|
||||
if not self._is_valid_pyarrow_dtype(dtype.pyarrow_dtype):
|
||||
# Raise AttributeError so that inspect can handle invalid Series.
|
||||
raise AttributeError(self._validation_msg.format(dtype=dtype))
|
||||
|
||||
@property
|
||||
def _pa_array(self):
|
||||
return self._data.array._pa_array
|
||||
|
||||
|
||||
class ListAccessor(ArrowAccessor):
|
||||
"""
|
||||
Accessor object for list data properties of the Series values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : Series
|
||||
Series containing Arrow list data.
|
||||
"""
|
||||
|
||||
def __init__(self, data=None) -> None:
|
||||
super().__init__(
|
||||
data,
|
||||
validation_msg="Can only use the '.list' accessor with "
|
||||
"'list[pyarrow]' dtype, not {dtype}.",
|
||||
)
|
||||
|
||||
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
|
||||
return (
|
||||
pa.types.is_list(pyarrow_dtype)
|
||||
or pa.types.is_fixed_size_list(pyarrow_dtype)
|
||||
or pa.types.is_large_list(pyarrow_dtype)
|
||||
)
|
||||
|
||||
def len(self) -> Series:
|
||||
"""
|
||||
Return the length of each list in the Series.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
The length of each list.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... [1, 2, 3],
|
||||
... [3],
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.list_(
|
||||
... pa.int64()
|
||||
... ))
|
||||
... )
|
||||
>>> s.list.len()
|
||||
0 3
|
||||
1 1
|
||||
dtype: int32[pyarrow]
|
||||
"""
|
||||
from pandas import Series
|
||||
|
||||
value_lengths = pc.list_value_length(self._pa_array)
|
||||
return Series(value_lengths, dtype=ArrowDtype(value_lengths.type))
|
||||
|
||||
def __getitem__(self, key: int | slice) -> Series:
|
||||
"""
|
||||
Index or slice lists in the Series.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
key : int | slice
|
||||
Index or slice of indices to access from each list.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
The list at requested index.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... [1, 2, 3],
|
||||
... [3],
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.list_(
|
||||
... pa.int64()
|
||||
... ))
|
||||
... )
|
||||
>>> s.list[0]
|
||||
0 1
|
||||
1 3
|
||||
dtype: int64[pyarrow]
|
||||
"""
|
||||
from pandas import Series
|
||||
|
||||
if isinstance(key, int):
|
||||
# TODO: Support negative key but pyarrow does not allow
|
||||
# element index to be an array.
|
||||
# if key < 0:
|
||||
# key = pc.add(key, pc.list_value_length(self._pa_array))
|
||||
element = pc.list_element(self._pa_array, key)
|
||||
return Series(element, dtype=ArrowDtype(element.type))
|
||||
elif isinstance(key, slice):
|
||||
if pa_version_under11p0:
|
||||
raise NotImplementedError(
|
||||
f"List slice not supported by pyarrow {pa.__version__}."
|
||||
)
|
||||
|
||||
# TODO: Support negative start/stop/step, ideally this would be added
|
||||
# upstream in pyarrow.
|
||||
start, stop, step = key.start, key.stop, key.step
|
||||
if start is None:
|
||||
# TODO: When adding negative step support
|
||||
# this should be setto last element of array
|
||||
# when step is negative.
|
||||
start = 0
|
||||
if step is None:
|
||||
step = 1
|
||||
sliced = pc.list_slice(self._pa_array, start, stop, step)
|
||||
return Series(sliced, dtype=ArrowDtype(sliced.type))
|
||||
else:
|
||||
raise ValueError(f"key must be an int or slice, got {type(key).__name__}")
|
||||
|
||||
def __iter__(self) -> Iterator:
|
||||
raise TypeError(f"'{type(self).__name__}' object is not iterable")
|
||||
|
||||
def flatten(self) -> Series:
|
||||
"""
|
||||
Flatten list values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
The data from all lists in the series flattened.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... [1, 2, 3],
|
||||
... [3],
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.list_(
|
||||
... pa.int64()
|
||||
... ))
|
||||
... )
|
||||
>>> s.list.flatten()
|
||||
0 1
|
||||
1 2
|
||||
2 3
|
||||
3 3
|
||||
dtype: int64[pyarrow]
|
||||
"""
|
||||
from pandas import Series
|
||||
|
||||
flattened = pc.list_flatten(self._pa_array)
|
||||
return Series(flattened, dtype=ArrowDtype(flattened.type))
|
||||
|
||||
|
||||
class StructAccessor(ArrowAccessor):
|
||||
"""
|
||||
Accessor object for structured data properties of the Series values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : Series
|
||||
Series containing Arrow struct data.
|
||||
"""
|
||||
|
||||
def __init__(self, data=None) -> None:
|
||||
super().__init__(
|
||||
data,
|
||||
validation_msg=(
|
||||
"Can only use the '.struct' accessor with 'struct[pyarrow]' "
|
||||
"dtype, not {dtype}."
|
||||
),
|
||||
)
|
||||
|
||||
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
|
||||
return pa.types.is_struct(pyarrow_dtype)
|
||||
|
||||
@property
|
||||
def dtypes(self) -> Series:
|
||||
"""
|
||||
Return the dtype object of each child field of the struct.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
The data type of each child field.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... {"version": 1, "project": "pandas"},
|
||||
... {"version": 2, "project": "pandas"},
|
||||
... {"version": 1, "project": "numpy"},
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.struct(
|
||||
... [("version", pa.int64()), ("project", pa.string())]
|
||||
... ))
|
||||
... )
|
||||
>>> s.struct.dtypes
|
||||
version int64[pyarrow]
|
||||
project string[pyarrow]
|
||||
dtype: object
|
||||
"""
|
||||
from pandas import (
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
|
||||
pa_type = self._data.dtype.pyarrow_dtype
|
||||
types = [ArrowDtype(struct.type) for struct in pa_type]
|
||||
names = [struct.name for struct in pa_type]
|
||||
return Series(types, index=Index(names))
|
||||
|
||||
def field(
|
||||
self,
|
||||
name_or_index: list[str]
|
||||
| list[bytes]
|
||||
| list[int]
|
||||
| pc.Expression
|
||||
| bytes
|
||||
| str
|
||||
| int,
|
||||
) -> Series:
|
||||
"""
|
||||
Extract a child field of a struct as a Series.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name_or_index : str | bytes | int | expression | list
|
||||
Name or index of the child field to extract.
|
||||
|
||||
For list-like inputs, this will index into a nested
|
||||
struct.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
The data corresponding to the selected child field.
|
||||
|
||||
See Also
|
||||
--------
|
||||
Series.struct.explode : Return all child fields as a DataFrame.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The name of the resulting Series will be set using the following
|
||||
rules:
|
||||
|
||||
- For string, bytes, or integer `name_or_index` (or a list of these, for
|
||||
a nested selection), the Series name is set to the selected
|
||||
field's name.
|
||||
- For a :class:`pyarrow.compute.Expression`, this is set to
|
||||
the string form of the expression.
|
||||
- For list-like `name_or_index`, the name will be set to the
|
||||
name of the final field selected.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... {"version": 1, "project": "pandas"},
|
||||
... {"version": 2, "project": "pandas"},
|
||||
... {"version": 1, "project": "numpy"},
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.struct(
|
||||
... [("version", pa.int64()), ("project", pa.string())]
|
||||
... ))
|
||||
... )
|
||||
|
||||
Extract by field name.
|
||||
|
||||
>>> s.struct.field("project")
|
||||
0 pandas
|
||||
1 pandas
|
||||
2 numpy
|
||||
Name: project, dtype: string[pyarrow]
|
||||
|
||||
Extract by field index.
|
||||
|
||||
>>> s.struct.field(0)
|
||||
0 1
|
||||
1 2
|
||||
2 1
|
||||
Name: version, dtype: int64[pyarrow]
|
||||
|
||||
Or an expression
|
||||
|
||||
>>> import pyarrow.compute as pc
|
||||
>>> s.struct.field(pc.field("project"))
|
||||
0 pandas
|
||||
1 pandas
|
||||
2 numpy
|
||||
Name: project, dtype: string[pyarrow]
|
||||
|
||||
For nested struct types, you can pass a list of values to index
|
||||
multiple levels:
|
||||
|
||||
>>> version_type = pa.struct([
|
||||
... ("major", pa.int64()),
|
||||
... ("minor", pa.int64()),
|
||||
... ])
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... {"version": {"major": 1, "minor": 5}, "project": "pandas"},
|
||||
... {"version": {"major": 2, "minor": 1}, "project": "pandas"},
|
||||
... {"version": {"major": 1, "minor": 26}, "project": "numpy"},
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.struct(
|
||||
... [("version", version_type), ("project", pa.string())]
|
||||
... ))
|
||||
... )
|
||||
>>> s.struct.field(["version", "minor"])
|
||||
0 5
|
||||
1 1
|
||||
2 26
|
||||
Name: minor, dtype: int64[pyarrow]
|
||||
>>> s.struct.field([0, 0])
|
||||
0 1
|
||||
1 2
|
||||
2 1
|
||||
Name: major, dtype: int64[pyarrow]
|
||||
"""
|
||||
from pandas import Series
|
||||
|
||||
def get_name(
|
||||
level_name_or_index: list[str]
|
||||
| list[bytes]
|
||||
| list[int]
|
||||
| pc.Expression
|
||||
| bytes
|
||||
| str
|
||||
| int,
|
||||
data: pa.ChunkedArray,
|
||||
):
|
||||
if isinstance(level_name_or_index, int):
|
||||
name = data.type.field(level_name_or_index).name
|
||||
elif isinstance(level_name_or_index, (str, bytes)):
|
||||
name = level_name_or_index
|
||||
elif isinstance(level_name_or_index, pc.Expression):
|
||||
name = str(level_name_or_index)
|
||||
elif is_list_like(level_name_or_index):
|
||||
# For nested input like [2, 1, 2]
|
||||
# iteratively get the struct and field name. The last
|
||||
# one is used for the name of the index.
|
||||
level_name_or_index = list(reversed(level_name_or_index))
|
||||
selected = data
|
||||
while level_name_or_index:
|
||||
# we need the cast, otherwise mypy complains about
|
||||
# getting ints, bytes, or str here, which isn't possible.
|
||||
level_name_or_index = cast(list, level_name_or_index)
|
||||
name_or_index = level_name_or_index.pop()
|
||||
name = get_name(name_or_index, selected)
|
||||
selected = selected.type.field(selected.type.get_field_index(name))
|
||||
name = selected.name
|
||||
else:
|
||||
raise ValueError(
|
||||
"name_or_index must be an int, str, bytes, "
|
||||
"pyarrow.compute.Expression, or list of those"
|
||||
)
|
||||
return name
|
||||
|
||||
pa_arr = self._data.array._pa_array
|
||||
name = get_name(name_or_index, pa_arr)
|
||||
field_arr = pc.struct_field(pa_arr, name_or_index)
|
||||
|
||||
return Series(
|
||||
field_arr,
|
||||
dtype=ArrowDtype(field_arr.type),
|
||||
index=self._data.index,
|
||||
name=name,
|
||||
)
|
||||
|
||||
def explode(self) -> DataFrame:
|
||||
"""
|
||||
Extract all child fields of a struct as a DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.DataFrame
|
||||
The data corresponding to all child fields.
|
||||
|
||||
See Also
|
||||
--------
|
||||
Series.struct.field : Return a single child field as a Series.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> s = pd.Series(
|
||||
... [
|
||||
... {"version": 1, "project": "pandas"},
|
||||
... {"version": 2, "project": "pandas"},
|
||||
... {"version": 1, "project": "numpy"},
|
||||
... ],
|
||||
... dtype=pd.ArrowDtype(pa.struct(
|
||||
... [("version", pa.int64()), ("project", pa.string())]
|
||||
... ))
|
||||
... )
|
||||
|
||||
>>> s.struct.explode()
|
||||
version project
|
||||
0 1 pandas
|
||||
1 2 pandas
|
||||
2 1 numpy
|
||||
"""
|
||||
from pandas import concat
|
||||
|
||||
pa_type = self._pa_array.type
|
||||
return concat(
|
||||
[self.field(i) for i in range(pa_type.num_fields)], axis="columns"
|
||||
)
|
||||
2942
lib/python3.13/site-packages/pandas/core/arrays/arrow/array.py
Normal file
2942
lib/python3.13/site-packages/pandas/core/arrays/arrow/array.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,174 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pyarrow
|
||||
|
||||
from pandas.compat import pa_version_under14p1
|
||||
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
IntervalDtype,
|
||||
PeriodDtype,
|
||||
)
|
||||
|
||||
from pandas.core.arrays.interval import VALID_CLOSED
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import IntervalClosedType
|
||||
|
||||
|
||||
class ArrowPeriodType(pyarrow.ExtensionType):
|
||||
def __init__(self, freq) -> None:
|
||||
# attributes need to be set first before calling
|
||||
# super init (as that calls serialize)
|
||||
self._freq = freq
|
||||
pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period")
|
||||
|
||||
@property
|
||||
def freq(self):
|
||||
return self._freq
|
||||
|
||||
def __arrow_ext_serialize__(self) -> bytes:
|
||||
metadata = {"freq": self.freq}
|
||||
return json.dumps(metadata).encode()
|
||||
|
||||
@classmethod
|
||||
def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowPeriodType:
|
||||
metadata = json.loads(serialized.decode())
|
||||
return ArrowPeriodType(metadata["freq"])
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, pyarrow.BaseExtensionType):
|
||||
return type(self) == type(other) and self.freq == other.freq
|
||||
else:
|
||||
return NotImplemented
|
||||
|
||||
def __ne__(self, other) -> bool:
|
||||
return not self == other
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return hash((str(self), self.freq))
|
||||
|
||||
def to_pandas_dtype(self) -> PeriodDtype:
|
||||
return PeriodDtype(freq=self.freq)
|
||||
|
||||
|
||||
# register the type with a dummy instance
|
||||
_period_type = ArrowPeriodType("D")
|
||||
pyarrow.register_extension_type(_period_type)
|
||||
|
||||
|
||||
class ArrowIntervalType(pyarrow.ExtensionType):
|
||||
def __init__(self, subtype, closed: IntervalClosedType) -> None:
|
||||
# attributes need to be set first before calling
|
||||
# super init (as that calls serialize)
|
||||
assert closed in VALID_CLOSED
|
||||
self._closed: IntervalClosedType = closed
|
||||
if not isinstance(subtype, pyarrow.DataType):
|
||||
subtype = pyarrow.type_for_alias(str(subtype))
|
||||
self._subtype = subtype
|
||||
|
||||
storage_type = pyarrow.struct([("left", subtype), ("right", subtype)])
|
||||
pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval")
|
||||
|
||||
@property
|
||||
def subtype(self):
|
||||
return self._subtype
|
||||
|
||||
@property
|
||||
def closed(self) -> IntervalClosedType:
|
||||
return self._closed
|
||||
|
||||
def __arrow_ext_serialize__(self) -> bytes:
|
||||
metadata = {"subtype": str(self.subtype), "closed": self.closed}
|
||||
return json.dumps(metadata).encode()
|
||||
|
||||
@classmethod
|
||||
def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowIntervalType:
|
||||
metadata = json.loads(serialized.decode())
|
||||
subtype = pyarrow.type_for_alias(metadata["subtype"])
|
||||
closed = metadata["closed"]
|
||||
return ArrowIntervalType(subtype, closed)
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, pyarrow.BaseExtensionType):
|
||||
return (
|
||||
type(self) == type(other)
|
||||
and self.subtype == other.subtype
|
||||
and self.closed == other.closed
|
||||
)
|
||||
else:
|
||||
return NotImplemented
|
||||
|
||||
def __ne__(self, other) -> bool:
|
||||
return not self == other
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return hash((str(self), str(self.subtype), self.closed))
|
||||
|
||||
def to_pandas_dtype(self) -> IntervalDtype:
|
||||
return IntervalDtype(self.subtype.to_pandas_dtype(), self.closed)
|
||||
|
||||
|
||||
# register the type with a dummy instance
|
||||
_interval_type = ArrowIntervalType(pyarrow.int64(), "left")
|
||||
pyarrow.register_extension_type(_interval_type)
|
||||
|
||||
|
||||
_ERROR_MSG = """\
|
||||
Disallowed deserialization of 'arrow.py_extension_type':
|
||||
storage_type = {storage_type}
|
||||
serialized = {serialized}
|
||||
pickle disassembly:\n{pickle_disassembly}
|
||||
|
||||
Reading of untrusted Parquet or Feather files with a PyExtensionType column
|
||||
allows arbitrary code execution.
|
||||
If you trust this file, you can enable reading the extension type by one of:
|
||||
|
||||
- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)`
|
||||
- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running
|
||||
`import pyarrow_hotfix; pyarrow_hotfix.uninstall()`
|
||||
|
||||
We strongly recommend updating your Parquet/Feather files to use extension types
|
||||
derived from `pyarrow.ExtensionType` instead, and register this type explicitly.
|
||||
"""
|
||||
|
||||
|
||||
def patch_pyarrow():
|
||||
# starting from pyarrow 14.0.1, it has its own mechanism
|
||||
if not pa_version_under14p1:
|
||||
return
|
||||
|
||||
# if https://github.com/pitrou/pyarrow-hotfix was installed and enabled
|
||||
if getattr(pyarrow, "_hotfix_installed", False):
|
||||
return
|
||||
|
||||
class ForbiddenExtensionType(pyarrow.ExtensionType):
|
||||
def __arrow_ext_serialize__(self):
|
||||
return b""
|
||||
|
||||
@classmethod
|
||||
def __arrow_ext_deserialize__(cls, storage_type, serialized):
|
||||
import io
|
||||
import pickletools
|
||||
|
||||
out = io.StringIO()
|
||||
pickletools.dis(serialized, out)
|
||||
raise RuntimeError(
|
||||
_ERROR_MSG.format(
|
||||
storage_type=storage_type,
|
||||
serialized=serialized,
|
||||
pickle_disassembly=out.getvalue(),
|
||||
)
|
||||
)
|
||||
|
||||
pyarrow.unregister_extension_type("arrow.py_extension_type")
|
||||
pyarrow.register_extension_type(
|
||||
ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type")
|
||||
)
|
||||
|
||||
pyarrow._hotfix_installed = True
|
||||
|
||||
|
||||
patch_pyarrow()
|
||||
2588
lib/python3.13/site-packages/pandas/core/arrays/base.py
Normal file
2588
lib/python3.13/site-packages/pandas/core/arrays/base.py
Normal file
File diff suppressed because it is too large
Load Diff
407
lib/python3.13/site-packages/pandas/core/arrays/boolean.py
Normal file
407
lib/python3.13/site-packages/pandas/core/arrays/boolean.py
Normal file
@@ -0,0 +1,407 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import numbers
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
ClassVar,
|
||||
cast,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
lib,
|
||||
missing as libmissing,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
from pandas.core.dtypes.dtypes import register_extension_dtype
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
from pandas.core import ops
|
||||
from pandas.core.array_algos import masked_accumulations
|
||||
from pandas.core.arrays.masked import (
|
||||
BaseMaskedArray,
|
||||
BaseMaskedDtype,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pyarrow
|
||||
|
||||
from pandas._typing import (
|
||||
Dtype,
|
||||
DtypeObj,
|
||||
Self,
|
||||
npt,
|
||||
type_t,
|
||||
)
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class BooleanDtype(BaseMaskedDtype):
|
||||
"""
|
||||
Extension dtype for boolean data.
|
||||
|
||||
.. warning::
|
||||
|
||||
BooleanDtype is considered experimental. The implementation and
|
||||
parts of the API may change without warning.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> pd.BooleanDtype()
|
||||
BooleanDtype
|
||||
"""
|
||||
|
||||
name: ClassVar[str] = "boolean"
|
||||
|
||||
# https://github.com/python/mypy/issues/4125
|
||||
# error: Signature of "type" incompatible with supertype "BaseMaskedDtype"
|
||||
@property
|
||||
def type(self) -> type: # type: ignore[override]
|
||||
return np.bool_
|
||||
|
||||
@property
|
||||
def kind(self) -> str:
|
||||
return "b"
|
||||
|
||||
@property
|
||||
def numpy_dtype(self) -> np.dtype:
|
||||
return np.dtype("bool")
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls) -> type_t[BooleanArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
return BooleanArray
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return "BooleanDtype"
|
||||
|
||||
@property
|
||||
def _is_boolean(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def _is_numeric(self) -> bool:
|
||||
return True
|
||||
|
||||
def __from_arrow__(
|
||||
self, array: pyarrow.Array | pyarrow.ChunkedArray
|
||||
) -> BooleanArray:
|
||||
"""
|
||||
Construct BooleanArray from pyarrow Array/ChunkedArray.
|
||||
"""
|
||||
import pyarrow
|
||||
|
||||
if array.type != pyarrow.bool_() and not pyarrow.types.is_null(array.type):
|
||||
raise TypeError(f"Expected array of boolean type, got {array.type} instead")
|
||||
|
||||
if isinstance(array, pyarrow.Array):
|
||||
chunks = [array]
|
||||
length = len(array)
|
||||
else:
|
||||
# pyarrow.ChunkedArray
|
||||
chunks = array.chunks
|
||||
length = array.length()
|
||||
|
||||
if pyarrow.types.is_null(array.type):
|
||||
mask = np.ones(length, dtype=bool)
|
||||
# No need to init data, since all null
|
||||
data = np.empty(length, dtype=bool)
|
||||
return BooleanArray(data, mask)
|
||||
|
||||
results = []
|
||||
for arr in chunks:
|
||||
buflist = arr.buffers()
|
||||
data = pyarrow.BooleanArray.from_buffers(
|
||||
arr.type, len(arr), [None, buflist[1]], offset=arr.offset
|
||||
).to_numpy(zero_copy_only=False)
|
||||
if arr.null_count != 0:
|
||||
mask = pyarrow.BooleanArray.from_buffers(
|
||||
arr.type, len(arr), [None, buflist[0]], offset=arr.offset
|
||||
).to_numpy(zero_copy_only=False)
|
||||
mask = ~mask
|
||||
else:
|
||||
mask = np.zeros(len(arr), dtype=bool)
|
||||
|
||||
bool_arr = BooleanArray(data, mask)
|
||||
results.append(bool_arr)
|
||||
|
||||
if not results:
|
||||
return BooleanArray(
|
||||
np.array([], dtype=np.bool_), np.array([], dtype=np.bool_)
|
||||
)
|
||||
else:
|
||||
return BooleanArray._concat_same_type(results)
|
||||
|
||||
|
||||
def coerce_to_array(
|
||||
values, mask=None, copy: bool = False
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Coerce the input values array to numpy arrays with a mask.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : 1D list-like
|
||||
mask : bool 1D array, optional
|
||||
copy : bool, default False
|
||||
if True, copy the input
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple of (values, mask)
|
||||
"""
|
||||
if isinstance(values, BooleanArray):
|
||||
if mask is not None:
|
||||
raise ValueError("cannot pass mask for BooleanArray input")
|
||||
values, mask = values._data, values._mask
|
||||
if copy:
|
||||
values = values.copy()
|
||||
mask = mask.copy()
|
||||
return values, mask
|
||||
|
||||
mask_values = None
|
||||
if isinstance(values, np.ndarray) and values.dtype == np.bool_:
|
||||
if copy:
|
||||
values = values.copy()
|
||||
elif isinstance(values, np.ndarray) and values.dtype.kind in "iufcb":
|
||||
mask_values = isna(values)
|
||||
|
||||
values_bool = np.zeros(len(values), dtype=bool)
|
||||
values_bool[~mask_values] = values[~mask_values].astype(bool)
|
||||
|
||||
if not np.all(
|
||||
values_bool[~mask_values].astype(values.dtype) == values[~mask_values]
|
||||
):
|
||||
raise TypeError("Need to pass bool-like values")
|
||||
|
||||
values = values_bool
|
||||
else:
|
||||
values_object = np.asarray(values, dtype=object)
|
||||
|
||||
inferred_dtype = lib.infer_dtype(values_object, skipna=True)
|
||||
integer_like = ("floating", "integer", "mixed-integer-float")
|
||||
if inferred_dtype not in ("boolean", "empty") + integer_like:
|
||||
raise TypeError("Need to pass bool-like values")
|
||||
|
||||
# mypy does not narrow the type of mask_values to npt.NDArray[np.bool_]
|
||||
# within this branch, it assumes it can also be None
|
||||
mask_values = cast("npt.NDArray[np.bool_]", isna(values_object))
|
||||
values = np.zeros(len(values), dtype=bool)
|
||||
values[~mask_values] = values_object[~mask_values].astype(bool)
|
||||
|
||||
# if the values were integer-like, validate it were actually 0/1's
|
||||
if (inferred_dtype in integer_like) and not (
|
||||
np.all(
|
||||
values[~mask_values].astype(float)
|
||||
== values_object[~mask_values].astype(float)
|
||||
)
|
||||
):
|
||||
raise TypeError("Need to pass bool-like values")
|
||||
|
||||
if mask is None and mask_values is None:
|
||||
mask = np.zeros(values.shape, dtype=bool)
|
||||
elif mask is None:
|
||||
mask = mask_values
|
||||
else:
|
||||
if isinstance(mask, np.ndarray) and mask.dtype == np.bool_:
|
||||
if mask_values is not None:
|
||||
mask = mask | mask_values
|
||||
else:
|
||||
if copy:
|
||||
mask = mask.copy()
|
||||
else:
|
||||
mask = np.array(mask, dtype=bool)
|
||||
if mask_values is not None:
|
||||
mask = mask | mask_values
|
||||
|
||||
if values.shape != mask.shape:
|
||||
raise ValueError("values.shape and mask.shape must match")
|
||||
|
||||
return values, mask
|
||||
|
||||
|
||||
class BooleanArray(BaseMaskedArray):
|
||||
"""
|
||||
Array of boolean (True/False) data with missing values.
|
||||
|
||||
This is a pandas Extension array for boolean data, under the hood
|
||||
represented by 2 numpy arrays: a boolean array with the data and
|
||||
a boolean array with the mask (True indicating missing).
|
||||
|
||||
BooleanArray implements Kleene logic (sometimes called three-value
|
||||
logic) for logical operations. See :ref:`boolean.kleene` for more.
|
||||
|
||||
To construct an BooleanArray from generic array-like input, use
|
||||
:func:`pandas.array` specifying ``dtype="boolean"`` (see examples
|
||||
below).
|
||||
|
||||
.. warning::
|
||||
|
||||
BooleanArray is considered experimental. The implementation and
|
||||
parts of the API may change without warning.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : numpy.ndarray
|
||||
A 1-d boolean-dtype array with the data.
|
||||
mask : numpy.ndarray
|
||||
A 1-d boolean-dtype array indicating missing values (True
|
||||
indicates missing).
|
||||
copy : bool, default False
|
||||
Whether to copy the `values` and `mask` arrays.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Returns
|
||||
-------
|
||||
BooleanArray
|
||||
|
||||
Examples
|
||||
--------
|
||||
Create an BooleanArray with :func:`pandas.array`:
|
||||
|
||||
>>> pd.array([True, False, None], dtype="boolean")
|
||||
<BooleanArray>
|
||||
[True, False, <NA>]
|
||||
Length: 3, dtype: boolean
|
||||
"""
|
||||
|
||||
# The value used to fill '_data' to avoid upcasting
|
||||
_internal_fill_value = False
|
||||
# Fill values used for any/all
|
||||
# Incompatible types in assignment (expression has type "bool", base class
|
||||
# "BaseMaskedArray" defined the type as "<typing special form>")
|
||||
_truthy_value = True # type: ignore[assignment]
|
||||
_falsey_value = False # type: ignore[assignment]
|
||||
_TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"}
|
||||
_FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"}
|
||||
|
||||
@classmethod
|
||||
def _simple_new(cls, values: np.ndarray, mask: npt.NDArray[np.bool_]) -> Self:
|
||||
result = super()._simple_new(values, mask)
|
||||
result._dtype = BooleanDtype()
|
||||
return result
|
||||
|
||||
def __init__(
|
||||
self, values: np.ndarray, mask: np.ndarray, copy: bool = False
|
||||
) -> None:
|
||||
if not (isinstance(values, np.ndarray) and values.dtype == np.bool_):
|
||||
raise TypeError(
|
||||
"values should be boolean numpy array. Use "
|
||||
"the 'pd.array' function instead"
|
||||
)
|
||||
self._dtype = BooleanDtype()
|
||||
super().__init__(values, mask, copy=copy)
|
||||
|
||||
@property
|
||||
def dtype(self) -> BooleanDtype:
|
||||
return self._dtype
|
||||
|
||||
@classmethod
|
||||
def _from_sequence_of_strings(
|
||||
cls,
|
||||
strings: list[str],
|
||||
*,
|
||||
dtype: Dtype | None = None,
|
||||
copy: bool = False,
|
||||
true_values: list[str] | None = None,
|
||||
false_values: list[str] | None = None,
|
||||
) -> BooleanArray:
|
||||
true_values_union = cls._TRUE_VALUES.union(true_values or [])
|
||||
false_values_union = cls._FALSE_VALUES.union(false_values or [])
|
||||
|
||||
def map_string(s) -> bool:
|
||||
if s in true_values_union:
|
||||
return True
|
||||
elif s in false_values_union:
|
||||
return False
|
||||
else:
|
||||
raise ValueError(f"{s} cannot be cast to bool")
|
||||
|
||||
scalars = np.array(strings, dtype=object)
|
||||
mask = isna(scalars)
|
||||
scalars[~mask] = list(map(map_string, scalars[~mask]))
|
||||
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
|
||||
|
||||
_HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_)
|
||||
|
||||
@classmethod
|
||||
def _coerce_to_array(
|
||||
cls, value, *, dtype: DtypeObj, copy: bool = False
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
if dtype:
|
||||
assert dtype == "boolean"
|
||||
return coerce_to_array(value, copy=copy)
|
||||
|
||||
def _logical_method(self, other, op):
|
||||
assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"}
|
||||
other_is_scalar = lib.is_scalar(other)
|
||||
mask = None
|
||||
|
||||
if isinstance(other, BooleanArray):
|
||||
other, mask = other._data, other._mask
|
||||
elif is_list_like(other):
|
||||
other = np.asarray(other, dtype="bool")
|
||||
if other.ndim > 1:
|
||||
raise NotImplementedError("can only perform ops with 1-d structures")
|
||||
other, mask = coerce_to_array(other, copy=False)
|
||||
elif isinstance(other, np.bool_):
|
||||
other = other.item()
|
||||
|
||||
if other_is_scalar and other is not libmissing.NA and not lib.is_bool(other):
|
||||
raise TypeError(
|
||||
"'other' should be pandas.NA or a bool. "
|
||||
f"Got {type(other).__name__} instead."
|
||||
)
|
||||
|
||||
if not other_is_scalar and len(self) != len(other):
|
||||
raise ValueError("Lengths must match")
|
||||
|
||||
if op.__name__ in {"or_", "ror_"}:
|
||||
result, mask = ops.kleene_or(self._data, other, self._mask, mask)
|
||||
elif op.__name__ in {"and_", "rand_"}:
|
||||
result, mask = ops.kleene_and(self._data, other, self._mask, mask)
|
||||
else:
|
||||
# i.e. xor, rxor
|
||||
result, mask = ops.kleene_xor(self._data, other, self._mask, mask)
|
||||
|
||||
# i.e. BooleanArray
|
||||
return self._maybe_mask_result(result, mask)
|
||||
|
||||
def _accumulate(
|
||||
self, name: str, *, skipna: bool = True, **kwargs
|
||||
) -> BaseMaskedArray:
|
||||
data = self._data
|
||||
mask = self._mask
|
||||
if name in ("cummin", "cummax"):
|
||||
op = getattr(masked_accumulations, name)
|
||||
data, mask = op(data, mask, skipna=skipna, **kwargs)
|
||||
return self._simple_new(data, mask)
|
||||
else:
|
||||
from pandas.core.arrays import IntegerArray
|
||||
|
||||
return IntegerArray(data.astype(int), mask)._accumulate(
|
||||
name, skipna=skipna, **kwargs
|
||||
)
|
||||
3070
lib/python3.13/site-packages/pandas/core/arrays/categorical.py
Normal file
3070
lib/python3.13/site-packages/pandas/core/arrays/categorical.py
Normal file
File diff suppressed because it is too large
Load Diff
2556
lib/python3.13/site-packages/pandas/core/arrays/datetimelike.py
Normal file
2556
lib/python3.13/site-packages/pandas/core/arrays/datetimelike.py
Normal file
File diff suppressed because it is too large
Load Diff
2820
lib/python3.13/site-packages/pandas/core/arrays/datetimes.py
Normal file
2820
lib/python3.13/site-packages/pandas/core/arrays/datetimes.py
Normal file
File diff suppressed because it is too large
Load Diff
173
lib/python3.13/site-packages/pandas/core/arrays/floating.py
Normal file
173
lib/python3.13/site-packages/pandas/core/arrays/floating.py
Normal file
@@ -0,0 +1,173 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import ClassVar
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.core.dtypes.base import register_extension_dtype
|
||||
from pandas.core.dtypes.common import is_float_dtype
|
||||
|
||||
from pandas.core.arrays.numeric import (
|
||||
NumericArray,
|
||||
NumericDtype,
|
||||
)
|
||||
|
||||
|
||||
class FloatingDtype(NumericDtype):
|
||||
"""
|
||||
An ExtensionDtype to hold a single size of floating dtype.
|
||||
|
||||
These specific implementations are subclasses of the non-public
|
||||
FloatingDtype. For example we have Float32Dtype to represent float32.
|
||||
|
||||
The attributes name & type are set when these subclasses are created.
|
||||
"""
|
||||
|
||||
_default_np_dtype = np.dtype(np.float64)
|
||||
_checker = is_float_dtype
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls) -> type[FloatingArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
return FloatingArray
|
||||
|
||||
@classmethod
|
||||
def _get_dtype_mapping(cls) -> dict[np.dtype, FloatingDtype]:
|
||||
return NUMPY_FLOAT_TO_DTYPE
|
||||
|
||||
@classmethod
|
||||
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
|
||||
"""
|
||||
Safely cast the values to the given dtype.
|
||||
|
||||
"safe" in this context means the casting is lossless.
|
||||
"""
|
||||
# This is really only here for compatibility with IntegerDtype
|
||||
# Here for compat with IntegerDtype
|
||||
return values.astype(dtype, copy=copy)
|
||||
|
||||
|
||||
class FloatingArray(NumericArray):
|
||||
"""
|
||||
Array of floating (optional missing) values.
|
||||
|
||||
.. warning::
|
||||
|
||||
FloatingArray is currently experimental, and its API or internal
|
||||
implementation may change without warning. Especially the behaviour
|
||||
regarding NaN (distinct from NA missing values) is subject to change.
|
||||
|
||||
We represent a FloatingArray with 2 numpy arrays:
|
||||
|
||||
- data: contains a numpy float array of the appropriate dtype
|
||||
- mask: a boolean array holding a mask on the data, True is missing
|
||||
|
||||
To construct an FloatingArray from generic array-like input, use
|
||||
:func:`pandas.array` with one of the float dtypes (see examples).
|
||||
|
||||
See :ref:`integer_na` for more.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : numpy.ndarray
|
||||
A 1-d float-dtype array.
|
||||
mask : numpy.ndarray
|
||||
A 1-d boolean-dtype array indicating missing values.
|
||||
copy : bool, default False
|
||||
Whether to copy the `values` and `mask`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Returns
|
||||
-------
|
||||
FloatingArray
|
||||
|
||||
Examples
|
||||
--------
|
||||
Create an FloatingArray with :func:`pandas.array`:
|
||||
|
||||
>>> pd.array([0.1, None, 0.3], dtype=pd.Float32Dtype())
|
||||
<FloatingArray>
|
||||
[0.1, <NA>, 0.3]
|
||||
Length: 3, dtype: Float32
|
||||
|
||||
String aliases for the dtypes are also available. They are capitalized.
|
||||
|
||||
>>> pd.array([0.1, None, 0.3], dtype="Float32")
|
||||
<FloatingArray>
|
||||
[0.1, <NA>, 0.3]
|
||||
Length: 3, dtype: Float32
|
||||
"""
|
||||
|
||||
_dtype_cls = FloatingDtype
|
||||
|
||||
# The value used to fill '_data' to avoid upcasting
|
||||
_internal_fill_value = np.nan
|
||||
# Fill values used for any/all
|
||||
# Incompatible types in assignment (expression has type "float", base class
|
||||
# "BaseMaskedArray" defined the type as "<typing special form>")
|
||||
_truthy_value = 1.0 # type: ignore[assignment]
|
||||
_falsey_value = 0.0 # type: ignore[assignment]
|
||||
|
||||
|
||||
_dtype_docstring = """
|
||||
An ExtensionDtype for {dtype} data.
|
||||
|
||||
This dtype uses ``pd.NA`` as missing value indicator.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Examples
|
||||
--------
|
||||
For Float32Dtype:
|
||||
|
||||
>>> ser = pd.Series([2.25, pd.NA], dtype=pd.Float32Dtype())
|
||||
>>> ser.dtype
|
||||
Float32Dtype()
|
||||
|
||||
For Float64Dtype:
|
||||
|
||||
>>> ser = pd.Series([2.25, pd.NA], dtype=pd.Float64Dtype())
|
||||
>>> ser.dtype
|
||||
Float64Dtype()
|
||||
"""
|
||||
|
||||
# create the Dtype
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Float32Dtype(FloatingDtype):
|
||||
type = np.float32
|
||||
name: ClassVar[str] = "Float32"
|
||||
__doc__ = _dtype_docstring.format(dtype="float32")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Float64Dtype(FloatingDtype):
|
||||
type = np.float64
|
||||
name: ClassVar[str] = "Float64"
|
||||
__doc__ = _dtype_docstring.format(dtype="float64")
|
||||
|
||||
|
||||
NUMPY_FLOAT_TO_DTYPE: dict[np.dtype, FloatingDtype] = {
|
||||
np.dtype(np.float32): Float32Dtype(),
|
||||
np.dtype(np.float64): Float64Dtype(),
|
||||
}
|
||||
272
lib/python3.13/site-packages/pandas/core/arrays/integer.py
Normal file
272
lib/python3.13/site-packages/pandas/core/arrays/integer.py
Normal file
@@ -0,0 +1,272 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import ClassVar
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.core.dtypes.base import register_extension_dtype
|
||||
from pandas.core.dtypes.common import is_integer_dtype
|
||||
|
||||
from pandas.core.arrays.numeric import (
|
||||
NumericArray,
|
||||
NumericDtype,
|
||||
)
|
||||
|
||||
|
||||
class IntegerDtype(NumericDtype):
|
||||
"""
|
||||
An ExtensionDtype to hold a single size & kind of integer dtype.
|
||||
|
||||
These specific implementations are subclasses of the non-public
|
||||
IntegerDtype. For example, we have Int8Dtype to represent signed int 8s.
|
||||
|
||||
The attributes name & type are set when these subclasses are created.
|
||||
"""
|
||||
|
||||
_default_np_dtype = np.dtype(np.int64)
|
||||
_checker = is_integer_dtype
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls) -> type[IntegerArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
return IntegerArray
|
||||
|
||||
@classmethod
|
||||
def _get_dtype_mapping(cls) -> dict[np.dtype, IntegerDtype]:
|
||||
return NUMPY_INT_TO_DTYPE
|
||||
|
||||
@classmethod
|
||||
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
|
||||
"""
|
||||
Safely cast the values to the given dtype.
|
||||
|
||||
"safe" in this context means the casting is lossless. e.g. if 'values'
|
||||
has a floating dtype, each value must be an integer.
|
||||
"""
|
||||
try:
|
||||
return values.astype(dtype, casting="safe", copy=copy)
|
||||
except TypeError as err:
|
||||
casted = values.astype(dtype, copy=copy)
|
||||
if (casted == values).all():
|
||||
return casted
|
||||
|
||||
raise TypeError(
|
||||
f"cannot safely cast non-equivalent {values.dtype} to {np.dtype(dtype)}"
|
||||
) from err
|
||||
|
||||
|
||||
class IntegerArray(NumericArray):
|
||||
"""
|
||||
Array of integer (optional missing) values.
|
||||
|
||||
Uses :attr:`pandas.NA` as the missing value.
|
||||
|
||||
.. warning::
|
||||
|
||||
IntegerArray is currently experimental, and its API or internal
|
||||
implementation may change without warning.
|
||||
|
||||
We represent an IntegerArray with 2 numpy arrays:
|
||||
|
||||
- data: contains a numpy integer array of the appropriate dtype
|
||||
- mask: a boolean array holding a mask on the data, True is missing
|
||||
|
||||
To construct an IntegerArray from generic array-like input, use
|
||||
:func:`pandas.array` with one of the integer dtypes (see examples).
|
||||
|
||||
See :ref:`integer_na` for more.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : numpy.ndarray
|
||||
A 1-d integer-dtype array.
|
||||
mask : numpy.ndarray
|
||||
A 1-d boolean-dtype array indicating missing values.
|
||||
copy : bool, default False
|
||||
Whether to copy the `values` and `mask`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Returns
|
||||
-------
|
||||
IntegerArray
|
||||
|
||||
Examples
|
||||
--------
|
||||
Create an IntegerArray with :func:`pandas.array`.
|
||||
|
||||
>>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype())
|
||||
>>> int_array
|
||||
<IntegerArray>
|
||||
[1, <NA>, 3]
|
||||
Length: 3, dtype: Int32
|
||||
|
||||
String aliases for the dtypes are also available. They are capitalized.
|
||||
|
||||
>>> pd.array([1, None, 3], dtype='Int32')
|
||||
<IntegerArray>
|
||||
[1, <NA>, 3]
|
||||
Length: 3, dtype: Int32
|
||||
|
||||
>>> pd.array([1, None, 3], dtype='UInt16')
|
||||
<IntegerArray>
|
||||
[1, <NA>, 3]
|
||||
Length: 3, dtype: UInt16
|
||||
"""
|
||||
|
||||
_dtype_cls = IntegerDtype
|
||||
|
||||
# The value used to fill '_data' to avoid upcasting
|
||||
_internal_fill_value = 1
|
||||
# Fill values used for any/all
|
||||
# Incompatible types in assignment (expression has type "int", base class
|
||||
# "BaseMaskedArray" defined the type as "<typing special form>")
|
||||
_truthy_value = 1 # type: ignore[assignment]
|
||||
_falsey_value = 0 # type: ignore[assignment]
|
||||
|
||||
|
||||
_dtype_docstring = """
|
||||
An ExtensionDtype for {dtype} integer data.
|
||||
|
||||
Uses :attr:`pandas.NA` as its missing value, rather than :attr:`numpy.nan`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Examples
|
||||
--------
|
||||
For Int8Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int8Dtype())
|
||||
>>> ser.dtype
|
||||
Int8Dtype()
|
||||
|
||||
For Int16Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int16Dtype())
|
||||
>>> ser.dtype
|
||||
Int16Dtype()
|
||||
|
||||
For Int32Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int32Dtype())
|
||||
>>> ser.dtype
|
||||
Int32Dtype()
|
||||
|
||||
For Int64Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.Int64Dtype())
|
||||
>>> ser.dtype
|
||||
Int64Dtype()
|
||||
|
||||
For UInt8Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt8Dtype())
|
||||
>>> ser.dtype
|
||||
UInt8Dtype()
|
||||
|
||||
For UInt16Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt16Dtype())
|
||||
>>> ser.dtype
|
||||
UInt16Dtype()
|
||||
|
||||
For UInt32Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt32Dtype())
|
||||
>>> ser.dtype
|
||||
UInt32Dtype()
|
||||
|
||||
For UInt64Dtype:
|
||||
|
||||
>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt64Dtype())
|
||||
>>> ser.dtype
|
||||
UInt64Dtype()
|
||||
"""
|
||||
|
||||
# create the Dtype
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Int8Dtype(IntegerDtype):
|
||||
type = np.int8
|
||||
name: ClassVar[str] = "Int8"
|
||||
__doc__ = _dtype_docstring.format(dtype="int8")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Int16Dtype(IntegerDtype):
|
||||
type = np.int16
|
||||
name: ClassVar[str] = "Int16"
|
||||
__doc__ = _dtype_docstring.format(dtype="int16")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Int32Dtype(IntegerDtype):
|
||||
type = np.int32
|
||||
name: ClassVar[str] = "Int32"
|
||||
__doc__ = _dtype_docstring.format(dtype="int32")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class Int64Dtype(IntegerDtype):
|
||||
type = np.int64
|
||||
name: ClassVar[str] = "Int64"
|
||||
__doc__ = _dtype_docstring.format(dtype="int64")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class UInt8Dtype(IntegerDtype):
|
||||
type = np.uint8
|
||||
name: ClassVar[str] = "UInt8"
|
||||
__doc__ = _dtype_docstring.format(dtype="uint8")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class UInt16Dtype(IntegerDtype):
|
||||
type = np.uint16
|
||||
name: ClassVar[str] = "UInt16"
|
||||
__doc__ = _dtype_docstring.format(dtype="uint16")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class UInt32Dtype(IntegerDtype):
|
||||
type = np.uint32
|
||||
name: ClassVar[str] = "UInt32"
|
||||
__doc__ = _dtype_docstring.format(dtype="uint32")
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class UInt64Dtype(IntegerDtype):
|
||||
type = np.uint64
|
||||
name: ClassVar[str] = "UInt64"
|
||||
__doc__ = _dtype_docstring.format(dtype="uint64")
|
||||
|
||||
|
||||
NUMPY_INT_TO_DTYPE: dict[np.dtype, IntegerDtype] = {
|
||||
np.dtype(np.int8): Int8Dtype(),
|
||||
np.dtype(np.int16): Int16Dtype(),
|
||||
np.dtype(np.int32): Int32Dtype(),
|
||||
np.dtype(np.int64): Int64Dtype(),
|
||||
np.dtype(np.uint8): UInt8Dtype(),
|
||||
np.dtype(np.uint16): UInt16Dtype(),
|
||||
np.dtype(np.uint32): UInt32Dtype(),
|
||||
np.dtype(np.uint64): UInt64Dtype(),
|
||||
}
|
||||
1917
lib/python3.13/site-packages/pandas/core/arrays/interval.py
Normal file
1917
lib/python3.13/site-packages/pandas/core/arrays/interval.py
Normal file
File diff suppressed because it is too large
Load Diff
1650
lib/python3.13/site-packages/pandas/core/arrays/masked.py
Normal file
1650
lib/python3.13/site-packages/pandas/core/arrays/masked.py
Normal file
File diff suppressed because it is too large
Load Diff
286
lib/python3.13/site-packages/pandas/core/arrays/numeric.py
Normal file
286
lib/python3.13/site-packages/pandas/core/arrays/numeric.py
Normal file
@@ -0,0 +1,286 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import numbers
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
lib,
|
||||
missing as libmissing,
|
||||
)
|
||||
from pandas.errors import AbstractMethodError
|
||||
from pandas.util._decorators import cache_readonly
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_integer_dtype,
|
||||
is_string_dtype,
|
||||
pandas_dtype,
|
||||
)
|
||||
|
||||
from pandas.core.arrays.masked import (
|
||||
BaseMaskedArray,
|
||||
BaseMaskedDtype,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Mapping
|
||||
|
||||
import pyarrow
|
||||
|
||||
from pandas._typing import (
|
||||
Dtype,
|
||||
DtypeObj,
|
||||
Self,
|
||||
npt,
|
||||
)
|
||||
|
||||
|
||||
class NumericDtype(BaseMaskedDtype):
|
||||
_default_np_dtype: np.dtype
|
||||
_checker: Callable[[Any], bool] # is_foo_dtype
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"{self.name}Dtype()"
|
||||
|
||||
@cache_readonly
|
||||
def is_signed_integer(self) -> bool:
|
||||
return self.kind == "i"
|
||||
|
||||
@cache_readonly
|
||||
def is_unsigned_integer(self) -> bool:
|
||||
return self.kind == "u"
|
||||
|
||||
@property
|
||||
def _is_numeric(self) -> bool:
|
||||
return True
|
||||
|
||||
def __from_arrow__(
|
||||
self, array: pyarrow.Array | pyarrow.ChunkedArray
|
||||
) -> BaseMaskedArray:
|
||||
"""
|
||||
Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray.
|
||||
"""
|
||||
import pyarrow
|
||||
|
||||
from pandas.core.arrays.arrow._arrow_utils import (
|
||||
pyarrow_array_to_numpy_and_mask,
|
||||
)
|
||||
|
||||
array_class = self.construct_array_type()
|
||||
|
||||
pyarrow_type = pyarrow.from_numpy_dtype(self.type)
|
||||
if not array.type.equals(pyarrow_type) and not pyarrow.types.is_null(
|
||||
array.type
|
||||
):
|
||||
# test_from_arrow_type_error raise for string, but allow
|
||||
# through itemsize conversion GH#31896
|
||||
rt_dtype = pandas_dtype(array.type.to_pandas_dtype())
|
||||
if rt_dtype.kind not in "iuf":
|
||||
# Could allow "c" or potentially disallow float<->int conversion,
|
||||
# but at the moment we specifically test that uint<->int works
|
||||
raise TypeError(
|
||||
f"Expected array of {self} type, got {array.type} instead"
|
||||
)
|
||||
|
||||
array = array.cast(pyarrow_type)
|
||||
|
||||
if isinstance(array, pyarrow.ChunkedArray):
|
||||
# TODO this "if" can be removed when requiring pyarrow >= 10.0, which fixed
|
||||
# combine_chunks for empty arrays https://github.com/apache/arrow/pull/13757
|
||||
if array.num_chunks == 0:
|
||||
array = pyarrow.array([], type=array.type)
|
||||
else:
|
||||
array = array.combine_chunks()
|
||||
|
||||
data, mask = pyarrow_array_to_numpy_and_mask(array, dtype=self.numpy_dtype)
|
||||
return array_class(data.copy(), ~mask, copy=False)
|
||||
|
||||
@classmethod
|
||||
def _get_dtype_mapping(cls) -> Mapping[np.dtype, NumericDtype]:
|
||||
raise AbstractMethodError(cls)
|
||||
|
||||
@classmethod
|
||||
def _standardize_dtype(cls, dtype: NumericDtype | str | np.dtype) -> NumericDtype:
|
||||
"""
|
||||
Convert a string representation or a numpy dtype to NumericDtype.
|
||||
"""
|
||||
if isinstance(dtype, str) and (dtype.startswith(("Int", "UInt", "Float"))):
|
||||
# Avoid DeprecationWarning from NumPy about np.dtype("Int64")
|
||||
# https://github.com/numpy/numpy/pull/7476
|
||||
dtype = dtype.lower()
|
||||
|
||||
if not isinstance(dtype, NumericDtype):
|
||||
mapping = cls._get_dtype_mapping()
|
||||
try:
|
||||
dtype = mapping[np.dtype(dtype)]
|
||||
except KeyError as err:
|
||||
raise ValueError(f"invalid dtype specified {dtype}") from err
|
||||
return dtype
|
||||
|
||||
@classmethod
|
||||
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
|
||||
"""
|
||||
Safely cast the values to the given dtype.
|
||||
|
||||
"safe" in this context means the casting is lossless.
|
||||
"""
|
||||
raise AbstractMethodError(cls)
|
||||
|
||||
|
||||
def _coerce_to_data_and_mask(
|
||||
values, dtype, copy: bool, dtype_cls: type[NumericDtype], default_dtype: np.dtype
|
||||
):
|
||||
checker = dtype_cls._checker
|
||||
|
||||
mask = None
|
||||
inferred_type = None
|
||||
|
||||
if dtype is None and hasattr(values, "dtype"):
|
||||
if checker(values.dtype):
|
||||
dtype = values.dtype
|
||||
|
||||
if dtype is not None:
|
||||
dtype = dtype_cls._standardize_dtype(dtype)
|
||||
|
||||
cls = dtype_cls.construct_array_type()
|
||||
if isinstance(values, cls):
|
||||
values, mask = values._data, values._mask
|
||||
if dtype is not None:
|
||||
values = values.astype(dtype.numpy_dtype, copy=False)
|
||||
|
||||
if copy:
|
||||
values = values.copy()
|
||||
mask = mask.copy()
|
||||
return values, mask, dtype, inferred_type
|
||||
|
||||
original = values
|
||||
if not copy:
|
||||
values = np.asarray(values)
|
||||
else:
|
||||
values = np.array(values, copy=copy)
|
||||
inferred_type = None
|
||||
if values.dtype == object or is_string_dtype(values.dtype):
|
||||
inferred_type = lib.infer_dtype(values, skipna=True)
|
||||
if inferred_type == "boolean" and dtype is None:
|
||||
name = dtype_cls.__name__.strip("_")
|
||||
raise TypeError(f"{values.dtype} cannot be converted to {name}")
|
||||
|
||||
elif values.dtype.kind == "b" and checker(dtype):
|
||||
if not copy:
|
||||
values = np.asarray(values, dtype=default_dtype)
|
||||
else:
|
||||
values = np.array(values, dtype=default_dtype, copy=copy)
|
||||
|
||||
elif values.dtype.kind not in "iuf":
|
||||
name = dtype_cls.__name__.strip("_")
|
||||
raise TypeError(f"{values.dtype} cannot be converted to {name}")
|
||||
|
||||
if values.ndim != 1:
|
||||
raise TypeError("values must be a 1D list-like")
|
||||
|
||||
if mask is None:
|
||||
if values.dtype.kind in "iu":
|
||||
# fastpath
|
||||
mask = np.zeros(len(values), dtype=np.bool_)
|
||||
else:
|
||||
mask = libmissing.is_numeric_na(values)
|
||||
else:
|
||||
assert len(mask) == len(values)
|
||||
|
||||
if mask.ndim != 1:
|
||||
raise TypeError("mask must be a 1D list-like")
|
||||
|
||||
# infer dtype if needed
|
||||
if dtype is None:
|
||||
dtype = default_dtype
|
||||
else:
|
||||
dtype = dtype.numpy_dtype
|
||||
|
||||
if is_integer_dtype(dtype) and values.dtype.kind == "f" and len(values) > 0:
|
||||
if mask.all():
|
||||
values = np.ones(values.shape, dtype=dtype)
|
||||
else:
|
||||
idx = np.nanargmax(values)
|
||||
if int(values[idx]) != original[idx]:
|
||||
# We have ints that lost precision during the cast.
|
||||
inferred_type = lib.infer_dtype(original, skipna=True)
|
||||
if (
|
||||
inferred_type not in ["floating", "mixed-integer-float"]
|
||||
and not mask.any()
|
||||
):
|
||||
values = np.asarray(original, dtype=dtype)
|
||||
else:
|
||||
values = np.asarray(original, dtype="object")
|
||||
|
||||
# we copy as need to coerce here
|
||||
if mask.any():
|
||||
values = values.copy()
|
||||
values[mask] = cls._internal_fill_value
|
||||
if inferred_type in ("string", "unicode"):
|
||||
# casts from str are always safe since they raise
|
||||
# a ValueError if the str cannot be parsed into a float
|
||||
values = values.astype(dtype, copy=copy)
|
||||
else:
|
||||
values = dtype_cls._safe_cast(values, dtype, copy=False)
|
||||
|
||||
return values, mask, dtype, inferred_type
|
||||
|
||||
|
||||
class NumericArray(BaseMaskedArray):
|
||||
"""
|
||||
Base class for IntegerArray and FloatingArray.
|
||||
"""
|
||||
|
||||
_dtype_cls: type[NumericDtype]
|
||||
|
||||
def __init__(
|
||||
self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False
|
||||
) -> None:
|
||||
checker = self._dtype_cls._checker
|
||||
if not (isinstance(values, np.ndarray) and checker(values.dtype)):
|
||||
descr = (
|
||||
"floating"
|
||||
if self._dtype_cls.kind == "f" # type: ignore[comparison-overlap]
|
||||
else "integer"
|
||||
)
|
||||
raise TypeError(
|
||||
f"values should be {descr} numpy array. Use "
|
||||
"the 'pd.array' function instead"
|
||||
)
|
||||
if values.dtype == np.float16:
|
||||
# If we don't raise here, then accessing self.dtype would raise
|
||||
raise TypeError("FloatingArray does not support np.float16 dtype.")
|
||||
|
||||
super().__init__(values, mask, copy=copy)
|
||||
|
||||
@cache_readonly
|
||||
def dtype(self) -> NumericDtype:
|
||||
mapping = self._dtype_cls._get_dtype_mapping()
|
||||
return mapping[self._data.dtype]
|
||||
|
||||
@classmethod
|
||||
def _coerce_to_array(
|
||||
cls, value, *, dtype: DtypeObj, copy: bool = False
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
dtype_cls = cls._dtype_cls
|
||||
default_dtype = dtype_cls._default_np_dtype
|
||||
values, mask, _, _ = _coerce_to_data_and_mask(
|
||||
value, dtype, copy, dtype_cls, default_dtype
|
||||
)
|
||||
return values, mask
|
||||
|
||||
@classmethod
|
||||
def _from_sequence_of_strings(
|
||||
cls, strings, *, dtype: Dtype | None = None, copy: bool = False
|
||||
) -> Self:
|
||||
from pandas.core.tools.numeric import to_numeric
|
||||
|
||||
scalars = to_numeric(strings, errors="raise", dtype_backend="numpy_nullable")
|
||||
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
|
||||
|
||||
_HANDLED_TYPES = (np.ndarray, numbers.Number)
|
||||
563
lib/python3.13/site-packages/pandas/core/arrays/numpy_.py
Normal file
563
lib/python3.13/site-packages/pandas/core/arrays/numpy_.py
Normal file
@@ -0,0 +1,563 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Literal,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas._libs.tslibs import is_supported_dtype
|
||||
from pandas.compat.numpy import function as nv
|
||||
|
||||
from pandas.core.dtypes.astype import astype_array
|
||||
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
|
||||
from pandas.core.dtypes.common import pandas_dtype
|
||||
from pandas.core.dtypes.dtypes import NumpyEADtype
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
from pandas.core import (
|
||||
arraylike,
|
||||
missing,
|
||||
nanops,
|
||||
ops,
|
||||
)
|
||||
from pandas.core.arraylike import OpsMixin
|
||||
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
|
||||
from pandas.core.construction import ensure_wrapped_if_datetimelike
|
||||
from pandas.core.strings.object_array import ObjectStringArrayMixin
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
AxisInt,
|
||||
Dtype,
|
||||
FillnaOptions,
|
||||
InterpolateOptions,
|
||||
NpDtype,
|
||||
Scalar,
|
||||
Self,
|
||||
npt,
|
||||
)
|
||||
|
||||
from pandas import Index
|
||||
|
||||
|
||||
# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is
|
||||
# incompatible with definition in base class "ExtensionArray"
|
||||
class NumpyExtensionArray( # type: ignore[misc]
|
||||
OpsMixin,
|
||||
NDArrayBackedExtensionArray,
|
||||
ObjectStringArrayMixin,
|
||||
):
|
||||
"""
|
||||
A pandas ExtensionArray for NumPy data.
|
||||
|
||||
This is mostly for internal compatibility, and is not especially
|
||||
useful on its own.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : ndarray
|
||||
The NumPy ndarray to wrap. Must be 1-dimensional.
|
||||
copy : bool, default False
|
||||
Whether to copy `values`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> pd.arrays.NumpyExtensionArray(np.array([0, 1, 2, 3]))
|
||||
<NumpyExtensionArray>
|
||||
[0, 1, 2, 3]
|
||||
Length: 4, dtype: int64
|
||||
"""
|
||||
|
||||
# If you're wondering why pd.Series(cls) doesn't put the array in an
|
||||
# ExtensionBlock, search for `ABCNumpyExtensionArray`. We check for
|
||||
# that _typ to ensure that users don't unnecessarily use EAs inside
|
||||
# pandas internals, which turns off things like block consolidation.
|
||||
_typ = "npy_extension"
|
||||
__array_priority__ = 1000
|
||||
_ndarray: np.ndarray
|
||||
_dtype: NumpyEADtype
|
||||
_internal_fill_value = np.nan
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Constructors
|
||||
|
||||
def __init__(
|
||||
self, values: np.ndarray | NumpyExtensionArray, copy: bool = False
|
||||
) -> None:
|
||||
if isinstance(values, type(self)):
|
||||
values = values._ndarray
|
||||
if not isinstance(values, np.ndarray):
|
||||
raise ValueError(
|
||||
f"'values' must be a NumPy array, not {type(values).__name__}"
|
||||
)
|
||||
|
||||
if values.ndim == 0:
|
||||
# Technically we support 2, but do not advertise that fact.
|
||||
raise ValueError("NumpyExtensionArray must be 1-dimensional.")
|
||||
|
||||
if copy:
|
||||
values = values.copy()
|
||||
|
||||
dtype = NumpyEADtype(values.dtype)
|
||||
super().__init__(values, dtype)
|
||||
|
||||
@classmethod
|
||||
def _from_sequence(
|
||||
cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
|
||||
) -> NumpyExtensionArray:
|
||||
if isinstance(dtype, NumpyEADtype):
|
||||
dtype = dtype._dtype
|
||||
|
||||
# error: Argument "dtype" to "asarray" has incompatible type
|
||||
# "Union[ExtensionDtype, str, dtype[Any], dtype[floating[_64Bit]], Type[object],
|
||||
# None]"; expected "Union[dtype[Any], None, type, _SupportsDType, str,
|
||||
# Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any],
|
||||
# _DTypeDict, Tuple[Any, Any]]]"
|
||||
result = np.asarray(scalars, dtype=dtype) # type: ignore[arg-type]
|
||||
if (
|
||||
result.ndim > 1
|
||||
and not hasattr(scalars, "dtype")
|
||||
and (dtype is None or dtype == object)
|
||||
):
|
||||
# e.g. list-of-tuples
|
||||
result = construct_1d_object_array_from_listlike(scalars)
|
||||
|
||||
if copy and result is scalars:
|
||||
result = result.copy()
|
||||
return cls(result)
|
||||
|
||||
def _from_backing_data(self, arr: np.ndarray) -> NumpyExtensionArray:
|
||||
return type(self)(arr)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Data
|
||||
|
||||
@property
|
||||
def dtype(self) -> NumpyEADtype:
|
||||
return self._dtype
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# NumPy Array Interface
|
||||
|
||||
def __array__(
|
||||
self, dtype: NpDtype | None = None, copy: bool | None = None
|
||||
) -> np.ndarray:
|
||||
return np.asarray(self._ndarray, dtype=dtype)
|
||||
|
||||
def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
|
||||
# Lightly modified version of
|
||||
# https://numpy.org/doc/stable/reference/generated/numpy.lib.mixins.NDArrayOperatorsMixin.html
|
||||
# The primary modification is not boxing scalar return values
|
||||
# in NumpyExtensionArray, since pandas' ExtensionArrays are 1-d.
|
||||
out = kwargs.get("out", ())
|
||||
|
||||
result = arraylike.maybe_dispatch_ufunc_to_dunder_op(
|
||||
self, ufunc, method, *inputs, **kwargs
|
||||
)
|
||||
if result is not NotImplemented:
|
||||
return result
|
||||
|
||||
if "out" in kwargs:
|
||||
# e.g. test_ufunc_unary
|
||||
return arraylike.dispatch_ufunc_with_out(
|
||||
self, ufunc, method, *inputs, **kwargs
|
||||
)
|
||||
|
||||
if method == "reduce":
|
||||
result = arraylike.dispatch_reduction_ufunc(
|
||||
self, ufunc, method, *inputs, **kwargs
|
||||
)
|
||||
if result is not NotImplemented:
|
||||
# e.g. tests.series.test_ufunc.TestNumpyReductions
|
||||
return result
|
||||
|
||||
# Defer to the implementation of the ufunc on unwrapped values.
|
||||
inputs = tuple(
|
||||
x._ndarray if isinstance(x, NumpyExtensionArray) else x for x in inputs
|
||||
)
|
||||
if out:
|
||||
kwargs["out"] = tuple(
|
||||
x._ndarray if isinstance(x, NumpyExtensionArray) else x for x in out
|
||||
)
|
||||
result = getattr(ufunc, method)(*inputs, **kwargs)
|
||||
|
||||
if ufunc.nout > 1:
|
||||
# multiple return values; re-box array-like results
|
||||
return tuple(type(self)(x) for x in result)
|
||||
elif method == "at":
|
||||
# no return value
|
||||
return None
|
||||
elif method == "reduce":
|
||||
if isinstance(result, np.ndarray):
|
||||
# e.g. test_np_reduce_2d
|
||||
return type(self)(result)
|
||||
|
||||
# e.g. test_np_max_nested_tuples
|
||||
return result
|
||||
else:
|
||||
# one return value; re-box array-like results
|
||||
return type(self)(result)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Pandas ExtensionArray Interface
|
||||
|
||||
def astype(self, dtype, copy: bool = True):
|
||||
dtype = pandas_dtype(dtype)
|
||||
|
||||
if dtype == self.dtype:
|
||||
if copy:
|
||||
return self.copy()
|
||||
return self
|
||||
|
||||
result = astype_array(self._ndarray, dtype=dtype, copy=copy)
|
||||
return result
|
||||
|
||||
def isna(self) -> np.ndarray:
|
||||
return isna(self._ndarray)
|
||||
|
||||
def _validate_scalar(self, fill_value):
|
||||
if fill_value is None:
|
||||
# Primarily for subclasses
|
||||
fill_value = self.dtype.na_value
|
||||
return fill_value
|
||||
|
||||
def _values_for_factorize(self) -> tuple[np.ndarray, float | None]:
|
||||
if self.dtype.kind in "iub":
|
||||
fv = None
|
||||
else:
|
||||
fv = np.nan
|
||||
return self._ndarray, fv
|
||||
|
||||
# Base EA class (and all other EA classes) don't have limit_area keyword
|
||||
# This can be removed here as well when the interpolate ffill/bfill method
|
||||
# deprecation is enforced
|
||||
def _pad_or_backfill(
|
||||
self,
|
||||
*,
|
||||
method: FillnaOptions,
|
||||
limit: int | None = None,
|
||||
limit_area: Literal["inside", "outside"] | None = None,
|
||||
copy: bool = True,
|
||||
) -> Self:
|
||||
"""
|
||||
ffill or bfill along axis=0.
|
||||
"""
|
||||
if copy:
|
||||
out_data = self._ndarray.copy()
|
||||
else:
|
||||
out_data = self._ndarray
|
||||
|
||||
meth = missing.clean_fill_method(method)
|
||||
missing.pad_or_backfill_inplace(
|
||||
out_data.T,
|
||||
method=meth,
|
||||
axis=0,
|
||||
limit=limit,
|
||||
limit_area=limit_area,
|
||||
)
|
||||
|
||||
if not copy:
|
||||
return self
|
||||
return type(self)._simple_new(out_data, dtype=self.dtype)
|
||||
|
||||
def interpolate(
|
||||
self,
|
||||
*,
|
||||
method: InterpolateOptions,
|
||||
axis: int,
|
||||
index: Index,
|
||||
limit,
|
||||
limit_direction,
|
||||
limit_area,
|
||||
copy: bool,
|
||||
**kwargs,
|
||||
) -> Self:
|
||||
"""
|
||||
See NDFrame.interpolate.__doc__.
|
||||
"""
|
||||
# NB: we return type(self) even if copy=False
|
||||
if not copy:
|
||||
out_data = self._ndarray
|
||||
else:
|
||||
out_data = self._ndarray.copy()
|
||||
|
||||
# TODO: assert we have floating dtype?
|
||||
missing.interpolate_2d_inplace(
|
||||
out_data,
|
||||
method=method,
|
||||
axis=axis,
|
||||
index=index,
|
||||
limit=limit,
|
||||
limit_direction=limit_direction,
|
||||
limit_area=limit_area,
|
||||
**kwargs,
|
||||
)
|
||||
if not copy:
|
||||
return self
|
||||
return type(self)._simple_new(out_data, dtype=self.dtype)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Reductions
|
||||
|
||||
def any(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
out=None,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_any((), {"out": out, "keepdims": keepdims})
|
||||
result = nanops.nanany(self._ndarray, axis=axis, skipna=skipna)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def all(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
out=None,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_all((), {"out": out, "keepdims": keepdims})
|
||||
result = nanops.nanall(self._ndarray, axis=axis, skipna=skipna)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def min(
|
||||
self, *, axis: AxisInt | None = None, skipna: bool = True, **kwargs
|
||||
) -> Scalar:
|
||||
nv.validate_min((), kwargs)
|
||||
result = nanops.nanmin(
|
||||
values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna
|
||||
)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def max(
|
||||
self, *, axis: AxisInt | None = None, skipna: bool = True, **kwargs
|
||||
) -> Scalar:
|
||||
nv.validate_max((), kwargs)
|
||||
result = nanops.nanmax(
|
||||
values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna
|
||||
)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def sum(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
skipna: bool = True,
|
||||
min_count: int = 0,
|
||||
**kwargs,
|
||||
) -> Scalar:
|
||||
nv.validate_sum((), kwargs)
|
||||
result = nanops.nansum(
|
||||
self._ndarray, axis=axis, skipna=skipna, min_count=min_count
|
||||
)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def prod(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
skipna: bool = True,
|
||||
min_count: int = 0,
|
||||
**kwargs,
|
||||
) -> Scalar:
|
||||
nv.validate_prod((), kwargs)
|
||||
result = nanops.nanprod(
|
||||
self._ndarray, axis=axis, skipna=skipna, min_count=min_count
|
||||
)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def mean(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
dtype: NpDtype | None = None,
|
||||
out=None,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_mean((), {"dtype": dtype, "out": out, "keepdims": keepdims})
|
||||
result = nanops.nanmean(self._ndarray, axis=axis, skipna=skipna)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def median(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
out=None,
|
||||
overwrite_input: bool = False,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_median(
|
||||
(), {"out": out, "overwrite_input": overwrite_input, "keepdims": keepdims}
|
||||
)
|
||||
result = nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def std(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
dtype: NpDtype | None = None,
|
||||
out=None,
|
||||
ddof: int = 1,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_stat_ddof_func(
|
||||
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="std"
|
||||
)
|
||||
result = nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def var(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
dtype: NpDtype | None = None,
|
||||
out=None,
|
||||
ddof: int = 1,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_stat_ddof_func(
|
||||
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="var"
|
||||
)
|
||||
result = nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def sem(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
dtype: NpDtype | None = None,
|
||||
out=None,
|
||||
ddof: int = 1,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_stat_ddof_func(
|
||||
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="sem"
|
||||
)
|
||||
result = nanops.nansem(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def kurt(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
dtype: NpDtype | None = None,
|
||||
out=None,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_stat_ddof_func(
|
||||
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="kurt"
|
||||
)
|
||||
result = nanops.nankurt(self._ndarray, axis=axis, skipna=skipna)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def skew(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt | None = None,
|
||||
dtype: NpDtype | None = None,
|
||||
out=None,
|
||||
keepdims: bool = False,
|
||||
skipna: bool = True,
|
||||
):
|
||||
nv.validate_stat_ddof_func(
|
||||
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="skew"
|
||||
)
|
||||
result = nanops.nanskew(self._ndarray, axis=axis, skipna=skipna)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Additional Methods
|
||||
|
||||
def to_numpy(
|
||||
self,
|
||||
dtype: npt.DTypeLike | None = None,
|
||||
copy: bool = False,
|
||||
na_value: object = lib.no_default,
|
||||
) -> np.ndarray:
|
||||
mask = self.isna()
|
||||
if na_value is not lib.no_default and mask.any():
|
||||
result = self._ndarray.copy()
|
||||
result[mask] = na_value
|
||||
else:
|
||||
result = self._ndarray
|
||||
|
||||
result = np.asarray(result, dtype=dtype)
|
||||
|
||||
if copy and result is self._ndarray:
|
||||
result = result.copy()
|
||||
|
||||
return result
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Ops
|
||||
|
||||
def __invert__(self) -> NumpyExtensionArray:
|
||||
return type(self)(~self._ndarray)
|
||||
|
||||
def __neg__(self) -> NumpyExtensionArray:
|
||||
return type(self)(-self._ndarray)
|
||||
|
||||
def __pos__(self) -> NumpyExtensionArray:
|
||||
return type(self)(+self._ndarray)
|
||||
|
||||
def __abs__(self) -> NumpyExtensionArray:
|
||||
return type(self)(abs(self._ndarray))
|
||||
|
||||
def _cmp_method(self, other, op):
|
||||
if isinstance(other, NumpyExtensionArray):
|
||||
other = other._ndarray
|
||||
|
||||
other = ops.maybe_prepare_scalar_for_op(other, (len(self),))
|
||||
pd_op = ops.get_array_op(op)
|
||||
other = ensure_wrapped_if_datetimelike(other)
|
||||
result = pd_op(self._ndarray, other)
|
||||
|
||||
if op is divmod or op is ops.rdivmod:
|
||||
a, b = result
|
||||
if isinstance(a, np.ndarray):
|
||||
# for e.g. op vs TimedeltaArray, we may already
|
||||
# have an ExtensionArray, in which case we do not wrap
|
||||
return self._wrap_ndarray_result(a), self._wrap_ndarray_result(b)
|
||||
return a, b
|
||||
|
||||
if isinstance(result, np.ndarray):
|
||||
# for e.g. multiplication vs TimedeltaArray, we may already
|
||||
# have an ExtensionArray, in which case we do not wrap
|
||||
return self._wrap_ndarray_result(result)
|
||||
return result
|
||||
|
||||
_arith_method = _cmp_method
|
||||
|
||||
def _wrap_ndarray_result(self, result: np.ndarray):
|
||||
# If we have timedelta64[ns] result, return a TimedeltaArray instead
|
||||
# of a NumpyExtensionArray
|
||||
if result.dtype.kind == "m" and is_supported_dtype(result.dtype):
|
||||
from pandas.core.arrays import TimedeltaArray
|
||||
|
||||
return TimedeltaArray._simple_new(result, dtype=result.dtype)
|
||||
return type(self)(result)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# String methods interface
|
||||
_str_na_value = np.nan
|
||||
1313
lib/python3.13/site-packages/pandas/core/arrays/period.py
Normal file
1313
lib/python3.13/site-packages/pandas/core/arrays/period.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,19 @@
|
||||
from pandas.core.arrays.sparse.accessor import (
|
||||
SparseAccessor,
|
||||
SparseFrameAccessor,
|
||||
)
|
||||
from pandas.core.arrays.sparse.array import (
|
||||
BlockIndex,
|
||||
IntIndex,
|
||||
SparseArray,
|
||||
make_sparse_index,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"BlockIndex",
|
||||
"IntIndex",
|
||||
"make_sparse_index",
|
||||
"SparseAccessor",
|
||||
"SparseArray",
|
||||
"SparseFrameAccessor",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,414 @@
|
||||
"""Sparse accessor"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
from pandas.core.dtypes.cast import find_common_type
|
||||
from pandas.core.dtypes.dtypes import SparseDtype
|
||||
|
||||
from pandas.core.accessor import (
|
||||
PandasDelegate,
|
||||
delegate_names,
|
||||
)
|
||||
from pandas.core.arrays.sparse.array import SparseArray
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
|
||||
|
||||
class BaseAccessor:
|
||||
_validation_msg = "Can only use the '.sparse' accessor with Sparse data."
|
||||
|
||||
def __init__(self, data=None) -> None:
|
||||
self._parent = data
|
||||
self._validate(data)
|
||||
|
||||
def _validate(self, data):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@delegate_names(
|
||||
SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property"
|
||||
)
|
||||
class SparseAccessor(BaseAccessor, PandasDelegate):
|
||||
"""
|
||||
Accessor for SparseSparse from other sparse matrix data types.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> ser = pd.Series([0, 0, 2, 2, 2], dtype="Sparse[int]")
|
||||
>>> ser.sparse.density
|
||||
0.6
|
||||
>>> ser.sparse.sp_values
|
||||
array([2, 2, 2])
|
||||
"""
|
||||
|
||||
def _validate(self, data):
|
||||
if not isinstance(data.dtype, SparseDtype):
|
||||
raise AttributeError(self._validation_msg)
|
||||
|
||||
def _delegate_property_get(self, name: str, *args, **kwargs):
|
||||
return getattr(self._parent.array, name)
|
||||
|
||||
def _delegate_method(self, name: str, *args, **kwargs):
|
||||
if name == "from_coo":
|
||||
return self.from_coo(*args, **kwargs)
|
||||
elif name == "to_coo":
|
||||
return self.to_coo(*args, **kwargs)
|
||||
else:
|
||||
raise ValueError
|
||||
|
||||
@classmethod
|
||||
def from_coo(cls, A, dense_index: bool = False) -> Series:
|
||||
"""
|
||||
Create a Series with sparse values from a scipy.sparse.coo_matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
A : scipy.sparse.coo_matrix
|
||||
dense_index : bool, default False
|
||||
If False (default), the index consists of only the
|
||||
coords of the non-null entries of the original coo_matrix.
|
||||
If True, the index consists of the full sorted
|
||||
(row, col) coordinates of the coo_matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
s : Series
|
||||
A Series with sparse values.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy import sparse
|
||||
|
||||
>>> A = sparse.coo_matrix(
|
||||
... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4)
|
||||
... )
|
||||
>>> A
|
||||
<COOrdinate sparse matrix of dtype 'float64'
|
||||
with 3 stored elements and shape (3, 4)>
|
||||
|
||||
>>> A.todense()
|
||||
matrix([[0., 0., 1., 2.],
|
||||
[3., 0., 0., 0.],
|
||||
[0., 0., 0., 0.]])
|
||||
|
||||
>>> ss = pd.Series.sparse.from_coo(A)
|
||||
>>> ss
|
||||
0 2 1.0
|
||||
3 2.0
|
||||
1 0 3.0
|
||||
dtype: Sparse[float64, nan]
|
||||
"""
|
||||
from pandas import Series
|
||||
from pandas.core.arrays.sparse.scipy_sparse import coo_to_sparse_series
|
||||
|
||||
result = coo_to_sparse_series(A, dense_index=dense_index)
|
||||
result = Series(result.array, index=result.index, copy=False)
|
||||
|
||||
return result
|
||||
|
||||
def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels: bool = False):
|
||||
"""
|
||||
Create a scipy.sparse.coo_matrix from a Series with MultiIndex.
|
||||
|
||||
Use row_levels and column_levels to determine the row and column
|
||||
coordinates respectively. row_levels and column_levels are the names
|
||||
(labels) or numbers of the levels. {row_levels, column_levels} must be
|
||||
a partition of the MultiIndex level names (or numbers).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
row_levels : tuple/list
|
||||
column_levels : tuple/list
|
||||
sort_labels : bool, default False
|
||||
Sort the row and column labels before forming the sparse matrix.
|
||||
When `row_levels` and/or `column_levels` refer to a single level,
|
||||
set to `True` for a faster execution.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : scipy.sparse.coo_matrix
|
||||
rows : list (row labels)
|
||||
columns : list (column labels)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])
|
||||
>>> s.index = pd.MultiIndex.from_tuples(
|
||||
... [
|
||||
... (1, 2, "a", 0),
|
||||
... (1, 2, "a", 1),
|
||||
... (1, 1, "b", 0),
|
||||
... (1, 1, "b", 1),
|
||||
... (2, 1, "b", 0),
|
||||
... (2, 1, "b", 1)
|
||||
... ],
|
||||
... names=["A", "B", "C", "D"],
|
||||
... )
|
||||
>>> s
|
||||
A B C D
|
||||
1 2 a 0 3.0
|
||||
1 NaN
|
||||
1 b 0 1.0
|
||||
1 3.0
|
||||
2 1 b 0 NaN
|
||||
1 NaN
|
||||
dtype: float64
|
||||
|
||||
>>> ss = s.astype("Sparse")
|
||||
>>> ss
|
||||
A B C D
|
||||
1 2 a 0 3.0
|
||||
1 NaN
|
||||
1 b 0 1.0
|
||||
1 3.0
|
||||
2 1 b 0 NaN
|
||||
1 NaN
|
||||
dtype: Sparse[float64, nan]
|
||||
|
||||
>>> A, rows, columns = ss.sparse.to_coo(
|
||||
... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True
|
||||
... )
|
||||
>>> A
|
||||
<COOrdinate sparse matrix of dtype 'float64'
|
||||
with 3 stored elements and shape (3, 4)>
|
||||
>>> A.todense()
|
||||
matrix([[0., 0., 1., 3.],
|
||||
[3., 0., 0., 0.],
|
||||
[0., 0., 0., 0.]])
|
||||
|
||||
>>> rows
|
||||
[(1, 1), (1, 2), (2, 1)]
|
||||
>>> columns
|
||||
[('a', 0), ('a', 1), ('b', 0), ('b', 1)]
|
||||
"""
|
||||
from pandas.core.arrays.sparse.scipy_sparse import sparse_series_to_coo
|
||||
|
||||
A, rows, columns = sparse_series_to_coo(
|
||||
self._parent, row_levels, column_levels, sort_labels=sort_labels
|
||||
)
|
||||
return A, rows, columns
|
||||
|
||||
def to_dense(self) -> Series:
|
||||
"""
|
||||
Convert a Series from sparse values to dense.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series:
|
||||
A Series with the same values, stored as a dense array.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> series = pd.Series(pd.arrays.SparseArray([0, 1, 0]))
|
||||
>>> series
|
||||
0 0
|
||||
1 1
|
||||
2 0
|
||||
dtype: Sparse[int64, 0]
|
||||
|
||||
>>> series.sparse.to_dense()
|
||||
0 0
|
||||
1 1
|
||||
2 0
|
||||
dtype: int64
|
||||
"""
|
||||
from pandas import Series
|
||||
|
||||
return Series(
|
||||
self._parent.array.to_dense(),
|
||||
index=self._parent.index,
|
||||
name=self._parent.name,
|
||||
copy=False,
|
||||
)
|
||||
|
||||
|
||||
class SparseFrameAccessor(BaseAccessor, PandasDelegate):
|
||||
"""
|
||||
DataFrame accessor for sparse data.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({"a": [1, 2, 0, 0],
|
||||
... "b": [3, 0, 0, 4]}, dtype="Sparse[int]")
|
||||
>>> df.sparse.density
|
||||
0.5
|
||||
"""
|
||||
|
||||
def _validate(self, data):
|
||||
dtypes = data.dtypes
|
||||
if not all(isinstance(t, SparseDtype) for t in dtypes):
|
||||
raise AttributeError(self._validation_msg)
|
||||
|
||||
@classmethod
|
||||
def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame:
|
||||
"""
|
||||
Create a new DataFrame from a scipy sparse matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : scipy.sparse.spmatrix
|
||||
Must be convertible to csc format.
|
||||
index, columns : Index, optional
|
||||
Row and column labels to use for the resulting DataFrame.
|
||||
Defaults to a RangeIndex.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
Each column of the DataFrame is stored as a
|
||||
:class:`arrays.SparseArray`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import scipy.sparse
|
||||
>>> mat = scipy.sparse.eye(3, dtype=float)
|
||||
>>> pd.DataFrame.sparse.from_spmatrix(mat)
|
||||
0 1 2
|
||||
0 1.0 0 0
|
||||
1 0 1.0 0
|
||||
2 0 0 1.0
|
||||
"""
|
||||
from pandas._libs.sparse import IntIndex
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
data = data.tocsc()
|
||||
index, columns = cls._prep_index(data, index, columns)
|
||||
n_rows, n_columns = data.shape
|
||||
# We need to make sure indices are sorted, as we create
|
||||
# IntIndex with no input validation (i.e. check_integrity=False ).
|
||||
# Indices may already be sorted in scipy in which case this adds
|
||||
# a small overhead.
|
||||
data.sort_indices()
|
||||
indices = data.indices
|
||||
indptr = data.indptr
|
||||
array_data = data.data
|
||||
dtype = SparseDtype(array_data.dtype, 0)
|
||||
arrays = []
|
||||
for i in range(n_columns):
|
||||
sl = slice(indptr[i], indptr[i + 1])
|
||||
idx = IntIndex(n_rows, indices[sl], check_integrity=False)
|
||||
arr = SparseArray._simple_new(array_data[sl], idx, dtype)
|
||||
arrays.append(arr)
|
||||
return DataFrame._from_arrays(
|
||||
arrays, columns=columns, index=index, verify_integrity=False
|
||||
)
|
||||
|
||||
def to_dense(self) -> DataFrame:
|
||||
"""
|
||||
Convert a DataFrame with sparse values to dense.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
A DataFrame with the same values stored as dense arrays.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0])})
|
||||
>>> df.sparse.to_dense()
|
||||
A
|
||||
0 0
|
||||
1 1
|
||||
2 0
|
||||
"""
|
||||
from pandas import DataFrame
|
||||
|
||||
data = {k: v.array.to_dense() for k, v in self._parent.items()}
|
||||
return DataFrame(data, index=self._parent.index, columns=self._parent.columns)
|
||||
|
||||
def to_coo(self):
|
||||
"""
|
||||
Return the contents of the frame as a sparse SciPy COO matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
scipy.sparse.spmatrix
|
||||
If the caller is heterogeneous and contains booleans or objects,
|
||||
the result will be of dtype=object. See Notes.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The dtype will be the lowest-common-denominator type (implicit
|
||||
upcasting); that is to say if the dtypes (even of numeric types)
|
||||
are mixed, the one that accommodates all will be chosen.
|
||||
|
||||
e.g. If the dtypes are float16 and float32, dtype will be upcast to
|
||||
float32. By numpy.find_common_type convention, mixing int64 and
|
||||
and uint64 will result in a float64 dtype.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])})
|
||||
>>> df.sparse.to_coo()
|
||||
<COOrdinate sparse matrix of dtype 'int64'
|
||||
with 2 stored elements and shape (4, 1)>
|
||||
"""
|
||||
import_optional_dependency("scipy")
|
||||
from scipy.sparse import coo_matrix
|
||||
|
||||
dtype = find_common_type(self._parent.dtypes.to_list())
|
||||
if isinstance(dtype, SparseDtype):
|
||||
dtype = dtype.subtype
|
||||
|
||||
cols, rows, data = [], [], []
|
||||
for col, (_, ser) in enumerate(self._parent.items()):
|
||||
sp_arr = ser.array
|
||||
if sp_arr.fill_value != 0:
|
||||
raise ValueError("fill value must be 0 when converting to COO matrix")
|
||||
|
||||
row = sp_arr.sp_index.indices
|
||||
cols.append(np.repeat(col, len(row)))
|
||||
rows.append(row)
|
||||
data.append(sp_arr.sp_values.astype(dtype, copy=False))
|
||||
|
||||
cols = np.concatenate(cols)
|
||||
rows = np.concatenate(rows)
|
||||
data = np.concatenate(data)
|
||||
return coo_matrix((data, (rows, cols)), shape=self._parent.shape)
|
||||
|
||||
@property
|
||||
def density(self) -> float:
|
||||
"""
|
||||
Ratio of non-sparse points to total (dense) data points.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])})
|
||||
>>> df.sparse.density
|
||||
0.5
|
||||
"""
|
||||
tmp = np.mean([column.array.density for _, column in self._parent.items()])
|
||||
return tmp
|
||||
|
||||
@staticmethod
|
||||
def _prep_index(data, index, columns):
|
||||
from pandas.core.indexes.api import (
|
||||
default_index,
|
||||
ensure_index,
|
||||
)
|
||||
|
||||
N, K = data.shape
|
||||
if index is None:
|
||||
index = default_index(N)
|
||||
else:
|
||||
index = ensure_index(index)
|
||||
if columns is None:
|
||||
columns = default_index(K)
|
||||
else:
|
||||
columns = ensure_index(columns)
|
||||
|
||||
if len(columns) != K:
|
||||
raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}")
|
||||
if len(index) != N:
|
||||
raise ValueError(f"Index length mismatch: {len(index)} vs. {N}")
|
||||
return index, columns
|
||||
1929
lib/python3.13/site-packages/pandas/core/arrays/sparse/array.py
Normal file
1929
lib/python3.13/site-packages/pandas/core/arrays/sparse/array.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,207 @@
|
||||
"""
|
||||
Interaction with scipy.sparse matrices.
|
||||
|
||||
Currently only includes to_coo helpers.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from pandas._libs import lib
|
||||
|
||||
from pandas.core.dtypes.missing import notna
|
||||
|
||||
from pandas.core.algorithms import factorize
|
||||
from pandas.core.indexes.api import MultiIndex
|
||||
from pandas.core.series import Series
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse
|
||||
|
||||
from pandas._typing import (
|
||||
IndexLabel,
|
||||
npt,
|
||||
)
|
||||
|
||||
|
||||
def _check_is_partition(parts: Iterable, whole: Iterable):
|
||||
whole = set(whole)
|
||||
parts = [set(x) for x in parts]
|
||||
if set.intersection(*parts) != set():
|
||||
raise ValueError("Is not a partition because intersection is not null.")
|
||||
if set.union(*parts) != whole:
|
||||
raise ValueError("Is not a partition because union is not the whole.")
|
||||
|
||||
|
||||
def _levels_to_axis(
|
||||
ss,
|
||||
levels: tuple[int] | list[int],
|
||||
valid_ilocs: npt.NDArray[np.intp],
|
||||
sort_labels: bool = False,
|
||||
) -> tuple[npt.NDArray[np.intp], list[IndexLabel]]:
|
||||
"""
|
||||
For a MultiIndexed sparse Series `ss`, return `ax_coords` and `ax_labels`,
|
||||
where `ax_coords` are the coordinates along one of the two axes of the
|
||||
destination sparse matrix, and `ax_labels` are the labels from `ss`' Index
|
||||
which correspond to these coordinates.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ss : Series
|
||||
levels : tuple/list
|
||||
valid_ilocs : numpy.ndarray
|
||||
Array of integer positions of valid values for the sparse matrix in ss.
|
||||
sort_labels : bool, default False
|
||||
Sort the axis labels before forming the sparse matrix. When `levels`
|
||||
refers to a single level, set to True for a faster execution.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ax_coords : numpy.ndarray (axis coordinates)
|
||||
ax_labels : list (axis labels)
|
||||
"""
|
||||
# Since the labels are sorted in `Index.levels`, when we wish to sort and
|
||||
# there is only one level of the MultiIndex for this axis, the desired
|
||||
# output can be obtained in the following simpler, more efficient way.
|
||||
if sort_labels and len(levels) == 1:
|
||||
ax_coords = ss.index.codes[levels[0]][valid_ilocs]
|
||||
ax_labels = ss.index.levels[levels[0]]
|
||||
|
||||
else:
|
||||
levels_values = lib.fast_zip(
|
||||
[ss.index.get_level_values(lvl).to_numpy() for lvl in levels]
|
||||
)
|
||||
codes, ax_labels = factorize(levels_values, sort=sort_labels)
|
||||
ax_coords = codes[valid_ilocs]
|
||||
|
||||
ax_labels = ax_labels.tolist()
|
||||
return ax_coords, ax_labels
|
||||
|
||||
|
||||
def _to_ijv(
|
||||
ss,
|
||||
row_levels: tuple[int] | list[int] = (0,),
|
||||
column_levels: tuple[int] | list[int] = (1,),
|
||||
sort_labels: bool = False,
|
||||
) -> tuple[
|
||||
np.ndarray,
|
||||
npt.NDArray[np.intp],
|
||||
npt.NDArray[np.intp],
|
||||
list[IndexLabel],
|
||||
list[IndexLabel],
|
||||
]:
|
||||
"""
|
||||
For an arbitrary MultiIndexed sparse Series return (v, i, j, ilabels,
|
||||
jlabels) where (v, (i, j)) is suitable for passing to scipy.sparse.coo
|
||||
constructor, and ilabels and jlabels are the row and column labels
|
||||
respectively.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ss : Series
|
||||
row_levels : tuple/list
|
||||
column_levels : tuple/list
|
||||
sort_labels : bool, default False
|
||||
Sort the row and column labels before forming the sparse matrix.
|
||||
When `row_levels` and/or `column_levels` refer to a single level,
|
||||
set to `True` for a faster execution.
|
||||
|
||||
Returns
|
||||
-------
|
||||
values : numpy.ndarray
|
||||
Valid values to populate a sparse matrix, extracted from
|
||||
ss.
|
||||
i_coords : numpy.ndarray (row coordinates of the values)
|
||||
j_coords : numpy.ndarray (column coordinates of the values)
|
||||
i_labels : list (row labels)
|
||||
j_labels : list (column labels)
|
||||
"""
|
||||
# index and column levels must be a partition of the index
|
||||
_check_is_partition([row_levels, column_levels], range(ss.index.nlevels))
|
||||
# From the sparse Series, get the integer indices and data for valid sparse
|
||||
# entries.
|
||||
sp_vals = ss.array.sp_values
|
||||
na_mask = notna(sp_vals)
|
||||
values = sp_vals[na_mask]
|
||||
valid_ilocs = ss.array.sp_index.indices[na_mask]
|
||||
|
||||
i_coords, i_labels = _levels_to_axis(
|
||||
ss, row_levels, valid_ilocs, sort_labels=sort_labels
|
||||
)
|
||||
|
||||
j_coords, j_labels = _levels_to_axis(
|
||||
ss, column_levels, valid_ilocs, sort_labels=sort_labels
|
||||
)
|
||||
|
||||
return values, i_coords, j_coords, i_labels, j_labels
|
||||
|
||||
|
||||
def sparse_series_to_coo(
|
||||
ss: Series,
|
||||
row_levels: Iterable[int] = (0,),
|
||||
column_levels: Iterable[int] = (1,),
|
||||
sort_labels: bool = False,
|
||||
) -> tuple[scipy.sparse.coo_matrix, list[IndexLabel], list[IndexLabel]]:
|
||||
"""
|
||||
Convert a sparse Series to a scipy.sparse.coo_matrix using index
|
||||
levels row_levels, column_levels as the row and column
|
||||
labels respectively. Returns the sparse_matrix, row and column labels.
|
||||
"""
|
||||
import scipy.sparse
|
||||
|
||||
if ss.index.nlevels < 2:
|
||||
raise ValueError("to_coo requires MultiIndex with nlevels >= 2.")
|
||||
if not ss.index.is_unique:
|
||||
raise ValueError(
|
||||
"Duplicate index entries are not allowed in to_coo transformation."
|
||||
)
|
||||
|
||||
# to keep things simple, only rely on integer indexing (not labels)
|
||||
row_levels = [ss.index._get_level_number(x) for x in row_levels]
|
||||
column_levels = [ss.index._get_level_number(x) for x in column_levels]
|
||||
|
||||
v, i, j, rows, columns = _to_ijv(
|
||||
ss, row_levels=row_levels, column_levels=column_levels, sort_labels=sort_labels
|
||||
)
|
||||
sparse_matrix = scipy.sparse.coo_matrix(
|
||||
(v, (i, j)), shape=(len(rows), len(columns))
|
||||
)
|
||||
return sparse_matrix, rows, columns
|
||||
|
||||
|
||||
def coo_to_sparse_series(
|
||||
A: scipy.sparse.coo_matrix, dense_index: bool = False
|
||||
) -> Series:
|
||||
"""
|
||||
Convert a scipy.sparse.coo_matrix to a Series with type sparse.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
A : scipy.sparse.coo_matrix
|
||||
dense_index : bool, default False
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series
|
||||
|
||||
Raises
|
||||
------
|
||||
TypeError if A is not a coo_matrix
|
||||
"""
|
||||
from pandas import SparseDtype
|
||||
|
||||
try:
|
||||
ser = Series(A.data, MultiIndex.from_arrays((A.row, A.col)), copy=False)
|
||||
except AttributeError as err:
|
||||
raise TypeError(
|
||||
f"Expected coo_matrix. Got {type(A).__name__} instead."
|
||||
) from err
|
||||
ser = ser.sort_index()
|
||||
ser = ser.astype(SparseDtype(ser.dtype))
|
||||
if dense_index:
|
||||
ind = MultiIndex.from_product([A.row, A.col])
|
||||
ser = ser.reindex(ind)
|
||||
return ser
|
||||
657
lib/python3.13/site-packages/pandas/core/arrays/string_.py
Normal file
657
lib/python3.13/site-packages/pandas/core/arrays/string_.py
Normal file
@@ -0,0 +1,657 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
ClassVar,
|
||||
Literal,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._config import get_option
|
||||
|
||||
from pandas._libs import (
|
||||
lib,
|
||||
missing as libmissing,
|
||||
)
|
||||
from pandas._libs.arrays import NDArrayBacked
|
||||
from pandas._libs.lib import ensure_string_array
|
||||
from pandas.compat import pa_version_under10p1
|
||||
from pandas.compat.numpy import function as nv
|
||||
from pandas.util._decorators import doc
|
||||
|
||||
from pandas.core.dtypes.base import (
|
||||
ExtensionDtype,
|
||||
StorageExtensionDtype,
|
||||
register_extension_dtype,
|
||||
)
|
||||
from pandas.core.dtypes.common import (
|
||||
is_array_like,
|
||||
is_bool_dtype,
|
||||
is_integer_dtype,
|
||||
is_object_dtype,
|
||||
is_string_dtype,
|
||||
pandas_dtype,
|
||||
)
|
||||
|
||||
from pandas.core import ops
|
||||
from pandas.core.array_algos import masked_reductions
|
||||
from pandas.core.arrays.base import ExtensionArray
|
||||
from pandas.core.arrays.floating import (
|
||||
FloatingArray,
|
||||
FloatingDtype,
|
||||
)
|
||||
from pandas.core.arrays.integer import (
|
||||
IntegerArray,
|
||||
IntegerDtype,
|
||||
)
|
||||
from pandas.core.arrays.numpy_ import NumpyExtensionArray
|
||||
from pandas.core.construction import extract_array
|
||||
from pandas.core.indexers import check_array_indexer
|
||||
from pandas.core.missing import isna
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pyarrow
|
||||
|
||||
from pandas._typing import (
|
||||
AxisInt,
|
||||
Dtype,
|
||||
DtypeObj,
|
||||
NumpySorter,
|
||||
NumpyValueArrayLike,
|
||||
Scalar,
|
||||
Self,
|
||||
npt,
|
||||
type_t,
|
||||
)
|
||||
|
||||
from pandas import Series
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class StringDtype(StorageExtensionDtype):
|
||||
"""
|
||||
Extension dtype for string data.
|
||||
|
||||
.. warning::
|
||||
|
||||
StringDtype is considered experimental. The implementation and
|
||||
parts of the API may change without warning.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
storage : {"python", "pyarrow", "pyarrow_numpy"}, optional
|
||||
If not given, the value of ``pd.options.mode.string_storage``.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> pd.StringDtype()
|
||||
string[python]
|
||||
|
||||
>>> pd.StringDtype(storage="pyarrow")
|
||||
string[pyarrow]
|
||||
"""
|
||||
|
||||
# error: Cannot override instance variable (previously declared on
|
||||
# base class "StorageExtensionDtype") with class variable
|
||||
name: ClassVar[str] = "string" # type: ignore[misc]
|
||||
|
||||
#: StringDtype().na_value uses pandas.NA except the implementation that
|
||||
# follows NumPy semantics, which uses nan.
|
||||
@property
|
||||
def na_value(self) -> libmissing.NAType | float: # type: ignore[override]
|
||||
if self.storage == "pyarrow_numpy":
|
||||
return np.nan
|
||||
else:
|
||||
return libmissing.NA
|
||||
|
||||
_metadata = ("storage",)
|
||||
|
||||
def __init__(self, storage=None) -> None:
|
||||
if storage is None:
|
||||
infer_string = get_option("future.infer_string")
|
||||
if infer_string:
|
||||
storage = "pyarrow_numpy"
|
||||
else:
|
||||
storage = get_option("mode.string_storage")
|
||||
if storage not in {"python", "pyarrow", "pyarrow_numpy"}:
|
||||
raise ValueError(
|
||||
f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. "
|
||||
f"Got {storage} instead."
|
||||
)
|
||||
if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under10p1:
|
||||
raise ImportError(
|
||||
"pyarrow>=10.0.1 is required for PyArrow backed StringArray."
|
||||
)
|
||||
self.storage = storage
|
||||
|
||||
@property
|
||||
def type(self) -> type[str]:
|
||||
return str
|
||||
|
||||
@classmethod
|
||||
def construct_from_string(cls, string) -> Self:
|
||||
"""
|
||||
Construct a StringDtype from a string.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
string : str
|
||||
The type of the name. The storage type will be taking from `string`.
|
||||
Valid options and their storage types are
|
||||
|
||||
========================== ==============================================
|
||||
string result storage
|
||||
========================== ==============================================
|
||||
``'string'`` pd.options.mode.string_storage, default python
|
||||
``'string[python]'`` python
|
||||
``'string[pyarrow]'`` pyarrow
|
||||
========================== ==============================================
|
||||
|
||||
Returns
|
||||
-------
|
||||
StringDtype
|
||||
|
||||
Raise
|
||||
-----
|
||||
TypeError
|
||||
If the string is not a valid option.
|
||||
"""
|
||||
if not isinstance(string, str):
|
||||
raise TypeError(
|
||||
f"'construct_from_string' expects a string, got {type(string)}"
|
||||
)
|
||||
if string == "string":
|
||||
return cls()
|
||||
elif string == "string[python]":
|
||||
return cls(storage="python")
|
||||
elif string == "string[pyarrow]":
|
||||
return cls(storage="pyarrow")
|
||||
elif string == "string[pyarrow_numpy]":
|
||||
return cls(storage="pyarrow_numpy")
|
||||
else:
|
||||
raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
|
||||
|
||||
# https://github.com/pandas-dev/pandas/issues/36126
|
||||
# error: Signature of "construct_array_type" incompatible with supertype
|
||||
# "ExtensionDtype"
|
||||
def construct_array_type( # type: ignore[override]
|
||||
self,
|
||||
) -> type_t[BaseStringArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
from pandas.core.arrays.string_arrow import (
|
||||
ArrowStringArray,
|
||||
ArrowStringArrayNumpySemantics,
|
||||
)
|
||||
|
||||
if self.storage == "python":
|
||||
return StringArray
|
||||
elif self.storage == "pyarrow":
|
||||
return ArrowStringArray
|
||||
else:
|
||||
return ArrowStringArrayNumpySemantics
|
||||
|
||||
def __from_arrow__(
|
||||
self, array: pyarrow.Array | pyarrow.ChunkedArray
|
||||
) -> BaseStringArray:
|
||||
"""
|
||||
Construct StringArray from pyarrow Array/ChunkedArray.
|
||||
"""
|
||||
if self.storage == "pyarrow":
|
||||
from pandas.core.arrays.string_arrow import ArrowStringArray
|
||||
|
||||
return ArrowStringArray(array)
|
||||
elif self.storage == "pyarrow_numpy":
|
||||
from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
|
||||
|
||||
return ArrowStringArrayNumpySemantics(array)
|
||||
else:
|
||||
import pyarrow
|
||||
|
||||
if isinstance(array, pyarrow.Array):
|
||||
chunks = [array]
|
||||
else:
|
||||
# pyarrow.ChunkedArray
|
||||
chunks = array.chunks
|
||||
|
||||
results = []
|
||||
for arr in chunks:
|
||||
# convert chunk by chunk to numpy and concatenate then, to avoid
|
||||
# overflow for large string data when concatenating the pyarrow arrays
|
||||
arr = arr.to_numpy(zero_copy_only=False)
|
||||
arr = ensure_string_array(arr, na_value=libmissing.NA)
|
||||
results.append(arr)
|
||||
|
||||
if len(chunks) == 0:
|
||||
arr = np.array([], dtype=object)
|
||||
else:
|
||||
arr = np.concatenate(results)
|
||||
|
||||
# Bypass validation inside StringArray constructor, see GH#47781
|
||||
new_string_array = StringArray.__new__(StringArray)
|
||||
NDArrayBacked.__init__(
|
||||
new_string_array,
|
||||
arr,
|
||||
StringDtype(storage="python"),
|
||||
)
|
||||
return new_string_array
|
||||
|
||||
|
||||
class BaseStringArray(ExtensionArray):
|
||||
"""
|
||||
Mixin class for StringArray, ArrowStringArray.
|
||||
"""
|
||||
|
||||
@doc(ExtensionArray.tolist)
|
||||
def tolist(self):
|
||||
if self.ndim > 1:
|
||||
return [x.tolist() for x in self]
|
||||
return list(self.to_numpy())
|
||||
|
||||
@classmethod
|
||||
def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self:
|
||||
if lib.infer_dtype(scalars, skipna=True) not in ["string", "empty"]:
|
||||
# TODO: require any NAs be valid-for-string
|
||||
raise ValueError
|
||||
return cls._from_sequence(scalars, dtype=dtype)
|
||||
|
||||
|
||||
# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is
|
||||
# incompatible with definition in base class "ExtensionArray"
|
||||
class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc]
|
||||
"""
|
||||
Extension array for string data.
|
||||
|
||||
.. warning::
|
||||
|
||||
StringArray is considered experimental. The implementation and
|
||||
parts of the API may change without warning.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : array-like
|
||||
The array of data.
|
||||
|
||||
.. warning::
|
||||
|
||||
Currently, this expects an object-dtype ndarray
|
||||
where the elements are Python strings
|
||||
or nan-likes (``None``, ``np.nan``, ``NA``).
|
||||
This may change without warning in the future. Use
|
||||
:meth:`pandas.array` with ``dtype="string"`` for a stable way of
|
||||
creating a `StringArray` from any sequence.
|
||||
|
||||
.. versionchanged:: 1.5.0
|
||||
|
||||
StringArray now accepts array-likes containing
|
||||
nan-likes(``None``, ``np.nan``) for the ``values`` parameter
|
||||
in addition to strings and :attr:`pandas.NA`
|
||||
|
||||
copy : bool, default False
|
||||
Whether to copy the array of data.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
See Also
|
||||
--------
|
||||
:func:`pandas.array`
|
||||
The recommended function for creating a StringArray.
|
||||
Series.str
|
||||
The string methods are available on Series backed by
|
||||
a StringArray.
|
||||
|
||||
Notes
|
||||
-----
|
||||
StringArray returns a BooleanArray for comparison methods.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")
|
||||
<StringArray>
|
||||
['This is', 'some text', <NA>, 'data.']
|
||||
Length: 4, dtype: string
|
||||
|
||||
Unlike arrays instantiated with ``dtype="object"``, ``StringArray``
|
||||
will convert the values to strings.
|
||||
|
||||
>>> pd.array(['1', 1], dtype="object")
|
||||
<NumpyExtensionArray>
|
||||
['1', 1]
|
||||
Length: 2, dtype: object
|
||||
>>> pd.array(['1', 1], dtype="string")
|
||||
<StringArray>
|
||||
['1', '1']
|
||||
Length: 2, dtype: string
|
||||
|
||||
However, instantiating StringArrays directly with non-strings will raise an error.
|
||||
|
||||
For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`:
|
||||
|
||||
>>> pd.array(["a", None, "c"], dtype="string") == "a"
|
||||
<BooleanArray>
|
||||
[True, <NA>, False]
|
||||
Length: 3, dtype: boolean
|
||||
"""
|
||||
|
||||
# undo the NumpyExtensionArray hack
|
||||
_typ = "extension"
|
||||
|
||||
def __init__(self, values, copy: bool = False) -> None:
|
||||
values = extract_array(values)
|
||||
|
||||
super().__init__(values, copy=copy)
|
||||
if not isinstance(values, type(self)):
|
||||
self._validate()
|
||||
NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
|
||||
|
||||
def _validate(self):
|
||||
"""Validate that we only store NA or strings."""
|
||||
if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
|
||||
raise ValueError("StringArray requires a sequence of strings or pandas.NA")
|
||||
if self._ndarray.dtype != "object":
|
||||
raise ValueError(
|
||||
"StringArray requires a sequence of strings or pandas.NA. Got "
|
||||
f"'{self._ndarray.dtype}' dtype instead."
|
||||
)
|
||||
# Check to see if need to convert Na values to pd.NA
|
||||
if self._ndarray.ndim > 2:
|
||||
# Ravel if ndims > 2 b/c no cythonized version available
|
||||
lib.convert_nans_to_NA(self._ndarray.ravel("K"))
|
||||
else:
|
||||
lib.convert_nans_to_NA(self._ndarray)
|
||||
|
||||
@classmethod
|
||||
def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
|
||||
if dtype and not (isinstance(dtype, str) and dtype == "string"):
|
||||
dtype = pandas_dtype(dtype)
|
||||
assert isinstance(dtype, StringDtype) and dtype.storage == "python"
|
||||
|
||||
from pandas.core.arrays.masked import BaseMaskedArray
|
||||
|
||||
if isinstance(scalars, BaseMaskedArray):
|
||||
# avoid costly conversion to object dtype
|
||||
na_values = scalars._mask
|
||||
result = scalars._data
|
||||
result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
|
||||
result[na_values] = libmissing.NA
|
||||
|
||||
else:
|
||||
if lib.is_pyarrow_array(scalars):
|
||||
# pyarrow array; we cannot rely on the "to_numpy" check in
|
||||
# ensure_string_array because calling scalars.to_numpy would set
|
||||
# zero_copy_only to True which caused problems see GH#52076
|
||||
scalars = np.array(scalars)
|
||||
# convert non-na-likes to str, and nan-likes to StringDtype().na_value
|
||||
result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)
|
||||
|
||||
# Manually creating new array avoids the validation step in the __init__, so is
|
||||
# faster. Refactor need for validation?
|
||||
new_string_array = cls.__new__(cls)
|
||||
NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python"))
|
||||
|
||||
return new_string_array
|
||||
|
||||
@classmethod
|
||||
def _from_sequence_of_strings(
|
||||
cls, strings, *, dtype: Dtype | None = None, copy: bool = False
|
||||
):
|
||||
return cls._from_sequence(strings, dtype=dtype, copy=copy)
|
||||
|
||||
@classmethod
|
||||
def _empty(cls, shape, dtype) -> StringArray:
|
||||
values = np.empty(shape, dtype=object)
|
||||
values[:] = libmissing.NA
|
||||
return cls(values).astype(dtype, copy=False)
|
||||
|
||||
def __arrow_array__(self, type=None):
|
||||
"""
|
||||
Convert myself into a pyarrow Array.
|
||||
"""
|
||||
import pyarrow as pa
|
||||
|
||||
if type is None:
|
||||
type = pa.string()
|
||||
|
||||
values = self._ndarray.copy()
|
||||
values[self.isna()] = None
|
||||
return pa.array(values, type=type, from_pandas=True)
|
||||
|
||||
def _values_for_factorize(self):
|
||||
arr = self._ndarray.copy()
|
||||
mask = self.isna()
|
||||
arr[mask] = None
|
||||
return arr, None
|
||||
|
||||
def __setitem__(self, key, value) -> None:
|
||||
value = extract_array(value, extract_numpy=True)
|
||||
if isinstance(value, type(self)):
|
||||
# extract_array doesn't extract NumpyExtensionArray subclasses
|
||||
value = value._ndarray
|
||||
|
||||
key = check_array_indexer(self, key)
|
||||
scalar_key = lib.is_scalar(key)
|
||||
scalar_value = lib.is_scalar(value)
|
||||
if scalar_key and not scalar_value:
|
||||
raise ValueError("setting an array element with a sequence.")
|
||||
|
||||
# validate new items
|
||||
if scalar_value:
|
||||
if isna(value):
|
||||
value = libmissing.NA
|
||||
elif not isinstance(value, str):
|
||||
raise TypeError(
|
||||
f"Cannot set non-string value '{value}' into a StringArray."
|
||||
)
|
||||
else:
|
||||
if not is_array_like(value):
|
||||
value = np.asarray(value, dtype=object)
|
||||
if len(value) and not lib.is_string_array(value, skipna=True):
|
||||
raise TypeError("Must provide strings.")
|
||||
|
||||
mask = isna(value)
|
||||
if mask.any():
|
||||
value = value.copy()
|
||||
value[isna(value)] = libmissing.NA
|
||||
|
||||
super().__setitem__(key, value)
|
||||
|
||||
def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
|
||||
# the super() method NDArrayBackedExtensionArray._putmask uses
|
||||
# np.putmask which doesn't properly handle None/pd.NA, so using the
|
||||
# base class implementation that uses __setitem__
|
||||
ExtensionArray._putmask(self, mask, value)
|
||||
|
||||
def astype(self, dtype, copy: bool = True):
|
||||
dtype = pandas_dtype(dtype)
|
||||
|
||||
if dtype == self.dtype:
|
||||
if copy:
|
||||
return self.copy()
|
||||
return self
|
||||
|
||||
elif isinstance(dtype, IntegerDtype):
|
||||
arr = self._ndarray.copy()
|
||||
mask = self.isna()
|
||||
arr[mask] = 0
|
||||
values = arr.astype(dtype.numpy_dtype)
|
||||
return IntegerArray(values, mask, copy=False)
|
||||
elif isinstance(dtype, FloatingDtype):
|
||||
arr = self.copy()
|
||||
mask = self.isna()
|
||||
arr[mask] = "0"
|
||||
values = arr.astype(dtype.numpy_dtype)
|
||||
return FloatingArray(values, mask, copy=False)
|
||||
elif isinstance(dtype, ExtensionDtype):
|
||||
# Skip the NumpyExtensionArray.astype method
|
||||
return ExtensionArray.astype(self, dtype, copy)
|
||||
elif np.issubdtype(dtype, np.floating):
|
||||
arr = self._ndarray.copy()
|
||||
mask = self.isna()
|
||||
arr[mask] = 0
|
||||
values = arr.astype(dtype)
|
||||
values[mask] = np.nan
|
||||
return values
|
||||
|
||||
return super().astype(dtype, copy)
|
||||
|
||||
def _reduce(
|
||||
self, name: str, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs
|
||||
):
|
||||
if name in ["min", "max"]:
|
||||
return getattr(self, name)(skipna=skipna, axis=axis)
|
||||
|
||||
raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
|
||||
|
||||
def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
|
||||
nv.validate_min((), kwargs)
|
||||
result = masked_reductions.min(
|
||||
values=self.to_numpy(), mask=self.isna(), skipna=skipna
|
||||
)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
|
||||
nv.validate_max((), kwargs)
|
||||
result = masked_reductions.max(
|
||||
values=self.to_numpy(), mask=self.isna(), skipna=skipna
|
||||
)
|
||||
return self._wrap_reduction_result(axis, result)
|
||||
|
||||
def value_counts(self, dropna: bool = True) -> Series:
|
||||
from pandas.core.algorithms import value_counts_internal as value_counts
|
||||
|
||||
result = value_counts(self._ndarray, dropna=dropna).astype("Int64")
|
||||
result.index = result.index.astype(self.dtype)
|
||||
return result
|
||||
|
||||
def memory_usage(self, deep: bool = False) -> int:
|
||||
result = self._ndarray.nbytes
|
||||
if deep:
|
||||
return result + lib.memory_usage_of_objects(self._ndarray)
|
||||
return result
|
||||
|
||||
@doc(ExtensionArray.searchsorted)
|
||||
def searchsorted(
|
||||
self,
|
||||
value: NumpyValueArrayLike | ExtensionArray,
|
||||
side: Literal["left", "right"] = "left",
|
||||
sorter: NumpySorter | None = None,
|
||||
) -> npt.NDArray[np.intp] | np.intp:
|
||||
if self._hasna:
|
||||
raise ValueError(
|
||||
"searchsorted requires array to be sorted, which is impossible "
|
||||
"with NAs present."
|
||||
)
|
||||
return super().searchsorted(value=value, side=side, sorter=sorter)
|
||||
|
||||
def _cmp_method(self, other, op):
|
||||
from pandas.arrays import BooleanArray
|
||||
|
||||
if isinstance(other, StringArray):
|
||||
other = other._ndarray
|
||||
|
||||
mask = isna(self) | isna(other)
|
||||
valid = ~mask
|
||||
|
||||
if not lib.is_scalar(other):
|
||||
if len(other) != len(self):
|
||||
# prevent improper broadcasting when other is 2D
|
||||
raise ValueError(
|
||||
f"Lengths of operands do not match: {len(self)} != {len(other)}"
|
||||
)
|
||||
|
||||
other = np.asarray(other)
|
||||
other = other[valid]
|
||||
|
||||
if op.__name__ in ops.ARITHMETIC_BINOPS:
|
||||
result = np.empty_like(self._ndarray, dtype="object")
|
||||
result[mask] = libmissing.NA
|
||||
result[valid] = op(self._ndarray[valid], other)
|
||||
return StringArray(result)
|
||||
else:
|
||||
# logical
|
||||
result = np.zeros(len(self._ndarray), dtype="bool")
|
||||
result[valid] = op(self._ndarray[valid], other)
|
||||
return BooleanArray(result, mask)
|
||||
|
||||
_arith_method = _cmp_method
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# String methods interface
|
||||
# error: Incompatible types in assignment (expression has type "NAType",
|
||||
# base class "NumpyExtensionArray" defined the type as "float")
|
||||
_str_na_value = libmissing.NA # type: ignore[assignment]
|
||||
|
||||
def _str_map(
|
||||
self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
|
||||
):
|
||||
from pandas.arrays import BooleanArray
|
||||
|
||||
if dtype is None:
|
||||
dtype = StringDtype(storage="python")
|
||||
if na_value is None:
|
||||
na_value = self.dtype.na_value
|
||||
|
||||
mask = isna(self)
|
||||
arr = np.asarray(self)
|
||||
|
||||
if is_integer_dtype(dtype) or is_bool_dtype(dtype):
|
||||
constructor: type[IntegerArray | BooleanArray]
|
||||
if is_integer_dtype(dtype):
|
||||
constructor = IntegerArray
|
||||
else:
|
||||
constructor = BooleanArray
|
||||
|
||||
na_value_is_na = isna(na_value)
|
||||
if na_value_is_na:
|
||||
na_value = 1
|
||||
elif dtype == np.dtype("bool"):
|
||||
na_value = bool(na_value)
|
||||
result = lib.map_infer_mask(
|
||||
arr,
|
||||
f,
|
||||
mask.view("uint8"),
|
||||
convert=False,
|
||||
na_value=na_value,
|
||||
# error: Argument 1 to "dtype" has incompatible type
|
||||
# "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
|
||||
# "Type[object]"
|
||||
dtype=np.dtype(dtype), # type: ignore[arg-type]
|
||||
)
|
||||
|
||||
if not na_value_is_na:
|
||||
mask[:] = False
|
||||
|
||||
return constructor(result, mask)
|
||||
|
||||
elif is_string_dtype(dtype) and not is_object_dtype(dtype):
|
||||
# i.e. StringDtype
|
||||
result = lib.map_infer_mask(
|
||||
arr, f, mask.view("uint8"), convert=False, na_value=na_value
|
||||
)
|
||||
return StringArray(result)
|
||||
else:
|
||||
# This is when the result type is object. We reach this when
|
||||
# -> We know the result type is truly object (e.g. .encode returns bytes
|
||||
# or .findall returns a list).
|
||||
# -> We don't know the result type. E.g. `.get` can return anything.
|
||||
return lib.map_infer_mask(arr, f, mask.view("uint8"))
|
||||
719
lib/python3.13/site-packages/pandas/core/arrays/string_arrow.py
Normal file
719
lib/python3.13/site-packages/pandas/core/arrays/string_arrow.py
Normal file
@@ -0,0 +1,719 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import partial
|
||||
import operator
|
||||
import re
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Callable,
|
||||
Union,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
lib,
|
||||
missing as libmissing,
|
||||
)
|
||||
from pandas.compat import (
|
||||
pa_version_under10p1,
|
||||
pa_version_under13p0,
|
||||
)
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_bool_dtype,
|
||||
is_integer_dtype,
|
||||
is_object_dtype,
|
||||
is_scalar,
|
||||
is_string_dtype,
|
||||
pandas_dtype,
|
||||
)
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin
|
||||
from pandas.core.arrays.arrow import ArrowExtensionArray
|
||||
from pandas.core.arrays.boolean import BooleanDtype
|
||||
from pandas.core.arrays.integer import Int64Dtype
|
||||
from pandas.core.arrays.numeric import NumericDtype
|
||||
from pandas.core.arrays.string_ import (
|
||||
BaseStringArray,
|
||||
StringDtype,
|
||||
)
|
||||
from pandas.core.ops import invalid_comparison
|
||||
from pandas.core.strings.object_array import ObjectStringArrayMixin
|
||||
|
||||
if not pa_version_under10p1:
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
|
||||
from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
AxisInt,
|
||||
Dtype,
|
||||
Scalar,
|
||||
npt,
|
||||
)
|
||||
|
||||
from pandas import Series
|
||||
|
||||
|
||||
ArrowStringScalarOrNAT = Union[str, libmissing.NAType]
|
||||
|
||||
|
||||
def _chk_pyarrow_available() -> None:
|
||||
if pa_version_under10p1:
|
||||
msg = "pyarrow>=10.0.1 is required for PyArrow backed ArrowExtensionArray."
|
||||
raise ImportError(msg)
|
||||
|
||||
|
||||
# TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from
|
||||
# ObjectStringArrayMixin because we want to have the object-dtype based methods as
|
||||
# fallback for the ones that pyarrow doesn't yet support
|
||||
|
||||
|
||||
class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringArray):
|
||||
"""
|
||||
Extension array for string data in a ``pyarrow.ChunkedArray``.
|
||||
|
||||
.. warning::
|
||||
|
||||
ArrowStringArray is considered experimental. The implementation and
|
||||
parts of the API may change without warning.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : pyarrow.Array or pyarrow.ChunkedArray
|
||||
The array of data.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
See Also
|
||||
--------
|
||||
:func:`pandas.array`
|
||||
The recommended function for creating a ArrowStringArray.
|
||||
Series.str
|
||||
The string methods are available on Series backed by
|
||||
a ArrowStringArray.
|
||||
|
||||
Notes
|
||||
-----
|
||||
ArrowStringArray returns a BooleanArray for comparison methods.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[pyarrow]")
|
||||
<ArrowStringArray>
|
||||
['This is', 'some text', <NA>, 'data.']
|
||||
Length: 4, dtype: string
|
||||
"""
|
||||
|
||||
# error: Incompatible types in assignment (expression has type "StringDtype",
|
||||
# base class "ArrowExtensionArray" defined the type as "ArrowDtype")
|
||||
_dtype: StringDtype # type: ignore[assignment]
|
||||
_storage = "pyarrow"
|
||||
|
||||
def __init__(self, values) -> None:
|
||||
_chk_pyarrow_available()
|
||||
if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string(
|
||||
values.type
|
||||
):
|
||||
values = pc.cast(values, pa.large_string())
|
||||
|
||||
super().__init__(values)
|
||||
self._dtype = StringDtype(storage=self._storage)
|
||||
|
||||
if not pa.types.is_large_string(self._pa_array.type) and not (
|
||||
pa.types.is_dictionary(self._pa_array.type)
|
||||
and pa.types.is_large_string(self._pa_array.type.value_type)
|
||||
):
|
||||
raise ValueError(
|
||||
"ArrowStringArray requires a PyArrow (chunked) array of "
|
||||
"large_string type"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
|
||||
pa_scalar = super()._box_pa_scalar(value, pa_type)
|
||||
if pa.types.is_string(pa_scalar.type) and pa_type is None:
|
||||
pa_scalar = pc.cast(pa_scalar, pa.large_string())
|
||||
return pa_scalar
|
||||
|
||||
@classmethod
|
||||
def _box_pa_array(
|
||||
cls, value, pa_type: pa.DataType | None = None, copy: bool = False
|
||||
) -> pa.Array | pa.ChunkedArray:
|
||||
pa_array = super()._box_pa_array(value, pa_type)
|
||||
if pa.types.is_string(pa_array.type) and pa_type is None:
|
||||
pa_array = pc.cast(pa_array, pa.large_string())
|
||||
return pa_array
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""
|
||||
Length of this array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
length : int
|
||||
"""
|
||||
return len(self._pa_array)
|
||||
|
||||
@classmethod
|
||||
def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
|
||||
from pandas.core.arrays.masked import BaseMaskedArray
|
||||
|
||||
_chk_pyarrow_available()
|
||||
|
||||
if dtype and not (isinstance(dtype, str) and dtype == "string"):
|
||||
dtype = pandas_dtype(dtype)
|
||||
assert isinstance(dtype, StringDtype) and dtype.storage in (
|
||||
"pyarrow",
|
||||
"pyarrow_numpy",
|
||||
)
|
||||
|
||||
if isinstance(scalars, BaseMaskedArray):
|
||||
# avoid costly conversion to object dtype in ensure_string_array and
|
||||
# numerical issues with Float32Dtype
|
||||
na_values = scalars._mask
|
||||
result = scalars._data
|
||||
result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
|
||||
return cls(pa.array(result, mask=na_values, type=pa.large_string()))
|
||||
elif isinstance(scalars, (pa.Array, pa.ChunkedArray)):
|
||||
return cls(pc.cast(scalars, pa.large_string()))
|
||||
|
||||
# convert non-na-likes to str
|
||||
result = lib.ensure_string_array(scalars, copy=copy)
|
||||
return cls(pa.array(result, type=pa.large_string(), from_pandas=True))
|
||||
|
||||
@classmethod
|
||||
def _from_sequence_of_strings(
|
||||
cls, strings, dtype: Dtype | None = None, copy: bool = False
|
||||
):
|
||||
return cls._from_sequence(strings, dtype=dtype, copy=copy)
|
||||
|
||||
@property
|
||||
def dtype(self) -> StringDtype: # type: ignore[override]
|
||||
"""
|
||||
An instance of 'string[pyarrow]'.
|
||||
"""
|
||||
return self._dtype
|
||||
|
||||
def insert(self, loc: int, item) -> ArrowStringArray:
|
||||
if not isinstance(item, str) and item is not libmissing.NA:
|
||||
raise TypeError("Scalar must be NA or str")
|
||||
return super().insert(loc, item)
|
||||
|
||||
@classmethod
|
||||
def _result_converter(cls, values, na=None):
|
||||
return BooleanDtype().__from_arrow__(values)
|
||||
|
||||
def _maybe_convert_setitem_value(self, value):
|
||||
"""Maybe convert value to be pyarrow compatible."""
|
||||
if is_scalar(value):
|
||||
if isna(value):
|
||||
value = None
|
||||
elif not isinstance(value, str):
|
||||
raise TypeError("Scalar must be NA or str")
|
||||
else:
|
||||
value = np.array(value, dtype=object, copy=True)
|
||||
value[isna(value)] = None
|
||||
for v in value:
|
||||
if not (v is None or isinstance(v, str)):
|
||||
raise TypeError("Scalar must be NA or str")
|
||||
return super()._maybe_convert_setitem_value(value)
|
||||
|
||||
def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
|
||||
value_set = [
|
||||
pa_scalar.as_py()
|
||||
for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values]
|
||||
if pa_scalar.type in (pa.string(), pa.null(), pa.large_string())
|
||||
]
|
||||
|
||||
# short-circuit to return all False array.
|
||||
if not len(value_set):
|
||||
return np.zeros(len(self), dtype=bool)
|
||||
|
||||
result = pc.is_in(
|
||||
self._pa_array, value_set=pa.array(value_set, type=self._pa_array.type)
|
||||
)
|
||||
# pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls
|
||||
# to False
|
||||
return np.array(result, dtype=np.bool_)
|
||||
|
||||
def astype(self, dtype, copy: bool = True):
|
||||
dtype = pandas_dtype(dtype)
|
||||
|
||||
if dtype == self.dtype:
|
||||
if copy:
|
||||
return self.copy()
|
||||
return self
|
||||
elif isinstance(dtype, NumericDtype):
|
||||
data = self._pa_array.cast(pa.from_numpy_dtype(dtype.numpy_dtype))
|
||||
return dtype.__from_arrow__(data)
|
||||
elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating):
|
||||
return self.to_numpy(dtype=dtype, na_value=np.nan)
|
||||
|
||||
return super().astype(dtype, copy=copy)
|
||||
|
||||
@property
|
||||
def _data(self):
|
||||
# dask accesses ._data directlys
|
||||
warnings.warn(
|
||||
f"{type(self).__name__}._data is a deprecated and will be removed "
|
||||
"in a future version, use ._pa_array instead",
|
||||
FutureWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
return self._pa_array
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# String methods interface
|
||||
|
||||
# error: Incompatible types in assignment (expression has type "NAType",
|
||||
# base class "ObjectStringArrayMixin" defined the type as "float")
|
||||
_str_na_value = libmissing.NA # type: ignore[assignment]
|
||||
|
||||
def _str_map(
|
||||
self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
|
||||
):
|
||||
# TODO: de-duplicate with StringArray method. This method is moreless copy and
|
||||
# paste.
|
||||
|
||||
from pandas.arrays import (
|
||||
BooleanArray,
|
||||
IntegerArray,
|
||||
)
|
||||
|
||||
if dtype is None:
|
||||
dtype = self.dtype
|
||||
if na_value is None:
|
||||
na_value = self.dtype.na_value
|
||||
|
||||
mask = isna(self)
|
||||
arr = np.asarray(self)
|
||||
|
||||
if is_integer_dtype(dtype) or is_bool_dtype(dtype):
|
||||
constructor: type[IntegerArray | BooleanArray]
|
||||
if is_integer_dtype(dtype):
|
||||
constructor = IntegerArray
|
||||
else:
|
||||
constructor = BooleanArray
|
||||
|
||||
na_value_is_na = isna(na_value)
|
||||
if na_value_is_na:
|
||||
na_value = 1
|
||||
result = lib.map_infer_mask(
|
||||
arr,
|
||||
f,
|
||||
mask.view("uint8"),
|
||||
convert=False,
|
||||
na_value=na_value,
|
||||
# error: Argument 1 to "dtype" has incompatible type
|
||||
# "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
|
||||
# "Type[object]"
|
||||
dtype=np.dtype(dtype), # type: ignore[arg-type]
|
||||
)
|
||||
|
||||
if not na_value_is_na:
|
||||
mask[:] = False
|
||||
|
||||
return constructor(result, mask)
|
||||
|
||||
elif is_string_dtype(dtype) and not is_object_dtype(dtype):
|
||||
# i.e. StringDtype
|
||||
result = lib.map_infer_mask(
|
||||
arr, f, mask.view("uint8"), convert=False, na_value=na_value
|
||||
)
|
||||
result = pa.array(
|
||||
result, mask=mask, type=pa.large_string(), from_pandas=True
|
||||
)
|
||||
return type(self)(result)
|
||||
else:
|
||||
# This is when the result type is object. We reach this when
|
||||
# -> We know the result type is truly object (e.g. .encode returns bytes
|
||||
# or .findall returns a list).
|
||||
# -> We don't know the result type. E.g. `.get` can return anything.
|
||||
return lib.map_infer_mask(arr, f, mask.view("uint8"))
|
||||
|
||||
def _str_contains(
|
||||
self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
|
||||
):
|
||||
if flags:
|
||||
fallback_performancewarning()
|
||||
return super()._str_contains(pat, case, flags, na, regex)
|
||||
|
||||
if regex:
|
||||
result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case)
|
||||
else:
|
||||
result = pc.match_substring(self._pa_array, pat, ignore_case=not case)
|
||||
result = self._result_converter(result, na=na)
|
||||
if not isna(na):
|
||||
result[isna(result)] = bool(na)
|
||||
return result
|
||||
|
||||
def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
|
||||
if isinstance(pat, str):
|
||||
result = pc.starts_with(self._pa_array, pattern=pat)
|
||||
else:
|
||||
if len(pat) == 0:
|
||||
# mimic existing behaviour of string extension array
|
||||
# and python string method
|
||||
result = pa.array(
|
||||
np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array)
|
||||
)
|
||||
else:
|
||||
result = pc.starts_with(self._pa_array, pattern=pat[0])
|
||||
|
||||
for p in pat[1:]:
|
||||
result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))
|
||||
if not isna(na):
|
||||
result = result.fill_null(na)
|
||||
return self._result_converter(result)
|
||||
|
||||
def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
|
||||
if isinstance(pat, str):
|
||||
result = pc.ends_with(self._pa_array, pattern=pat)
|
||||
else:
|
||||
if len(pat) == 0:
|
||||
# mimic existing behaviour of string extension array
|
||||
# and python string method
|
||||
result = pa.array(
|
||||
np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array)
|
||||
)
|
||||
else:
|
||||
result = pc.ends_with(self._pa_array, pattern=pat[0])
|
||||
|
||||
for p in pat[1:]:
|
||||
result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))
|
||||
if not isna(na):
|
||||
result = result.fill_null(na)
|
||||
return self._result_converter(result)
|
||||
|
||||
def _str_replace(
|
||||
self,
|
||||
pat: str | re.Pattern,
|
||||
repl: str | Callable,
|
||||
n: int = -1,
|
||||
case: bool = True,
|
||||
flags: int = 0,
|
||||
regex: bool = True,
|
||||
):
|
||||
if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
|
||||
fallback_performancewarning()
|
||||
return super()._str_replace(pat, repl, n, case, flags, regex)
|
||||
|
||||
func = pc.replace_substring_regex if regex else pc.replace_substring
|
||||
result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n)
|
||||
return type(self)(result)
|
||||
|
||||
def _str_repeat(self, repeats: int | Sequence[int]):
|
||||
if not isinstance(repeats, int):
|
||||
return super()._str_repeat(repeats)
|
||||
else:
|
||||
return type(self)(pc.binary_repeat(self._pa_array, repeats))
|
||||
|
||||
def _str_match(
|
||||
self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
|
||||
):
|
||||
if not pat.startswith("^"):
|
||||
pat = f"^{pat}"
|
||||
return self._str_contains(pat, case, flags, na, regex=True)
|
||||
|
||||
def _str_fullmatch(
|
||||
self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
|
||||
):
|
||||
if not pat.endswith("$") or pat.endswith("\\$"):
|
||||
pat = f"{pat}$"
|
||||
return self._str_match(pat, case, flags, na)
|
||||
|
||||
def _str_slice(
|
||||
self, start: int | None = None, stop: int | None = None, step: int | None = None
|
||||
):
|
||||
if stop is None:
|
||||
return super()._str_slice(start, stop, step)
|
||||
if start is None:
|
||||
start = 0
|
||||
if step is None:
|
||||
step = 1
|
||||
return type(self)(
|
||||
pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
|
||||
)
|
||||
|
||||
def _str_isalnum(self):
|
||||
result = pc.utf8_is_alnum(self._pa_array)
|
||||
return self._result_converter(result)
|
||||
|
||||
def _str_isalpha(self):
|
||||
result = pc.utf8_is_alpha(self._pa_array)
|
||||
return self._result_converter(result)
|
||||
|
||||
def _str_isdecimal(self):
|
||||
result = pc.utf8_is_decimal(self._pa_array)
|
||||
return self._result_converter(result)
|
||||
|
||||
def _str_isdigit(self):
|
||||
result = pc.utf8_is_digit(self._pa_array)
|
||||
return self._result_converter(result)
|
||||
|
||||
def _str_islower(self):
|
||||
result = pc.utf8_is_lower(self._pa_array)
|
||||
return self._result_converter(result)
|
||||
|
||||
def _str_isnumeric(self):
|
||||
result = pc.utf8_is_numeric(self._pa_array)
|
||||
return self._result_converter(result)
|
||||
|
||||
def _str_isspace(self):
|
||||
result = pc.utf8_is_space(self._pa_array)
|
||||
return self._result_converter(result)
|
||||
|
||||
def _str_istitle(self):
|
||||
result = pc.utf8_is_title(self._pa_array)
|
||||
return self._result_converter(result)
|
||||
|
||||
def _str_isupper(self):
|
||||
result = pc.utf8_is_upper(self._pa_array)
|
||||
return self._result_converter(result)
|
||||
|
||||
def _str_len(self):
|
||||
result = pc.utf8_length(self._pa_array)
|
||||
return self._convert_int_dtype(result)
|
||||
|
||||
def _str_lower(self):
|
||||
return type(self)(pc.utf8_lower(self._pa_array))
|
||||
|
||||
def _str_upper(self):
|
||||
return type(self)(pc.utf8_upper(self._pa_array))
|
||||
|
||||
def _str_strip(self, to_strip=None):
|
||||
if to_strip is None:
|
||||
result = pc.utf8_trim_whitespace(self._pa_array)
|
||||
else:
|
||||
result = pc.utf8_trim(self._pa_array, characters=to_strip)
|
||||
return type(self)(result)
|
||||
|
||||
def _str_lstrip(self, to_strip=None):
|
||||
if to_strip is None:
|
||||
result = pc.utf8_ltrim_whitespace(self._pa_array)
|
||||
else:
|
||||
result = pc.utf8_ltrim(self._pa_array, characters=to_strip)
|
||||
return type(self)(result)
|
||||
|
||||
def _str_rstrip(self, to_strip=None):
|
||||
if to_strip is None:
|
||||
result = pc.utf8_rtrim_whitespace(self._pa_array)
|
||||
else:
|
||||
result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
|
||||
return type(self)(result)
|
||||
|
||||
def _str_removeprefix(self, prefix: str):
|
||||
if not pa_version_under13p0:
|
||||
starts_with = pc.starts_with(self._pa_array, pattern=prefix)
|
||||
removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
|
||||
result = pc.if_else(starts_with, removed, self._pa_array)
|
||||
return type(self)(result)
|
||||
return super()._str_removeprefix(prefix)
|
||||
|
||||
def _str_removesuffix(self, suffix: str):
|
||||
ends_with = pc.ends_with(self._pa_array, pattern=suffix)
|
||||
removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
|
||||
result = pc.if_else(ends_with, removed, self._pa_array)
|
||||
return type(self)(result)
|
||||
|
||||
def _str_count(self, pat: str, flags: int = 0):
|
||||
if flags:
|
||||
return super()._str_count(pat, flags)
|
||||
result = pc.count_substring_regex(self._pa_array, pat)
|
||||
return self._convert_int_dtype(result)
|
||||
|
||||
def _str_find(self, sub: str, start: int = 0, end: int | None = None):
|
||||
if start != 0 and end is not None:
|
||||
slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
|
||||
result = pc.find_substring(slices, sub)
|
||||
not_found = pc.equal(result, -1)
|
||||
offset_result = pc.add(result, end - start)
|
||||
result = pc.if_else(not_found, result, offset_result)
|
||||
elif start == 0 and end is None:
|
||||
slices = self._pa_array
|
||||
result = pc.find_substring(slices, sub)
|
||||
else:
|
||||
return super()._str_find(sub, start, end)
|
||||
return self._convert_int_dtype(result)
|
||||
|
||||
def _str_get_dummies(self, sep: str = "|"):
|
||||
dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep)
|
||||
if len(labels) == 0:
|
||||
return np.empty(shape=(0, 0), dtype=np.int64), labels
|
||||
dummies = np.vstack(dummies_pa.to_numpy())
|
||||
return dummies.astype(np.int64, copy=False), labels
|
||||
|
||||
def _convert_int_dtype(self, result):
|
||||
return Int64Dtype().__from_arrow__(result)
|
||||
|
||||
def _reduce(
|
||||
self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
|
||||
):
|
||||
result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs)
|
||||
if name in ("argmin", "argmax") and isinstance(result, pa.Array):
|
||||
return self._convert_int_dtype(result)
|
||||
elif isinstance(result, pa.Array):
|
||||
return type(self)(result)
|
||||
else:
|
||||
return result
|
||||
|
||||
def _rank(
|
||||
self,
|
||||
*,
|
||||
axis: AxisInt = 0,
|
||||
method: str = "average",
|
||||
na_option: str = "keep",
|
||||
ascending: bool = True,
|
||||
pct: bool = False,
|
||||
):
|
||||
"""
|
||||
See Series.rank.__doc__.
|
||||
"""
|
||||
return self._convert_int_dtype(
|
||||
self._rank_calc(
|
||||
axis=axis,
|
||||
method=method,
|
||||
na_option=na_option,
|
||||
ascending=ascending,
|
||||
pct=pct,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class ArrowStringArrayNumpySemantics(ArrowStringArray):
|
||||
_storage = "pyarrow_numpy"
|
||||
|
||||
@classmethod
|
||||
def _result_converter(cls, values, na=None):
|
||||
if not isna(na):
|
||||
values = values.fill_null(bool(na))
|
||||
return ArrowExtensionArray(values).to_numpy(na_value=np.nan)
|
||||
|
||||
def __getattribute__(self, item):
|
||||
# ArrowStringArray and we both inherit from ArrowExtensionArray, which
|
||||
# creates inheritance problems (Diamond inheritance)
|
||||
if item in ArrowStringArrayMixin.__dict__ and item not in (
|
||||
"_pa_array",
|
||||
"__dict__",
|
||||
):
|
||||
return partial(getattr(ArrowStringArrayMixin, item), self)
|
||||
return super().__getattribute__(item)
|
||||
|
||||
def _str_map(
|
||||
self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
|
||||
):
|
||||
if dtype is None:
|
||||
dtype = self.dtype
|
||||
if na_value is None:
|
||||
na_value = self.dtype.na_value
|
||||
|
||||
mask = isna(self)
|
||||
arr = np.asarray(self)
|
||||
|
||||
if is_integer_dtype(dtype) or is_bool_dtype(dtype):
|
||||
if is_integer_dtype(dtype):
|
||||
na_value = np.nan
|
||||
else:
|
||||
na_value = False
|
||||
try:
|
||||
result = lib.map_infer_mask(
|
||||
arr,
|
||||
f,
|
||||
mask.view("uint8"),
|
||||
convert=False,
|
||||
na_value=na_value,
|
||||
dtype=np.dtype(dtype), # type: ignore[arg-type]
|
||||
)
|
||||
return result
|
||||
|
||||
except ValueError:
|
||||
result = lib.map_infer_mask(
|
||||
arr,
|
||||
f,
|
||||
mask.view("uint8"),
|
||||
convert=False,
|
||||
na_value=na_value,
|
||||
)
|
||||
if convert and result.dtype == object:
|
||||
result = lib.maybe_convert_objects(result)
|
||||
return result
|
||||
|
||||
elif is_string_dtype(dtype) and not is_object_dtype(dtype):
|
||||
# i.e. StringDtype
|
||||
result = lib.map_infer_mask(
|
||||
arr, f, mask.view("uint8"), convert=False, na_value=na_value
|
||||
)
|
||||
result = pa.array(
|
||||
result, mask=mask, type=pa.large_string(), from_pandas=True
|
||||
)
|
||||
return type(self)(result)
|
||||
else:
|
||||
# This is when the result type is object. We reach this when
|
||||
# -> We know the result type is truly object (e.g. .encode returns bytes
|
||||
# or .findall returns a list).
|
||||
# -> We don't know the result type. E.g. `.get` can return anything.
|
||||
return lib.map_infer_mask(arr, f, mask.view("uint8"))
|
||||
|
||||
def _convert_int_dtype(self, result):
|
||||
if isinstance(result, pa.Array):
|
||||
result = result.to_numpy(zero_copy_only=False)
|
||||
else:
|
||||
result = result.to_numpy()
|
||||
if result.dtype == np.int32:
|
||||
result = result.astype(np.int64)
|
||||
return result
|
||||
|
||||
def _cmp_method(self, other, op):
|
||||
try:
|
||||
result = super()._cmp_method(other, op)
|
||||
except pa.ArrowNotImplementedError:
|
||||
return invalid_comparison(self, other, op)
|
||||
if op == operator.ne:
|
||||
return result.to_numpy(np.bool_, na_value=True)
|
||||
else:
|
||||
return result.to_numpy(np.bool_, na_value=False)
|
||||
|
||||
def value_counts(self, dropna: bool = True) -> Series:
|
||||
from pandas import Series
|
||||
|
||||
result = super().value_counts(dropna)
|
||||
return Series(
|
||||
result._values.to_numpy(), index=result.index, name=result.name, copy=False
|
||||
)
|
||||
|
||||
def _reduce(
|
||||
self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
|
||||
):
|
||||
if name in ["any", "all"]:
|
||||
if not skipna and name == "all":
|
||||
nas = pc.invert(pc.is_null(self._pa_array))
|
||||
arr = pc.and_kleene(nas, pc.not_equal(self._pa_array, ""))
|
||||
else:
|
||||
arr = pc.not_equal(self._pa_array, "")
|
||||
return ArrowExtensionArray(arr)._reduce(
|
||||
name, skipna=skipna, keepdims=keepdims, **kwargs
|
||||
)
|
||||
else:
|
||||
return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)
|
||||
|
||||
def insert(self, loc: int, item) -> ArrowStringArrayNumpySemantics:
|
||||
if item is np.nan:
|
||||
item = libmissing.NA
|
||||
return super().insert(loc, item) # type: ignore[return-value]
|
||||
1185
lib/python3.13/site-packages/pandas/core/arrays/timedeltas.py
Normal file
1185
lib/python3.13/site-packages/pandas/core/arrays/timedeltas.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user