Updated script that can be controled by Nodejs web app
This commit is contained in:
9
lib/python3.13/site-packages/pandas/tests/base/common.py
Normal file
9
lib/python3.13/site-packages/pandas/tests/base/common.py
Normal file
@ -0,0 +1,9 @@
|
||||
from typing import Any
|
||||
|
||||
from pandas import Index
|
||||
|
||||
|
||||
def allow_na_ops(obj: Any) -> bool:
|
||||
"""Whether to skip test cases including NaN"""
|
||||
is_bool_index = isinstance(obj, Index) and obj.inferred_type == "boolean"
|
||||
return not is_bool_index and obj._can_hold_na
|
@ -0,0 +1,179 @@
|
||||
from datetime import datetime
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PYPY
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.accessor import PandasDelegate
|
||||
from pandas.core.base import (
|
||||
NoNewAttributesMixin,
|
||||
PandasObject,
|
||||
)
|
||||
|
||||
|
||||
def series_via_frame_from_dict(x, **kwargs):
|
||||
return DataFrame({"a": x}, **kwargs)["a"]
|
||||
|
||||
|
||||
def series_via_frame_from_scalar(x, **kwargs):
|
||||
return DataFrame(x, **kwargs)[0]
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
Series,
|
||||
series_via_frame_from_dict,
|
||||
series_via_frame_from_scalar,
|
||||
Index,
|
||||
],
|
||||
ids=["Series", "DataFrame-dict", "DataFrame-array", "Index"],
|
||||
)
|
||||
def constructor(request):
|
||||
return request.param
|
||||
|
||||
|
||||
class TestPandasDelegate:
|
||||
class Delegator:
|
||||
_properties = ["prop"]
|
||||
_methods = ["test_method"]
|
||||
|
||||
def _set_prop(self, value):
|
||||
self.prop = value
|
||||
|
||||
def _get_prop(self):
|
||||
return self.prop
|
||||
|
||||
prop = property(_get_prop, _set_prop, doc="foo property")
|
||||
|
||||
def test_method(self, *args, **kwargs):
|
||||
"""a test method"""
|
||||
|
||||
class Delegate(PandasDelegate, PandasObject):
|
||||
def __init__(self, obj) -> None:
|
||||
self.obj = obj
|
||||
|
||||
def test_invalid_delegation(self):
|
||||
# these show that in order for the delegation to work
|
||||
# the _delegate_* methods need to be overridden to not raise
|
||||
# a TypeError
|
||||
|
||||
self.Delegate._add_delegate_accessors(
|
||||
delegate=self.Delegator,
|
||||
accessors=self.Delegator._properties,
|
||||
typ="property",
|
||||
)
|
||||
self.Delegate._add_delegate_accessors(
|
||||
delegate=self.Delegator, accessors=self.Delegator._methods, typ="method"
|
||||
)
|
||||
|
||||
delegate = self.Delegate(self.Delegator())
|
||||
|
||||
msg = "You cannot access the property prop"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
delegate.prop
|
||||
|
||||
msg = "The property prop cannot be set"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
delegate.prop = 5
|
||||
|
||||
msg = "You cannot access the property prop"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
delegate.prop
|
||||
|
||||
@pytest.mark.skipif(PYPY, reason="not relevant for PyPy")
|
||||
def test_memory_usage(self):
|
||||
# Delegate does not implement memory_usage.
|
||||
# Check that we fall back to in-built `__sizeof__`
|
||||
# GH 12924
|
||||
delegate = self.Delegate(self.Delegator())
|
||||
sys.getsizeof(delegate)
|
||||
|
||||
|
||||
class TestNoNewAttributesMixin:
|
||||
def test_mixin(self):
|
||||
class T(NoNewAttributesMixin):
|
||||
pass
|
||||
|
||||
t = T()
|
||||
assert not hasattr(t, "__frozen")
|
||||
|
||||
t.a = "test"
|
||||
assert t.a == "test"
|
||||
|
||||
t._freeze()
|
||||
assert "__frozen" in dir(t)
|
||||
assert getattr(t, "__frozen")
|
||||
msg = "You cannot add any new attribute"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
t.b = "test"
|
||||
|
||||
assert not hasattr(t, "b")
|
||||
|
||||
|
||||
class TestConstruction:
|
||||
# test certain constructor behaviours on dtype inference across Series,
|
||||
# Index and DataFrame
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"a",
|
||||
[
|
||||
np.array(["2263-01-01"], dtype="datetime64[D]"),
|
||||
np.array([datetime(2263, 1, 1)], dtype=object),
|
||||
np.array([np.datetime64("2263-01-01", "D")], dtype=object),
|
||||
np.array(["2263-01-01"], dtype=object),
|
||||
],
|
||||
ids=[
|
||||
"datetime64[D]",
|
||||
"object-datetime.datetime",
|
||||
"object-numpy-scalar",
|
||||
"object-string",
|
||||
],
|
||||
)
|
||||
def test_constructor_datetime_outofbound(
|
||||
self, a, constructor, request, using_infer_string
|
||||
):
|
||||
# GH-26853 (+ bug GH-26206 out of bound non-ns unit)
|
||||
|
||||
# No dtype specified (dtype inference)
|
||||
# datetime64[non-ns] raise error, other cases result in object dtype
|
||||
# and preserve original data
|
||||
if a.dtype.kind == "M":
|
||||
# Can't fit in nanosecond bounds -> get the nearest supported unit
|
||||
result = constructor(a)
|
||||
assert result.dtype == "M8[s]"
|
||||
else:
|
||||
result = constructor(a)
|
||||
if using_infer_string and "object-string" in request.node.callspec.id:
|
||||
assert result.dtype == "string"
|
||||
else:
|
||||
assert result.dtype == "object"
|
||||
tm.assert_numpy_array_equal(result.to_numpy(), a)
|
||||
|
||||
# Explicit dtype specified
|
||||
# Forced conversion fails for all -> all cases raise error
|
||||
msg = "Out of bounds|Out of bounds .* present at position 0"
|
||||
with pytest.raises(pd.errors.OutOfBoundsDatetime, match=msg):
|
||||
constructor(a, dtype="datetime64[ns]")
|
||||
|
||||
def test_constructor_datetime_nonns(self, constructor):
|
||||
arr = np.array(["2020-01-01T00:00:00.000000"], dtype="datetime64[us]")
|
||||
dta = pd.core.arrays.DatetimeArray._simple_new(arr, dtype=arr.dtype)
|
||||
expected = constructor(dta)
|
||||
assert expected.dtype == arr.dtype
|
||||
|
||||
result = constructor(arr)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# https://github.com/pandas-dev/pandas/issues/34843
|
||||
arr.flags.writeable = False
|
||||
result = constructor(arr)
|
||||
tm.assert_equal(result, expected)
|
@ -0,0 +1,562 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.dtypes import DatetimeTZDtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
CategoricalIndex,
|
||||
Series,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays import (
|
||||
DatetimeArray,
|
||||
IntervalArray,
|
||||
NumpyExtensionArray,
|
||||
PeriodArray,
|
||||
SparseArray,
|
||||
TimedeltaArray,
|
||||
)
|
||||
from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
|
||||
|
||||
|
||||
class TestToIterable:
|
||||
# test that we convert an iterable to python types
|
||||
|
||||
dtypes = [
|
||||
("int8", int),
|
||||
("int16", int),
|
||||
("int32", int),
|
||||
("int64", int),
|
||||
("uint8", int),
|
||||
("uint16", int),
|
||||
("uint32", int),
|
||||
("uint64", int),
|
||||
("float16", float),
|
||||
("float32", float),
|
||||
("float64", float),
|
||||
("datetime64[ns]", Timestamp),
|
||||
("datetime64[ns, US/Eastern]", Timestamp),
|
||||
("timedelta64[ns]", Timedelta),
|
||||
]
|
||||
|
||||
@pytest.mark.parametrize("dtype, rdtype", dtypes)
|
||||
@pytest.mark.parametrize(
|
||||
"method",
|
||||
[
|
||||
lambda x: x.tolist(),
|
||||
lambda x: x.to_list(),
|
||||
lambda x: list(x),
|
||||
lambda x: list(x.__iter__()),
|
||||
],
|
||||
ids=["tolist", "to_list", "list", "iter"],
|
||||
)
|
||||
def test_iterable(self, index_or_series, method, dtype, rdtype):
|
||||
# gh-10904
|
||||
# gh-13258
|
||||
# coerce iteration to underlying python / pandas types
|
||||
typ = index_or_series
|
||||
if dtype == "float16" and issubclass(typ, pd.Index):
|
||||
with pytest.raises(NotImplementedError, match="float16 indexes are not "):
|
||||
typ([1], dtype=dtype)
|
||||
return
|
||||
s = typ([1], dtype=dtype)
|
||||
result = method(s)[0]
|
||||
assert isinstance(result, rdtype)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype, rdtype, obj",
|
||||
[
|
||||
("object", object, "a"),
|
||||
("object", int, 1),
|
||||
("category", object, "a"),
|
||||
("category", int, 1),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"method",
|
||||
[
|
||||
lambda x: x.tolist(),
|
||||
lambda x: x.to_list(),
|
||||
lambda x: list(x),
|
||||
lambda x: list(x.__iter__()),
|
||||
],
|
||||
ids=["tolist", "to_list", "list", "iter"],
|
||||
)
|
||||
def test_iterable_object_and_category(
|
||||
self, index_or_series, method, dtype, rdtype, obj
|
||||
):
|
||||
# gh-10904
|
||||
# gh-13258
|
||||
# coerce iteration to underlying python / pandas types
|
||||
typ = index_or_series
|
||||
s = typ([obj], dtype=dtype)
|
||||
result = method(s)[0]
|
||||
assert isinstance(result, rdtype)
|
||||
|
||||
@pytest.mark.parametrize("dtype, rdtype", dtypes)
|
||||
def test_iterable_items(self, dtype, rdtype):
|
||||
# gh-13258
|
||||
# test if items yields the correct boxed scalars
|
||||
# this only applies to series
|
||||
s = Series([1], dtype=dtype)
|
||||
_, result = next(iter(s.items()))
|
||||
assert isinstance(result, rdtype)
|
||||
|
||||
_, result = next(iter(s.items()))
|
||||
assert isinstance(result, rdtype)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype, rdtype", dtypes + [("object", int), ("category", int)]
|
||||
)
|
||||
def test_iterable_map(self, index_or_series, dtype, rdtype):
|
||||
# gh-13236
|
||||
# coerce iteration to underlying python / pandas types
|
||||
typ = index_or_series
|
||||
if dtype == "float16" and issubclass(typ, pd.Index):
|
||||
with pytest.raises(NotImplementedError, match="float16 indexes are not "):
|
||||
typ([1], dtype=dtype)
|
||||
return
|
||||
s = typ([1], dtype=dtype)
|
||||
result = s.map(type)[0]
|
||||
if not isinstance(rdtype, tuple):
|
||||
rdtype = (rdtype,)
|
||||
assert result in rdtype
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method",
|
||||
[
|
||||
lambda x: x.tolist(),
|
||||
lambda x: x.to_list(),
|
||||
lambda x: list(x),
|
||||
lambda x: list(x.__iter__()),
|
||||
],
|
||||
ids=["tolist", "to_list", "list", "iter"],
|
||||
)
|
||||
def test_categorial_datetimelike(self, method):
|
||||
i = CategoricalIndex([Timestamp("1999-12-31"), Timestamp("2000-12-31")])
|
||||
|
||||
result = method(i)[0]
|
||||
assert isinstance(result, Timestamp)
|
||||
|
||||
def test_iter_box_dt64(self, unit):
|
||||
vals = [Timestamp("2011-01-01"), Timestamp("2011-01-02")]
|
||||
ser = Series(vals).dt.as_unit(unit)
|
||||
assert ser.dtype == f"datetime64[{unit}]"
|
||||
for res, exp in zip(ser, vals):
|
||||
assert isinstance(res, Timestamp)
|
||||
assert res.tz is None
|
||||
assert res == exp
|
||||
assert res.unit == unit
|
||||
|
||||
def test_iter_box_dt64tz(self, unit):
|
||||
vals = [
|
||||
Timestamp("2011-01-01", tz="US/Eastern"),
|
||||
Timestamp("2011-01-02", tz="US/Eastern"),
|
||||
]
|
||||
ser = Series(vals).dt.as_unit(unit)
|
||||
|
||||
assert ser.dtype == f"datetime64[{unit}, US/Eastern]"
|
||||
for res, exp in zip(ser, vals):
|
||||
assert isinstance(res, Timestamp)
|
||||
assert res.tz == exp.tz
|
||||
assert res == exp
|
||||
assert res.unit == unit
|
||||
|
||||
def test_iter_box_timedelta64(self, unit):
|
||||
# timedelta
|
||||
vals = [Timedelta("1 days"), Timedelta("2 days")]
|
||||
ser = Series(vals).dt.as_unit(unit)
|
||||
assert ser.dtype == f"timedelta64[{unit}]"
|
||||
for res, exp in zip(ser, vals):
|
||||
assert isinstance(res, Timedelta)
|
||||
assert res == exp
|
||||
assert res.unit == unit
|
||||
|
||||
def test_iter_box_period(self):
|
||||
# period
|
||||
vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")]
|
||||
s = Series(vals)
|
||||
assert s.dtype == "Period[M]"
|
||||
for res, exp in zip(s, vals):
|
||||
assert isinstance(res, pd.Period)
|
||||
assert res.freq == "ME"
|
||||
assert res == exp
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arr, expected_type, dtype",
|
||||
[
|
||||
(np.array([0, 1], dtype=np.int64), np.ndarray, "int64"),
|
||||
(np.array(["a", "b"]), np.ndarray, "object"),
|
||||
(pd.Categorical(["a", "b"]), pd.Categorical, "category"),
|
||||
(
|
||||
pd.DatetimeIndex(["2017", "2018"], tz="US/Central"),
|
||||
DatetimeArray,
|
||||
"datetime64[ns, US/Central]",
|
||||
),
|
||||
(
|
||||
pd.PeriodIndex([2018, 2019], freq="Y"),
|
||||
PeriodArray,
|
||||
pd.core.dtypes.dtypes.PeriodDtype("Y-DEC"),
|
||||
),
|
||||
(pd.IntervalIndex.from_breaks([0, 1, 2]), IntervalArray, "interval"),
|
||||
(
|
||||
pd.DatetimeIndex(["2017", "2018"]),
|
||||
DatetimeArray,
|
||||
"datetime64[ns]",
|
||||
),
|
||||
(
|
||||
pd.TimedeltaIndex([10**10]),
|
||||
TimedeltaArray,
|
||||
"m8[ns]",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_values_consistent(arr, expected_type, dtype, using_infer_string):
|
||||
if using_infer_string and dtype == "object":
|
||||
expected_type = ArrowStringArrayNumpySemantics
|
||||
l_values = Series(arr)._values
|
||||
r_values = pd.Index(arr)._values
|
||||
assert type(l_values) is expected_type
|
||||
assert type(l_values) is type(r_values)
|
||||
|
||||
tm.assert_equal(l_values, r_values)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("arr", [np.array([1, 2, 3])])
|
||||
def test_numpy_array(arr):
|
||||
ser = Series(arr)
|
||||
result = ser.array
|
||||
expected = NumpyExtensionArray(arr)
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_numpy_array_all_dtypes(any_numpy_dtype):
|
||||
ser = Series(dtype=any_numpy_dtype)
|
||||
result = ser.array
|
||||
if np.dtype(any_numpy_dtype).kind == "M":
|
||||
assert isinstance(result, DatetimeArray)
|
||||
elif np.dtype(any_numpy_dtype).kind == "m":
|
||||
assert isinstance(result, TimedeltaArray)
|
||||
else:
|
||||
assert isinstance(result, NumpyExtensionArray)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arr, attr",
|
||||
[
|
||||
(pd.Categorical(["a", "b"]), "_codes"),
|
||||
(PeriodArray._from_sequence(["2000", "2001"], dtype="period[D]"), "_ndarray"),
|
||||
(pd.array([0, np.nan], dtype="Int64"), "_data"),
|
||||
(IntervalArray.from_breaks([0, 1]), "_left"),
|
||||
(SparseArray([0, 1]), "_sparse_values"),
|
||||
(
|
||||
DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")),
|
||||
"_ndarray",
|
||||
),
|
||||
# tz-aware Datetime
|
||||
(
|
||||
DatetimeArray._from_sequence(
|
||||
np.array(
|
||||
["2000-01-01T12:00:00", "2000-01-02T12:00:00"], dtype="M8[ns]"
|
||||
),
|
||||
dtype=DatetimeTZDtype(tz="US/Central"),
|
||||
),
|
||||
"_ndarray",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_array(arr, attr, index_or_series, request):
|
||||
box = index_or_series
|
||||
|
||||
result = box(arr, copy=False).array
|
||||
|
||||
if attr:
|
||||
arr = getattr(arr, attr)
|
||||
result = getattr(result, attr)
|
||||
|
||||
assert result is arr
|
||||
|
||||
|
||||
def test_array_multiindex_raises():
|
||||
idx = pd.MultiIndex.from_product([["A"], ["a", "b"]])
|
||||
msg = "MultiIndex has no single backing array"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
idx.array
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arr, expected",
|
||||
[
|
||||
(np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)),
|
||||
(pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object)),
|
||||
(
|
||||
pd.core.arrays.period_array(["2000", "2001"], freq="D"),
|
||||
np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]),
|
||||
),
|
||||
(pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan])),
|
||||
(
|
||||
IntervalArray.from_breaks([0, 1, 2]),
|
||||
np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object),
|
||||
),
|
||||
(SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)),
|
||||
# tz-naive datetime
|
||||
(
|
||||
DatetimeArray._from_sequence(np.array(["2000", "2001"], dtype="M8[ns]")),
|
||||
np.array(["2000", "2001"], dtype="M8[ns]"),
|
||||
),
|
||||
# tz-aware stays tz`-aware
|
||||
(
|
||||
DatetimeArray._from_sequence(
|
||||
np.array(["2000-01-01T06:00:00", "2000-01-02T06:00:00"], dtype="M8[ns]")
|
||||
)
|
||||
.tz_localize("UTC")
|
||||
.tz_convert("US/Central"),
|
||||
np.array(
|
||||
[
|
||||
Timestamp("2000-01-01", tz="US/Central"),
|
||||
Timestamp("2000-01-02", tz="US/Central"),
|
||||
]
|
||||
),
|
||||
),
|
||||
# Timedelta
|
||||
(
|
||||
TimedeltaArray._from_sequence(
|
||||
np.array([0, 3600000000000], dtype="i8").view("m8[ns]")
|
||||
),
|
||||
np.array([0, 3600000000000], dtype="m8[ns]"),
|
||||
),
|
||||
# GH#26406 tz is preserved in Categorical[dt64tz]
|
||||
(
|
||||
pd.Categorical(date_range("2016-01-01", periods=2, tz="US/Pacific")),
|
||||
np.array(
|
||||
[
|
||||
Timestamp("2016-01-01", tz="US/Pacific"),
|
||||
Timestamp("2016-01-02", tz="US/Pacific"),
|
||||
]
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_to_numpy(arr, expected, index_or_series_or_array, request):
|
||||
box = index_or_series_or_array
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
thing = box(arr)
|
||||
|
||||
result = thing.to_numpy()
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
result = np.asarray(thing)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("as_series", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"arr", [np.array([1, 2, 3], dtype="int64"), np.array(["a", "b", "c"], dtype=object)]
|
||||
)
|
||||
def test_to_numpy_copy(arr, as_series, using_infer_string):
|
||||
obj = pd.Index(arr, copy=False)
|
||||
if as_series:
|
||||
obj = Series(obj.values, copy=False)
|
||||
|
||||
# no copy by default
|
||||
result = obj.to_numpy()
|
||||
if using_infer_string and arr.dtype == object:
|
||||
assert np.shares_memory(arr, result) is False
|
||||
else:
|
||||
assert np.shares_memory(arr, result) is True
|
||||
|
||||
result = obj.to_numpy(copy=False)
|
||||
if using_infer_string and arr.dtype == object:
|
||||
assert np.shares_memory(arr, result) is False
|
||||
else:
|
||||
assert np.shares_memory(arr, result) is True
|
||||
|
||||
# copy=True
|
||||
result = obj.to_numpy(copy=True)
|
||||
assert np.shares_memory(arr, result) is False
|
||||
|
||||
|
||||
@pytest.mark.parametrize("as_series", [True, False])
|
||||
def test_to_numpy_dtype(as_series, unit):
|
||||
tz = "US/Eastern"
|
||||
obj = pd.DatetimeIndex(["2000", "2001"], tz=tz)
|
||||
if as_series:
|
||||
obj = Series(obj)
|
||||
|
||||
# preserve tz by default
|
||||
result = obj.to_numpy()
|
||||
expected = np.array(
|
||||
[Timestamp("2000", tz=tz), Timestamp("2001", tz=tz)], dtype=object
|
||||
)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
result = obj.to_numpy(dtype="object")
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
result = obj.to_numpy(dtype="M8[ns]")
|
||||
expected = np.array(["2000-01-01T05", "2001-01-01T05"], dtype="M8[ns]")
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, dtype, na_value, expected",
|
||||
[
|
||||
([1, 2, None], "float64", 0, [1.0, 2.0, 0.0]),
|
||||
(
|
||||
[Timestamp("2000"), Timestamp("2000"), pd.NaT],
|
||||
None,
|
||||
Timestamp("2000"),
|
||||
[np.datetime64("2000-01-01T00:00:00.000000000")] * 3,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_to_numpy_na_value_numpy_dtype(
|
||||
index_or_series, values, dtype, na_value, expected
|
||||
):
|
||||
obj = index_or_series(values)
|
||||
result = obj.to_numpy(dtype=dtype, na_value=na_value)
|
||||
expected = np.array(expected)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, multiindex, dtype, na_value, expected",
|
||||
[
|
||||
(
|
||||
[1, 2, None, 4],
|
||||
[(0, "a"), (0, "b"), (1, "b"), (1, "c")],
|
||||
float,
|
||||
None,
|
||||
[1.0, 2.0, np.nan, 4.0],
|
||||
),
|
||||
(
|
||||
[1, 2, None, 4],
|
||||
[(0, "a"), (0, "b"), (1, "b"), (1, "c")],
|
||||
float,
|
||||
np.nan,
|
||||
[1.0, 2.0, np.nan, 4.0],
|
||||
),
|
||||
(
|
||||
[1.0, 2.0, np.nan, 4.0],
|
||||
[("a", 0), ("a", 1), ("a", 2), ("b", 0)],
|
||||
int,
|
||||
0,
|
||||
[1, 2, 0, 4],
|
||||
),
|
||||
(
|
||||
[Timestamp("2000"), Timestamp("2000"), pd.NaT],
|
||||
[(0, Timestamp("2021")), (0, Timestamp("2022")), (1, Timestamp("2000"))],
|
||||
None,
|
||||
Timestamp("2000"),
|
||||
[np.datetime64("2000-01-01T00:00:00.000000000")] * 3,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_to_numpy_multiindex_series_na_value(
|
||||
data, multiindex, dtype, na_value, expected
|
||||
):
|
||||
index = pd.MultiIndex.from_tuples(multiindex)
|
||||
series = Series(data, index=index)
|
||||
result = series.to_numpy(dtype=dtype, na_value=na_value)
|
||||
expected = np.array(expected)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_to_numpy_kwargs_raises():
|
||||
# numpy
|
||||
s = Series([1, 2, 3])
|
||||
msg = r"to_numpy\(\) got an unexpected keyword argument 'foo'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
s.to_numpy(foo=True)
|
||||
|
||||
# extension
|
||||
s = Series([1, 2, 3], dtype="Int64")
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
s.to_numpy(foo=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
{"a": [1, 2, 3], "b": [1, 2, None]},
|
||||
{"a": np.array([1, 2, 3]), "b": np.array([1, 2, np.nan])},
|
||||
{"a": pd.array([1, 2, 3]), "b": pd.array([1, 2, None])},
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype, na_value", [(float, np.nan), (object, None)])
|
||||
def test_to_numpy_dataframe_na_value(data, dtype, na_value):
|
||||
# https://github.com/pandas-dev/pandas/issues/33820
|
||||
df = pd.DataFrame(data)
|
||||
result = df.to_numpy(dtype=dtype, na_value=na_value)
|
||||
expected = np.array([[1, 1], [2, 2], [3, na_value]], dtype=dtype)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, expected",
|
||||
[
|
||||
(
|
||||
{"a": pd.array([1, 2, None])},
|
||||
np.array([[1.0], [2.0], [np.nan]], dtype=float),
|
||||
),
|
||||
(
|
||||
{"a": [1, 2, 3], "b": [1, 2, 3]},
|
||||
np.array([[1, 1], [2, 2], [3, 3]], dtype=float),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_to_numpy_dataframe_single_block(data, expected):
|
||||
# https://github.com/pandas-dev/pandas/issues/33820
|
||||
df = pd.DataFrame(data)
|
||||
result = df.to_numpy(dtype=float, na_value=np.nan)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_to_numpy_dataframe_single_block_no_mutate():
|
||||
# https://github.com/pandas-dev/pandas/issues/33820
|
||||
result = pd.DataFrame(np.array([1.0, 2.0, np.nan]))
|
||||
expected = pd.DataFrame(np.array([1.0, 2.0, np.nan]))
|
||||
result.to_numpy(na_value=0.0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestAsArray:
|
||||
@pytest.mark.parametrize("tz", [None, "US/Central"])
|
||||
def test_asarray_object_dt64(self, tz):
|
||||
ser = Series(date_range("2000", periods=2, tz=tz))
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
# Future behavior (for tzaware case) with no warning
|
||||
result = np.asarray(ser, dtype=object)
|
||||
|
||||
expected = np.array(
|
||||
[Timestamp("2000-01-01", tz=tz), Timestamp("2000-01-02", tz=tz)]
|
||||
)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_asarray_tz_naive(self):
|
||||
# This shouldn't produce a warning.
|
||||
ser = Series(date_range("2000", periods=2))
|
||||
expected = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]")
|
||||
result = np.asarray(ser)
|
||||
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_asarray_tz_aware(self):
|
||||
tz = "US/Central"
|
||||
ser = Series(date_range("2000", periods=2, tz=tz))
|
||||
expected = np.array(["2000-01-01T06", "2000-01-02T06"], dtype="M8[ns]")
|
||||
result = np.asarray(ser, dtype="datetime64[ns]")
|
||||
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
# Old behavior with no warning
|
||||
result = np.asarray(ser, dtype="M8[ns]")
|
||||
|
||||
tm.assert_numpy_array_equal(result, expected)
|
@ -0,0 +1,60 @@
|
||||
"""
|
||||
Though Index.fillna and Series.fillna has separate impl,
|
||||
test here to confirm these works as the same
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import MultiIndex
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.base.common import allow_na_ops
|
||||
|
||||
|
||||
def test_fillna(index_or_series_obj):
|
||||
# GH 11343
|
||||
obj = index_or_series_obj
|
||||
|
||||
if isinstance(obj, MultiIndex):
|
||||
msg = "isna is not defined for MultiIndex"
|
||||
with pytest.raises(NotImplementedError, match=msg):
|
||||
obj.fillna(0)
|
||||
return
|
||||
|
||||
# values will not be changed
|
||||
fill_value = obj.values[0] if len(obj) > 0 else 0
|
||||
result = obj.fillna(fill_value)
|
||||
|
||||
tm.assert_equal(obj, result)
|
||||
|
||||
# check shallow_copied
|
||||
assert obj is not result
|
||||
|
||||
|
||||
@pytest.mark.parametrize("null_obj", [np.nan, None])
|
||||
def test_fillna_null(null_obj, index_or_series_obj):
|
||||
# GH 11343
|
||||
obj = index_or_series_obj
|
||||
klass = type(obj)
|
||||
|
||||
if not allow_na_ops(obj):
|
||||
pytest.skip(f"{klass} doesn't allow for NA operations")
|
||||
elif len(obj) < 1:
|
||||
pytest.skip("Test doesn't make sense on empty data")
|
||||
elif isinstance(obj, MultiIndex):
|
||||
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
|
||||
|
||||
values = obj._values
|
||||
fill_value = values[0]
|
||||
expected = values.copy()
|
||||
values[0:2] = null_obj
|
||||
expected[0:2] = fill_value
|
||||
|
||||
expected = klass(expected)
|
||||
obj = klass(values)
|
||||
|
||||
result = obj.fillna(fill_value)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# check shallow_copied
|
||||
assert obj is not result
|
191
lib/python3.13/site-packages/pandas/tests/base/test_misc.py
Normal file
191
lib/python3.13/site-packages/pandas/tests/base/test_misc.py
Normal file
@ -0,0 +1,191 @@
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._config import using_pyarrow_string_dtype
|
||||
|
||||
from pandas.compat import PYPY
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_dtype_equal,
|
||||
is_object_dtype,
|
||||
)
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_isnull_notnull_docstrings():
|
||||
# GH#41855 make sure its clear these are aliases
|
||||
doc = pd.DataFrame.notnull.__doc__
|
||||
assert doc.startswith("\nDataFrame.notnull is an alias for DataFrame.notna.\n")
|
||||
doc = pd.DataFrame.isnull.__doc__
|
||||
assert doc.startswith("\nDataFrame.isnull is an alias for DataFrame.isna.\n")
|
||||
|
||||
doc = Series.notnull.__doc__
|
||||
assert doc.startswith("\nSeries.notnull is an alias for Series.notna.\n")
|
||||
doc = Series.isnull.__doc__
|
||||
assert doc.startswith("\nSeries.isnull is an alias for Series.isna.\n")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op_name, op",
|
||||
[
|
||||
("add", "+"),
|
||||
("sub", "-"),
|
||||
("mul", "*"),
|
||||
("mod", "%"),
|
||||
("pow", "**"),
|
||||
("truediv", "/"),
|
||||
("floordiv", "//"),
|
||||
],
|
||||
)
|
||||
def test_binary_ops_docstring(frame_or_series, op_name, op):
|
||||
# not using the all_arithmetic_functions fixture with _get_opstr
|
||||
# as _get_opstr is used internally in the dynamic implementation of the docstring
|
||||
klass = frame_or_series
|
||||
|
||||
operand1 = klass.__name__.lower()
|
||||
operand2 = "other"
|
||||
expected_str = " ".join([operand1, op, operand2])
|
||||
assert expected_str in getattr(klass, op_name).__doc__
|
||||
|
||||
# reverse version of the binary ops
|
||||
expected_str = " ".join([operand2, op, operand1])
|
||||
assert expected_str in getattr(klass, "r" + op_name).__doc__
|
||||
|
||||
|
||||
def test_ndarray_compat_properties(index_or_series_obj):
|
||||
obj = index_or_series_obj
|
||||
|
||||
# Check that we work.
|
||||
for p in ["shape", "dtype", "T", "nbytes"]:
|
||||
assert getattr(obj, p, None) is not None
|
||||
|
||||
# deprecated properties
|
||||
for p in ["strides", "itemsize", "base", "data"]:
|
||||
assert not hasattr(obj, p)
|
||||
|
||||
msg = "can only convert an array of size 1 to a Python scalar"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.item() # len > 1
|
||||
|
||||
assert obj.ndim == 1
|
||||
assert obj.size == len(obj)
|
||||
|
||||
assert Index([1]).item() == 1
|
||||
assert Series([1]).item() == 1
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
PYPY or using_pyarrow_string_dtype(),
|
||||
reason="not relevant for PyPy doesn't work properly for arrow strings",
|
||||
)
|
||||
def test_memory_usage(index_or_series_memory_obj):
|
||||
obj = index_or_series_memory_obj
|
||||
# Clear index caches so that len(obj) == 0 report 0 memory usage
|
||||
if isinstance(obj, Series):
|
||||
is_ser = True
|
||||
obj.index._engine.clear_mapping()
|
||||
else:
|
||||
is_ser = False
|
||||
obj._engine.clear_mapping()
|
||||
|
||||
res = obj.memory_usage()
|
||||
res_deep = obj.memory_usage(deep=True)
|
||||
|
||||
is_object = is_object_dtype(obj) or (is_ser and is_object_dtype(obj.index))
|
||||
is_categorical = isinstance(obj.dtype, pd.CategoricalDtype) or (
|
||||
is_ser and isinstance(obj.index.dtype, pd.CategoricalDtype)
|
||||
)
|
||||
is_object_string = is_dtype_equal(obj, "string[python]") or (
|
||||
is_ser and is_dtype_equal(obj.index.dtype, "string[python]")
|
||||
)
|
||||
|
||||
if len(obj) == 0:
|
||||
expected = 0
|
||||
assert res_deep == res == expected
|
||||
elif is_object or is_categorical or is_object_string:
|
||||
# only deep will pick them up
|
||||
assert res_deep > res
|
||||
else:
|
||||
assert res == res_deep
|
||||
|
||||
# sys.getsizeof will call the .memory_usage with
|
||||
# deep=True, and add on some GC overhead
|
||||
diff = res_deep - sys.getsizeof(obj)
|
||||
assert abs(diff) < 100
|
||||
|
||||
|
||||
def test_memory_usage_components_series(series_with_simple_index):
|
||||
series = series_with_simple_index
|
||||
total_usage = series.memory_usage(index=True)
|
||||
non_index_usage = series.memory_usage(index=False)
|
||||
index_usage = series.index.memory_usage()
|
||||
assert total_usage == non_index_usage + index_usage
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", tm.NARROW_NP_DTYPES)
|
||||
def test_memory_usage_components_narrow_series(dtype):
|
||||
series = Series(range(5), dtype=dtype, index=[f"i-{i}" for i in range(5)], name="a")
|
||||
total_usage = series.memory_usage(index=True)
|
||||
non_index_usage = series.memory_usage(index=False)
|
||||
index_usage = series.index.memory_usage()
|
||||
assert total_usage == non_index_usage + index_usage
|
||||
|
||||
|
||||
def test_searchsorted(request, index_or_series_obj):
|
||||
# numpy.searchsorted calls obj.searchsorted under the hood.
|
||||
# See gh-12238
|
||||
obj = index_or_series_obj
|
||||
|
||||
if isinstance(obj, pd.MultiIndex):
|
||||
# See gh-14833
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(
|
||||
reason="np.searchsorted doesn't work on pd.MultiIndex: GH 14833"
|
||||
)
|
||||
)
|
||||
elif obj.dtype.kind == "c" and isinstance(obj, Index):
|
||||
# TODO: Should Series cases also raise? Looks like they use numpy
|
||||
# comparison semantics https://github.com/numpy/numpy/issues/15981
|
||||
mark = pytest.mark.xfail(reason="complex objects are not comparable")
|
||||
request.applymarker(mark)
|
||||
|
||||
max_obj = max(obj, default=0)
|
||||
index = np.searchsorted(obj, max_obj)
|
||||
assert 0 <= index <= len(obj)
|
||||
|
||||
index = np.searchsorted(obj, max_obj, sorter=range(len(obj)))
|
||||
assert 0 <= index <= len(obj)
|
||||
|
||||
|
||||
def test_access_by_position(index_flat):
|
||||
index = index_flat
|
||||
|
||||
if len(index) == 0:
|
||||
pytest.skip("Test doesn't make sense on empty data")
|
||||
|
||||
series = Series(index)
|
||||
assert index[0] == series.iloc[0]
|
||||
assert index[5] == series.iloc[5]
|
||||
assert index[-1] == series.iloc[-1]
|
||||
|
||||
size = len(index)
|
||||
assert index[-1] == index[size - 1]
|
||||
|
||||
msg = f"index {size} is out of bounds for axis 0 with size {size}"
|
||||
if is_dtype_equal(index.dtype, "string[pyarrow]") or is_dtype_equal(
|
||||
index.dtype, "string[pyarrow_numpy]"
|
||||
):
|
||||
msg = "index out of bounds"
|
||||
with pytest.raises(IndexError, match=msg):
|
||||
index[size]
|
||||
msg = "single positional indexer is out-of-bounds"
|
||||
with pytest.raises(IndexError, match=msg):
|
||||
series.iloc[size]
|
@ -0,0 +1,56 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
CategoricalDtype,
|
||||
DataFrame,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_transpose(index_or_series_obj):
|
||||
obj = index_or_series_obj
|
||||
tm.assert_equal(obj.transpose(), obj)
|
||||
|
||||
|
||||
def test_transpose_non_default_axes(index_or_series_obj):
|
||||
msg = "the 'axes' parameter is not supported"
|
||||
obj = index_or_series_obj
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.transpose(1)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.transpose(axes=1)
|
||||
|
||||
|
||||
def test_numpy_transpose(index_or_series_obj):
|
||||
msg = "the 'axes' parameter is not supported"
|
||||
obj = index_or_series_obj
|
||||
tm.assert_equal(np.transpose(obj), obj)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
np.transpose(obj, axes=1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, transposed_data, index, columns, dtype",
|
||||
[
|
||||
([[1], [2]], [[1, 2]], ["a", "a"], ["b"], int),
|
||||
([[1], [2]], [[1, 2]], ["a", "a"], ["b"], CategoricalDtype([1, 2])),
|
||||
([[1, 2]], [[1], [2]], ["b"], ["a", "a"], int),
|
||||
([[1, 2]], [[1], [2]], ["b"], ["a", "a"], CategoricalDtype([1, 2])),
|
||||
([[1, 2], [3, 4]], [[1, 3], [2, 4]], ["a", "a"], ["b", "b"], int),
|
||||
(
|
||||
[[1, 2], [3, 4]],
|
||||
[[1, 3], [2, 4]],
|
||||
["a", "a"],
|
||||
["b", "b"],
|
||||
CategoricalDtype([1, 2, 3, 4]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_duplicate_labels(data, transposed_data, index, columns, dtype):
|
||||
# GH 42380
|
||||
df = DataFrame(data, index=index, columns=columns, dtype=dtype)
|
||||
result = df.T
|
||||
expected = DataFrame(transposed_data, index=columns, columns=index, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
124
lib/python3.13/site-packages/pandas/tests/base/test_unique.py
Normal file
124
lib/python3.13/site-packages/pandas/tests/base/test_unique.py
Normal file
@ -0,0 +1,124 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._config import using_pyarrow_string_dtype
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.base.common import allow_na_ops
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
|
||||
def test_unique(index_or_series_obj):
|
||||
obj = index_or_series_obj
|
||||
obj = np.repeat(obj, range(1, len(obj) + 1))
|
||||
result = obj.unique()
|
||||
|
||||
# dict.fromkeys preserves the order
|
||||
unique_values = list(dict.fromkeys(obj.values))
|
||||
if isinstance(obj, pd.MultiIndex):
|
||||
expected = pd.MultiIndex.from_tuples(unique_values)
|
||||
expected.names = obj.names
|
||||
tm.assert_index_equal(result, expected, exact=True)
|
||||
elif isinstance(obj, pd.Index):
|
||||
expected = pd.Index(unique_values, dtype=obj.dtype)
|
||||
if isinstance(obj.dtype, pd.DatetimeTZDtype):
|
||||
expected = expected.normalize()
|
||||
tm.assert_index_equal(result, expected, exact=True)
|
||||
else:
|
||||
expected = np.array(unique_values)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
|
||||
@pytest.mark.parametrize("null_obj", [np.nan, None])
|
||||
def test_unique_null(null_obj, index_or_series_obj):
|
||||
obj = index_or_series_obj
|
||||
|
||||
if not allow_na_ops(obj):
|
||||
pytest.skip("type doesn't allow for NA operations")
|
||||
elif len(obj) < 1:
|
||||
pytest.skip("Test doesn't make sense on empty data")
|
||||
elif isinstance(obj, pd.MultiIndex):
|
||||
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
|
||||
|
||||
values = obj._values
|
||||
values[0:2] = null_obj
|
||||
|
||||
klass = type(obj)
|
||||
repeated_values = np.repeat(values, range(1, len(values) + 1))
|
||||
obj = klass(repeated_values, dtype=obj.dtype)
|
||||
result = obj.unique()
|
||||
|
||||
unique_values_raw = dict.fromkeys(obj.values)
|
||||
# because np.nan == np.nan is False, but None == None is True
|
||||
# np.nan would be duplicated, whereas None wouldn't
|
||||
unique_values_not_null = [val for val in unique_values_raw if not pd.isnull(val)]
|
||||
unique_values = [null_obj] + unique_values_not_null
|
||||
|
||||
if isinstance(obj, pd.Index):
|
||||
expected = pd.Index(unique_values, dtype=obj.dtype)
|
||||
if isinstance(obj.dtype, pd.DatetimeTZDtype):
|
||||
result = result.normalize()
|
||||
expected = expected.normalize()
|
||||
tm.assert_index_equal(result, expected, exact=True)
|
||||
else:
|
||||
expected = np.array(unique_values, dtype=obj.dtype)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_nunique(index_or_series_obj):
|
||||
obj = index_or_series_obj
|
||||
obj = np.repeat(obj, range(1, len(obj) + 1))
|
||||
expected = len(obj.unique())
|
||||
assert obj.nunique(dropna=False) == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("null_obj", [np.nan, None])
|
||||
def test_nunique_null(null_obj, index_or_series_obj):
|
||||
obj = index_or_series_obj
|
||||
|
||||
if not allow_na_ops(obj):
|
||||
pytest.skip("type doesn't allow for NA operations")
|
||||
elif isinstance(obj, pd.MultiIndex):
|
||||
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
|
||||
|
||||
values = obj._values
|
||||
values[0:2] = null_obj
|
||||
|
||||
klass = type(obj)
|
||||
repeated_values = np.repeat(values, range(1, len(values) + 1))
|
||||
obj = klass(repeated_values, dtype=obj.dtype)
|
||||
|
||||
if isinstance(obj, pd.CategoricalIndex):
|
||||
assert obj.nunique() == len(obj.categories)
|
||||
assert obj.nunique(dropna=False) == len(obj.categories) + 1
|
||||
else:
|
||||
num_unique_values = len(obj.unique())
|
||||
assert obj.nunique() == max(0, num_unique_values - 1)
|
||||
assert obj.nunique(dropna=False) == max(0, num_unique_values)
|
||||
|
||||
|
||||
@pytest.mark.single_cpu
|
||||
@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="decoding fails")
|
||||
def test_unique_bad_unicode(index_or_series):
|
||||
# regression test for #34550
|
||||
uval = "\ud83d" # smiley emoji
|
||||
|
||||
obj = index_or_series([uval] * 2)
|
||||
result = obj.unique()
|
||||
|
||||
if isinstance(obj, pd.Index):
|
||||
expected = pd.Index(["\ud83d"], dtype=object)
|
||||
tm.assert_index_equal(result, expected, exact=True)
|
||||
else:
|
||||
expected = np.array(["\ud83d"], dtype=object)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dropna", [True, False])
|
||||
def test_nunique_dropna(dropna):
|
||||
# GH37566
|
||||
ser = pd.Series(["yes", "yes", pd.NA, np.nan, None, pd.NaT])
|
||||
res = ser.nunique(dropna)
|
||||
assert res == 1 if dropna else 5
|
@ -0,0 +1,356 @@
|
||||
import collections
|
||||
from datetime import timedelta
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DatetimeIndex,
|
||||
Index,
|
||||
Interval,
|
||||
IntervalIndex,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timedelta,
|
||||
TimedeltaIndex,
|
||||
array,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.base.common import allow_na_ops
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
|
||||
def test_value_counts(index_or_series_obj):
|
||||
obj = index_or_series_obj
|
||||
obj = np.repeat(obj, range(1, len(obj) + 1))
|
||||
result = obj.value_counts()
|
||||
|
||||
counter = collections.Counter(obj)
|
||||
expected = Series(dict(counter.most_common()), dtype=np.int64, name="count")
|
||||
|
||||
if obj.dtype != np.float16:
|
||||
expected.index = expected.index.astype(obj.dtype)
|
||||
else:
|
||||
with pytest.raises(NotImplementedError, match="float16 indexes are not "):
|
||||
expected.index.astype(obj.dtype)
|
||||
return
|
||||
if isinstance(expected.index, MultiIndex):
|
||||
expected.index.names = obj.names
|
||||
else:
|
||||
expected.index.name = obj.name
|
||||
|
||||
if not isinstance(result.dtype, np.dtype):
|
||||
if getattr(obj.dtype, "storage", "") == "pyarrow":
|
||||
expected = expected.astype("int64[pyarrow]")
|
||||
else:
|
||||
# i.e IntegerDtype
|
||||
expected = expected.astype("Int64")
|
||||
|
||||
# TODO(GH#32514): Order of entries with the same count is inconsistent
|
||||
# on CI (gh-32449)
|
||||
if obj.duplicated().any():
|
||||
result = result.sort_index()
|
||||
expected = expected.sort_index()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("null_obj", [np.nan, None])
|
||||
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
|
||||
def test_value_counts_null(null_obj, index_or_series_obj):
|
||||
orig = index_or_series_obj
|
||||
obj = orig.copy()
|
||||
|
||||
if not allow_na_ops(obj):
|
||||
pytest.skip("type doesn't allow for NA operations")
|
||||
elif len(obj) < 1:
|
||||
pytest.skip("Test doesn't make sense on empty data")
|
||||
elif isinstance(orig, MultiIndex):
|
||||
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
|
||||
|
||||
values = obj._values
|
||||
values[0:2] = null_obj
|
||||
|
||||
klass = type(obj)
|
||||
repeated_values = np.repeat(values, range(1, len(values) + 1))
|
||||
obj = klass(repeated_values, dtype=obj.dtype)
|
||||
|
||||
# because np.nan == np.nan is False, but None == None is True
|
||||
# np.nan would be duplicated, whereas None wouldn't
|
||||
counter = collections.Counter(obj.dropna())
|
||||
expected = Series(dict(counter.most_common()), dtype=np.int64, name="count")
|
||||
|
||||
if obj.dtype != np.float16:
|
||||
expected.index = expected.index.astype(obj.dtype)
|
||||
else:
|
||||
with pytest.raises(NotImplementedError, match="float16 indexes are not "):
|
||||
expected.index.astype(obj.dtype)
|
||||
return
|
||||
expected.index.name = obj.name
|
||||
|
||||
result = obj.value_counts()
|
||||
if obj.duplicated().any():
|
||||
# TODO(GH#32514):
|
||||
# Order of entries with the same count is inconsistent on CI (gh-32449)
|
||||
expected = expected.sort_index()
|
||||
result = result.sort_index()
|
||||
|
||||
if not isinstance(result.dtype, np.dtype):
|
||||
if getattr(obj.dtype, "storage", "") == "pyarrow":
|
||||
expected = expected.astype("int64[pyarrow]")
|
||||
else:
|
||||
# i.e IntegerDtype
|
||||
expected = expected.astype("Int64")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
expected[null_obj] = 3
|
||||
|
||||
result = obj.value_counts(dropna=False)
|
||||
if obj.duplicated().any():
|
||||
# TODO(GH#32514):
|
||||
# Order of entries with the same count is inconsistent on CI (gh-32449)
|
||||
expected = expected.sort_index()
|
||||
result = result.sort_index()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_value_counts_inferred(index_or_series, using_infer_string):
|
||||
klass = index_or_series
|
||||
s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"]
|
||||
s = klass(s_values)
|
||||
expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"], name="count")
|
||||
tm.assert_series_equal(s.value_counts(), expected)
|
||||
|
||||
if isinstance(s, Index):
|
||||
exp = Index(np.unique(np.array(s_values, dtype=np.object_)))
|
||||
tm.assert_index_equal(s.unique(), exp)
|
||||
else:
|
||||
exp = np.unique(np.array(s_values, dtype=np.object_))
|
||||
if using_infer_string:
|
||||
exp = array(exp)
|
||||
tm.assert_equal(s.unique(), exp)
|
||||
|
||||
assert s.nunique() == 4
|
||||
# don't sort, have to sort after the fact as not sorting is
|
||||
# platform-dep
|
||||
hist = s.value_counts(sort=False).sort_values()
|
||||
expected = Series([3, 1, 4, 2], index=list("acbd"), name="count").sort_values()
|
||||
tm.assert_series_equal(hist, expected)
|
||||
|
||||
# sort ascending
|
||||
hist = s.value_counts(ascending=True)
|
||||
expected = Series([1, 2, 3, 4], index=list("cdab"), name="count")
|
||||
tm.assert_series_equal(hist, expected)
|
||||
|
||||
# relative histogram.
|
||||
hist = s.value_counts(normalize=True)
|
||||
expected = Series(
|
||||
[0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"], name="proportion"
|
||||
)
|
||||
tm.assert_series_equal(hist, expected)
|
||||
|
||||
|
||||
def test_value_counts_bins(index_or_series, using_infer_string):
|
||||
klass = index_or_series
|
||||
s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"]
|
||||
s = klass(s_values)
|
||||
|
||||
# bins
|
||||
msg = "bins argument only works with numeric data"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
s.value_counts(bins=1)
|
||||
|
||||
s1 = Series([1, 1, 2, 3])
|
||||
res1 = s1.value_counts(bins=1)
|
||||
exp1 = Series({Interval(0.997, 3.0): 4}, name="count")
|
||||
tm.assert_series_equal(res1, exp1)
|
||||
res1n = s1.value_counts(bins=1, normalize=True)
|
||||
exp1n = Series({Interval(0.997, 3.0): 1.0}, name="proportion")
|
||||
tm.assert_series_equal(res1n, exp1n)
|
||||
|
||||
if isinstance(s1, Index):
|
||||
tm.assert_index_equal(s1.unique(), Index([1, 2, 3]))
|
||||
else:
|
||||
exp = np.array([1, 2, 3], dtype=np.int64)
|
||||
tm.assert_numpy_array_equal(s1.unique(), exp)
|
||||
|
||||
assert s1.nunique() == 3
|
||||
|
||||
# these return the same
|
||||
res4 = s1.value_counts(bins=4, dropna=True)
|
||||
intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
|
||||
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]), name="count")
|
||||
tm.assert_series_equal(res4, exp4)
|
||||
|
||||
res4 = s1.value_counts(bins=4, dropna=False)
|
||||
intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
|
||||
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]), name="count")
|
||||
tm.assert_series_equal(res4, exp4)
|
||||
|
||||
res4n = s1.value_counts(bins=4, normalize=True)
|
||||
exp4n = Series(
|
||||
[0.5, 0.25, 0.25, 0], index=intervals.take([0, 1, 3, 2]), name="proportion"
|
||||
)
|
||||
tm.assert_series_equal(res4n, exp4n)
|
||||
|
||||
# handle NA's properly
|
||||
s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"]
|
||||
s = klass(s_values)
|
||||
expected = Series([4, 3, 2], index=["b", "a", "d"], name="count")
|
||||
tm.assert_series_equal(s.value_counts(), expected)
|
||||
|
||||
if isinstance(s, Index):
|
||||
exp = Index(["a", "b", np.nan, "d"])
|
||||
tm.assert_index_equal(s.unique(), exp)
|
||||
else:
|
||||
exp = np.array(["a", "b", np.nan, "d"], dtype=object)
|
||||
if using_infer_string:
|
||||
exp = array(exp)
|
||||
tm.assert_equal(s.unique(), exp)
|
||||
assert s.nunique() == 3
|
||||
|
||||
s = klass({}) if klass is dict else klass({}, dtype=object)
|
||||
expected = Series([], dtype=np.int64, name="count")
|
||||
tm.assert_series_equal(s.value_counts(), expected, check_index_type=False)
|
||||
# returned dtype differs depending on original
|
||||
if isinstance(s, Index):
|
||||
tm.assert_index_equal(s.unique(), Index([]), exact=False)
|
||||
else:
|
||||
tm.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False)
|
||||
|
||||
assert s.nunique() == 0
|
||||
|
||||
|
||||
def test_value_counts_datetime64(index_or_series, unit):
|
||||
klass = index_or_series
|
||||
|
||||
# GH 3002, datetime64[ns]
|
||||
# don't test names though
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"person_id": ["xxyyzz", "xxyyzz", "xxyyzz", "xxyyww", "foofoo", "foofoo"],
|
||||
"dt": pd.to_datetime(
|
||||
[
|
||||
"2010-01-01",
|
||||
"2010-01-01",
|
||||
"2010-01-01",
|
||||
"2009-01-01",
|
||||
"2008-09-09",
|
||||
"2008-09-09",
|
||||
]
|
||||
).as_unit(unit),
|
||||
"food": ["PIE", "GUM", "EGG", "EGG", "PIE", "GUM"],
|
||||
}
|
||||
)
|
||||
|
||||
s = klass(df["dt"].copy())
|
||||
s.name = None
|
||||
idx = pd.to_datetime(
|
||||
["2010-01-01 00:00:00", "2008-09-09 00:00:00", "2009-01-01 00:00:00"]
|
||||
).as_unit(unit)
|
||||
expected_s = Series([3, 2, 1], index=idx, name="count")
|
||||
tm.assert_series_equal(s.value_counts(), expected_s)
|
||||
|
||||
expected = array(
|
||||
np.array(
|
||||
["2010-01-01 00:00:00", "2009-01-01 00:00:00", "2008-09-09 00:00:00"],
|
||||
dtype=f"datetime64[{unit}]",
|
||||
)
|
||||
)
|
||||
result = s.unique()
|
||||
if isinstance(s, Index):
|
||||
tm.assert_index_equal(result, DatetimeIndex(expected))
|
||||
else:
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
|
||||
assert s.nunique() == 3
|
||||
|
||||
# with NaT
|
||||
s = df["dt"].copy()
|
||||
s = klass(list(s.values) + [pd.NaT] * 4)
|
||||
if klass is Series:
|
||||
s = s.dt.as_unit(unit)
|
||||
else:
|
||||
s = s.as_unit(unit)
|
||||
|
||||
result = s.value_counts()
|
||||
assert result.index.dtype == f"datetime64[{unit}]"
|
||||
tm.assert_series_equal(result, expected_s)
|
||||
|
||||
result = s.value_counts(dropna=False)
|
||||
expected_s = pd.concat(
|
||||
[
|
||||
Series([4], index=DatetimeIndex([pd.NaT]).as_unit(unit), name="count"),
|
||||
expected_s,
|
||||
]
|
||||
)
|
||||
tm.assert_series_equal(result, expected_s)
|
||||
|
||||
assert s.dtype == f"datetime64[{unit}]"
|
||||
unique = s.unique()
|
||||
assert unique.dtype == f"datetime64[{unit}]"
|
||||
|
||||
# numpy_array_equal cannot compare pd.NaT
|
||||
if isinstance(s, Index):
|
||||
exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT]).as_unit(unit)
|
||||
tm.assert_index_equal(unique, exp_idx)
|
||||
else:
|
||||
tm.assert_extension_array_equal(unique[:3], expected)
|
||||
assert pd.isna(unique[3])
|
||||
|
||||
assert s.nunique() == 3
|
||||
assert s.nunique(dropna=False) == 4
|
||||
|
||||
|
||||
def test_value_counts_timedelta64(index_or_series, unit):
|
||||
# timedelta64[ns]
|
||||
klass = index_or_series
|
||||
|
||||
day = Timedelta(timedelta(1)).as_unit(unit)
|
||||
tdi = TimedeltaIndex([day], name="dt").as_unit(unit)
|
||||
|
||||
tdvals = np.zeros(6, dtype=f"m8[{unit}]") + day
|
||||
td = klass(tdvals, name="dt")
|
||||
|
||||
result = td.value_counts()
|
||||
expected_s = Series([6], index=tdi, name="count")
|
||||
tm.assert_series_equal(result, expected_s)
|
||||
|
||||
expected = tdi
|
||||
result = td.unique()
|
||||
if isinstance(td, Index):
|
||||
tm.assert_index_equal(result, expected)
|
||||
else:
|
||||
tm.assert_extension_array_equal(result, expected._values)
|
||||
|
||||
td2 = day + np.zeros(6, dtype=f"m8[{unit}]")
|
||||
td2 = klass(td2, name="dt")
|
||||
result2 = td2.value_counts()
|
||||
tm.assert_series_equal(result2, expected_s)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dropna", [True, False])
|
||||
def test_value_counts_with_nan(dropna, index_or_series):
|
||||
# GH31944
|
||||
klass = index_or_series
|
||||
values = [True, pd.NA, np.nan]
|
||||
obj = klass(values)
|
||||
res = obj.value_counts(dropna=dropna)
|
||||
if dropna is True:
|
||||
expected = Series([1], index=Index([True], dtype=obj.dtype), name="count")
|
||||
else:
|
||||
expected = Series([1, 1, 1], index=[True, pd.NA, np.nan], name="count")
|
||||
tm.assert_series_equal(res, expected)
|
||||
|
||||
|
||||
def test_value_counts_object_inference_deprecated():
|
||||
# GH#56161
|
||||
dti = pd.date_range("2016-01-01", periods=3, tz="UTC")
|
||||
|
||||
idx = dti.astype(object)
|
||||
msg = "The behavior of value_counts with object-dtype is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
res = idx.value_counts()
|
||||
|
||||
exp = dti.value_counts()
|
||||
tm.assert_series_equal(res, exp)
|
Reference in New Issue
Block a user