Updated script that can be controled by Nodejs web app

This commit is contained in:
mac OS
2024-11-25 12:24:18 +07:00
parent c440eda1f4
commit 8b0ab2bd3a
8662 changed files with 1803808 additions and 34 deletions

View File

@ -0,0 +1,9 @@
from typing import Any
from pandas import Index
def allow_na_ops(obj: Any) -> bool:
"""Whether to skip test cases including NaN"""
is_bool_index = isinstance(obj, Index) and obj.inferred_type == "boolean"
return not is_bool_index and obj._can_hold_na

View File

@ -0,0 +1,179 @@
from datetime import datetime
import sys
import numpy as np
import pytest
from pandas.compat import PYPY
import pandas as pd
from pandas import (
DataFrame,
Index,
Series,
)
import pandas._testing as tm
from pandas.core.accessor import PandasDelegate
from pandas.core.base import (
NoNewAttributesMixin,
PandasObject,
)
def series_via_frame_from_dict(x, **kwargs):
return DataFrame({"a": x}, **kwargs)["a"]
def series_via_frame_from_scalar(x, **kwargs):
return DataFrame(x, **kwargs)[0]
@pytest.fixture(
params=[
Series,
series_via_frame_from_dict,
series_via_frame_from_scalar,
Index,
],
ids=["Series", "DataFrame-dict", "DataFrame-array", "Index"],
)
def constructor(request):
return request.param
class TestPandasDelegate:
class Delegator:
_properties = ["prop"]
_methods = ["test_method"]
def _set_prop(self, value):
self.prop = value
def _get_prop(self):
return self.prop
prop = property(_get_prop, _set_prop, doc="foo property")
def test_method(self, *args, **kwargs):
"""a test method"""
class Delegate(PandasDelegate, PandasObject):
def __init__(self, obj) -> None:
self.obj = obj
def test_invalid_delegation(self):
# these show that in order for the delegation to work
# the _delegate_* methods need to be overridden to not raise
# a TypeError
self.Delegate._add_delegate_accessors(
delegate=self.Delegator,
accessors=self.Delegator._properties,
typ="property",
)
self.Delegate._add_delegate_accessors(
delegate=self.Delegator, accessors=self.Delegator._methods, typ="method"
)
delegate = self.Delegate(self.Delegator())
msg = "You cannot access the property prop"
with pytest.raises(TypeError, match=msg):
delegate.prop
msg = "The property prop cannot be set"
with pytest.raises(TypeError, match=msg):
delegate.prop = 5
msg = "You cannot access the property prop"
with pytest.raises(TypeError, match=msg):
delegate.prop
@pytest.mark.skipif(PYPY, reason="not relevant for PyPy")
def test_memory_usage(self):
# Delegate does not implement memory_usage.
# Check that we fall back to in-built `__sizeof__`
# GH 12924
delegate = self.Delegate(self.Delegator())
sys.getsizeof(delegate)
class TestNoNewAttributesMixin:
def test_mixin(self):
class T(NoNewAttributesMixin):
pass
t = T()
assert not hasattr(t, "__frozen")
t.a = "test"
assert t.a == "test"
t._freeze()
assert "__frozen" in dir(t)
assert getattr(t, "__frozen")
msg = "You cannot add any new attribute"
with pytest.raises(AttributeError, match=msg):
t.b = "test"
assert not hasattr(t, "b")
class TestConstruction:
# test certain constructor behaviours on dtype inference across Series,
# Index and DataFrame
@pytest.mark.parametrize(
"a",
[
np.array(["2263-01-01"], dtype="datetime64[D]"),
np.array([datetime(2263, 1, 1)], dtype=object),
np.array([np.datetime64("2263-01-01", "D")], dtype=object),
np.array(["2263-01-01"], dtype=object),
],
ids=[
"datetime64[D]",
"object-datetime.datetime",
"object-numpy-scalar",
"object-string",
],
)
def test_constructor_datetime_outofbound(
self, a, constructor, request, using_infer_string
):
# GH-26853 (+ bug GH-26206 out of bound non-ns unit)
# No dtype specified (dtype inference)
# datetime64[non-ns] raise error, other cases result in object dtype
# and preserve original data
if a.dtype.kind == "M":
# Can't fit in nanosecond bounds -> get the nearest supported unit
result = constructor(a)
assert result.dtype == "M8[s]"
else:
result = constructor(a)
if using_infer_string and "object-string" in request.node.callspec.id:
assert result.dtype == "string"
else:
assert result.dtype == "object"
tm.assert_numpy_array_equal(result.to_numpy(), a)
# Explicit dtype specified
# Forced conversion fails for all -> all cases raise error
msg = "Out of bounds|Out of bounds .* present at position 0"
with pytest.raises(pd.errors.OutOfBoundsDatetime, match=msg):
constructor(a, dtype="datetime64[ns]")
def test_constructor_datetime_nonns(self, constructor):
arr = np.array(["2020-01-01T00:00:00.000000"], dtype="datetime64[us]")
dta = pd.core.arrays.DatetimeArray._simple_new(arr, dtype=arr.dtype)
expected = constructor(dta)
assert expected.dtype == arr.dtype
result = constructor(arr)
tm.assert_equal(result, expected)
# https://github.com/pandas-dev/pandas/issues/34843
arr.flags.writeable = False
result = constructor(arr)
tm.assert_equal(result, expected)

View File

@ -0,0 +1,562 @@
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import DatetimeTZDtype
import pandas as pd
from pandas import (
CategoricalIndex,
Series,
Timedelta,
Timestamp,
date_range,
)
import pandas._testing as tm
from pandas.core.arrays import (
DatetimeArray,
IntervalArray,
NumpyExtensionArray,
PeriodArray,
SparseArray,
TimedeltaArray,
)
from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
class TestToIterable:
# test that we convert an iterable to python types
dtypes = [
("int8", int),
("int16", int),
("int32", int),
("int64", int),
("uint8", int),
("uint16", int),
("uint32", int),
("uint64", int),
("float16", float),
("float32", float),
("float64", float),
("datetime64[ns]", Timestamp),
("datetime64[ns, US/Eastern]", Timestamp),
("timedelta64[ns]", Timedelta),
]
@pytest.mark.parametrize("dtype, rdtype", dtypes)
@pytest.mark.parametrize(
"method",
[
lambda x: x.tolist(),
lambda x: x.to_list(),
lambda x: list(x),
lambda x: list(x.__iter__()),
],
ids=["tolist", "to_list", "list", "iter"],
)
def test_iterable(self, index_or_series, method, dtype, rdtype):
# gh-10904
# gh-13258
# coerce iteration to underlying python / pandas types
typ = index_or_series
if dtype == "float16" and issubclass(typ, pd.Index):
with pytest.raises(NotImplementedError, match="float16 indexes are not "):
typ([1], dtype=dtype)
return
s = typ([1], dtype=dtype)
result = method(s)[0]
assert isinstance(result, rdtype)
@pytest.mark.parametrize(
"dtype, rdtype, obj",
[
("object", object, "a"),
("object", int, 1),
("category", object, "a"),
("category", int, 1),
],
)
@pytest.mark.parametrize(
"method",
[
lambda x: x.tolist(),
lambda x: x.to_list(),
lambda x: list(x),
lambda x: list(x.__iter__()),
],
ids=["tolist", "to_list", "list", "iter"],
)
def test_iterable_object_and_category(
self, index_or_series, method, dtype, rdtype, obj
):
# gh-10904
# gh-13258
# coerce iteration to underlying python / pandas types
typ = index_or_series
s = typ([obj], dtype=dtype)
result = method(s)[0]
assert isinstance(result, rdtype)
@pytest.mark.parametrize("dtype, rdtype", dtypes)
def test_iterable_items(self, dtype, rdtype):
# gh-13258
# test if items yields the correct boxed scalars
# this only applies to series
s = Series([1], dtype=dtype)
_, result = next(iter(s.items()))
assert isinstance(result, rdtype)
_, result = next(iter(s.items()))
assert isinstance(result, rdtype)
@pytest.mark.parametrize(
"dtype, rdtype", dtypes + [("object", int), ("category", int)]
)
def test_iterable_map(self, index_or_series, dtype, rdtype):
# gh-13236
# coerce iteration to underlying python / pandas types
typ = index_or_series
if dtype == "float16" and issubclass(typ, pd.Index):
with pytest.raises(NotImplementedError, match="float16 indexes are not "):
typ([1], dtype=dtype)
return
s = typ([1], dtype=dtype)
result = s.map(type)[0]
if not isinstance(rdtype, tuple):
rdtype = (rdtype,)
assert result in rdtype
@pytest.mark.parametrize(
"method",
[
lambda x: x.tolist(),
lambda x: x.to_list(),
lambda x: list(x),
lambda x: list(x.__iter__()),
],
ids=["tolist", "to_list", "list", "iter"],
)
def test_categorial_datetimelike(self, method):
i = CategoricalIndex([Timestamp("1999-12-31"), Timestamp("2000-12-31")])
result = method(i)[0]
assert isinstance(result, Timestamp)
def test_iter_box_dt64(self, unit):
vals = [Timestamp("2011-01-01"), Timestamp("2011-01-02")]
ser = Series(vals).dt.as_unit(unit)
assert ser.dtype == f"datetime64[{unit}]"
for res, exp in zip(ser, vals):
assert isinstance(res, Timestamp)
assert res.tz is None
assert res == exp
assert res.unit == unit
def test_iter_box_dt64tz(self, unit):
vals = [
Timestamp("2011-01-01", tz="US/Eastern"),
Timestamp("2011-01-02", tz="US/Eastern"),
]
ser = Series(vals).dt.as_unit(unit)
assert ser.dtype == f"datetime64[{unit}, US/Eastern]"
for res, exp in zip(ser, vals):
assert isinstance(res, Timestamp)
assert res.tz == exp.tz
assert res == exp
assert res.unit == unit
def test_iter_box_timedelta64(self, unit):
# timedelta
vals = [Timedelta("1 days"), Timedelta("2 days")]
ser = Series(vals).dt.as_unit(unit)
assert ser.dtype == f"timedelta64[{unit}]"
for res, exp in zip(ser, vals):
assert isinstance(res, Timedelta)
assert res == exp
assert res.unit == unit
def test_iter_box_period(self):
# period
vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")]
s = Series(vals)
assert s.dtype == "Period[M]"
for res, exp in zip(s, vals):
assert isinstance(res, pd.Period)
assert res.freq == "ME"
assert res == exp
@pytest.mark.parametrize(
"arr, expected_type, dtype",
[
(np.array([0, 1], dtype=np.int64), np.ndarray, "int64"),
(np.array(["a", "b"]), np.ndarray, "object"),
(pd.Categorical(["a", "b"]), pd.Categorical, "category"),
(
pd.DatetimeIndex(["2017", "2018"], tz="US/Central"),
DatetimeArray,
"datetime64[ns, US/Central]",
),
(
pd.PeriodIndex([2018, 2019], freq="Y"),
PeriodArray,
pd.core.dtypes.dtypes.PeriodDtype("Y-DEC"),
),
(pd.IntervalIndex.from_breaks([0, 1, 2]), IntervalArray, "interval"),
(
pd.DatetimeIndex(["2017", "2018"]),
DatetimeArray,
"datetime64[ns]",
),
(
pd.TimedeltaIndex([10**10]),
TimedeltaArray,
"m8[ns]",
),
],
)
def test_values_consistent(arr, expected_type, dtype, using_infer_string):
if using_infer_string and dtype == "object":
expected_type = ArrowStringArrayNumpySemantics
l_values = Series(arr)._values
r_values = pd.Index(arr)._values
assert type(l_values) is expected_type
assert type(l_values) is type(r_values)
tm.assert_equal(l_values, r_values)
@pytest.mark.parametrize("arr", [np.array([1, 2, 3])])
def test_numpy_array(arr):
ser = Series(arr)
result = ser.array
expected = NumpyExtensionArray(arr)
tm.assert_extension_array_equal(result, expected)
def test_numpy_array_all_dtypes(any_numpy_dtype):
ser = Series(dtype=any_numpy_dtype)
result = ser.array
if np.dtype(any_numpy_dtype).kind == "M":
assert isinstance(result, DatetimeArray)
elif np.dtype(any_numpy_dtype).kind == "m":
assert isinstance(result, TimedeltaArray)
else:
assert isinstance(result, NumpyExtensionArray)
@pytest.mark.parametrize(
"arr, attr",
[
(pd.Categorical(["a", "b"]), "_codes"),
(PeriodArray._from_sequence(["2000", "2001"], dtype="period[D]"), "_ndarray"),
(pd.array([0, np.nan], dtype="Int64"), "_data"),
(IntervalArray.from_breaks([0, 1]), "_left"),
(SparseArray([0, 1]), "_sparse_values"),
(
DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")),
"_ndarray",
),
# tz-aware Datetime
(
DatetimeArray._from_sequence(
np.array(
["2000-01-01T12:00:00", "2000-01-02T12:00:00"], dtype="M8[ns]"
),
dtype=DatetimeTZDtype(tz="US/Central"),
),
"_ndarray",
),
],
)
def test_array(arr, attr, index_or_series, request):
box = index_or_series
result = box(arr, copy=False).array
if attr:
arr = getattr(arr, attr)
result = getattr(result, attr)
assert result is arr
def test_array_multiindex_raises():
idx = pd.MultiIndex.from_product([["A"], ["a", "b"]])
msg = "MultiIndex has no single backing array"
with pytest.raises(ValueError, match=msg):
idx.array
@pytest.mark.parametrize(
"arr, expected",
[
(np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)),
(pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object)),
(
pd.core.arrays.period_array(["2000", "2001"], freq="D"),
np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]),
),
(pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan])),
(
IntervalArray.from_breaks([0, 1, 2]),
np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object),
),
(SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)),
# tz-naive datetime
(
DatetimeArray._from_sequence(np.array(["2000", "2001"], dtype="M8[ns]")),
np.array(["2000", "2001"], dtype="M8[ns]"),
),
# tz-aware stays tz`-aware
(
DatetimeArray._from_sequence(
np.array(["2000-01-01T06:00:00", "2000-01-02T06:00:00"], dtype="M8[ns]")
)
.tz_localize("UTC")
.tz_convert("US/Central"),
np.array(
[
Timestamp("2000-01-01", tz="US/Central"),
Timestamp("2000-01-02", tz="US/Central"),
]
),
),
# Timedelta
(
TimedeltaArray._from_sequence(
np.array([0, 3600000000000], dtype="i8").view("m8[ns]")
),
np.array([0, 3600000000000], dtype="m8[ns]"),
),
# GH#26406 tz is preserved in Categorical[dt64tz]
(
pd.Categorical(date_range("2016-01-01", periods=2, tz="US/Pacific")),
np.array(
[
Timestamp("2016-01-01", tz="US/Pacific"),
Timestamp("2016-01-02", tz="US/Pacific"),
]
),
),
],
)
def test_to_numpy(arr, expected, index_or_series_or_array, request):
box = index_or_series_or_array
with tm.assert_produces_warning(None):
thing = box(arr)
result = thing.to_numpy()
tm.assert_numpy_array_equal(result, expected)
result = np.asarray(thing)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("as_series", [True, False])
@pytest.mark.parametrize(
"arr", [np.array([1, 2, 3], dtype="int64"), np.array(["a", "b", "c"], dtype=object)]
)
def test_to_numpy_copy(arr, as_series, using_infer_string):
obj = pd.Index(arr, copy=False)
if as_series:
obj = Series(obj.values, copy=False)
# no copy by default
result = obj.to_numpy()
if using_infer_string and arr.dtype == object:
assert np.shares_memory(arr, result) is False
else:
assert np.shares_memory(arr, result) is True
result = obj.to_numpy(copy=False)
if using_infer_string and arr.dtype == object:
assert np.shares_memory(arr, result) is False
else:
assert np.shares_memory(arr, result) is True
# copy=True
result = obj.to_numpy(copy=True)
assert np.shares_memory(arr, result) is False
@pytest.mark.parametrize("as_series", [True, False])
def test_to_numpy_dtype(as_series, unit):
tz = "US/Eastern"
obj = pd.DatetimeIndex(["2000", "2001"], tz=tz)
if as_series:
obj = Series(obj)
# preserve tz by default
result = obj.to_numpy()
expected = np.array(
[Timestamp("2000", tz=tz), Timestamp("2001", tz=tz)], dtype=object
)
tm.assert_numpy_array_equal(result, expected)
result = obj.to_numpy(dtype="object")
tm.assert_numpy_array_equal(result, expected)
result = obj.to_numpy(dtype="M8[ns]")
expected = np.array(["2000-01-01T05", "2001-01-01T05"], dtype="M8[ns]")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize(
"values, dtype, na_value, expected",
[
([1, 2, None], "float64", 0, [1.0, 2.0, 0.0]),
(
[Timestamp("2000"), Timestamp("2000"), pd.NaT],
None,
Timestamp("2000"),
[np.datetime64("2000-01-01T00:00:00.000000000")] * 3,
),
],
)
def test_to_numpy_na_value_numpy_dtype(
index_or_series, values, dtype, na_value, expected
):
obj = index_or_series(values)
result = obj.to_numpy(dtype=dtype, na_value=na_value)
expected = np.array(expected)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize(
"data, multiindex, dtype, na_value, expected",
[
(
[1, 2, None, 4],
[(0, "a"), (0, "b"), (1, "b"), (1, "c")],
float,
None,
[1.0, 2.0, np.nan, 4.0],
),
(
[1, 2, None, 4],
[(0, "a"), (0, "b"), (1, "b"), (1, "c")],
float,
np.nan,
[1.0, 2.0, np.nan, 4.0],
),
(
[1.0, 2.0, np.nan, 4.0],
[("a", 0), ("a", 1), ("a", 2), ("b", 0)],
int,
0,
[1, 2, 0, 4],
),
(
[Timestamp("2000"), Timestamp("2000"), pd.NaT],
[(0, Timestamp("2021")), (0, Timestamp("2022")), (1, Timestamp("2000"))],
None,
Timestamp("2000"),
[np.datetime64("2000-01-01T00:00:00.000000000")] * 3,
),
],
)
def test_to_numpy_multiindex_series_na_value(
data, multiindex, dtype, na_value, expected
):
index = pd.MultiIndex.from_tuples(multiindex)
series = Series(data, index=index)
result = series.to_numpy(dtype=dtype, na_value=na_value)
expected = np.array(expected)
tm.assert_numpy_array_equal(result, expected)
def test_to_numpy_kwargs_raises():
# numpy
s = Series([1, 2, 3])
msg = r"to_numpy\(\) got an unexpected keyword argument 'foo'"
with pytest.raises(TypeError, match=msg):
s.to_numpy(foo=True)
# extension
s = Series([1, 2, 3], dtype="Int64")
with pytest.raises(TypeError, match=msg):
s.to_numpy(foo=True)
@pytest.mark.parametrize(
"data",
[
{"a": [1, 2, 3], "b": [1, 2, None]},
{"a": np.array([1, 2, 3]), "b": np.array([1, 2, np.nan])},
{"a": pd.array([1, 2, 3]), "b": pd.array([1, 2, None])},
],
)
@pytest.mark.parametrize("dtype, na_value", [(float, np.nan), (object, None)])
def test_to_numpy_dataframe_na_value(data, dtype, na_value):
# https://github.com/pandas-dev/pandas/issues/33820
df = pd.DataFrame(data)
result = df.to_numpy(dtype=dtype, na_value=na_value)
expected = np.array([[1, 1], [2, 2], [3, na_value]], dtype=dtype)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize(
"data, expected",
[
(
{"a": pd.array([1, 2, None])},
np.array([[1.0], [2.0], [np.nan]], dtype=float),
),
(
{"a": [1, 2, 3], "b": [1, 2, 3]},
np.array([[1, 1], [2, 2], [3, 3]], dtype=float),
),
],
)
def test_to_numpy_dataframe_single_block(data, expected):
# https://github.com/pandas-dev/pandas/issues/33820
df = pd.DataFrame(data)
result = df.to_numpy(dtype=float, na_value=np.nan)
tm.assert_numpy_array_equal(result, expected)
def test_to_numpy_dataframe_single_block_no_mutate():
# https://github.com/pandas-dev/pandas/issues/33820
result = pd.DataFrame(np.array([1.0, 2.0, np.nan]))
expected = pd.DataFrame(np.array([1.0, 2.0, np.nan]))
result.to_numpy(na_value=0.0)
tm.assert_frame_equal(result, expected)
class TestAsArray:
@pytest.mark.parametrize("tz", [None, "US/Central"])
def test_asarray_object_dt64(self, tz):
ser = Series(date_range("2000", periods=2, tz=tz))
with tm.assert_produces_warning(None):
# Future behavior (for tzaware case) with no warning
result = np.asarray(ser, dtype=object)
expected = np.array(
[Timestamp("2000-01-01", tz=tz), Timestamp("2000-01-02", tz=tz)]
)
tm.assert_numpy_array_equal(result, expected)
def test_asarray_tz_naive(self):
# This shouldn't produce a warning.
ser = Series(date_range("2000", periods=2))
expected = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]")
result = np.asarray(ser)
tm.assert_numpy_array_equal(result, expected)
def test_asarray_tz_aware(self):
tz = "US/Central"
ser = Series(date_range("2000", periods=2, tz=tz))
expected = np.array(["2000-01-01T06", "2000-01-02T06"], dtype="M8[ns]")
result = np.asarray(ser, dtype="datetime64[ns]")
tm.assert_numpy_array_equal(result, expected)
# Old behavior with no warning
result = np.asarray(ser, dtype="M8[ns]")
tm.assert_numpy_array_equal(result, expected)

View File

@ -0,0 +1,60 @@
"""
Though Index.fillna and Series.fillna has separate impl,
test here to confirm these works as the same
"""
import numpy as np
import pytest
from pandas import MultiIndex
import pandas._testing as tm
from pandas.tests.base.common import allow_na_ops
def test_fillna(index_or_series_obj):
# GH 11343
obj = index_or_series_obj
if isinstance(obj, MultiIndex):
msg = "isna is not defined for MultiIndex"
with pytest.raises(NotImplementedError, match=msg):
obj.fillna(0)
return
# values will not be changed
fill_value = obj.values[0] if len(obj) > 0 else 0
result = obj.fillna(fill_value)
tm.assert_equal(obj, result)
# check shallow_copied
assert obj is not result
@pytest.mark.parametrize("null_obj", [np.nan, None])
def test_fillna_null(null_obj, index_or_series_obj):
# GH 11343
obj = index_or_series_obj
klass = type(obj)
if not allow_na_ops(obj):
pytest.skip(f"{klass} doesn't allow for NA operations")
elif len(obj) < 1:
pytest.skip("Test doesn't make sense on empty data")
elif isinstance(obj, MultiIndex):
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
values = obj._values
fill_value = values[0]
expected = values.copy()
values[0:2] = null_obj
expected[0:2] = fill_value
expected = klass(expected)
obj = klass(values)
result = obj.fillna(fill_value)
tm.assert_equal(result, expected)
# check shallow_copied
assert obj is not result

View File

@ -0,0 +1,191 @@
import sys
import numpy as np
import pytest
from pandas._config import using_pyarrow_string_dtype
from pandas.compat import PYPY
from pandas.core.dtypes.common import (
is_dtype_equal,
is_object_dtype,
)
import pandas as pd
from pandas import (
Index,
Series,
)
import pandas._testing as tm
def test_isnull_notnull_docstrings():
# GH#41855 make sure its clear these are aliases
doc = pd.DataFrame.notnull.__doc__
assert doc.startswith("\nDataFrame.notnull is an alias for DataFrame.notna.\n")
doc = pd.DataFrame.isnull.__doc__
assert doc.startswith("\nDataFrame.isnull is an alias for DataFrame.isna.\n")
doc = Series.notnull.__doc__
assert doc.startswith("\nSeries.notnull is an alias for Series.notna.\n")
doc = Series.isnull.__doc__
assert doc.startswith("\nSeries.isnull is an alias for Series.isna.\n")
@pytest.mark.parametrize(
"op_name, op",
[
("add", "+"),
("sub", "-"),
("mul", "*"),
("mod", "%"),
("pow", "**"),
("truediv", "/"),
("floordiv", "//"),
],
)
def test_binary_ops_docstring(frame_or_series, op_name, op):
# not using the all_arithmetic_functions fixture with _get_opstr
# as _get_opstr is used internally in the dynamic implementation of the docstring
klass = frame_or_series
operand1 = klass.__name__.lower()
operand2 = "other"
expected_str = " ".join([operand1, op, operand2])
assert expected_str in getattr(klass, op_name).__doc__
# reverse version of the binary ops
expected_str = " ".join([operand2, op, operand1])
assert expected_str in getattr(klass, "r" + op_name).__doc__
def test_ndarray_compat_properties(index_or_series_obj):
obj = index_or_series_obj
# Check that we work.
for p in ["shape", "dtype", "T", "nbytes"]:
assert getattr(obj, p, None) is not None
# deprecated properties
for p in ["strides", "itemsize", "base", "data"]:
assert not hasattr(obj, p)
msg = "can only convert an array of size 1 to a Python scalar"
with pytest.raises(ValueError, match=msg):
obj.item() # len > 1
assert obj.ndim == 1
assert obj.size == len(obj)
assert Index([1]).item() == 1
assert Series([1]).item() == 1
@pytest.mark.skipif(
PYPY or using_pyarrow_string_dtype(),
reason="not relevant for PyPy doesn't work properly for arrow strings",
)
def test_memory_usage(index_or_series_memory_obj):
obj = index_or_series_memory_obj
# Clear index caches so that len(obj) == 0 report 0 memory usage
if isinstance(obj, Series):
is_ser = True
obj.index._engine.clear_mapping()
else:
is_ser = False
obj._engine.clear_mapping()
res = obj.memory_usage()
res_deep = obj.memory_usage(deep=True)
is_object = is_object_dtype(obj) or (is_ser and is_object_dtype(obj.index))
is_categorical = isinstance(obj.dtype, pd.CategoricalDtype) or (
is_ser and isinstance(obj.index.dtype, pd.CategoricalDtype)
)
is_object_string = is_dtype_equal(obj, "string[python]") or (
is_ser and is_dtype_equal(obj.index.dtype, "string[python]")
)
if len(obj) == 0:
expected = 0
assert res_deep == res == expected
elif is_object or is_categorical or is_object_string:
# only deep will pick them up
assert res_deep > res
else:
assert res == res_deep
# sys.getsizeof will call the .memory_usage with
# deep=True, and add on some GC overhead
diff = res_deep - sys.getsizeof(obj)
assert abs(diff) < 100
def test_memory_usage_components_series(series_with_simple_index):
series = series_with_simple_index
total_usage = series.memory_usage(index=True)
non_index_usage = series.memory_usage(index=False)
index_usage = series.index.memory_usage()
assert total_usage == non_index_usage + index_usage
@pytest.mark.parametrize("dtype", tm.NARROW_NP_DTYPES)
def test_memory_usage_components_narrow_series(dtype):
series = Series(range(5), dtype=dtype, index=[f"i-{i}" for i in range(5)], name="a")
total_usage = series.memory_usage(index=True)
non_index_usage = series.memory_usage(index=False)
index_usage = series.index.memory_usage()
assert total_usage == non_index_usage + index_usage
def test_searchsorted(request, index_or_series_obj):
# numpy.searchsorted calls obj.searchsorted under the hood.
# See gh-12238
obj = index_or_series_obj
if isinstance(obj, pd.MultiIndex):
# See gh-14833
request.applymarker(
pytest.mark.xfail(
reason="np.searchsorted doesn't work on pd.MultiIndex: GH 14833"
)
)
elif obj.dtype.kind == "c" and isinstance(obj, Index):
# TODO: Should Series cases also raise? Looks like they use numpy
# comparison semantics https://github.com/numpy/numpy/issues/15981
mark = pytest.mark.xfail(reason="complex objects are not comparable")
request.applymarker(mark)
max_obj = max(obj, default=0)
index = np.searchsorted(obj, max_obj)
assert 0 <= index <= len(obj)
index = np.searchsorted(obj, max_obj, sorter=range(len(obj)))
assert 0 <= index <= len(obj)
def test_access_by_position(index_flat):
index = index_flat
if len(index) == 0:
pytest.skip("Test doesn't make sense on empty data")
series = Series(index)
assert index[0] == series.iloc[0]
assert index[5] == series.iloc[5]
assert index[-1] == series.iloc[-1]
size = len(index)
assert index[-1] == index[size - 1]
msg = f"index {size} is out of bounds for axis 0 with size {size}"
if is_dtype_equal(index.dtype, "string[pyarrow]") or is_dtype_equal(
index.dtype, "string[pyarrow_numpy]"
):
msg = "index out of bounds"
with pytest.raises(IndexError, match=msg):
index[size]
msg = "single positional indexer is out-of-bounds"
with pytest.raises(IndexError, match=msg):
series.iloc[size]

View File

@ -0,0 +1,56 @@
import numpy as np
import pytest
from pandas import (
CategoricalDtype,
DataFrame,
)
import pandas._testing as tm
def test_transpose(index_or_series_obj):
obj = index_or_series_obj
tm.assert_equal(obj.transpose(), obj)
def test_transpose_non_default_axes(index_or_series_obj):
msg = "the 'axes' parameter is not supported"
obj = index_or_series_obj
with pytest.raises(ValueError, match=msg):
obj.transpose(1)
with pytest.raises(ValueError, match=msg):
obj.transpose(axes=1)
def test_numpy_transpose(index_or_series_obj):
msg = "the 'axes' parameter is not supported"
obj = index_or_series_obj
tm.assert_equal(np.transpose(obj), obj)
with pytest.raises(ValueError, match=msg):
np.transpose(obj, axes=1)
@pytest.mark.parametrize(
"data, transposed_data, index, columns, dtype",
[
([[1], [2]], [[1, 2]], ["a", "a"], ["b"], int),
([[1], [2]], [[1, 2]], ["a", "a"], ["b"], CategoricalDtype([1, 2])),
([[1, 2]], [[1], [2]], ["b"], ["a", "a"], int),
([[1, 2]], [[1], [2]], ["b"], ["a", "a"], CategoricalDtype([1, 2])),
([[1, 2], [3, 4]], [[1, 3], [2, 4]], ["a", "a"], ["b", "b"], int),
(
[[1, 2], [3, 4]],
[[1, 3], [2, 4]],
["a", "a"],
["b", "b"],
CategoricalDtype([1, 2, 3, 4]),
),
],
)
def test_duplicate_labels(data, transposed_data, index, columns, dtype):
# GH 42380
df = DataFrame(data, index=index, columns=columns, dtype=dtype)
result = df.T
expected = DataFrame(transposed_data, index=columns, columns=index, dtype=dtype)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,124 @@
import numpy as np
import pytest
from pandas._config import using_pyarrow_string_dtype
import pandas as pd
import pandas._testing as tm
from pandas.tests.base.common import allow_na_ops
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
def test_unique(index_or_series_obj):
obj = index_or_series_obj
obj = np.repeat(obj, range(1, len(obj) + 1))
result = obj.unique()
# dict.fromkeys preserves the order
unique_values = list(dict.fromkeys(obj.values))
if isinstance(obj, pd.MultiIndex):
expected = pd.MultiIndex.from_tuples(unique_values)
expected.names = obj.names
tm.assert_index_equal(result, expected, exact=True)
elif isinstance(obj, pd.Index):
expected = pd.Index(unique_values, dtype=obj.dtype)
if isinstance(obj.dtype, pd.DatetimeTZDtype):
expected = expected.normalize()
tm.assert_index_equal(result, expected, exact=True)
else:
expected = np.array(unique_values)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
@pytest.mark.parametrize("null_obj", [np.nan, None])
def test_unique_null(null_obj, index_or_series_obj):
obj = index_or_series_obj
if not allow_na_ops(obj):
pytest.skip("type doesn't allow for NA operations")
elif len(obj) < 1:
pytest.skip("Test doesn't make sense on empty data")
elif isinstance(obj, pd.MultiIndex):
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
values = obj._values
values[0:2] = null_obj
klass = type(obj)
repeated_values = np.repeat(values, range(1, len(values) + 1))
obj = klass(repeated_values, dtype=obj.dtype)
result = obj.unique()
unique_values_raw = dict.fromkeys(obj.values)
# because np.nan == np.nan is False, but None == None is True
# np.nan would be duplicated, whereas None wouldn't
unique_values_not_null = [val for val in unique_values_raw if not pd.isnull(val)]
unique_values = [null_obj] + unique_values_not_null
if isinstance(obj, pd.Index):
expected = pd.Index(unique_values, dtype=obj.dtype)
if isinstance(obj.dtype, pd.DatetimeTZDtype):
result = result.normalize()
expected = expected.normalize()
tm.assert_index_equal(result, expected, exact=True)
else:
expected = np.array(unique_values, dtype=obj.dtype)
tm.assert_numpy_array_equal(result, expected)
def test_nunique(index_or_series_obj):
obj = index_or_series_obj
obj = np.repeat(obj, range(1, len(obj) + 1))
expected = len(obj.unique())
assert obj.nunique(dropna=False) == expected
@pytest.mark.parametrize("null_obj", [np.nan, None])
def test_nunique_null(null_obj, index_or_series_obj):
obj = index_or_series_obj
if not allow_na_ops(obj):
pytest.skip("type doesn't allow for NA operations")
elif isinstance(obj, pd.MultiIndex):
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
values = obj._values
values[0:2] = null_obj
klass = type(obj)
repeated_values = np.repeat(values, range(1, len(values) + 1))
obj = klass(repeated_values, dtype=obj.dtype)
if isinstance(obj, pd.CategoricalIndex):
assert obj.nunique() == len(obj.categories)
assert obj.nunique(dropna=False) == len(obj.categories) + 1
else:
num_unique_values = len(obj.unique())
assert obj.nunique() == max(0, num_unique_values - 1)
assert obj.nunique(dropna=False) == max(0, num_unique_values)
@pytest.mark.single_cpu
@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="decoding fails")
def test_unique_bad_unicode(index_or_series):
# regression test for #34550
uval = "\ud83d" # smiley emoji
obj = index_or_series([uval] * 2)
result = obj.unique()
if isinstance(obj, pd.Index):
expected = pd.Index(["\ud83d"], dtype=object)
tm.assert_index_equal(result, expected, exact=True)
else:
expected = np.array(["\ud83d"], dtype=object)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("dropna", [True, False])
def test_nunique_dropna(dropna):
# GH37566
ser = pd.Series(["yes", "yes", pd.NA, np.nan, None, pd.NaT])
res = ser.nunique(dropna)
assert res == 1 if dropna else 5

View File

@ -0,0 +1,356 @@
import collections
from datetime import timedelta
import numpy as np
import pytest
import pandas as pd
from pandas import (
DatetimeIndex,
Index,
Interval,
IntervalIndex,
MultiIndex,
Series,
Timedelta,
TimedeltaIndex,
array,
)
import pandas._testing as tm
from pandas.tests.base.common import allow_na_ops
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
def test_value_counts(index_or_series_obj):
obj = index_or_series_obj
obj = np.repeat(obj, range(1, len(obj) + 1))
result = obj.value_counts()
counter = collections.Counter(obj)
expected = Series(dict(counter.most_common()), dtype=np.int64, name="count")
if obj.dtype != np.float16:
expected.index = expected.index.astype(obj.dtype)
else:
with pytest.raises(NotImplementedError, match="float16 indexes are not "):
expected.index.astype(obj.dtype)
return
if isinstance(expected.index, MultiIndex):
expected.index.names = obj.names
else:
expected.index.name = obj.name
if not isinstance(result.dtype, np.dtype):
if getattr(obj.dtype, "storage", "") == "pyarrow":
expected = expected.astype("int64[pyarrow]")
else:
# i.e IntegerDtype
expected = expected.astype("Int64")
# TODO(GH#32514): Order of entries with the same count is inconsistent
# on CI (gh-32449)
if obj.duplicated().any():
result = result.sort_index()
expected = expected.sort_index()
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("null_obj", [np.nan, None])
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
def test_value_counts_null(null_obj, index_or_series_obj):
orig = index_or_series_obj
obj = orig.copy()
if not allow_na_ops(obj):
pytest.skip("type doesn't allow for NA operations")
elif len(obj) < 1:
pytest.skip("Test doesn't make sense on empty data")
elif isinstance(orig, MultiIndex):
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
values = obj._values
values[0:2] = null_obj
klass = type(obj)
repeated_values = np.repeat(values, range(1, len(values) + 1))
obj = klass(repeated_values, dtype=obj.dtype)
# because np.nan == np.nan is False, but None == None is True
# np.nan would be duplicated, whereas None wouldn't
counter = collections.Counter(obj.dropna())
expected = Series(dict(counter.most_common()), dtype=np.int64, name="count")
if obj.dtype != np.float16:
expected.index = expected.index.astype(obj.dtype)
else:
with pytest.raises(NotImplementedError, match="float16 indexes are not "):
expected.index.astype(obj.dtype)
return
expected.index.name = obj.name
result = obj.value_counts()
if obj.duplicated().any():
# TODO(GH#32514):
# Order of entries with the same count is inconsistent on CI (gh-32449)
expected = expected.sort_index()
result = result.sort_index()
if not isinstance(result.dtype, np.dtype):
if getattr(obj.dtype, "storage", "") == "pyarrow":
expected = expected.astype("int64[pyarrow]")
else:
# i.e IntegerDtype
expected = expected.astype("Int64")
tm.assert_series_equal(result, expected)
expected[null_obj] = 3
result = obj.value_counts(dropna=False)
if obj.duplicated().any():
# TODO(GH#32514):
# Order of entries with the same count is inconsistent on CI (gh-32449)
expected = expected.sort_index()
result = result.sort_index()
tm.assert_series_equal(result, expected)
def test_value_counts_inferred(index_or_series, using_infer_string):
klass = index_or_series
s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"]
s = klass(s_values)
expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"], name="count")
tm.assert_series_equal(s.value_counts(), expected)
if isinstance(s, Index):
exp = Index(np.unique(np.array(s_values, dtype=np.object_)))
tm.assert_index_equal(s.unique(), exp)
else:
exp = np.unique(np.array(s_values, dtype=np.object_))
if using_infer_string:
exp = array(exp)
tm.assert_equal(s.unique(), exp)
assert s.nunique() == 4
# don't sort, have to sort after the fact as not sorting is
# platform-dep
hist = s.value_counts(sort=False).sort_values()
expected = Series([3, 1, 4, 2], index=list("acbd"), name="count").sort_values()
tm.assert_series_equal(hist, expected)
# sort ascending
hist = s.value_counts(ascending=True)
expected = Series([1, 2, 3, 4], index=list("cdab"), name="count")
tm.assert_series_equal(hist, expected)
# relative histogram.
hist = s.value_counts(normalize=True)
expected = Series(
[0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"], name="proportion"
)
tm.assert_series_equal(hist, expected)
def test_value_counts_bins(index_or_series, using_infer_string):
klass = index_or_series
s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"]
s = klass(s_values)
# bins
msg = "bins argument only works with numeric data"
with pytest.raises(TypeError, match=msg):
s.value_counts(bins=1)
s1 = Series([1, 1, 2, 3])
res1 = s1.value_counts(bins=1)
exp1 = Series({Interval(0.997, 3.0): 4}, name="count")
tm.assert_series_equal(res1, exp1)
res1n = s1.value_counts(bins=1, normalize=True)
exp1n = Series({Interval(0.997, 3.0): 1.0}, name="proportion")
tm.assert_series_equal(res1n, exp1n)
if isinstance(s1, Index):
tm.assert_index_equal(s1.unique(), Index([1, 2, 3]))
else:
exp = np.array([1, 2, 3], dtype=np.int64)
tm.assert_numpy_array_equal(s1.unique(), exp)
assert s1.nunique() == 3
# these return the same
res4 = s1.value_counts(bins=4, dropna=True)
intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]), name="count")
tm.assert_series_equal(res4, exp4)
res4 = s1.value_counts(bins=4, dropna=False)
intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]), name="count")
tm.assert_series_equal(res4, exp4)
res4n = s1.value_counts(bins=4, normalize=True)
exp4n = Series(
[0.5, 0.25, 0.25, 0], index=intervals.take([0, 1, 3, 2]), name="proportion"
)
tm.assert_series_equal(res4n, exp4n)
# handle NA's properly
s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"]
s = klass(s_values)
expected = Series([4, 3, 2], index=["b", "a", "d"], name="count")
tm.assert_series_equal(s.value_counts(), expected)
if isinstance(s, Index):
exp = Index(["a", "b", np.nan, "d"])
tm.assert_index_equal(s.unique(), exp)
else:
exp = np.array(["a", "b", np.nan, "d"], dtype=object)
if using_infer_string:
exp = array(exp)
tm.assert_equal(s.unique(), exp)
assert s.nunique() == 3
s = klass({}) if klass is dict else klass({}, dtype=object)
expected = Series([], dtype=np.int64, name="count")
tm.assert_series_equal(s.value_counts(), expected, check_index_type=False)
# returned dtype differs depending on original
if isinstance(s, Index):
tm.assert_index_equal(s.unique(), Index([]), exact=False)
else:
tm.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False)
assert s.nunique() == 0
def test_value_counts_datetime64(index_or_series, unit):
klass = index_or_series
# GH 3002, datetime64[ns]
# don't test names though
df = pd.DataFrame(
{
"person_id": ["xxyyzz", "xxyyzz", "xxyyzz", "xxyyww", "foofoo", "foofoo"],
"dt": pd.to_datetime(
[
"2010-01-01",
"2010-01-01",
"2010-01-01",
"2009-01-01",
"2008-09-09",
"2008-09-09",
]
).as_unit(unit),
"food": ["PIE", "GUM", "EGG", "EGG", "PIE", "GUM"],
}
)
s = klass(df["dt"].copy())
s.name = None
idx = pd.to_datetime(
["2010-01-01 00:00:00", "2008-09-09 00:00:00", "2009-01-01 00:00:00"]
).as_unit(unit)
expected_s = Series([3, 2, 1], index=idx, name="count")
tm.assert_series_equal(s.value_counts(), expected_s)
expected = array(
np.array(
["2010-01-01 00:00:00", "2009-01-01 00:00:00", "2008-09-09 00:00:00"],
dtype=f"datetime64[{unit}]",
)
)
result = s.unique()
if isinstance(s, Index):
tm.assert_index_equal(result, DatetimeIndex(expected))
else:
tm.assert_extension_array_equal(result, expected)
assert s.nunique() == 3
# with NaT
s = df["dt"].copy()
s = klass(list(s.values) + [pd.NaT] * 4)
if klass is Series:
s = s.dt.as_unit(unit)
else:
s = s.as_unit(unit)
result = s.value_counts()
assert result.index.dtype == f"datetime64[{unit}]"
tm.assert_series_equal(result, expected_s)
result = s.value_counts(dropna=False)
expected_s = pd.concat(
[
Series([4], index=DatetimeIndex([pd.NaT]).as_unit(unit), name="count"),
expected_s,
]
)
tm.assert_series_equal(result, expected_s)
assert s.dtype == f"datetime64[{unit}]"
unique = s.unique()
assert unique.dtype == f"datetime64[{unit}]"
# numpy_array_equal cannot compare pd.NaT
if isinstance(s, Index):
exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT]).as_unit(unit)
tm.assert_index_equal(unique, exp_idx)
else:
tm.assert_extension_array_equal(unique[:3], expected)
assert pd.isna(unique[3])
assert s.nunique() == 3
assert s.nunique(dropna=False) == 4
def test_value_counts_timedelta64(index_or_series, unit):
# timedelta64[ns]
klass = index_or_series
day = Timedelta(timedelta(1)).as_unit(unit)
tdi = TimedeltaIndex([day], name="dt").as_unit(unit)
tdvals = np.zeros(6, dtype=f"m8[{unit}]") + day
td = klass(tdvals, name="dt")
result = td.value_counts()
expected_s = Series([6], index=tdi, name="count")
tm.assert_series_equal(result, expected_s)
expected = tdi
result = td.unique()
if isinstance(td, Index):
tm.assert_index_equal(result, expected)
else:
tm.assert_extension_array_equal(result, expected._values)
td2 = day + np.zeros(6, dtype=f"m8[{unit}]")
td2 = klass(td2, name="dt")
result2 = td2.value_counts()
tm.assert_series_equal(result2, expected_s)
@pytest.mark.parametrize("dropna", [True, False])
def test_value_counts_with_nan(dropna, index_or_series):
# GH31944
klass = index_or_series
values = [True, pd.NA, np.nan]
obj = klass(values)
res = obj.value_counts(dropna=dropna)
if dropna is True:
expected = Series([1], index=Index([True], dtype=obj.dtype), name="count")
else:
expected = Series([1, 1, 1], index=[True, pd.NA, np.nan], name="count")
tm.assert_series_equal(res, expected)
def test_value_counts_object_inference_deprecated():
# GH#56161
dti = pd.date_range("2016-01-01", periods=3, tz="UTC")
idx = dti.astype(object)
msg = "The behavior of value_counts with object-dtype is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
res = idx.value_counts()
exp = dti.value_counts()
tm.assert_series_equal(res, exp)