Updated script that can be controled by Nodejs web app

This commit is contained in:
mac OS
2024-11-25 12:24:18 +07:00
parent c440eda1f4
commit 8b0ab2bd3a
8662 changed files with 1803808 additions and 34 deletions

View File

@ -0,0 +1,383 @@
from __future__ import annotations
import pytest
import pandas as pd
from pandas import api
import pandas._testing as tm
from pandas.api import (
extensions as api_extensions,
indexers as api_indexers,
interchange as api_interchange,
types as api_types,
typing as api_typing,
)
class Base:
def check(self, namespace, expected, ignored=None):
# see which names are in the namespace, minus optional
# ignored ones
# compare vs the expected
result = sorted(
f for f in dir(namespace) if not f.startswith("__") and f != "annotations"
)
if ignored is not None:
result = sorted(set(result) - set(ignored))
expected = sorted(expected)
tm.assert_almost_equal(result, expected)
class TestPDApi(Base):
# these are optionally imported based on testing
# & need to be ignored
ignored = ["tests", "locale", "conftest", "_version_meson"]
# top-level sub-packages
public_lib = [
"api",
"arrays",
"options",
"test",
"testing",
"errors",
"plotting",
"io",
"tseries",
]
private_lib = ["compat", "core", "pandas", "util", "_built_with_meson"]
# misc
misc = ["IndexSlice", "NaT", "NA"]
# top-level classes
classes = [
"ArrowDtype",
"Categorical",
"CategoricalIndex",
"DataFrame",
"DateOffset",
"DatetimeIndex",
"ExcelFile",
"ExcelWriter",
"Flags",
"Grouper",
"HDFStore",
"Index",
"MultiIndex",
"Period",
"PeriodIndex",
"RangeIndex",
"Series",
"SparseDtype",
"StringDtype",
"Timedelta",
"TimedeltaIndex",
"Timestamp",
"Interval",
"IntervalIndex",
"CategoricalDtype",
"PeriodDtype",
"IntervalDtype",
"DatetimeTZDtype",
"BooleanDtype",
"Int8Dtype",
"Int16Dtype",
"Int32Dtype",
"Int64Dtype",
"UInt8Dtype",
"UInt16Dtype",
"UInt32Dtype",
"UInt64Dtype",
"Float32Dtype",
"Float64Dtype",
"NamedAgg",
]
# these are already deprecated; awaiting removal
deprecated_classes: list[str] = []
# external modules exposed in pandas namespace
modules: list[str] = []
# top-level functions
funcs = [
"array",
"bdate_range",
"concat",
"crosstab",
"cut",
"date_range",
"interval_range",
"eval",
"factorize",
"get_dummies",
"from_dummies",
"infer_freq",
"isna",
"isnull",
"lreshape",
"melt",
"notna",
"notnull",
"offsets",
"merge",
"merge_ordered",
"merge_asof",
"period_range",
"pivot",
"pivot_table",
"qcut",
"show_versions",
"timedelta_range",
"unique",
"value_counts",
"wide_to_long",
]
# top-level option funcs
funcs_option = [
"reset_option",
"describe_option",
"get_option",
"option_context",
"set_option",
"set_eng_float_format",
]
# top-level read_* funcs
funcs_read = [
"read_clipboard",
"read_csv",
"read_excel",
"read_fwf",
"read_gbq",
"read_hdf",
"read_html",
"read_xml",
"read_json",
"read_pickle",
"read_sas",
"read_sql",
"read_sql_query",
"read_sql_table",
"read_stata",
"read_table",
"read_feather",
"read_parquet",
"read_orc",
"read_spss",
]
# top-level json funcs
funcs_json = ["json_normalize"]
# top-level to_* funcs
funcs_to = ["to_datetime", "to_numeric", "to_pickle", "to_timedelta"]
# top-level to deprecate in the future
deprecated_funcs_in_future: list[str] = []
# these are already deprecated; awaiting removal
deprecated_funcs: list[str] = []
# private modules in pandas namespace
private_modules = [
"_config",
"_libs",
"_is_numpy_dev",
"_pandas_datetime_CAPI",
"_pandas_parser_CAPI",
"_testing",
"_typing",
]
if not pd._built_with_meson:
private_modules.append("_version")
def test_api(self):
checkthese = (
self.public_lib
+ self.private_lib
+ self.misc
+ self.modules
+ self.classes
+ self.funcs
+ self.funcs_option
+ self.funcs_read
+ self.funcs_json
+ self.funcs_to
+ self.private_modules
)
self.check(namespace=pd, expected=checkthese, ignored=self.ignored)
def test_api_all(self):
expected = set(
self.public_lib
+ self.misc
+ self.modules
+ self.classes
+ self.funcs
+ self.funcs_option
+ self.funcs_read
+ self.funcs_json
+ self.funcs_to
) - set(self.deprecated_classes)
actual = set(pd.__all__)
extraneous = actual - expected
assert not extraneous
missing = expected - actual
assert not missing
def test_depr(self):
deprecated_list = (
self.deprecated_classes
+ self.deprecated_funcs
+ self.deprecated_funcs_in_future
)
for depr in deprecated_list:
with tm.assert_produces_warning(FutureWarning):
_ = getattr(pd, depr)
class TestApi(Base):
allowed_api_dirs = [
"types",
"extensions",
"indexers",
"interchange",
"typing",
]
allowed_typing = [
"DataFrameGroupBy",
"DatetimeIndexResamplerGroupby",
"Expanding",
"ExpandingGroupby",
"ExponentialMovingWindow",
"ExponentialMovingWindowGroupby",
"JsonReader",
"NaTType",
"NAType",
"PeriodIndexResamplerGroupby",
"Resampler",
"Rolling",
"RollingGroupby",
"SeriesGroupBy",
"StataReader",
"TimedeltaIndexResamplerGroupby",
"TimeGrouper",
"Window",
]
allowed_api_types = [
"is_any_real_numeric_dtype",
"is_array_like",
"is_bool",
"is_bool_dtype",
"is_categorical_dtype",
"is_complex",
"is_complex_dtype",
"is_datetime64_any_dtype",
"is_datetime64_dtype",
"is_datetime64_ns_dtype",
"is_datetime64tz_dtype",
"is_dict_like",
"is_dtype_equal",
"is_extension_array_dtype",
"is_file_like",
"is_float",
"is_float_dtype",
"is_hashable",
"is_int64_dtype",
"is_integer",
"is_integer_dtype",
"is_interval",
"is_interval_dtype",
"is_iterator",
"is_list_like",
"is_named_tuple",
"is_number",
"is_numeric_dtype",
"is_object_dtype",
"is_period_dtype",
"is_re",
"is_re_compilable",
"is_scalar",
"is_signed_integer_dtype",
"is_sparse",
"is_string_dtype",
"is_timedelta64_dtype",
"is_timedelta64_ns_dtype",
"is_unsigned_integer_dtype",
"pandas_dtype",
"infer_dtype",
"union_categoricals",
"CategoricalDtype",
"DatetimeTZDtype",
"IntervalDtype",
"PeriodDtype",
]
allowed_api_interchange = ["from_dataframe", "DataFrame"]
allowed_api_indexers = [
"check_array_indexer",
"BaseIndexer",
"FixedForwardWindowIndexer",
"VariableOffsetWindowIndexer",
]
allowed_api_extensions = [
"no_default",
"ExtensionDtype",
"register_extension_dtype",
"register_dataframe_accessor",
"register_index_accessor",
"register_series_accessor",
"take",
"ExtensionArray",
"ExtensionScalarOpsMixin",
]
def test_api(self):
self.check(api, self.allowed_api_dirs)
def test_api_typing(self):
self.check(api_typing, self.allowed_typing)
def test_api_types(self):
self.check(api_types, self.allowed_api_types)
def test_api_interchange(self):
self.check(api_interchange, self.allowed_api_interchange)
def test_api_indexers(self):
self.check(api_indexers, self.allowed_api_indexers)
def test_api_extensions(self):
self.check(api_extensions, self.allowed_api_extensions)
class TestTesting(Base):
funcs = [
"assert_frame_equal",
"assert_series_equal",
"assert_index_equal",
"assert_extension_array_equal",
]
def test_testing(self):
from pandas import testing
self.check(testing, self.funcs)
def test_util_in_top_level(self):
with pytest.raises(AttributeError, match="foo"):
pd.util.foo
def test_pandas_array_alias():
msg = "PandasArray has been renamed NumpyExtensionArray"
with tm.assert_produces_warning(FutureWarning, match=msg):
res = pd.arrays.PandasArray
assert res is pd.arrays.NumpyExtensionArray

View File

@ -0,0 +1,62 @@
from __future__ import annotations
import pandas._testing as tm
from pandas.api import types
from pandas.tests.api.test_api import Base
class TestTypes(Base):
allowed = [
"is_any_real_numeric_dtype",
"is_bool",
"is_bool_dtype",
"is_categorical_dtype",
"is_complex",
"is_complex_dtype",
"is_datetime64_any_dtype",
"is_datetime64_dtype",
"is_datetime64_ns_dtype",
"is_datetime64tz_dtype",
"is_dtype_equal",
"is_float",
"is_float_dtype",
"is_int64_dtype",
"is_integer",
"is_integer_dtype",
"is_number",
"is_numeric_dtype",
"is_object_dtype",
"is_scalar",
"is_sparse",
"is_string_dtype",
"is_signed_integer_dtype",
"is_timedelta64_dtype",
"is_timedelta64_ns_dtype",
"is_unsigned_integer_dtype",
"is_period_dtype",
"is_interval",
"is_interval_dtype",
"is_re",
"is_re_compilable",
"is_dict_like",
"is_iterator",
"is_file_like",
"is_list_like",
"is_hashable",
"is_array_like",
"is_named_tuple",
"pandas_dtype",
"union_categoricals",
"infer_dtype",
"is_extension_array_dtype",
]
deprecated: list[str] = []
dtypes = ["CategoricalDtype", "DatetimeTZDtype", "PeriodDtype", "IntervalDtype"]
def test_types(self):
self.check(types, self.allowed + self.dtypes + self.deprecated)
def test_deprecated_from_api_types(self):
for t in self.deprecated:
with tm.assert_produces_warning(FutureWarning):
getattr(types, t)(1)

View File

@ -0,0 +1,7 @@
from pandas.core.groupby.base import transformation_kernels
# There is no Series.cumcount or DataFrame.cumcount
series_transform_kernels = [
x for x in sorted(transformation_kernels) if x != "cumcount"
]
frame_transform_kernels = [x for x in sorted(transformation_kernels) if x != "cumcount"]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,113 @@
import numpy as np
import pytest
from pandas.compat.numpy import np_version_gte1p25
import pandas as pd
import pandas._testing as tm
def test_agg_relabel():
# GH 26513
df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]})
# simplest case with one column, one func
result = df.agg(foo=("B", "sum"))
expected = pd.DataFrame({"B": [10]}, index=pd.Index(["foo"]))
tm.assert_frame_equal(result, expected)
# test on same column with different methods
result = df.agg(foo=("B", "sum"), bar=("B", "min"))
expected = pd.DataFrame({"B": [10, 1]}, index=pd.Index(["foo", "bar"]))
tm.assert_frame_equal(result, expected)
def test_agg_relabel_multi_columns_multi_methods():
# GH 26513, test on multiple columns with multiple methods
df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]})
result = df.agg(
foo=("A", "sum"),
bar=("B", "mean"),
cat=("A", "min"),
dat=("B", "max"),
f=("A", "max"),
g=("C", "min"),
)
expected = pd.DataFrame(
{
"A": [6.0, np.nan, 1.0, np.nan, 2.0, np.nan],
"B": [np.nan, 2.5, np.nan, 4.0, np.nan, np.nan],
"C": [np.nan, np.nan, np.nan, np.nan, np.nan, 3.0],
},
index=pd.Index(["foo", "bar", "cat", "dat", "f", "g"]),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.xfail(np_version_gte1p25, reason="name of min now equals name of np.min")
def test_agg_relabel_partial_functions():
# GH 26513, test on partial, functools or more complex cases
df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]})
msg = "using Series.[mean|min]"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.agg(foo=("A", np.mean), bar=("A", "mean"), cat=("A", min))
expected = pd.DataFrame(
{"A": [1.5, 1.5, 1.0]}, index=pd.Index(["foo", "bar", "cat"])
)
tm.assert_frame_equal(result, expected)
msg = "using Series.[mean|min|max|sum]"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.agg(
foo=("A", min),
bar=("A", np.min),
cat=("B", max),
dat=("C", "min"),
f=("B", np.sum),
kk=("B", lambda x: min(x)),
)
expected = pd.DataFrame(
{
"A": [1.0, 1.0, np.nan, np.nan, np.nan, np.nan],
"B": [np.nan, np.nan, 4.0, np.nan, 10.0, 1.0],
"C": [np.nan, np.nan, np.nan, 3.0, np.nan, np.nan],
},
index=pd.Index(["foo", "bar", "cat", "dat", "f", "kk"]),
)
tm.assert_frame_equal(result, expected)
def test_agg_namedtuple():
# GH 26513
df = pd.DataFrame({"A": [0, 1], "B": [1, 2]})
result = df.agg(
foo=pd.NamedAgg("B", "sum"),
bar=pd.NamedAgg("B", "min"),
cat=pd.NamedAgg(column="B", aggfunc="count"),
fft=pd.NamedAgg("B", aggfunc="max"),
)
expected = pd.DataFrame(
{"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"])
)
tm.assert_frame_equal(result, expected)
result = df.agg(
foo=pd.NamedAgg("A", "min"),
bar=pd.NamedAgg(column="B", aggfunc="max"),
cat=pd.NamedAgg(column="A", aggfunc="max"),
)
expected = pd.DataFrame(
{"A": [0.0, np.nan, 1.0], "B": [np.nan, 2.0, np.nan]},
index=pd.Index(["foo", "bar", "cat"]),
)
tm.assert_frame_equal(result, expected)
def test_reconstruct_func():
# GH 28472, test to ensure reconstruct_func isn't moved;
# This method is used by other libraries (e.g. dask)
result = pd.core.apply.reconstruct_func("min")
expected = (False, "min", None, None)
tm.assert_equal(result, expected)

View File

@ -0,0 +1,264 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
MultiIndex,
Series,
)
import pandas._testing as tm
from pandas.tests.apply.common import frame_transform_kernels
from pandas.tests.frame.common import zip_frames
def unpack_obj(obj, klass, axis):
"""
Helper to ensure we have the right type of object for a test parametrized
over frame_or_series.
"""
if klass is not DataFrame:
obj = obj["A"]
if axis != 0:
pytest.skip(f"Test is only for DataFrame with axis={axis}")
return obj
def test_transform_ufunc(axis, float_frame, frame_or_series):
# GH 35964
obj = unpack_obj(float_frame, frame_or_series, axis)
with np.errstate(all="ignore"):
f_sqrt = np.sqrt(obj)
# ufunc
result = obj.transform(np.sqrt, axis=axis)
expected = f_sqrt
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"ops, names",
[
([np.sqrt], ["sqrt"]),
([np.abs, np.sqrt], ["absolute", "sqrt"]),
(np.array([np.sqrt]), ["sqrt"]),
(np.array([np.abs, np.sqrt]), ["absolute", "sqrt"]),
],
)
def test_transform_listlike(axis, float_frame, ops, names):
# GH 35964
other_axis = 1 if axis in {0, "index"} else 0
with np.errstate(all="ignore"):
expected = zip_frames([op(float_frame) for op in ops], axis=other_axis)
if axis in {0, "index"}:
expected.columns = MultiIndex.from_product([float_frame.columns, names])
else:
expected.index = MultiIndex.from_product([float_frame.index, names])
result = float_frame.transform(ops, axis=axis)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("ops", [[], np.array([])])
def test_transform_empty_listlike(float_frame, ops, frame_or_series):
obj = unpack_obj(float_frame, frame_or_series, 0)
with pytest.raises(ValueError, match="No transform functions were provided"):
obj.transform(ops)
def test_transform_listlike_func_with_args():
# GH 50624
df = DataFrame({"x": [1, 2, 3]})
def foo1(x, a=1, c=0):
return x + a + c
def foo2(x, b=2, c=0):
return x + b + c
msg = r"foo1\(\) got an unexpected keyword argument 'b'"
with pytest.raises(TypeError, match=msg):
df.transform([foo1, foo2], 0, 3, b=3, c=4)
result = df.transform([foo1, foo2], 0, 3, c=4)
expected = DataFrame(
[[8, 8], [9, 9], [10, 10]],
columns=MultiIndex.from_tuples([("x", "foo1"), ("x", "foo2")]),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("box", [dict, Series])
def test_transform_dictlike(axis, float_frame, box):
# GH 35964
if axis in (0, "index"):
e = float_frame.columns[0]
expected = float_frame[[e]].transform(np.abs)
else:
e = float_frame.index[0]
expected = float_frame.iloc[[0]].transform(np.abs)
result = float_frame.transform(box({e: np.abs}), axis=axis)
tm.assert_frame_equal(result, expected)
def test_transform_dictlike_mixed():
# GH 40018 - mix of lists and non-lists in values of a dictionary
df = DataFrame({"a": [1, 2], "b": [1, 4], "c": [1, 4]})
result = df.transform({"b": ["sqrt", "abs"], "c": "sqrt"})
expected = DataFrame(
[[1.0, 1, 1.0], [2.0, 4, 2.0]],
columns=MultiIndex([("b", "c"), ("sqrt", "abs")], [(0, 0, 1), (0, 1, 0)]),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"ops",
[
{},
{"A": []},
{"A": [], "B": "cumsum"},
{"A": "cumsum", "B": []},
{"A": [], "B": ["cumsum"]},
{"A": ["cumsum"], "B": []},
],
)
def test_transform_empty_dictlike(float_frame, ops, frame_or_series):
obj = unpack_obj(float_frame, frame_or_series, 0)
with pytest.raises(ValueError, match="No transform functions were provided"):
obj.transform(ops)
@pytest.mark.parametrize("use_apply", [True, False])
def test_transform_udf(axis, float_frame, use_apply, frame_or_series):
# GH 35964
obj = unpack_obj(float_frame, frame_or_series, axis)
# transform uses UDF either via apply or passing the entire DataFrame
def func(x):
# transform is using apply iff x is not a DataFrame
if use_apply == isinstance(x, frame_or_series):
# Force transform to fallback
raise ValueError
return x + 1
result = obj.transform(func, axis=axis)
expected = obj + 1
tm.assert_equal(result, expected)
wont_fail = ["ffill", "bfill", "fillna", "pad", "backfill", "shift"]
frame_kernels_raise = [x for x in frame_transform_kernels if x not in wont_fail]
@pytest.mark.parametrize("op", [*frame_kernels_raise, lambda x: x + 1])
def test_transform_bad_dtype(op, frame_or_series, request):
# GH 35964
if op == "ngroup":
request.applymarker(
pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame")
)
obj = DataFrame({"A": 3 * [object]}) # DataFrame that will fail on most transforms
obj = tm.get_obj(obj, frame_or_series)
error = TypeError
msg = "|".join(
[
"not supported between instances of 'type' and 'type'",
"unsupported operand type",
]
)
with pytest.raises(error, match=msg):
obj.transform(op)
with pytest.raises(error, match=msg):
obj.transform([op])
with pytest.raises(error, match=msg):
obj.transform({"A": op})
with pytest.raises(error, match=msg):
obj.transform({"A": [op]})
@pytest.mark.parametrize("op", frame_kernels_raise)
def test_transform_failure_typeerror(request, op):
# GH 35964
if op == "ngroup":
request.applymarker(
pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame")
)
# Using object makes most transform kernels fail
df = DataFrame({"A": 3 * [object], "B": [1, 2, 3]})
error = TypeError
msg = "|".join(
[
"not supported between instances of 'type' and 'type'",
"unsupported operand type",
]
)
with pytest.raises(error, match=msg):
df.transform([op])
with pytest.raises(error, match=msg):
df.transform({"A": op, "B": op})
with pytest.raises(error, match=msg):
df.transform({"A": [op], "B": [op]})
with pytest.raises(error, match=msg):
df.transform({"A": [op, "shift"], "B": [op]})
def test_transform_failure_valueerror():
# GH 40211
def op(x):
if np.sum(np.sum(x)) < 10:
raise ValueError
return x
df = DataFrame({"A": [1, 2, 3], "B": [400, 500, 600]})
msg = "Transform function failed"
with pytest.raises(ValueError, match=msg):
df.transform([op])
with pytest.raises(ValueError, match=msg):
df.transform({"A": op, "B": op})
with pytest.raises(ValueError, match=msg):
df.transform({"A": [op], "B": [op]})
with pytest.raises(ValueError, match=msg):
df.transform({"A": [op, "shift"], "B": [op]})
@pytest.mark.parametrize("use_apply", [True, False])
def test_transform_passes_args(use_apply, frame_or_series):
# GH 35964
# transform uses UDF either via apply or passing the entire DataFrame
expected_args = [1, 2]
expected_kwargs = {"c": 3}
def f(x, a, b, c):
# transform is using apply iff x is not a DataFrame
if use_apply == isinstance(x, frame_or_series):
# Force transform to fallback
raise ValueError
assert [a, b] == expected_args
assert c == expected_kwargs["c"]
return x
frame_or_series([1]).transform(f, 0, *expected_args, **expected_kwargs)
def test_transform_empty_dataframe():
# https://github.com/pandas-dev/pandas/issues/39636
df = DataFrame([], columns=["col1", "col2"])
result = df.transform(lambda x: x + 10)
tm.assert_frame_equal(result, df)
result = df["col1"].transform(lambda x: x + 10)
tm.assert_series_equal(result, df["col1"])

View File

@ -0,0 +1,361 @@
# Tests specifically aimed at detecting bad arguments.
# This file is organized by reason for exception.
# 1. always invalid argument values
# 2. missing column(s)
# 3. incompatible ops/dtype/args/kwargs
# 4. invalid result shape/type
# If your test does not fit into one of these categories, add to this list.
from itertools import chain
import re
import numpy as np
import pytest
from pandas.errors import SpecificationError
from pandas import (
DataFrame,
Series,
date_range,
)
import pandas._testing as tm
@pytest.mark.parametrize("result_type", ["foo", 1])
def test_result_type_error(result_type):
# allowed result_type
df = DataFrame(
np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1,
columns=["A", "B", "C"],
)
msg = (
"invalid value for result_type, must be one of "
"{None, 'reduce', 'broadcast', 'expand'}"
)
with pytest.raises(ValueError, match=msg):
df.apply(lambda x: [1, 2, 3], axis=1, result_type=result_type)
def test_apply_invalid_axis_value():
df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"])
msg = "No axis named 2 for object type DataFrame"
with pytest.raises(ValueError, match=msg):
df.apply(lambda x: x, 2)
def test_agg_raises():
# GH 26513
df = DataFrame({"A": [0, 1], "B": [1, 2]})
msg = "Must provide"
with pytest.raises(TypeError, match=msg):
df.agg()
def test_map_with_invalid_na_action_raises():
# https://github.com/pandas-dev/pandas/issues/32815
s = Series([1, 2, 3])
msg = "na_action must either be 'ignore' or None"
with pytest.raises(ValueError, match=msg):
s.map(lambda x: x, na_action="____")
@pytest.mark.parametrize("input_na_action", ["____", True])
def test_map_arg_is_dict_with_invalid_na_action_raises(input_na_action):
# https://github.com/pandas-dev/pandas/issues/46588
s = Series([1, 2, 3])
msg = f"na_action must either be 'ignore' or None, {input_na_action} was passed"
with pytest.raises(ValueError, match=msg):
s.map({1: 2}, na_action=input_na_action)
@pytest.mark.parametrize("method", ["apply", "agg", "transform"])
@pytest.mark.parametrize("func", [{"A": {"B": "sum"}}, {"A": {"B": ["sum"]}}])
def test_nested_renamer(frame_or_series, method, func):
# GH 35964
obj = frame_or_series({"A": [1]})
match = "nested renamer is not supported"
with pytest.raises(SpecificationError, match=match):
getattr(obj, method)(func)
@pytest.mark.parametrize(
"renamer",
[{"foo": ["min", "max"]}, {"foo": ["min", "max"], "bar": ["sum", "mean"]}],
)
def test_series_nested_renamer(renamer):
s = Series(range(6), dtype="int64", name="series")
msg = "nested renamer is not supported"
with pytest.raises(SpecificationError, match=msg):
s.agg(renamer)
def test_apply_dict_depr():
tsdf = DataFrame(
np.random.default_rng(2).standard_normal((10, 3)),
columns=["A", "B", "C"],
index=date_range("1/1/2000", periods=10),
)
msg = "nested renamer is not supported"
with pytest.raises(SpecificationError, match=msg):
tsdf.A.agg({"foo": ["sum", "mean"]})
@pytest.mark.parametrize("method", ["agg", "transform"])
def test_dict_nested_renaming_depr(method):
df = DataFrame({"A": range(5), "B": 5})
# nested renaming
msg = r"nested renamer is not supported"
with pytest.raises(SpecificationError, match=msg):
getattr(df, method)({"A": {"foo": "min"}, "B": {"bar": "max"}})
@pytest.mark.parametrize("method", ["apply", "agg", "transform"])
@pytest.mark.parametrize("func", [{"B": "sum"}, {"B": ["sum"]}])
def test_missing_column(method, func):
# GH 40004
obj = DataFrame({"A": [1]})
match = re.escape("Column(s) ['B'] do not exist")
with pytest.raises(KeyError, match=match):
getattr(obj, method)(func)
def test_transform_mixed_column_name_dtypes():
# GH39025
df = DataFrame({"a": ["1"]})
msg = r"Column\(s\) \[1, 'b'\] do not exist"
with pytest.raises(KeyError, match=msg):
df.transform({"a": int, 1: str, "b": int})
@pytest.mark.parametrize(
"how, args", [("pct_change", ()), ("nsmallest", (1, ["a", "b"])), ("tail", 1)]
)
def test_apply_str_axis_1_raises(how, args):
# GH 39211 - some ops don't support axis=1
df = DataFrame({"a": [1, 2], "b": [3, 4]})
msg = f"Operation {how} does not support axis=1"
with pytest.raises(ValueError, match=msg):
df.apply(how, axis=1, args=args)
def test_transform_axis_1_raises():
# GH 35964
msg = "No axis named 1 for object type Series"
with pytest.raises(ValueError, match=msg):
Series([1]).transform("sum", axis=1)
def test_apply_modify_traceback():
data = DataFrame(
{
"A": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"foo",
"foo",
"foo",
],
"B": [
"one",
"one",
"one",
"two",
"one",
"one",
"one",
"two",
"two",
"two",
"one",
],
"C": [
"dull",
"dull",
"shiny",
"dull",
"dull",
"shiny",
"shiny",
"dull",
"shiny",
"shiny",
"shiny",
],
"D": np.random.default_rng(2).standard_normal(11),
"E": np.random.default_rng(2).standard_normal(11),
"F": np.random.default_rng(2).standard_normal(11),
}
)
data.loc[4, "C"] = np.nan
def transform(row):
if row["C"].startswith("shin") and row["A"] == "foo":
row["D"] = 7
return row
msg = "'float' object has no attribute 'startswith'"
with pytest.raises(AttributeError, match=msg):
data.apply(transform, axis=1)
@pytest.mark.parametrize(
"df, func, expected",
tm.get_cython_table_params(
DataFrame([["a", "b"], ["b", "a"]]), [["cumprod", TypeError]]
),
)
def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_string):
# GH 21224
if using_infer_string:
import pyarrow as pa
expected = (expected, pa.lib.ArrowNotImplementedError)
msg = "can't multiply sequence by non-int of type 'str'|has no kernel"
warn = None if isinstance(func, str) else FutureWarning
with pytest.raises(expected, match=msg):
with tm.assert_produces_warning(warn, match="using DataFrame.cumprod"):
df.agg(func, axis=axis)
@pytest.mark.parametrize(
"series, func, expected",
chain(
tm.get_cython_table_params(
Series("a b c".split()),
[
("mean", TypeError), # mean raises TypeError
("prod", TypeError),
("std", TypeError),
("var", TypeError),
("median", TypeError),
("cumprod", TypeError),
],
)
),
)
def test_agg_cython_table_raises_series(series, func, expected, using_infer_string):
# GH21224
msg = r"[Cc]ould not convert|can't multiply sequence by non-int of type"
if func == "median" or func is np.nanmedian or func is np.median:
msg = r"Cannot convert \['a' 'b' 'c'\] to numeric"
if using_infer_string:
import pyarrow as pa
expected = (expected, pa.lib.ArrowNotImplementedError)
msg = msg + "|does not support|has no kernel"
warn = None if isinstance(func, str) else FutureWarning
with pytest.raises(expected, match=msg):
# e.g. Series('a b'.split()).cumprod() will raise
with tm.assert_produces_warning(warn, match="is currently using Series.*"):
series.agg(func)
def test_agg_none_to_type():
# GH 40543
df = DataFrame({"a": [None]})
msg = re.escape("int() argument must be a string")
with pytest.raises(TypeError, match=msg):
df.agg({"a": lambda x: int(x.iloc[0])})
def test_transform_none_to_type():
# GH#34377
df = DataFrame({"a": [None]})
msg = "argument must be a"
with pytest.raises(TypeError, match=msg):
df.transform({"a": lambda x: int(x.iloc[0])})
@pytest.mark.parametrize(
"func",
[
lambda x: np.array([1, 2]).reshape(-1, 2),
lambda x: [1, 2],
lambda x: Series([1, 2]),
],
)
def test_apply_broadcast_error(func):
df = DataFrame(
np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1,
columns=["A", "B", "C"],
)
# > 1 ndim
msg = "too many dims to broadcast|cannot broadcast result"
with pytest.raises(ValueError, match=msg):
df.apply(func, axis=1, result_type="broadcast")
def test_transform_and_agg_err_agg(axis, float_frame):
# cannot both transform and agg
msg = "cannot combine transform and aggregation operations"
with pytest.raises(ValueError, match=msg):
with np.errstate(all="ignore"):
float_frame.agg(["max", "sqrt"], axis=axis)
@pytest.mark.filterwarnings("ignore::FutureWarning") # GH53325
@pytest.mark.parametrize(
"func, msg",
[
(["sqrt", "max"], "cannot combine transform and aggregation"),
(
{"foo": np.sqrt, "bar": "sum"},
"cannot perform both aggregation and transformation",
),
],
)
def test_transform_and_agg_err_series(string_series, func, msg):
# we are trying to transform with an aggregator
with pytest.raises(ValueError, match=msg):
with np.errstate(all="ignore"):
string_series.agg(func)
@pytest.mark.parametrize("func", [["max", "min"], ["max", "sqrt"]])
def test_transform_wont_agg_frame(axis, float_frame, func):
# GH 35964
# cannot both transform and agg
msg = "Function did not transform"
with pytest.raises(ValueError, match=msg):
float_frame.transform(func, axis=axis)
@pytest.mark.parametrize("func", [["min", "max"], ["sqrt", "max"]])
def test_transform_wont_agg_series(string_series, func):
# GH 35964
# we are trying to transform with an aggregator
msg = "Function did not transform"
with pytest.raises(ValueError, match=msg):
string_series.transform(func)
@pytest.mark.parametrize(
"op_wrapper", [lambda x: x, lambda x: [x], lambda x: {"A": x}, lambda x: {"A": [x]}]
)
def test_transform_reducer_raises(all_reductions, frame_or_series, op_wrapper):
# GH 35964
op = op_wrapper(all_reductions)
obj = DataFrame({"A": [1, 2, 3]})
obj = tm.get_obj(obj, frame_or_series)
msg = "Function did not transform"
with pytest.raises(ValueError, match=msg):
obj.transform(op)

View File

@ -0,0 +1,118 @@
import numpy as np
import pytest
import pandas.util._test_decorators as td
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm
pytestmark = [td.skip_if_no("numba"), pytest.mark.single_cpu]
@pytest.fixture(params=[0, 1])
def apply_axis(request):
return request.param
def test_numba_vs_python_noop(float_frame, apply_axis):
func = lambda x: x
result = float_frame.apply(func, engine="numba", axis=apply_axis)
expected = float_frame.apply(func, engine="python", axis=apply_axis)
tm.assert_frame_equal(result, expected)
def test_numba_vs_python_string_index():
# GH#56189
pytest.importorskip("pyarrow")
df = DataFrame(
1,
index=Index(["a", "b"], dtype="string[pyarrow_numpy]"),
columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"),
)
func = lambda x: x
result = df.apply(func, engine="numba", axis=0)
expected = df.apply(func, engine="python", axis=0)
tm.assert_frame_equal(
result, expected, check_column_type=False, check_index_type=False
)
def test_numba_vs_python_indexing():
frame = DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]},
index=Index(["A", "B", "C"]),
)
row_func = lambda x: x["c"]
result = frame.apply(row_func, engine="numba", axis=1)
expected = frame.apply(row_func, engine="python", axis=1)
tm.assert_series_equal(result, expected)
col_func = lambda x: x["A"]
result = frame.apply(col_func, engine="numba", axis=0)
expected = frame.apply(col_func, engine="python", axis=0)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"reduction",
[lambda x: x.mean(), lambda x: x.min(), lambda x: x.max(), lambda x: x.sum()],
)
def test_numba_vs_python_reductions(reduction, apply_axis):
df = DataFrame(np.ones((4, 4), dtype=np.float64))
result = df.apply(reduction, engine="numba", axis=apply_axis)
expected = df.apply(reduction, engine="python", axis=apply_axis)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("colnames", [[1, 2, 3], [1.0, 2.0, 3.0]])
def test_numba_numeric_colnames(colnames):
# Check that numeric column names lower properly and can be indxed on
df = DataFrame(
np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int64), columns=colnames
)
first_col = colnames[0]
f = lambda x: x[first_col] # Get the first column
result = df.apply(f, engine="numba", axis=1)
expected = df.apply(f, engine="python", axis=1)
tm.assert_series_equal(result, expected)
def test_numba_parallel_unsupported(float_frame):
f = lambda x: x
with pytest.raises(
NotImplementedError,
match="Parallel apply is not supported when raw=False and engine='numba'",
):
float_frame.apply(f, engine="numba", engine_kwargs={"parallel": True})
def test_numba_nonunique_unsupported(apply_axis):
f = lambda x: x
df = DataFrame({"a": [1, 2]}, index=Index(["a", "a"]))
with pytest.raises(
NotImplementedError,
match="The index/columns must be unique when raw=False and engine='numba'",
):
df.apply(f, engine="numba", axis=apply_axis)
def test_numba_unsupported_dtypes(apply_axis):
f = lambda x: x
df = DataFrame({"a": [1, 2], "b": ["a", "b"], "c": [4, 5]})
df["c"] = df["c"].astype("double[pyarrow]")
with pytest.raises(
ValueError,
match="Column b must have a numeric dtype. Found 'object|string' instead",
):
df.apply(f, engine="numba", axis=apply_axis)
with pytest.raises(
ValueError,
match="Column c is backed by an extension array, "
"which is not supported by the numba engine.",
):
df["c"].to_frame().apply(f, engine="numba", axis=apply_axis)

View File

@ -0,0 +1,701 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
Series,
concat,
date_range,
timedelta_range,
)
import pandas._testing as tm
from pandas.tests.apply.common import series_transform_kernels
@pytest.fixture(params=[False, "compat"])
def by_row(request):
return request.param
def test_series_map_box_timedelta(by_row):
# GH#11349
ser = Series(timedelta_range("1 day 1 s", periods=3, freq="h"))
def f(x):
return x.total_seconds() if by_row else x.dt.total_seconds()
result = ser.apply(f, by_row=by_row)
expected = ser.map(lambda x: x.total_seconds())
tm.assert_series_equal(result, expected)
expected = Series([86401.0, 90001.0, 93601.0])
tm.assert_series_equal(result, expected)
def test_apply(datetime_series, by_row):
result = datetime_series.apply(np.sqrt, by_row=by_row)
with np.errstate(all="ignore"):
expected = np.sqrt(datetime_series)
tm.assert_series_equal(result, expected)
# element-wise apply (ufunc)
result = datetime_series.apply(np.exp, by_row=by_row)
expected = np.exp(datetime_series)
tm.assert_series_equal(result, expected)
# empty series
s = Series(dtype=object, name="foo", index=Index([], name="bar"))
rs = s.apply(lambda x: x, by_row=by_row)
tm.assert_series_equal(s, rs)
# check all metadata (GH 9322)
assert s is not rs
assert s.index is rs.index
assert s.dtype == rs.dtype
assert s.name == rs.name
# index but no data
s = Series(index=[1, 2, 3], dtype=np.float64)
rs = s.apply(lambda x: x, by_row=by_row)
tm.assert_series_equal(s, rs)
def test_apply_map_same_length_inference_bug():
s = Series([1, 2])
def f(x):
return (x, x + 1)
result = s.apply(f, by_row="compat")
expected = s.map(f)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("convert_dtype", [True, False])
def test_apply_convert_dtype_deprecated(convert_dtype):
ser = Series(np.random.default_rng(2).standard_normal(10))
def func(x):
return x if x > 0 else np.nan
with tm.assert_produces_warning(FutureWarning):
ser.apply(func, convert_dtype=convert_dtype, by_row="compat")
def test_apply_args():
s = Series(["foo,bar"])
result = s.apply(str.split, args=(",",))
assert result[0] == ["foo", "bar"]
assert isinstance(result[0], list)
@pytest.mark.parametrize(
"args, kwargs, increment",
[((), {}, 0), ((), {"a": 1}, 1), ((2, 3), {}, 32), ((1,), {"c": 2}, 201)],
)
def test_agg_args(args, kwargs, increment):
# GH 43357
def f(x, a=0, b=0, c=0):
return x + a + 10 * b + 100 * c
s = Series([1, 2])
msg = (
"in Series.agg cannot aggregate and has been deprecated. "
"Use Series.transform to keep behavior unchanged."
)
with tm.assert_produces_warning(FutureWarning, match=msg):
result = s.agg(f, 0, *args, **kwargs)
expected = s + increment
tm.assert_series_equal(result, expected)
def test_agg_mapping_func_deprecated():
# GH 53325
s = Series([1, 2, 3])
def foo1(x, a=1, c=0):
return x + a + c
def foo2(x, b=2, c=0):
return x + b + c
msg = "using .+ in Series.agg cannot aggregate and"
with tm.assert_produces_warning(FutureWarning, match=msg):
s.agg(foo1, 0, 3, c=4)
with tm.assert_produces_warning(FutureWarning, match=msg):
s.agg([foo1, foo2], 0, 3, c=4)
with tm.assert_produces_warning(FutureWarning, match=msg):
s.agg({"a": foo1, "b": foo2}, 0, 3, c=4)
def test_series_apply_map_box_timestamps(by_row):
# GH#2689, GH#2627
ser = Series(date_range("1/1/2000", periods=10))
def func(x):
return (x.hour, x.day, x.month)
if not by_row:
msg = "Series' object has no attribute 'hour'"
with pytest.raises(AttributeError, match=msg):
ser.apply(func, by_row=by_row)
return
result = ser.apply(func, by_row=by_row)
expected = ser.map(func)
tm.assert_series_equal(result, expected)
def test_apply_box_dt64():
# ufunc will not be boxed. Same test cases as the test_map_box
vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")]
ser = Series(vals, dtype="M8[ns]")
assert ser.dtype == "datetime64[ns]"
# boxed value must be Timestamp instance
res = ser.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}", by_row="compat")
exp = Series(["Timestamp_1_None", "Timestamp_2_None"])
tm.assert_series_equal(res, exp)
def test_apply_box_dt64tz():
vals = [
pd.Timestamp("2011-01-01", tz="US/Eastern"),
pd.Timestamp("2011-01-02", tz="US/Eastern"),
]
ser = Series(vals, dtype="M8[ns, US/Eastern]")
assert ser.dtype == "datetime64[ns, US/Eastern]"
res = ser.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}", by_row="compat")
exp = Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"])
tm.assert_series_equal(res, exp)
def test_apply_box_td64():
# timedelta
vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")]
ser = Series(vals)
assert ser.dtype == "timedelta64[ns]"
res = ser.apply(lambda x: f"{type(x).__name__}_{x.days}", by_row="compat")
exp = Series(["Timedelta_1", "Timedelta_2"])
tm.assert_series_equal(res, exp)
def test_apply_box_period():
# period
vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")]
ser = Series(vals)
assert ser.dtype == "Period[M]"
res = ser.apply(lambda x: f"{type(x).__name__}_{x.freqstr}", by_row="compat")
exp = Series(["Period_M", "Period_M"])
tm.assert_series_equal(res, exp)
def test_apply_datetimetz(by_row):
values = date_range("2011-01-01", "2011-01-02", freq="h").tz_localize("Asia/Tokyo")
s = Series(values, name="XX")
result = s.apply(lambda x: x + pd.offsets.Day(), by_row=by_row)
exp_values = date_range("2011-01-02", "2011-01-03", freq="h").tz_localize(
"Asia/Tokyo"
)
exp = Series(exp_values, name="XX")
tm.assert_series_equal(result, exp)
result = s.apply(lambda x: x.hour if by_row else x.dt.hour, by_row=by_row)
exp = Series(list(range(24)) + [0], name="XX", dtype="int64" if by_row else "int32")
tm.assert_series_equal(result, exp)
# not vectorized
def f(x):
return str(x.tz) if by_row else str(x.dt.tz)
result = s.apply(f, by_row=by_row)
if by_row:
exp = Series(["Asia/Tokyo"] * 25, name="XX")
tm.assert_series_equal(result, exp)
else:
assert result == "Asia/Tokyo"
def test_apply_categorical(by_row, using_infer_string):
values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True)
ser = Series(values, name="XX", index=list("abcdefg"))
if not by_row:
msg = "Series' object has no attribute 'lower"
with pytest.raises(AttributeError, match=msg):
ser.apply(lambda x: x.lower(), by_row=by_row)
assert ser.apply(lambda x: "A", by_row=by_row) == "A"
return
result = ser.apply(lambda x: x.lower(), by_row=by_row)
# should be categorical dtype when the number of categories are
# the same
values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True)
exp = Series(values, name="XX", index=list("abcdefg"))
tm.assert_series_equal(result, exp)
tm.assert_categorical_equal(result.values, exp.values)
result = ser.apply(lambda x: "A")
exp = Series(["A"] * 7, name="XX", index=list("abcdefg"))
tm.assert_series_equal(result, exp)
assert result.dtype == object if not using_infer_string else "string[pyarrow_numpy]"
@pytest.mark.parametrize("series", [["1-1", "1-1", np.nan], ["1-1", "1-2", np.nan]])
def test_apply_categorical_with_nan_values(series, by_row):
# GH 20714 bug fixed in: GH 24275
s = Series(series, dtype="category")
if not by_row:
msg = "'Series' object has no attribute 'split'"
with pytest.raises(AttributeError, match=msg):
s.apply(lambda x: x.split("-")[0], by_row=by_row)
return
result = s.apply(lambda x: x.split("-")[0], by_row=by_row)
result = result.astype(object)
expected = Series(["1", "1", np.nan], dtype="category")
expected = expected.astype(object)
tm.assert_series_equal(result, expected)
def test_apply_empty_integer_series_with_datetime_index(by_row):
# GH 21245
s = Series([], index=date_range(start="2018-01-01", periods=0), dtype=int)
result = s.apply(lambda x: x, by_row=by_row)
tm.assert_series_equal(result, s)
def test_apply_dataframe_iloc():
uintDF = DataFrame(np.uint64([1, 2, 3, 4, 5]), columns=["Numbers"])
indexDF = DataFrame([2, 3, 2, 1, 2], columns=["Indices"])
def retrieve(targetRow, targetDF):
val = targetDF["Numbers"].iloc[targetRow]
return val
result = indexDF["Indices"].apply(retrieve, args=(uintDF,))
expected = Series([3, 4, 3, 2, 3], name="Indices", dtype="uint64")
tm.assert_series_equal(result, expected)
def test_transform(string_series, by_row):
# transforming functions
with np.errstate(all="ignore"):
f_sqrt = np.sqrt(string_series)
f_abs = np.abs(string_series)
# ufunc
result = string_series.apply(np.sqrt, by_row=by_row)
expected = f_sqrt.copy()
tm.assert_series_equal(result, expected)
# list-like
result = string_series.apply([np.sqrt], by_row=by_row)
expected = f_sqrt.to_frame().copy()
expected.columns = ["sqrt"]
tm.assert_frame_equal(result, expected)
result = string_series.apply(["sqrt"], by_row=by_row)
tm.assert_frame_equal(result, expected)
# multiple items in list
# these are in the order as if we are applying both functions per
# series and then concatting
expected = concat([f_sqrt, f_abs], axis=1)
expected.columns = ["sqrt", "absolute"]
result = string_series.apply([np.sqrt, np.abs], by_row=by_row)
tm.assert_frame_equal(result, expected)
# dict, provide renaming
expected = concat([f_sqrt, f_abs], axis=1)
expected.columns = ["foo", "bar"]
expected = expected.unstack().rename("series")
result = string_series.apply({"foo": np.sqrt, "bar": np.abs}, by_row=by_row)
tm.assert_series_equal(result.reindex_like(expected), expected)
@pytest.mark.parametrize("op", series_transform_kernels)
def test_transform_partial_failure(op, request):
# GH 35964
if op in ("ffill", "bfill", "pad", "backfill", "shift"):
request.applymarker(
pytest.mark.xfail(reason=f"{op} is successful on any dtype")
)
# Using object makes most transform kernels fail
ser = Series(3 * [object])
if op in ("fillna", "ngroup"):
error = ValueError
msg = "Transform function failed"
else:
error = TypeError
msg = "|".join(
[
"not supported between instances of 'type' and 'type'",
"unsupported operand type",
]
)
with pytest.raises(error, match=msg):
ser.transform([op, "shift"])
with pytest.raises(error, match=msg):
ser.transform({"A": op, "B": "shift"})
with pytest.raises(error, match=msg):
ser.transform({"A": [op], "B": ["shift"]})
with pytest.raises(error, match=msg):
ser.transform({"A": [op, "shift"], "B": [op]})
def test_transform_partial_failure_valueerror():
# GH 40211
def noop(x):
return x
def raising_op(_):
raise ValueError
ser = Series(3 * [object])
msg = "Transform function failed"
with pytest.raises(ValueError, match=msg):
ser.transform([noop, raising_op])
with pytest.raises(ValueError, match=msg):
ser.transform({"A": raising_op, "B": noop})
with pytest.raises(ValueError, match=msg):
ser.transform({"A": [raising_op], "B": [noop]})
with pytest.raises(ValueError, match=msg):
ser.transform({"A": [noop, raising_op], "B": [noop]})
def test_demo():
# demonstration tests
s = Series(range(6), dtype="int64", name="series")
result = s.agg(["min", "max"])
expected = Series([0, 5], index=["min", "max"], name="series")
tm.assert_series_equal(result, expected)
result = s.agg({"foo": "min"})
expected = Series([0], index=["foo"], name="series")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("func", [str, lambda x: str(x)])
def test_apply_map_evaluate_lambdas_the_same(string_series, func, by_row):
# test that we are evaluating row-by-row first if by_row="compat"
# else vectorized evaluation
result = string_series.apply(func, by_row=by_row)
if by_row:
expected = string_series.map(func)
tm.assert_series_equal(result, expected)
else:
assert result == str(string_series)
def test_agg_evaluate_lambdas(string_series):
# GH53325
# in the future, the result will be a Series class.
with tm.assert_produces_warning(FutureWarning):
result = string_series.agg(lambda x: type(x))
assert isinstance(result, Series) and len(result) == len(string_series)
with tm.assert_produces_warning(FutureWarning):
result = string_series.agg(type)
assert isinstance(result, Series) and len(result) == len(string_series)
@pytest.mark.parametrize("op_name", ["agg", "apply"])
def test_with_nested_series(datetime_series, op_name):
# GH 2316
# .agg with a reducer and a transform, what to do
msg = "cannot aggregate"
warning = FutureWarning if op_name == "agg" else None
with tm.assert_produces_warning(warning, match=msg):
# GH52123
result = getattr(datetime_series, op_name)(
lambda x: Series([x, x**2], index=["x", "x^2"])
)
expected = DataFrame({"x": datetime_series, "x^2": datetime_series**2})
tm.assert_frame_equal(result, expected)
with tm.assert_produces_warning(FutureWarning, match=msg):
result = datetime_series.agg(lambda x: Series([x, x**2], index=["x", "x^2"]))
tm.assert_frame_equal(result, expected)
def test_replicate_describe(string_series):
# this also tests a result set that is all scalars
expected = string_series.describe()
result = string_series.apply(
{
"count": "count",
"mean": "mean",
"std": "std",
"min": "min",
"25%": lambda x: x.quantile(0.25),
"50%": "median",
"75%": lambda x: x.quantile(0.75),
"max": "max",
},
)
tm.assert_series_equal(result, expected)
def test_reduce(string_series):
# reductions with named functions
result = string_series.agg(["sum", "mean"])
expected = Series(
[string_series.sum(), string_series.mean()],
["sum", "mean"],
name=string_series.name,
)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"how, kwds",
[("agg", {}), ("apply", {"by_row": "compat"}), ("apply", {"by_row": False})],
)
def test_non_callable_aggregates(how, kwds):
# test agg using non-callable series attributes
# GH 39116 - expand to apply
s = Series([1, 2, None])
# Calling agg w/ just a string arg same as calling s.arg
result = getattr(s, how)("size", **kwds)
expected = s.size
assert result == expected
# test when mixed w/ callable reducers
result = getattr(s, how)(["size", "count", "mean"], **kwds)
expected = Series({"size": 3.0, "count": 2.0, "mean": 1.5})
tm.assert_series_equal(result, expected)
result = getattr(s, how)({"size": "size", "count": "count", "mean": "mean"}, **kwds)
tm.assert_series_equal(result, expected)
def test_series_apply_no_suffix_index(by_row):
# GH36189
s = Series([4] * 3)
result = s.apply(["sum", lambda x: x.sum(), lambda x: x.sum()], by_row=by_row)
expected = Series([12, 12, 12], index=["sum", "<lambda>", "<lambda>"])
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"dti,exp",
[
(
Series([1, 2], index=pd.DatetimeIndex([0, 31536000000])),
DataFrame(np.repeat([[1, 2]], 2, axis=0), dtype="int64"),
),
(
Series(
np.arange(10, dtype=np.float64),
index=date_range("2020-01-01", periods=10),
name="ts",
),
DataFrame(np.repeat([[1, 2]], 10, axis=0), dtype="int64"),
),
],
)
@pytest.mark.parametrize("aware", [True, False])
def test_apply_series_on_date_time_index_aware_series(dti, exp, aware):
# GH 25959
# Calling apply on a localized time series should not cause an error
if aware:
index = dti.tz_localize("UTC").index
else:
index = dti.index
result = Series(index).apply(lambda x: Series([1, 2]))
tm.assert_frame_equal(result, exp)
@pytest.mark.parametrize(
"by_row, expected", [("compat", Series(np.ones(10), dtype="int64")), (False, 1)]
)
def test_apply_scalar_on_date_time_index_aware_series(by_row, expected):
# GH 25959
# Calling apply on a localized time series should not cause an error
series = Series(
np.arange(10, dtype=np.float64),
index=date_range("2020-01-01", periods=10, tz="UTC"),
)
result = Series(series.index).apply(lambda x: 1, by_row=by_row)
tm.assert_equal(result, expected)
def test_apply_to_timedelta(by_row):
list_of_valid_strings = ["00:00:01", "00:00:02"]
a = pd.to_timedelta(list_of_valid_strings)
b = Series(list_of_valid_strings).apply(pd.to_timedelta, by_row=by_row)
tm.assert_series_equal(Series(a), b)
list_of_strings = ["00:00:01", np.nan, pd.NaT, pd.NaT]
a = pd.to_timedelta(list_of_strings)
ser = Series(list_of_strings)
b = ser.apply(pd.to_timedelta, by_row=by_row)
tm.assert_series_equal(Series(a), b)
@pytest.mark.parametrize(
"ops, names",
[
([np.sum], ["sum"]),
([np.sum, np.mean], ["sum", "mean"]),
(np.array([np.sum]), ["sum"]),
(np.array([np.sum, np.mean]), ["sum", "mean"]),
],
)
@pytest.mark.parametrize(
"how, kwargs",
[["agg", {}], ["apply", {"by_row": "compat"}], ["apply", {"by_row": False}]],
)
def test_apply_listlike_reducer(string_series, ops, names, how, kwargs):
# GH 39140
expected = Series({name: op(string_series) for name, op in zip(names, ops)})
expected.name = "series"
warn = FutureWarning if how == "agg" else None
msg = f"using Series.[{'|'.join(names)}]"
with tm.assert_produces_warning(warn, match=msg):
result = getattr(string_series, how)(ops, **kwargs)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"ops",
[
{"A": np.sum},
{"A": np.sum, "B": np.mean},
Series({"A": np.sum}),
Series({"A": np.sum, "B": np.mean}),
],
)
@pytest.mark.parametrize(
"how, kwargs",
[["agg", {}], ["apply", {"by_row": "compat"}], ["apply", {"by_row": False}]],
)
def test_apply_dictlike_reducer(string_series, ops, how, kwargs, by_row):
# GH 39140
expected = Series({name: op(string_series) for name, op in ops.items()})
expected.name = string_series.name
warn = FutureWarning if how == "agg" else None
msg = "using Series.[sum|mean]"
with tm.assert_produces_warning(warn, match=msg):
result = getattr(string_series, how)(ops, **kwargs)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"ops, names",
[
([np.sqrt], ["sqrt"]),
([np.abs, np.sqrt], ["absolute", "sqrt"]),
(np.array([np.sqrt]), ["sqrt"]),
(np.array([np.abs, np.sqrt]), ["absolute", "sqrt"]),
],
)
def test_apply_listlike_transformer(string_series, ops, names, by_row):
# GH 39140
with np.errstate(all="ignore"):
expected = concat([op(string_series) for op in ops], axis=1)
expected.columns = names
result = string_series.apply(ops, by_row=by_row)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"ops, expected",
[
([lambda x: x], DataFrame({"<lambda>": [1, 2, 3]})),
([lambda x: x.sum()], Series([6], index=["<lambda>"])),
],
)
def test_apply_listlike_lambda(ops, expected, by_row):
# GH53400
ser = Series([1, 2, 3])
result = ser.apply(ops, by_row=by_row)
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"ops",
[
{"A": np.sqrt},
{"A": np.sqrt, "B": np.exp},
Series({"A": np.sqrt}),
Series({"A": np.sqrt, "B": np.exp}),
],
)
def test_apply_dictlike_transformer(string_series, ops, by_row):
# GH 39140
with np.errstate(all="ignore"):
expected = concat({name: op(string_series) for name, op in ops.items()})
expected.name = string_series.name
result = string_series.apply(ops, by_row=by_row)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"ops, expected",
[
(
{"a": lambda x: x},
Series([1, 2, 3], index=MultiIndex.from_arrays([["a"] * 3, range(3)])),
),
({"a": lambda x: x.sum()}, Series([6], index=["a"])),
],
)
def test_apply_dictlike_lambda(ops, by_row, expected):
# GH53400
ser = Series([1, 2, 3])
result = ser.apply(ops, by_row=by_row)
tm.assert_equal(result, expected)
def test_apply_retains_column_name(by_row):
# GH 16380
df = DataFrame({"x": range(3)}, Index(range(3), name="x"))
result = df.x.apply(lambda x: Series(range(x + 1), Index(range(x + 1), name="y")))
expected = DataFrame(
[[0.0, np.nan, np.nan], [0.0, 1.0, np.nan], [0.0, 1.0, 2.0]],
columns=Index(range(3), name="y"),
index=Index(range(3), name="x"),
)
tm.assert_frame_equal(result, expected)
def test_apply_type():
# GH 46719
s = Series([3, "string", float], index=["a", "b", "c"])
result = s.apply(type)
expected = Series([int, str, type], index=["a", "b", "c"])
tm.assert_series_equal(result, expected)
def test_series_apply_unpack_nested_data():
# GH#55189
ser = Series([[1, 2, 3], [4, 5, 6, 7]])
result = ser.apply(lambda x: Series(x))
expected = DataFrame({0: [1.0, 4.0], 1: [2.0, 5.0], 2: [3.0, 6.0], 3: [np.nan, 7]})
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,39 @@
import pandas as pd
import pandas._testing as tm
def test_relabel_no_duplicated_method():
# this is to test there is no duplicated method used in agg
df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]})
result = df["A"].agg(foo="sum")
expected = df["A"].agg({"foo": "sum"})
tm.assert_series_equal(result, expected)
result = df["B"].agg(foo="min", bar="max")
expected = df["B"].agg({"foo": "min", "bar": "max"})
tm.assert_series_equal(result, expected)
msg = "using Series.[sum|min|max]"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df["B"].agg(foo=sum, bar=min, cat="max")
msg = "using Series.[sum|min|max]"
with tm.assert_produces_warning(FutureWarning, match=msg):
expected = df["B"].agg({"foo": sum, "bar": min, "cat": "max"})
tm.assert_series_equal(result, expected)
def test_relabel_duplicated_method():
# this is to test with nested renaming, duplicated method can be used
# if they are assigned with different new names
df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]})
result = df["A"].agg(foo="sum", bar="sum")
expected = pd.Series([6, 6], index=["foo", "bar"], name="A")
tm.assert_series_equal(result, expected)
msg = "using Series.min"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df["B"].agg(foo=min, bar="min")
expected = pd.Series([1, 1], index=["foo", "bar"], name="B")
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,84 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
MultiIndex,
Series,
concat,
)
import pandas._testing as tm
@pytest.mark.parametrize(
"args, kwargs, increment",
[((), {}, 0), ((), {"a": 1}, 1), ((2, 3), {}, 32), ((1,), {"c": 2}, 201)],
)
def test_agg_args(args, kwargs, increment):
# GH 43357
def f(x, a=0, b=0, c=0):
return x + a + 10 * b + 100 * c
s = Series([1, 2])
result = s.transform(f, 0, *args, **kwargs)
expected = s + increment
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"ops, names",
[
([np.sqrt], ["sqrt"]),
([np.abs, np.sqrt], ["absolute", "sqrt"]),
(np.array([np.sqrt]), ["sqrt"]),
(np.array([np.abs, np.sqrt]), ["absolute", "sqrt"]),
],
)
def test_transform_listlike(string_series, ops, names):
# GH 35964
with np.errstate(all="ignore"):
expected = concat([op(string_series) for op in ops], axis=1)
expected.columns = names
result = string_series.transform(ops)
tm.assert_frame_equal(result, expected)
def test_transform_listlike_func_with_args():
# GH 50624
s = Series([1, 2, 3])
def foo1(x, a=1, c=0):
return x + a + c
def foo2(x, b=2, c=0):
return x + b + c
msg = r"foo1\(\) got an unexpected keyword argument 'b'"
with pytest.raises(TypeError, match=msg):
s.transform([foo1, foo2], 0, 3, b=3, c=4)
result = s.transform([foo1, foo2], 0, 3, c=4)
expected = DataFrame({"foo1": [8, 9, 10], "foo2": [8, 9, 10]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("box", [dict, Series])
def test_transform_dictlike(string_series, box):
# GH 35964
with np.errstate(all="ignore"):
expected = concat([np.sqrt(string_series), np.abs(string_series)], axis=1)
expected.columns = ["foo", "bar"]
result = string_series.transform(box({"foo": np.sqrt, "bar": np.abs}))
tm.assert_frame_equal(result, expected)
def test_transform_dictlike_mixed():
# GH 40018 - mix of lists and non-lists in values of a dictionary
df = Series([1, 4])
result = df.transform({"b": ["sqrt", "abs"], "c": "sqrt"})
expected = DataFrame(
[[1.0, 1, 1.0], [2.0, 4, 2.0]],
columns=MultiIndex([("b", "c"), ("sqrt", "abs")], [(0, 0, 1), (0, 1, 0)]),
)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,326 @@
from itertools import chain
import operator
import numpy as np
import pytest
from pandas.core.dtypes.common import is_number
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
from pandas.tests.apply.common import (
frame_transform_kernels,
series_transform_kernels,
)
@pytest.mark.parametrize("func", ["sum", "mean", "min", "max", "std"])
@pytest.mark.parametrize(
"args,kwds",
[
pytest.param([], {}, id="no_args_or_kwds"),
pytest.param([1], {}, id="axis_from_args"),
pytest.param([], {"axis": 1}, id="axis_from_kwds"),
pytest.param([], {"numeric_only": True}, id="optional_kwds"),
pytest.param([1, True], {"numeric_only": True}, id="args_and_kwds"),
],
)
@pytest.mark.parametrize("how", ["agg", "apply"])
def test_apply_with_string_funcs(request, float_frame, func, args, kwds, how):
if len(args) > 1 and how == "agg":
request.applymarker(
pytest.mark.xfail(
raises=TypeError,
reason="agg/apply signature mismatch - agg passes 2nd "
"argument to func",
)
)
result = getattr(float_frame, how)(func, *args, **kwds)
expected = getattr(float_frame, func)(*args, **kwds)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("arg", ["sum", "mean", "min", "max", "std"])
def test_with_string_args(datetime_series, arg):
result = datetime_series.apply(arg)
expected = getattr(datetime_series, arg)()
assert result == expected
@pytest.mark.parametrize("op", ["mean", "median", "std", "var"])
@pytest.mark.parametrize("how", ["agg", "apply"])
def test_apply_np_reducer(op, how):
# GH 39116
float_frame = DataFrame({"a": [1, 2], "b": [3, 4]})
result = getattr(float_frame, how)(op)
# pandas ddof defaults to 1, numpy to 0
kwargs = {"ddof": 1} if op in ("std", "var") else {}
expected = Series(
getattr(np, op)(float_frame, axis=0, **kwargs), index=float_frame.columns
)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"op", ["abs", "ceil", "cos", "cumsum", "exp", "log", "sqrt", "square"]
)
@pytest.mark.parametrize("how", ["transform", "apply"])
def test_apply_np_transformer(float_frame, op, how):
# GH 39116
# float_frame will _usually_ have negative values, which will
# trigger the warning here, but let's put one in just to be sure
float_frame.iloc[0, 0] = -1.0
warn = None
if op in ["log", "sqrt"]:
warn = RuntimeWarning
with tm.assert_produces_warning(warn, check_stacklevel=False):
# float_frame fixture is defined in conftest.py, so we don't check the
# stacklevel as otherwise the test would fail.
result = getattr(float_frame, how)(op)
expected = getattr(np, op)(float_frame)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"series, func, expected",
chain(
tm.get_cython_table_params(
Series(dtype=np.float64),
[
("sum", 0),
("max", np.nan),
("min", np.nan),
("all", True),
("any", False),
("mean", np.nan),
("prod", 1),
("std", np.nan),
("var", np.nan),
("median", np.nan),
],
),
tm.get_cython_table_params(
Series([np.nan, 1, 2, 3]),
[
("sum", 6),
("max", 3),
("min", 1),
("all", True),
("any", True),
("mean", 2),
("prod", 6),
("std", 1),
("var", 1),
("median", 2),
],
),
tm.get_cython_table_params(
Series("a b c".split()),
[
("sum", "abc"),
("max", "c"),
("min", "a"),
("all", True),
("any", True),
],
),
),
)
def test_agg_cython_table_series(series, func, expected):
# GH21224
# test reducing functions in
# pandas.core.base.SelectionMixin._cython_table
warn = None if isinstance(func, str) else FutureWarning
with tm.assert_produces_warning(warn, match="is currently using Series.*"):
result = series.agg(func)
if is_number(expected):
assert np.isclose(result, expected, equal_nan=True)
else:
assert result == expected
@pytest.mark.parametrize(
"series, func, expected",
chain(
tm.get_cython_table_params(
Series(dtype=np.float64),
[
("cumprod", Series([], dtype=np.float64)),
("cumsum", Series([], dtype=np.float64)),
],
),
tm.get_cython_table_params(
Series([np.nan, 1, 2, 3]),
[
("cumprod", Series([np.nan, 1, 2, 6])),
("cumsum", Series([np.nan, 1, 3, 6])),
],
),
tm.get_cython_table_params(
Series("a b c".split()), [("cumsum", Series(["a", "ab", "abc"]))]
),
),
)
def test_agg_cython_table_transform_series(series, func, expected):
# GH21224
# test transforming functions in
# pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
warn = None if isinstance(func, str) else FutureWarning
with tm.assert_produces_warning(warn, match="is currently using Series.*"):
result = series.agg(func)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"df, func, expected",
chain(
tm.get_cython_table_params(
DataFrame(),
[
("sum", Series(dtype="float64")),
("max", Series(dtype="float64")),
("min", Series(dtype="float64")),
("all", Series(dtype=bool)),
("any", Series(dtype=bool)),
("mean", Series(dtype="float64")),
("prod", Series(dtype="float64")),
("std", Series(dtype="float64")),
("var", Series(dtype="float64")),
("median", Series(dtype="float64")),
],
),
tm.get_cython_table_params(
DataFrame([[np.nan, 1], [1, 2]]),
[
("sum", Series([1.0, 3])),
("max", Series([1.0, 2])),
("min", Series([1.0, 1])),
("all", Series([True, True])),
("any", Series([True, True])),
("mean", Series([1, 1.5])),
("prod", Series([1.0, 2])),
("std", Series([np.nan, 0.707107])),
("var", Series([np.nan, 0.5])),
("median", Series([1, 1.5])),
],
),
),
)
def test_agg_cython_table_frame(df, func, expected, axis):
# GH 21224
# test reducing functions in
# pandas.core.base.SelectionMixin._cython_table
warn = None if isinstance(func, str) else FutureWarning
with tm.assert_produces_warning(warn, match="is currently using DataFrame.*"):
# GH#53425
result = df.agg(func, axis=axis)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"df, func, expected",
chain(
tm.get_cython_table_params(
DataFrame(), [("cumprod", DataFrame()), ("cumsum", DataFrame())]
),
tm.get_cython_table_params(
DataFrame([[np.nan, 1], [1, 2]]),
[
("cumprod", DataFrame([[np.nan, 1], [1, 2]])),
("cumsum", DataFrame([[np.nan, 1], [1, 3]])),
],
),
),
)
def test_agg_cython_table_transform_frame(df, func, expected, axis):
# GH 21224
# test transforming functions in
# pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
if axis in ("columns", 1):
# operating blockwise doesn't let us preserve dtypes
expected = expected.astype("float64")
warn = None if isinstance(func, str) else FutureWarning
with tm.assert_produces_warning(warn, match="is currently using DataFrame.*"):
# GH#53425
result = df.agg(func, axis=axis)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("op", series_transform_kernels)
def test_transform_groupby_kernel_series(request, string_series, op):
# GH 35964
if op == "ngroup":
request.applymarker(
pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame")
)
args = [0.0] if op == "fillna" else []
ones = np.ones(string_series.shape[0])
warn = FutureWarning if op == "fillna" else None
msg = "SeriesGroupBy.fillna is deprecated"
with tm.assert_produces_warning(warn, match=msg):
expected = string_series.groupby(ones).transform(op, *args)
result = string_series.transform(op, 0, *args)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("op", frame_transform_kernels)
def test_transform_groupby_kernel_frame(request, axis, float_frame, op):
if op == "ngroup":
request.applymarker(
pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame")
)
# GH 35964
args = [0.0] if op == "fillna" else []
if axis in (0, "index"):
ones = np.ones(float_frame.shape[0])
msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
else:
ones = np.ones(float_frame.shape[1])
msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
gb = float_frame.groupby(ones, axis=axis)
warn = FutureWarning if op == "fillna" else None
op_msg = "DataFrameGroupBy.fillna is deprecated"
with tm.assert_produces_warning(warn, match=op_msg):
expected = gb.transform(op, *args)
result = float_frame.transform(op, axis, *args)
tm.assert_frame_equal(result, expected)
# same thing, but ensuring we have multiple blocks
assert "E" not in float_frame.columns
float_frame["E"] = float_frame["A"].copy()
assert len(float_frame._mgr.arrays) > 1
if axis in (0, "index"):
ones = np.ones(float_frame.shape[0])
else:
ones = np.ones(float_frame.shape[1])
with tm.assert_produces_warning(FutureWarning, match=msg):
gb2 = float_frame.groupby(ones, axis=axis)
warn = FutureWarning if op == "fillna" else None
op_msg = "DataFrameGroupBy.fillna is deprecated"
with tm.assert_produces_warning(warn, match=op_msg):
expected2 = gb2.transform(op, *args)
result2 = float_frame.transform(op, axis, *args)
tm.assert_frame_equal(result2, expected2)
@pytest.mark.parametrize("method", ["abs", "shift", "pct_change", "cumsum", "rank"])
def test_transform_method_name(method):
# GH 19760
df = DataFrame({"A": [-1, 2]})
result = df.transform(method)
expected = operator.methodcaller(method)(df)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,155 @@
"""
Assertion helpers for arithmetic tests.
"""
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
Series,
array,
)
import pandas._testing as tm
from pandas.core.arrays import (
BooleanArray,
NumpyExtensionArray,
)
def assert_cannot_add(left, right, msg="cannot add"):
"""
Helper to assert that left and right cannot be added.
Parameters
----------
left : object
right : object
msg : str, default "cannot add"
"""
with pytest.raises(TypeError, match=msg):
left + right
with pytest.raises(TypeError, match=msg):
right + left
def assert_invalid_addsub_type(left, right, msg=None):
"""
Helper to assert that left and right can be neither added nor subtracted.
Parameters
----------
left : object
right : object
msg : str or None, default None
"""
with pytest.raises(TypeError, match=msg):
left + right
with pytest.raises(TypeError, match=msg):
right + left
with pytest.raises(TypeError, match=msg):
left - right
with pytest.raises(TypeError, match=msg):
right - left
def get_upcast_box(left, right, is_cmp: bool = False):
"""
Get the box to use for 'expected' in an arithmetic or comparison operation.
Parameters
left : Any
right : Any
is_cmp : bool, default False
Whether the operation is a comparison method.
"""
if isinstance(left, DataFrame) or isinstance(right, DataFrame):
return DataFrame
if isinstance(left, Series) or isinstance(right, Series):
if is_cmp and isinstance(left, Index):
# Index does not defer for comparisons
return np.array
return Series
if isinstance(left, Index) or isinstance(right, Index):
if is_cmp:
return np.array
return Index
return tm.to_array
def assert_invalid_comparison(left, right, box):
"""
Assert that comparison operations with mismatched types behave correctly.
Parameters
----------
left : np.ndarray, ExtensionArray, Index, or Series
right : object
box : {pd.DataFrame, pd.Series, pd.Index, pd.array, tm.to_array}
"""
# Not for tznaive-tzaware comparison
# Note: not quite the same as how we do this for tm.box_expected
xbox = box if box not in [Index, array] else np.array
def xbox2(x):
# Eventually we'd like this to be tighter, but for now we'll
# just exclude NumpyExtensionArray[bool]
if isinstance(x, NumpyExtensionArray):
return x._ndarray
if isinstance(x, BooleanArray):
# NB: we are assuming no pd.NAs for now
return x.astype(bool)
return x
# rev_box: box to use for reversed comparisons
rev_box = xbox
if isinstance(right, Index) and isinstance(left, Series):
rev_box = np.array
result = xbox2(left == right)
expected = xbox(np.zeros(result.shape, dtype=np.bool_))
tm.assert_equal(result, expected)
result = xbox2(right == left)
tm.assert_equal(result, rev_box(expected))
result = xbox2(left != right)
tm.assert_equal(result, ~expected)
result = xbox2(right != left)
tm.assert_equal(result, rev_box(~expected))
msg = "|".join(
[
"Invalid comparison between",
"Cannot compare type",
"not supported between",
"invalid type promotion",
(
# GH#36706 npdev 1.20.0 2020-09-28
r"The DTypes <class 'numpy.dtype\[datetime64\]'> and "
r"<class 'numpy.dtype\[int64\]'> do not have a common DType. "
"For example they cannot be stored in a single array unless the "
"dtype is `object`."
),
]
)
with pytest.raises(TypeError, match=msg):
left < right
with pytest.raises(TypeError, match=msg):
left <= right
with pytest.raises(TypeError, match=msg):
left > right
with pytest.raises(TypeError, match=msg):
left >= right
with pytest.raises(TypeError, match=msg):
right < left
with pytest.raises(TypeError, match=msg):
right <= left
with pytest.raises(TypeError, match=msg):
right > left
with pytest.raises(TypeError, match=msg):
right >= left

View File

@ -0,0 +1,139 @@
import numpy as np
import pytest
import pandas as pd
from pandas import Index
@pytest.fixture(params=[1, np.array(1, dtype=np.int64)])
def one(request):
"""
Several variants of integer value 1. The zero-dim integer array
behaves like an integer.
This fixture can be used to check that datetimelike indexes handle
addition and subtraction of integers and zero-dimensional arrays
of integers.
Examples
--------
dti = pd.date_range('2016-01-01', periods=2, freq='h')
dti
DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 01:00:00'],
dtype='datetime64[ns]', freq='h')
dti + one
DatetimeIndex(['2016-01-01 01:00:00', '2016-01-01 02:00:00'],
dtype='datetime64[ns]', freq='h')
"""
return request.param
zeros = [
box_cls([0] * 5, dtype=dtype)
for box_cls in [Index, np.array, pd.array]
for dtype in [np.int64, np.uint64, np.float64]
]
zeros.extend([box_cls([-0.0] * 5, dtype=np.float64) for box_cls in [Index, np.array]])
zeros.extend([np.array(0, dtype=dtype) for dtype in [np.int64, np.uint64, np.float64]])
zeros.extend([np.array(-0.0, dtype=np.float64)])
zeros.extend([0, 0.0, -0.0])
@pytest.fixture(params=zeros)
def zero(request):
"""
Several types of scalar zeros and length 5 vectors of zeros.
This fixture can be used to check that numeric-dtype indexes handle
division by any zero numeric-dtype.
Uses vector of length 5 for broadcasting with `numeric_idx` fixture,
which creates numeric-dtype vectors also of length 5.
Examples
--------
arr = RangeIndex(5)
arr / zeros
Index([nan, inf, inf, inf, inf], dtype='float64')
"""
return request.param
# ------------------------------------------------------------------
# Scalar Fixtures
@pytest.fixture(
params=[
pd.Timedelta("10m7s").to_pytimedelta(),
pd.Timedelta("10m7s"),
pd.Timedelta("10m7s").to_timedelta64(),
],
ids=lambda x: type(x).__name__,
)
def scalar_td(request):
"""
Several variants of Timedelta scalars representing 10 minutes and 7 seconds.
"""
return request.param
@pytest.fixture(
params=[
pd.offsets.Day(3),
pd.offsets.Hour(72),
pd.Timedelta(days=3).to_pytimedelta(),
pd.Timedelta("72:00:00"),
np.timedelta64(3, "D"),
np.timedelta64(72, "h"),
],
ids=lambda x: type(x).__name__,
)
def three_days(request):
"""
Several timedelta-like and DateOffset objects that each represent
a 3-day timedelta
"""
return request.param
@pytest.fixture(
params=[
pd.offsets.Hour(2),
pd.offsets.Minute(120),
pd.Timedelta(hours=2).to_pytimedelta(),
pd.Timedelta(seconds=2 * 3600),
np.timedelta64(2, "h"),
np.timedelta64(120, "m"),
],
ids=lambda x: type(x).__name__,
)
def two_hours(request):
"""
Several timedelta-like and DateOffset objects that each represent
a 2-hour timedelta
"""
return request.param
_common_mismatch = [
pd.offsets.YearBegin(2),
pd.offsets.MonthBegin(1),
pd.offsets.Minute(),
]
@pytest.fixture(
params=[
np.timedelta64(4, "h"),
pd.Timedelta(hours=23).to_pytimedelta(),
pd.Timedelta("23:00:00"),
]
+ _common_mismatch
)
def not_daily(request):
"""
Several timedelta-like and DateOffset instances that are _not_
compatible with Daily frequencies.
"""
return request.param

View File

@ -0,0 +1,39 @@
import operator
import numpy as np
import pytest
import pandas._testing as tm
from pandas.core.ops.array_ops import (
comparison_op,
na_logical_op,
)
def test_na_logical_op_2d():
left = np.arange(8).reshape(4, 2)
right = left.astype(object)
right[0, 0] = np.nan
# Check that we fall back to the vec_binop branch
with pytest.raises(TypeError, match="unsupported operand type"):
operator.or_(left, right)
result = na_logical_op(left, right, operator.or_)
expected = right
tm.assert_numpy_array_equal(result, expected)
def test_object_comparison_2d():
left = np.arange(9).reshape(3, 3).astype(object)
right = left.T
result = comparison_op(left, right, operator.eq)
expected = np.eye(3).astype(bool)
tm.assert_numpy_array_equal(result, expected)
# Ensure that cython doesn't raise on non-writeable arg, which
# we can get from np.broadcast_to
right.flags.writeable = False
result = comparison_op(left, right, operator.ne)
tm.assert_numpy_array_equal(result, ~expected)

View File

@ -0,0 +1,25 @@
import numpy as np
from pandas import (
Categorical,
Series,
)
import pandas._testing as tm
class TestCategoricalComparisons:
def test_categorical_nan_equality(self):
cat = Series(Categorical(["a", "b", "c", np.nan]))
expected = Series([True, True, True, False])
result = cat == cat
tm.assert_series_equal(result, expected)
def test_categorical_tuple_equality(self):
# GH 18050
ser = Series([(0, 0), (0, 1), (0, 0), (1, 0), (1, 1)])
expected = Series([True, False, True, False, False])
result = ser == (0, 0)
tm.assert_series_equal(result, expected)
result = ser.astype("category") == (0, 0)
tm.assert_series_equal(result, expected)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,306 @@
import operator
import numpy as np
import pytest
from pandas.core.dtypes.common import is_list_like
import pandas as pd
from pandas import (
Categorical,
Index,
Interval,
IntervalIndex,
Period,
Series,
Timedelta,
Timestamp,
date_range,
period_range,
timedelta_range,
)
import pandas._testing as tm
from pandas.core.arrays import (
BooleanArray,
IntervalArray,
)
from pandas.tests.arithmetic.common import get_upcast_box
@pytest.fixture(
params=[
(Index([0, 2, 4, 4]), Index([1, 3, 5, 8])),
(Index([0.0, 1.0, 2.0, np.nan]), Index([1.0, 2.0, 3.0, np.nan])),
(
timedelta_range("0 days", periods=3).insert(3, pd.NaT),
timedelta_range("1 day", periods=3).insert(3, pd.NaT),
),
(
date_range("20170101", periods=3).insert(3, pd.NaT),
date_range("20170102", periods=3).insert(3, pd.NaT),
),
(
date_range("20170101", periods=3, tz="US/Eastern").insert(3, pd.NaT),
date_range("20170102", periods=3, tz="US/Eastern").insert(3, pd.NaT),
),
],
ids=lambda x: str(x[0].dtype),
)
def left_right_dtypes(request):
"""
Fixture for building an IntervalArray from various dtypes
"""
return request.param
@pytest.fixture
def interval_array(left_right_dtypes):
"""
Fixture to generate an IntervalArray of various dtypes containing NA if possible
"""
left, right = left_right_dtypes
return IntervalArray.from_arrays(left, right)
def create_categorical_intervals(left, right, closed="right"):
return Categorical(IntervalIndex.from_arrays(left, right, closed))
def create_series_intervals(left, right, closed="right"):
return Series(IntervalArray.from_arrays(left, right, closed))
def create_series_categorical_intervals(left, right, closed="right"):
return Series(Categorical(IntervalIndex.from_arrays(left, right, closed)))
class TestComparison:
@pytest.fixture(params=[operator.eq, operator.ne])
def op(self, request):
return request.param
@pytest.fixture(
params=[
IntervalArray.from_arrays,
IntervalIndex.from_arrays,
create_categorical_intervals,
create_series_intervals,
create_series_categorical_intervals,
],
ids=[
"IntervalArray",
"IntervalIndex",
"Categorical[Interval]",
"Series[Interval]",
"Series[Categorical[Interval]]",
],
)
def interval_constructor(self, request):
"""
Fixture for all pandas native interval constructors.
To be used as the LHS of IntervalArray comparisons.
"""
return request.param
def elementwise_comparison(self, op, interval_array, other):
"""
Helper that performs elementwise comparisons between `array` and `other`
"""
other = other if is_list_like(other) else [other] * len(interval_array)
expected = np.array([op(x, y) for x, y in zip(interval_array, other)])
if isinstance(other, Series):
return Series(expected, index=other.index)
return expected
def test_compare_scalar_interval(self, op, interval_array):
# matches first interval
other = interval_array[0]
result = op(interval_array, other)
expected = self.elementwise_comparison(op, interval_array, other)
tm.assert_numpy_array_equal(result, expected)
# matches on a single endpoint but not both
other = Interval(interval_array.left[0], interval_array.right[1])
result = op(interval_array, other)
expected = self.elementwise_comparison(op, interval_array, other)
tm.assert_numpy_array_equal(result, expected)
def test_compare_scalar_interval_mixed_closed(self, op, closed, other_closed):
interval_array = IntervalArray.from_arrays(range(2), range(1, 3), closed=closed)
other = Interval(0, 1, closed=other_closed)
result = op(interval_array, other)
expected = self.elementwise_comparison(op, interval_array, other)
tm.assert_numpy_array_equal(result, expected)
def test_compare_scalar_na(self, op, interval_array, nulls_fixture, box_with_array):
box = box_with_array
obj = tm.box_expected(interval_array, box)
result = op(obj, nulls_fixture)
if nulls_fixture is pd.NA:
# GH#31882
exp = np.ones(interval_array.shape, dtype=bool)
expected = BooleanArray(exp, exp)
else:
expected = self.elementwise_comparison(op, interval_array, nulls_fixture)
if not (box is Index and nulls_fixture is pd.NA):
# don't cast expected from BooleanArray to ndarray[object]
xbox = get_upcast_box(obj, nulls_fixture, True)
expected = tm.box_expected(expected, xbox)
tm.assert_equal(result, expected)
rev = op(nulls_fixture, obj)
tm.assert_equal(rev, expected)
@pytest.mark.parametrize(
"other",
[
0,
1.0,
True,
"foo",
Timestamp("2017-01-01"),
Timestamp("2017-01-01", tz="US/Eastern"),
Timedelta("0 days"),
Period("2017-01-01", "D"),
],
)
def test_compare_scalar_other(self, op, interval_array, other):
result = op(interval_array, other)
expected = self.elementwise_comparison(op, interval_array, other)
tm.assert_numpy_array_equal(result, expected)
def test_compare_list_like_interval(self, op, interval_array, interval_constructor):
# same endpoints
other = interval_constructor(interval_array.left, interval_array.right)
result = op(interval_array, other)
expected = self.elementwise_comparison(op, interval_array, other)
tm.assert_equal(result, expected)
# different endpoints
other = interval_constructor(
interval_array.left[::-1], interval_array.right[::-1]
)
result = op(interval_array, other)
expected = self.elementwise_comparison(op, interval_array, other)
tm.assert_equal(result, expected)
# all nan endpoints
other = interval_constructor([np.nan] * 4, [np.nan] * 4)
result = op(interval_array, other)
expected = self.elementwise_comparison(op, interval_array, other)
tm.assert_equal(result, expected)
def test_compare_list_like_interval_mixed_closed(
self, op, interval_constructor, closed, other_closed
):
interval_array = IntervalArray.from_arrays(range(2), range(1, 3), closed=closed)
other = interval_constructor(range(2), range(1, 3), closed=other_closed)
result = op(interval_array, other)
expected = self.elementwise_comparison(op, interval_array, other)
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"other",
[
(
Interval(0, 1),
Interval(Timedelta("1 day"), Timedelta("2 days")),
Interval(4, 5, "both"),
Interval(10, 20, "neither"),
),
(0, 1.5, Timestamp("20170103"), np.nan),
(
Timestamp("20170102", tz="US/Eastern"),
Timedelta("2 days"),
"baz",
pd.NaT,
),
],
)
def test_compare_list_like_object(self, op, interval_array, other):
result = op(interval_array, other)
expected = self.elementwise_comparison(op, interval_array, other)
tm.assert_numpy_array_equal(result, expected)
def test_compare_list_like_nan(self, op, interval_array, nulls_fixture):
other = [nulls_fixture] * 4
result = op(interval_array, other)
expected = self.elementwise_comparison(op, interval_array, other)
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"other",
[
np.arange(4, dtype="int64"),
np.arange(4, dtype="float64"),
date_range("2017-01-01", periods=4),
date_range("2017-01-01", periods=4, tz="US/Eastern"),
timedelta_range("0 days", periods=4),
period_range("2017-01-01", periods=4, freq="D"),
Categorical(list("abab")),
Categorical(date_range("2017-01-01", periods=4)),
pd.array(list("abcd")),
pd.array(["foo", 3.14, None, object()], dtype=object),
],
ids=lambda x: str(x.dtype),
)
def test_compare_list_like_other(self, op, interval_array, other):
result = op(interval_array, other)
expected = self.elementwise_comparison(op, interval_array, other)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("length", [1, 3, 5])
@pytest.mark.parametrize("other_constructor", [IntervalArray, list])
def test_compare_length_mismatch_errors(self, op, other_constructor, length):
interval_array = IntervalArray.from_arrays(range(4), range(1, 5))
other = other_constructor([Interval(0, 1)] * length)
with pytest.raises(ValueError, match="Lengths must match to compare"):
op(interval_array, other)
@pytest.mark.parametrize(
"constructor, expected_type, assert_func",
[
(IntervalIndex, np.array, tm.assert_numpy_array_equal),
(Series, Series, tm.assert_series_equal),
],
)
def test_index_series_compat(self, op, constructor, expected_type, assert_func):
# IntervalIndex/Series that rely on IntervalArray for comparisons
breaks = range(4)
index = constructor(IntervalIndex.from_breaks(breaks))
# scalar comparisons
other = index[0]
result = op(index, other)
expected = expected_type(self.elementwise_comparison(op, index, other))
assert_func(result, expected)
other = breaks[0]
result = op(index, other)
expected = expected_type(self.elementwise_comparison(op, index, other))
assert_func(result, expected)
# list-like comparisons
other = IntervalArray.from_breaks(breaks)
result = op(index, other)
expected = expected_type(self.elementwise_comparison(op, index, other))
assert_func(result, expected)
other = [index[0], breaks[0], "foo"]
result = op(index, other)
expected = expected_type(self.elementwise_comparison(op, index, other))
assert_func(result, expected)
@pytest.mark.parametrize("scalars", ["a", False, 1, 1.0, None])
def test_comparison_operations(self, scalars):
# GH #28981
expected = Series([False, False])
s = Series([Interval(0, 1), Interval(1, 2)], dtype="interval")
result = s == scalars
tm.assert_series_equal(result, expected)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,420 @@
# Arithmetic tests for DataFrame/Series/Index/Array classes that should
# behave identically.
# Specifically for object dtype
import datetime
from decimal import Decimal
import operator
import numpy as np
import pytest
from pandas._config import using_pyarrow_string_dtype
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
Series,
Timestamp,
option_context,
)
import pandas._testing as tm
from pandas.core import ops
# ------------------------------------------------------------------
# Comparisons
class TestObjectComparisons:
def test_comparison_object_numeric_nas(self, comparison_op):
ser = Series(np.random.default_rng(2).standard_normal(10), dtype=object)
shifted = ser.shift(2)
func = comparison_op
result = func(ser, shifted)
expected = func(ser.astype(float), shifted.astype(float))
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
)
def test_object_comparisons(self, infer_string):
with option_context("future.infer_string", infer_string):
ser = Series(["a", "b", np.nan, "c", "a"])
result = ser == "a"
expected = Series([True, False, False, False, True])
tm.assert_series_equal(result, expected)
result = ser < "a"
expected = Series([False, False, False, False, False])
tm.assert_series_equal(result, expected)
result = ser != "a"
expected = -(ser == "a")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("dtype", [None, object])
def test_more_na_comparisons(self, dtype):
left = Series(["a", np.nan, "c"], dtype=dtype)
right = Series(["a", np.nan, "d"], dtype=dtype)
result = left == right
expected = Series([True, False, False])
tm.assert_series_equal(result, expected)
result = left != right
expected = Series([False, True, True])
tm.assert_series_equal(result, expected)
result = left == np.nan
expected = Series([False, False, False])
tm.assert_series_equal(result, expected)
result = left != np.nan
expected = Series([True, True, True])
tm.assert_series_equal(result, expected)
# ------------------------------------------------------------------
# Arithmetic
class TestArithmetic:
def test_add_period_to_array_of_offset(self):
# GH#50162
per = pd.Period("2012-1-1", freq="D")
pi = pd.period_range("2012-1-1", periods=10, freq="D")
idx = per - pi
expected = pd.Index([x + per for x in idx], dtype=object)
result = idx + per
tm.assert_index_equal(result, expected)
result = per + idx
tm.assert_index_equal(result, expected)
# TODO: parametrize
def test_pow_ops_object(self):
# GH#22922
# pow is weird with masking & 1, so testing here
a = Series([1, np.nan, 1, np.nan], dtype=object)
b = Series([1, np.nan, np.nan, 1], dtype=object)
result = a**b
expected = Series(a.values**b.values, dtype=object)
tm.assert_series_equal(result, expected)
result = b**a
expected = Series(b.values**a.values, dtype=object)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("op", [operator.add, ops.radd])
@pytest.mark.parametrize("other", ["category", "Int64"])
def test_add_extension_scalar(self, other, box_with_array, op):
# GH#22378
# Check that scalars satisfying is_extension_array_dtype(obj)
# do not incorrectly try to dispatch to an ExtensionArray operation
arr = Series(["a", "b", "c"])
expected = Series([op(x, other) for x in arr])
arr = tm.box_expected(arr, box_with_array)
expected = tm.box_expected(expected, box_with_array)
result = op(arr, other)
tm.assert_equal(result, expected)
def test_objarr_add_str(self, box_with_array):
ser = Series(["x", np.nan, "x"])
expected = Series(["xa", np.nan, "xa"])
ser = tm.box_expected(ser, box_with_array)
expected = tm.box_expected(expected, box_with_array)
result = ser + "a"
tm.assert_equal(result, expected)
def test_objarr_radd_str(self, box_with_array):
ser = Series(["x", np.nan, "x"])
expected = Series(["ax", np.nan, "ax"])
ser = tm.box_expected(ser, box_with_array)
expected = tm.box_expected(expected, box_with_array)
result = "a" + ser
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"data",
[
[1, 2, 3],
[1.1, 2.2, 3.3],
[Timestamp("2011-01-01"), Timestamp("2011-01-02"), pd.NaT],
["x", "y", 1],
],
)
@pytest.mark.parametrize("dtype", [None, object])
def test_objarr_radd_str_invalid(self, dtype, data, box_with_array):
ser = Series(data, dtype=dtype)
ser = tm.box_expected(ser, box_with_array)
msg = "|".join(
[
"can only concatenate str",
"did not contain a loop with signature matching types",
"unsupported operand type",
"must be str",
]
)
with pytest.raises(TypeError, match=msg):
"foo_" + ser
@pytest.mark.parametrize("op", [operator.add, ops.radd, operator.sub, ops.rsub])
def test_objarr_add_invalid(self, op, box_with_array):
# invalid ops
box = box_with_array
obj_ser = Series(list("abc"), dtype=object, name="objects")
obj_ser = tm.box_expected(obj_ser, box)
msg = "|".join(
[
"can only concatenate str",
"unsupported operand type",
"must be str",
"has no kernel",
]
)
with pytest.raises(Exception, match=msg):
op(obj_ser, 1)
with pytest.raises(Exception, match=msg):
op(obj_ser, np.array(1, dtype=np.int64))
# TODO: Moved from tests.series.test_operators; needs cleanup
def test_operators_na_handling(self):
ser = Series(["foo", "bar", "baz", np.nan])
result = "prefix_" + ser
expected = Series(["prefix_foo", "prefix_bar", "prefix_baz", np.nan])
tm.assert_series_equal(result, expected)
result = ser + "_suffix"
expected = Series(["foo_suffix", "bar_suffix", "baz_suffix", np.nan])
tm.assert_series_equal(result, expected)
# TODO: parametrize over box
@pytest.mark.parametrize("dtype", [None, object])
def test_series_with_dtype_radd_timedelta(self, dtype):
# note this test is _not_ aimed at timedelta64-dtyped Series
# as of 2.0 we retain object dtype when ser.dtype == object
ser = Series(
[pd.Timedelta("1 days"), pd.Timedelta("2 days"), pd.Timedelta("3 days")],
dtype=dtype,
)
expected = Series(
[pd.Timedelta("4 days"), pd.Timedelta("5 days"), pd.Timedelta("6 days")],
dtype=dtype,
)
result = pd.Timedelta("3 days") + ser
tm.assert_series_equal(result, expected)
result = ser + pd.Timedelta("3 days")
tm.assert_series_equal(result, expected)
# TODO: cleanup & parametrize over box
def test_mixed_timezone_series_ops_object(self):
# GH#13043
ser = Series(
[
Timestamp("2015-01-01", tz="US/Eastern"),
Timestamp("2015-01-01", tz="Asia/Tokyo"),
],
name="xxx",
)
assert ser.dtype == object
exp = Series(
[
Timestamp("2015-01-02", tz="US/Eastern"),
Timestamp("2015-01-02", tz="Asia/Tokyo"),
],
name="xxx",
)
tm.assert_series_equal(ser + pd.Timedelta("1 days"), exp)
tm.assert_series_equal(pd.Timedelta("1 days") + ser, exp)
# object series & object series
ser2 = Series(
[
Timestamp("2015-01-03", tz="US/Eastern"),
Timestamp("2015-01-05", tz="Asia/Tokyo"),
],
name="xxx",
)
assert ser2.dtype == object
exp = Series(
[pd.Timedelta("2 days"), pd.Timedelta("4 days")], name="xxx", dtype=object
)
tm.assert_series_equal(ser2 - ser, exp)
tm.assert_series_equal(ser - ser2, -exp)
ser = Series(
[pd.Timedelta("01:00:00"), pd.Timedelta("02:00:00")],
name="xxx",
dtype=object,
)
assert ser.dtype == object
exp = Series(
[pd.Timedelta("01:30:00"), pd.Timedelta("02:30:00")],
name="xxx",
dtype=object,
)
tm.assert_series_equal(ser + pd.Timedelta("00:30:00"), exp)
tm.assert_series_equal(pd.Timedelta("00:30:00") + ser, exp)
# TODO: cleanup & parametrize over box
def test_iadd_preserves_name(self):
# GH#17067, GH#19723 __iadd__ and __isub__ should preserve index name
ser = Series([1, 2, 3])
ser.index.name = "foo"
ser.index += 1
assert ser.index.name == "foo"
ser.index -= 1
assert ser.index.name == "foo"
def test_add_string(self):
# from bug report
index = pd.Index(["a", "b", "c"])
index2 = index + "foo"
assert "a" not in index2
assert "afoo" in index2
def test_iadd_string(self):
index = pd.Index(["a", "b", "c"])
# doesn't fail test unless there is a check before `+=`
assert "a" in index
index += "_x"
assert "a_x" in index
@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="add doesn't work")
def test_add(self):
index = pd.Index([str(i) for i in range(10)])
expected = pd.Index(index.values * 2)
tm.assert_index_equal(index + index, expected)
tm.assert_index_equal(index + index.tolist(), expected)
tm.assert_index_equal(index.tolist() + index, expected)
# test add and radd
index = pd.Index(list("abc"))
expected = pd.Index(["a1", "b1", "c1"])
tm.assert_index_equal(index + "1", expected)
expected = pd.Index(["1a", "1b", "1c"])
tm.assert_index_equal("1" + index, expected)
def test_sub_fail(self, using_infer_string):
index = pd.Index([str(i) for i in range(10)])
if using_infer_string:
import pyarrow as pa
err = pa.lib.ArrowNotImplementedError
msg = "has no kernel"
else:
err = TypeError
msg = "unsupported operand type|Cannot broadcast"
with pytest.raises(err, match=msg):
index - "a"
with pytest.raises(err, match=msg):
index - index
with pytest.raises(err, match=msg):
index - index.tolist()
with pytest.raises(err, match=msg):
index.tolist() - index
def test_sub_object(self):
# GH#19369
index = pd.Index([Decimal(1), Decimal(2)])
expected = pd.Index([Decimal(0), Decimal(1)])
result = index - Decimal(1)
tm.assert_index_equal(result, expected)
result = index - pd.Index([Decimal(1), Decimal(1)])
tm.assert_index_equal(result, expected)
msg = "unsupported operand type"
with pytest.raises(TypeError, match=msg):
index - "foo"
with pytest.raises(TypeError, match=msg):
index - np.array([2, "foo"], dtype=object)
def test_rsub_object(self, fixed_now_ts):
# GH#19369
index = pd.Index([Decimal(1), Decimal(2)])
expected = pd.Index([Decimal(1), Decimal(0)])
result = Decimal(2) - index
tm.assert_index_equal(result, expected)
result = np.array([Decimal(2), Decimal(2)]) - index
tm.assert_index_equal(result, expected)
msg = "unsupported operand type"
with pytest.raises(TypeError, match=msg):
"foo" - index
with pytest.raises(TypeError, match=msg):
np.array([True, fixed_now_ts]) - index
class MyIndex(pd.Index):
# Simple index subclass that tracks ops calls.
_calls: int
@classmethod
def _simple_new(cls, values, name=None, dtype=None):
result = object.__new__(cls)
result._data = values
result._name = name
result._calls = 0
result._reset_identity()
return result
def __add__(self, other):
self._calls += 1
return self._simple_new(self._data)
def __radd__(self, other):
return self.__add__(other)
@pytest.mark.parametrize(
"other",
[
[datetime.timedelta(1), datetime.timedelta(2)],
[datetime.datetime(2000, 1, 1), datetime.datetime(2000, 1, 2)],
[pd.Period("2000"), pd.Period("2001")],
["a", "b"],
],
ids=["timedelta", "datetime", "period", "object"],
)
def test_index_ops_defer_to_unknown_subclasses(other):
# https://github.com/pandas-dev/pandas/issues/31109
values = np.array(
[datetime.date(2000, 1, 1), datetime.date(2000, 1, 2)], dtype=object
)
a = MyIndex._simple_new(values)
other = pd.Index(other)
result = other + a
assert isinstance(result, MyIndex)
assert a._calls == 1

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,139 @@
import operator
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.fixture
def data():
"""Fixture returning boolean array with valid and missing values."""
return pd.array(
[True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False],
dtype="boolean",
)
@pytest.fixture
def left_array():
"""Fixture returning boolean array with valid and missing values."""
return pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
@pytest.fixture
def right_array():
"""Fixture returning boolean array with valid and missing values."""
return pd.array([True, False, None] * 3, dtype="boolean")
# Basic test for the arithmetic array ops
# -----------------------------------------------------------------------------
@pytest.mark.parametrize(
"opname, exp",
[
("add", [True, True, None, True, False, None, None, None, None]),
("mul", [True, False, None, False, False, None, None, None, None]),
],
ids=["add", "mul"],
)
def test_add_mul(left_array, right_array, opname, exp):
op = getattr(operator, opname)
result = op(left_array, right_array)
expected = pd.array(exp, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_sub(left_array, right_array):
msg = (
r"numpy boolean subtract, the `-` operator, is (?:deprecated|not supported), "
r"use the bitwise_xor, the `\^` operator, or the logical_xor function instead\."
)
with pytest.raises(TypeError, match=msg):
left_array - right_array
def test_div(left_array, right_array):
msg = "operator '.*' not implemented for bool dtypes"
with pytest.raises(NotImplementedError, match=msg):
# check that we are matching the non-masked Series behavior
pd.Series(left_array._data) / pd.Series(right_array._data)
with pytest.raises(NotImplementedError, match=msg):
left_array / right_array
@pytest.mark.parametrize(
"opname",
[
"floordiv",
"mod",
"pow",
],
)
def test_op_int8(left_array, right_array, opname):
op = getattr(operator, opname)
if opname != "mod":
msg = "operator '.*' not implemented for bool dtypes"
with pytest.raises(NotImplementedError, match=msg):
result = op(left_array, right_array)
return
result = op(left_array, right_array)
expected = op(left_array.astype("Int8"), right_array.astype("Int8"))
tm.assert_extension_array_equal(result, expected)
# Test generic characteristics / errors
# -----------------------------------------------------------------------------
def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
# invalid ops
if using_infer_string:
import pyarrow as pa
err = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
else:
err = TypeError
op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)
# invalid scalars
msg = (
"did not contain a loop with signature matching types|"
"BooleanArray cannot perform the operation|"
"not supported for the input types, and the inputs could not be safely coerced "
"to any supported types according to the casting rule ''safe''"
)
with pytest.raises(TypeError, match=msg):
ops("foo")
msg = "|".join(
[
r"unsupported operand type\(s\) for",
"Concatenation operation is not implemented for NumPy arrays",
"has no kernel",
]
)
with pytest.raises(err, match=msg):
ops(pd.Timestamp("20180101"))
# invalid array-likes
if op not in ("__mul__", "__rmul__"):
# TODO(extension) numpy's mul with object array sees booleans as numbers
msg = "|".join(
[
r"unsupported operand type\(s\) for",
"can only concatenate str",
"not all arguments converted during string formatting",
"has no kernel",
"not implemented",
]
)
with pytest.raises(err, match=msg):
ops(pd.Series("foo", index=s.index))

View File

@ -0,0 +1,53 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
def test_astype():
# with missing values
arr = pd.array([True, False, None], dtype="boolean")
with pytest.raises(ValueError, match="cannot convert NA to integer"):
arr.astype("int64")
with pytest.raises(ValueError, match="cannot convert float NaN to"):
arr.astype("bool")
result = arr.astype("float64")
expected = np.array([1, 0, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
result = arr.astype("str")
expected = np.array(["True", "False", "<NA>"], dtype=f"{tm.ENDIAN}U5")
tm.assert_numpy_array_equal(result, expected)
# no missing values
arr = pd.array([True, False, True], dtype="boolean")
result = arr.astype("int64")
expected = np.array([1, 0, 1], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
result = arr.astype("bool")
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
def test_astype_to_boolean_array():
# astype to BooleanArray
arr = pd.array([True, False, None], dtype="boolean")
result = arr.astype("boolean")
tm.assert_extension_array_equal(result, arr)
result = arr.astype(pd.BooleanDtype())
tm.assert_extension_array_equal(result, arr)
def test_astype_to_integer_array():
# astype to IntegerArray
arr = pd.array([True, False, None], dtype="boolean")
result = arr.astype("Int64")
expected = pd.array([1, 0, None], dtype="Int64")
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,60 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.arrays import BooleanArray
from pandas.tests.arrays.masked_shared import ComparisonOps
@pytest.fixture
def data():
"""Fixture returning boolean array with valid and missing data"""
return pd.array(
[True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False],
dtype="boolean",
)
@pytest.fixture
def dtype():
"""Fixture returning BooleanDtype"""
return pd.BooleanDtype()
class TestComparisonOps(ComparisonOps):
def test_compare_scalar(self, data, comparison_op):
self._compare_other(data, comparison_op, True)
def test_compare_array(self, data, comparison_op):
other = pd.array([True] * len(data), dtype="boolean")
self._compare_other(data, comparison_op, other)
other = np.array([True] * len(data))
self._compare_other(data, comparison_op, other)
other = pd.Series([True] * len(data))
self._compare_other(data, comparison_op, other)
@pytest.mark.parametrize("other", [True, False, pd.NA])
def test_scalar(self, other, comparison_op, dtype):
ComparisonOps.test_scalar(self, other, comparison_op, dtype)
def test_array(self, comparison_op):
op = comparison_op
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
b = pd.array([True, False, None] * 3, dtype="boolean")
result = op(a, b)
values = op(a._data, b._data)
mask = a._mask | b._mask
expected = BooleanArray(values, mask)
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
result[0] = None
tm.assert_extension_array_equal(
a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
)
tm.assert_extension_array_equal(
b, pd.array([True, False, None] * 3, dtype="boolean")
)

View File

@ -0,0 +1,325 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.arrays import BooleanArray
from pandas.core.arrays.boolean import coerce_to_array
def test_boolean_array_constructor():
values = np.array([True, False, True, False], dtype="bool")
mask = np.array([False, False, False, True], dtype="bool")
result = BooleanArray(values, mask)
expected = pd.array([True, False, True, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
with pytest.raises(TypeError, match="values should be boolean numpy array"):
BooleanArray(values.tolist(), mask)
with pytest.raises(TypeError, match="mask should be boolean numpy array"):
BooleanArray(values, mask.tolist())
with pytest.raises(TypeError, match="values should be boolean numpy array"):
BooleanArray(values.astype(int), mask)
with pytest.raises(TypeError, match="mask should be boolean numpy array"):
BooleanArray(values, None)
with pytest.raises(ValueError, match="values.shape must match mask.shape"):
BooleanArray(values.reshape(1, -1), mask)
with pytest.raises(ValueError, match="values.shape must match mask.shape"):
BooleanArray(values, mask.reshape(1, -1))
def test_boolean_array_constructor_copy():
values = np.array([True, False, True, False], dtype="bool")
mask = np.array([False, False, False, True], dtype="bool")
result = BooleanArray(values, mask)
assert result._data is values
assert result._mask is mask
result = BooleanArray(values, mask, copy=True)
assert result._data is not values
assert result._mask is not mask
def test_to_boolean_array():
expected = BooleanArray(
np.array([True, False, True]), np.array([False, False, False])
)
result = pd.array([True, False, True], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = pd.array(np.array([True, False, True]), dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = pd.array(np.array([True, False, True], dtype=object), dtype="boolean")
tm.assert_extension_array_equal(result, expected)
# with missing values
expected = BooleanArray(
np.array([True, False, True]), np.array([False, False, True])
)
result = pd.array([True, False, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = pd.array(np.array([True, False, None], dtype=object), dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_to_boolean_array_all_none():
expected = BooleanArray(np.array([True, True, True]), np.array([True, True, True]))
result = pd.array([None, None, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = pd.array(np.array([None, None, None], dtype=object), dtype="boolean")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"a, b",
[
([True, False, None, np.nan, pd.NA], [True, False, None, None, None]),
([True, np.nan], [True, None]),
([True, pd.NA], [True, None]),
([np.nan, np.nan], [None, None]),
(np.array([np.nan, np.nan], dtype=float), [None, None]),
],
)
def test_to_boolean_array_missing_indicators(a, b):
result = pd.array(a, dtype="boolean")
expected = pd.array(b, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"values",
[
["foo", "bar"],
["1", "2"],
# "foo",
[1, 2],
[1.0, 2.0],
pd.date_range("20130101", periods=2),
np.array(["foo"]),
np.array([1, 2]),
np.array([1.0, 2.0]),
[np.nan, {"a": 1}],
],
)
def test_to_boolean_array_error(values):
# error in converting existing arrays to BooleanArray
msg = "Need to pass bool-like value"
with pytest.raises(TypeError, match=msg):
pd.array(values, dtype="boolean")
def test_to_boolean_array_from_integer_array():
result = pd.array(np.array([1, 0, 1, 0]), dtype="boolean")
expected = pd.array([True, False, True, False], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
# with missing values
result = pd.array(np.array([1, 0, 1, None]), dtype="boolean")
expected = pd.array([True, False, True, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_to_boolean_array_from_float_array():
result = pd.array(np.array([1.0, 0.0, 1.0, 0.0]), dtype="boolean")
expected = pd.array([True, False, True, False], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
# with missing values
result = pd.array(np.array([1.0, 0.0, 1.0, np.nan]), dtype="boolean")
expected = pd.array([True, False, True, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_to_boolean_array_integer_like():
# integers of 0's and 1's
result = pd.array([1, 0, 1, 0], dtype="boolean")
expected = pd.array([True, False, True, False], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
# with missing values
result = pd.array([1, 0, 1, None], dtype="boolean")
expected = pd.array([True, False, True, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_coerce_to_array():
# TODO this is currently not public API
values = np.array([True, False, True, False], dtype="bool")
mask = np.array([False, False, False, True], dtype="bool")
result = BooleanArray(*coerce_to_array(values, mask=mask))
expected = BooleanArray(values, mask)
tm.assert_extension_array_equal(result, expected)
assert result._data is values
assert result._mask is mask
result = BooleanArray(*coerce_to_array(values, mask=mask, copy=True))
expected = BooleanArray(values, mask)
tm.assert_extension_array_equal(result, expected)
assert result._data is not values
assert result._mask is not mask
# mixed missing from values and mask
values = [True, False, None, False]
mask = np.array([False, False, False, True], dtype="bool")
result = BooleanArray(*coerce_to_array(values, mask=mask))
expected = BooleanArray(
np.array([True, False, True, True]), np.array([False, False, True, True])
)
tm.assert_extension_array_equal(result, expected)
result = BooleanArray(*coerce_to_array(np.array(values, dtype=object), mask=mask))
tm.assert_extension_array_equal(result, expected)
result = BooleanArray(*coerce_to_array(values, mask=mask.tolist()))
tm.assert_extension_array_equal(result, expected)
# raise errors for wrong dimension
values = np.array([True, False, True, False], dtype="bool")
mask = np.array([False, False, False, True], dtype="bool")
# passing 2D values is OK as long as no mask
coerce_to_array(values.reshape(1, -1))
with pytest.raises(ValueError, match="values.shape and mask.shape must match"):
coerce_to_array(values.reshape(1, -1), mask=mask)
with pytest.raises(ValueError, match="values.shape and mask.shape must match"):
coerce_to_array(values, mask=mask.reshape(1, -1))
def test_coerce_to_array_from_boolean_array():
# passing BooleanArray to coerce_to_array
values = np.array([True, False, True, False], dtype="bool")
mask = np.array([False, False, False, True], dtype="bool")
arr = BooleanArray(values, mask)
result = BooleanArray(*coerce_to_array(arr))
tm.assert_extension_array_equal(result, arr)
# no copy
assert result._data is arr._data
assert result._mask is arr._mask
result = BooleanArray(*coerce_to_array(arr), copy=True)
tm.assert_extension_array_equal(result, arr)
assert result._data is not arr._data
assert result._mask is not arr._mask
with pytest.raises(ValueError, match="cannot pass mask for BooleanArray input"):
coerce_to_array(arr, mask=mask)
def test_coerce_to_numpy_array():
# with missing values -> object dtype
arr = pd.array([True, False, None], dtype="boolean")
result = np.array(arr)
expected = np.array([True, False, pd.NA], dtype="object")
tm.assert_numpy_array_equal(result, expected)
# also with no missing values -> object dtype
arr = pd.array([True, False, True], dtype="boolean")
result = np.array(arr)
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
# force bool dtype
result = np.array(arr, dtype="bool")
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
# with missing values will raise error
arr = pd.array([True, False, None], dtype="boolean")
msg = (
"cannot convert to 'bool'-dtype NumPy array with missing values. "
"Specify an appropriate 'na_value' for this dtype."
)
with pytest.raises(ValueError, match=msg):
np.array(arr, dtype="bool")
def test_to_boolean_array_from_strings():
result = BooleanArray._from_sequence_of_strings(
np.array(["True", "False", "1", "1.0", "0", "0.0", np.nan], dtype=object),
dtype="boolean",
)
expected = BooleanArray(
np.array([True, False, True, True, False, False, False]),
np.array([False, False, False, False, False, False, True]),
)
tm.assert_extension_array_equal(result, expected)
def test_to_boolean_array_from_strings_invalid_string():
with pytest.raises(ValueError, match="cannot be cast"):
BooleanArray._from_sequence_of_strings(["donkey"], dtype="boolean")
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy(box):
con = pd.Series if box else pd.array
# default (with or without missing values) -> object dtype
arr = con([True, False, True], dtype="boolean")
result = arr.to_numpy()
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
arr = con([True, False, None], dtype="boolean")
result = arr.to_numpy()
expected = np.array([True, False, pd.NA], dtype="object")
tm.assert_numpy_array_equal(result, expected)
arr = con([True, False, None], dtype="boolean")
result = arr.to_numpy(dtype="str")
expected = np.array([True, False, pd.NA], dtype=f"{tm.ENDIAN}U5")
tm.assert_numpy_array_equal(result, expected)
# no missing values -> can convert to bool, otherwise raises
arr = con([True, False, True], dtype="boolean")
result = arr.to_numpy(dtype="bool")
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
arr = con([True, False, None], dtype="boolean")
with pytest.raises(ValueError, match="cannot convert to 'bool'-dtype"):
result = arr.to_numpy(dtype="bool")
# specify dtype and na_value
arr = con([True, False, None], dtype="boolean")
result = arr.to_numpy(dtype=object, na_value=None)
expected = np.array([True, False, None], dtype="object")
tm.assert_numpy_array_equal(result, expected)
result = arr.to_numpy(dtype=bool, na_value=False)
expected = np.array([True, False, False], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
result = arr.to_numpy(dtype="int64", na_value=-99)
expected = np.array([1, 0, -99], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
result = arr.to_numpy(dtype="float64", na_value=np.nan)
expected = np.array([1, 0, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
# converting to int or float without specifying na_value raises
with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"):
arr.to_numpy(dtype="int64")
def test_to_numpy_copy():
# to_numpy can be zero-copy if no missing values
arr = pd.array([True, False, True], dtype="boolean")
result = arr.to_numpy(dtype=bool)
result[0] = False
tm.assert_extension_array_equal(
arr, pd.array([False, False, True], dtype="boolean")
)
arr = pd.array([True, False, True], dtype="boolean")
result = arr.to_numpy(dtype=bool, copy=True)
result[0] = False
tm.assert_extension_array_equal(arr, pd.array([True, False, True], dtype="boolean"))

View File

@ -0,0 +1,126 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize(
"ufunc", [np.add, np.logical_or, np.logical_and, np.logical_xor]
)
def test_ufuncs_binary(ufunc):
# two BooleanArrays
a = pd.array([True, False, None], dtype="boolean")
result = ufunc(a, a)
expected = pd.array(ufunc(a._data, a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
s = pd.Series(a)
result = ufunc(s, a)
expected = pd.Series(ufunc(a._data, a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_series_equal(result, expected)
# Boolean with numpy array
arr = np.array([True, True, False])
result = ufunc(a, arr)
expected = pd.array(ufunc(a._data, arr), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
result = ufunc(arr, a)
expected = pd.array(ufunc(arr, a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
# BooleanArray with scalar
result = ufunc(a, True)
expected = pd.array(ufunc(a._data, True), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
result = ufunc(True, a)
expected = pd.array(ufunc(True, a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
# not handled types
msg = r"operand type\(s\) all returned NotImplemented from __array_ufunc__"
with pytest.raises(TypeError, match=msg):
ufunc(a, "test")
@pytest.mark.parametrize("ufunc", [np.logical_not])
def test_ufuncs_unary(ufunc):
a = pd.array([True, False, None], dtype="boolean")
result = ufunc(a)
expected = pd.array(ufunc(a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
ser = pd.Series(a)
result = ufunc(ser)
expected = pd.Series(ufunc(a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_series_equal(result, expected)
def test_ufunc_numeric():
# np.sqrt on np.bool_ returns float16, which we upcast to Float32
# bc we do not have Float16
arr = pd.array([True, False, None], dtype="boolean")
res = np.sqrt(arr)
expected = pd.array([1, 0, None], dtype="Float32")
tm.assert_extension_array_equal(res, expected)
@pytest.mark.parametrize("values", [[True, False], [True, None]])
def test_ufunc_reduce_raises(values):
arr = pd.array(values, dtype="boolean")
res = np.add.reduce(arr)
if arr[-1] is pd.NA:
expected = pd.NA
else:
expected = arr._data.sum()
tm.assert_almost_equal(res, expected)
def test_value_counts_na():
arr = pd.array([True, False, pd.NA], dtype="boolean")
result = arr.value_counts(dropna=False)
expected = pd.Series([1, 1, 1], index=arr, dtype="Int64", name="count")
assert expected.index.dtype == arr.dtype
tm.assert_series_equal(result, expected)
result = arr.value_counts(dropna=True)
expected = pd.Series([1, 1], index=arr[:-1], dtype="Int64", name="count")
assert expected.index.dtype == arr.dtype
tm.assert_series_equal(result, expected)
def test_value_counts_with_normalize():
ser = pd.Series([True, False, pd.NA], dtype="boolean")
result = ser.value_counts(normalize=True)
expected = pd.Series([1, 1], index=ser[:-1], dtype="Float64", name="proportion") / 2
assert expected.index.dtype == "boolean"
tm.assert_series_equal(result, expected)
def test_diff():
a = pd.array(
[True, True, False, False, True, None, True, None, False], dtype="boolean"
)
result = pd.core.algorithms.diff(a, 1)
expected = pd.array(
[None, False, True, False, True, None, None, None, None], dtype="boolean"
)
tm.assert_extension_array_equal(result, expected)
ser = pd.Series(a)
result = ser.diff()
expected = pd.Series(expected)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,13 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize("na", [None, np.nan, pd.NA])
def test_setitem_missing_values(na):
arr = pd.array([True, False, None], dtype="boolean")
expected = pd.array([True, None, None], dtype="boolean")
arr[1] = na
tm.assert_extension_array_equal(arr, expected)

View File

@ -0,0 +1,254 @@
import operator
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.arrays import BooleanArray
from pandas.core.ops.mask_ops import (
kleene_and,
kleene_or,
kleene_xor,
)
from pandas.tests.extension.base import BaseOpsUtil
class TestLogicalOps(BaseOpsUtil):
def test_numpy_scalars_ok(self, all_logical_operators):
a = pd.array([True, False, None], dtype="boolean")
op = getattr(a, all_logical_operators)
tm.assert_extension_array_equal(op(True), op(np.bool_(True)))
tm.assert_extension_array_equal(op(False), op(np.bool_(False)))
def get_op_from_name(self, op_name):
short_opname = op_name.strip("_")
short_opname = short_opname if "xor" in short_opname else short_opname + "_"
try:
op = getattr(operator, short_opname)
except AttributeError:
# Assume it is the reverse operator
rop = getattr(operator, short_opname[1:])
op = lambda x, y: rop(y, x)
return op
def test_empty_ok(self, all_logical_operators):
a = pd.array([], dtype="boolean")
op_name = all_logical_operators
result = getattr(a, op_name)(True)
tm.assert_extension_array_equal(a, result)
result = getattr(a, op_name)(False)
tm.assert_extension_array_equal(a, result)
result = getattr(a, op_name)(pd.NA)
tm.assert_extension_array_equal(a, result)
@pytest.mark.parametrize(
"other", ["a", pd.Timestamp(2017, 1, 1, 12), np.timedelta64(4)]
)
def test_eq_mismatched_type(self, other):
# GH-44499
arr = pd.array([True, False])
result = arr == other
expected = pd.array([False, False])
tm.assert_extension_array_equal(result, expected)
result = arr != other
expected = pd.array([True, True])
tm.assert_extension_array_equal(result, expected)
def test_logical_length_mismatch_raises(self, all_logical_operators):
op_name = all_logical_operators
a = pd.array([True, False, None], dtype="boolean")
msg = "Lengths must match"
with pytest.raises(ValueError, match=msg):
getattr(a, op_name)([True, False])
with pytest.raises(ValueError, match=msg):
getattr(a, op_name)(np.array([True, False]))
with pytest.raises(ValueError, match=msg):
getattr(a, op_name)(pd.array([True, False], dtype="boolean"))
def test_logical_nan_raises(self, all_logical_operators):
op_name = all_logical_operators
a = pd.array([True, False, None], dtype="boolean")
msg = "Got float instead"
with pytest.raises(TypeError, match=msg):
getattr(a, op_name)(np.nan)
@pytest.mark.parametrize("other", ["a", 1])
def test_non_bool_or_na_other_raises(self, other, all_logical_operators):
a = pd.array([True, False], dtype="boolean")
with pytest.raises(TypeError, match=str(type(other).__name__)):
getattr(a, all_logical_operators)(other)
def test_kleene_or(self):
# A clear test of behavior.
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
b = pd.array([True, False, None] * 3, dtype="boolean")
result = a | b
expected = pd.array(
[True, True, True, True, False, None, True, None, None], dtype="boolean"
)
tm.assert_extension_array_equal(result, expected)
result = b | a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
)
tm.assert_extension_array_equal(
b, pd.array([True, False, None] * 3, dtype="boolean")
)
@pytest.mark.parametrize(
"other, expected",
[
(pd.NA, [True, None, None]),
(True, [True, True, True]),
(np.bool_(True), [True, True, True]),
(False, [True, False, None]),
(np.bool_(False), [True, False, None]),
],
)
def test_kleene_or_scalar(self, other, expected):
# TODO: test True & False
a = pd.array([True, False, None], dtype="boolean")
result = a | other
expected = pd.array(expected, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = other | a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True, False, None], dtype="boolean")
)
def test_kleene_and(self):
# A clear test of behavior.
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
b = pd.array([True, False, None] * 3, dtype="boolean")
result = a & b
expected = pd.array(
[True, False, None, False, False, False, None, False, None], dtype="boolean"
)
tm.assert_extension_array_equal(result, expected)
result = b & a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
)
tm.assert_extension_array_equal(
b, pd.array([True, False, None] * 3, dtype="boolean")
)
@pytest.mark.parametrize(
"other, expected",
[
(pd.NA, [None, False, None]),
(True, [True, False, None]),
(False, [False, False, False]),
(np.bool_(True), [True, False, None]),
(np.bool_(False), [False, False, False]),
],
)
def test_kleene_and_scalar(self, other, expected):
a = pd.array([True, False, None], dtype="boolean")
result = a & other
expected = pd.array(expected, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = other & a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True, False, None], dtype="boolean")
)
def test_kleene_xor(self):
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
b = pd.array([True, False, None] * 3, dtype="boolean")
result = a ^ b
expected = pd.array(
[False, True, None, True, False, None, None, None, None], dtype="boolean"
)
tm.assert_extension_array_equal(result, expected)
result = b ^ a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
)
tm.assert_extension_array_equal(
b, pd.array([True, False, None] * 3, dtype="boolean")
)
@pytest.mark.parametrize(
"other, expected",
[
(pd.NA, [None, None, None]),
(True, [False, True, None]),
(np.bool_(True), [False, True, None]),
(np.bool_(False), [True, False, None]),
],
)
def test_kleene_xor_scalar(self, other, expected):
a = pd.array([True, False, None], dtype="boolean")
result = a ^ other
expected = pd.array(expected, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = other ^ a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True, False, None], dtype="boolean")
)
@pytest.mark.parametrize("other", [True, False, pd.NA, [True, False, None] * 3])
def test_no_masked_assumptions(self, other, all_logical_operators):
# The logical operations should not assume that masked values are False!
a = pd.arrays.BooleanArray(
np.array([True, True, True, False, False, False, True, False, True]),
np.array([False] * 6 + [True, True, True]),
)
b = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
if isinstance(other, list):
other = pd.array(other, dtype="boolean")
result = getattr(a, all_logical_operators)(other)
expected = getattr(b, all_logical_operators)(other)
tm.assert_extension_array_equal(result, expected)
if isinstance(other, BooleanArray):
other._data[other._mask] = True
a._data[a._mask] = False
result = getattr(a, all_logical_operators)(other)
expected = getattr(b, all_logical_operators)(other)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("operation", [kleene_or, kleene_xor, kleene_and])
def test_error_both_scalar(operation):
msg = r"Either `left` or `right` need to be a np\.ndarray."
with pytest.raises(TypeError, match=msg):
# masks need to be non-None, otherwise it ends up in an infinite recursion
operation(True, True, np.zeros(1), np.zeros(1))

View File

@ -0,0 +1,27 @@
import pandas as pd
import pandas._testing as tm
class TestUnaryOps:
def test_invert(self):
a = pd.array([True, False, None], dtype="boolean")
expected = pd.array([False, True, None], dtype="boolean")
tm.assert_extension_array_equal(~a, expected)
expected = pd.Series(expected, index=["a", "b", "c"], name="name")
result = ~pd.Series(a, index=["a", "b", "c"], name="name")
tm.assert_series_equal(result, expected)
df = pd.DataFrame({"A": a, "B": [True, False, False]}, index=["a", "b", "c"])
result = ~df
expected = pd.DataFrame(
{"A": expected, "B": [False, True, True]}, index=["a", "b", "c"]
)
tm.assert_frame_equal(result, expected)
def test_abs(self):
# matching numpy behavior, abs is the identity function
arr = pd.array([True, False, None], dtype="boolean")
result = abs(arr)
tm.assert_extension_array_equal(result, arr)

View File

@ -0,0 +1,62 @@
import numpy as np
import pytest
import pandas as pd
@pytest.fixture
def data():
"""Fixture returning boolean array, with valid and missing values."""
return pd.array(
[True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False],
dtype="boolean",
)
@pytest.mark.parametrize(
"values, exp_any, exp_all, exp_any_noskip, exp_all_noskip",
[
([True, pd.NA], True, True, True, pd.NA),
([False, pd.NA], False, False, pd.NA, False),
([pd.NA], False, True, pd.NA, pd.NA),
([], False, True, False, True),
# GH-33253: all True / all False values buggy with skipna=False
([True, True], True, True, True, True),
([False, False], False, False, False, False),
],
)
def test_any_all(values, exp_any, exp_all, exp_any_noskip, exp_all_noskip):
# the methods return numpy scalars
exp_any = pd.NA if exp_any is pd.NA else np.bool_(exp_any)
exp_all = pd.NA if exp_all is pd.NA else np.bool_(exp_all)
exp_any_noskip = pd.NA if exp_any_noskip is pd.NA else np.bool_(exp_any_noskip)
exp_all_noskip = pd.NA if exp_all_noskip is pd.NA else np.bool_(exp_all_noskip)
for con in [pd.array, pd.Series]:
a = con(values, dtype="boolean")
assert a.any() is exp_any
assert a.all() is exp_all
assert a.any(skipna=False) is exp_any_noskip
assert a.all(skipna=False) is exp_all_noskip
assert np.any(a.any()) is exp_any
assert np.all(a.all()) is exp_all
@pytest.mark.parametrize("dropna", [True, False])
def test_reductions_return_types(dropna, data, all_numeric_reductions):
op = all_numeric_reductions
s = pd.Series(data)
if dropna:
s = s.dropna()
if op in ("sum", "prod"):
assert isinstance(getattr(s, op)(), np.int_)
elif op == "count":
# Oddly on the 32 bit build (but not Windows), this is intc (!= intp)
assert isinstance(getattr(s, op)(), np.integer)
elif op in ("min", "max"):
assert isinstance(getattr(s, op)(), np.bool_)
else:
# "mean", "std", "var", "median", "kurt", "skew"
assert isinstance(getattr(s, op)(), np.float64)

View File

@ -0,0 +1,13 @@
import pandas as pd
def test_repr():
df = pd.DataFrame({"A": pd.array([True, False, None], dtype="boolean")})
expected = " A\n0 True\n1 False\n2 <NA>"
assert repr(df) == expected
expected = "0 True\n1 False\n2 <NA>\nName: A, dtype: boolean"
assert repr(df.A) == expected
expected = "<BooleanArray>\n[True, False, <NA>]\nLength: 3, dtype: boolean"
assert repr(df.A.array) == expected

View File

@ -0,0 +1,89 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize("ordered", [True, False])
@pytest.mark.parametrize("categories", [["b", "a", "c"], ["a", "b", "c", "d"]])
def test_factorize(categories, ordered):
cat = pd.Categorical(
["b", "b", "a", "c", None], categories=categories, ordered=ordered
)
codes, uniques = pd.factorize(cat)
expected_codes = np.array([0, 0, 1, 2, -1], dtype=np.intp)
expected_uniques = pd.Categorical(
["b", "a", "c"], categories=categories, ordered=ordered
)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_categorical_equal(uniques, expected_uniques)
def test_factorized_sort():
cat = pd.Categorical(["b", "b", None, "a"])
codes, uniques = pd.factorize(cat, sort=True)
expected_codes = np.array([1, 1, -1, 0], dtype=np.intp)
expected_uniques = pd.Categorical(["a", "b"])
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_categorical_equal(uniques, expected_uniques)
def test_factorized_sort_ordered():
cat = pd.Categorical(
["b", "b", None, "a"], categories=["c", "b", "a"], ordered=True
)
codes, uniques = pd.factorize(cat, sort=True)
expected_codes = np.array([0, 0, -1, 1], dtype=np.intp)
expected_uniques = pd.Categorical(
["b", "a"], categories=["c", "b", "a"], ordered=True
)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_categorical_equal(uniques, expected_uniques)
def test_isin_cats():
# GH2003
cat = pd.Categorical(["a", "b", np.nan])
result = cat.isin(["a", np.nan])
expected = np.array([True, False, True], dtype=bool)
tm.assert_numpy_array_equal(expected, result)
result = cat.isin(["a", "c"])
expected = np.array([True, False, False], dtype=bool)
tm.assert_numpy_array_equal(expected, result)
@pytest.mark.parametrize("value", [[""], [None, ""], [pd.NaT, ""]])
def test_isin_cats_corner_cases(value):
# GH36550
cat = pd.Categorical([""])
result = cat.isin(value)
expected = np.array([True], dtype=bool)
tm.assert_numpy_array_equal(expected, result)
@pytest.mark.parametrize("empty", [[], pd.Series(dtype=object), np.array([])])
def test_isin_empty(empty):
s = pd.Categorical(["a", "b"])
expected = np.array([False, False], dtype=bool)
result = s.isin(empty)
tm.assert_numpy_array_equal(expected, result)
def test_diff():
ser = pd.Series([1, 2, 3], dtype="category")
msg = "Convert to a suitable dtype"
with pytest.raises(TypeError, match=msg):
ser.diff()
df = ser.to_frame(name="A")
with pytest.raises(TypeError, match=msg):
df.diff()

View File

@ -0,0 +1,349 @@
import re
import sys
import numpy as np
import pytest
from pandas.compat import PYPY
from pandas import (
Categorical,
CategoricalDtype,
DataFrame,
Index,
NaT,
Series,
date_range,
)
import pandas._testing as tm
from pandas.api.types import is_scalar
class TestCategoricalAnalytics:
@pytest.mark.parametrize("aggregation", ["min", "max"])
def test_min_max_not_ordered_raises(self, aggregation):
# unordered cats have no min/max
cat = Categorical(["a", "b", "c", "d"], ordered=False)
msg = f"Categorical is not ordered for operation {aggregation}"
agg_func = getattr(cat, aggregation)
with pytest.raises(TypeError, match=msg):
agg_func()
ufunc = np.minimum if aggregation == "min" else np.maximum
with pytest.raises(TypeError, match=msg):
ufunc.reduce(cat)
def test_min_max_ordered(self, index_or_series_or_array):
cat = Categorical(["a", "b", "c", "d"], ordered=True)
obj = index_or_series_or_array(cat)
_min = obj.min()
_max = obj.max()
assert _min == "a"
assert _max == "d"
assert np.minimum.reduce(obj) == "a"
assert np.maximum.reduce(obj) == "d"
# TODO: raises if we pass axis=0 (on Index and Categorical, not Series)
cat = Categorical(
["a", "b", "c", "d"], categories=["d", "c", "b", "a"], ordered=True
)
obj = index_or_series_or_array(cat)
_min = obj.min()
_max = obj.max()
assert _min == "d"
assert _max == "a"
assert np.minimum.reduce(obj) == "d"
assert np.maximum.reduce(obj) == "a"
def test_min_max_reduce(self):
# GH52788
cat = Categorical(["a", "b", "c", "d"], ordered=True)
df = DataFrame(cat)
result_max = df.agg("max")
expected_max = Series(Categorical(["d"], dtype=cat.dtype))
tm.assert_series_equal(result_max, expected_max)
result_min = df.agg("min")
expected_min = Series(Categorical(["a"], dtype=cat.dtype))
tm.assert_series_equal(result_min, expected_min)
@pytest.mark.parametrize(
"categories,expected",
[
(list("ABC"), np.nan),
([1, 2, 3], np.nan),
pytest.param(
Series(date_range("2020-01-01", periods=3), dtype="category"),
NaT,
marks=pytest.mark.xfail(
reason="https://github.com/pandas-dev/pandas/issues/29962"
),
),
],
)
@pytest.mark.parametrize("aggregation", ["min", "max"])
def test_min_max_ordered_empty(self, categories, expected, aggregation):
# GH 30227
cat = Categorical([], categories=categories, ordered=True)
agg_func = getattr(cat, aggregation)
result = agg_func()
assert result is expected
@pytest.mark.parametrize(
"values, categories",
[(["a", "b", "c", np.nan], list("cba")), ([1, 2, 3, np.nan], [3, 2, 1])],
)
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("function", ["min", "max"])
def test_min_max_with_nan(self, values, categories, function, skipna):
# GH 25303
cat = Categorical(values, categories=categories, ordered=True)
result = getattr(cat, function)(skipna=skipna)
if skipna is False:
assert result is np.nan
else:
expected = categories[0] if function == "min" else categories[2]
assert result == expected
@pytest.mark.parametrize("function", ["min", "max"])
@pytest.mark.parametrize("skipna", [True, False])
def test_min_max_only_nan(self, function, skipna):
# https://github.com/pandas-dev/pandas/issues/33450
cat = Categorical([np.nan], categories=[1, 2], ordered=True)
result = getattr(cat, function)(skipna=skipna)
assert result is np.nan
@pytest.mark.parametrize("method", ["min", "max"])
def test_numeric_only_min_max_raises(self, method):
# GH 25303
cat = Categorical(
[np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True
)
with pytest.raises(TypeError, match=".* got an unexpected keyword"):
getattr(cat, method)(numeric_only=True)
@pytest.mark.parametrize("method", ["min", "max"])
def test_numpy_min_max_raises(self, method):
cat = Categorical(["a", "b", "c", "b"], ordered=False)
msg = (
f"Categorical is not ordered for operation {method}\n"
"you can use .as_ordered() to change the Categorical to an ordered one"
)
method = getattr(np, method)
with pytest.raises(TypeError, match=re.escape(msg)):
method(cat)
@pytest.mark.parametrize("kwarg", ["axis", "out", "keepdims"])
@pytest.mark.parametrize("method", ["min", "max"])
def test_numpy_min_max_unsupported_kwargs_raises(self, method, kwarg):
cat = Categorical(["a", "b", "c", "b"], ordered=True)
msg = (
f"the '{kwarg}' parameter is not supported in the pandas implementation "
f"of {method}"
)
if kwarg == "axis":
msg = r"`axis` must be fewer than the number of dimensions \(1\)"
kwargs = {kwarg: 42}
method = getattr(np, method)
with pytest.raises(ValueError, match=msg):
method(cat, **kwargs)
@pytest.mark.parametrize("method, expected", [("min", "a"), ("max", "c")])
def test_numpy_min_max_axis_equals_none(self, method, expected):
cat = Categorical(["a", "b", "c", "b"], ordered=True)
method = getattr(np, method)
result = method(cat, axis=None)
assert result == expected
@pytest.mark.parametrize(
"values,categories,exp_mode",
[
([1, 1, 2, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5]),
([1, 1, 1, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5, 1]),
([1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [5, 4, 3, 2, 1]),
([np.nan, np.nan, np.nan, 4, 5], [5, 4, 3, 2, 1], [5, 4]),
([np.nan, np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]),
([np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]),
],
)
def test_mode(self, values, categories, exp_mode):
cat = Categorical(values, categories=categories, ordered=True)
res = Series(cat).mode()._values
exp = Categorical(exp_mode, categories=categories, ordered=True)
tm.assert_categorical_equal(res, exp)
def test_searchsorted(self, ordered):
# https://github.com/pandas-dev/pandas/issues/8420
# https://github.com/pandas-dev/pandas/issues/14522
cat = Categorical(
["cheese", "milk", "apple", "bread", "bread"],
categories=["cheese", "milk", "apple", "bread"],
ordered=ordered,
)
ser = Series(cat)
# Searching for single item argument, side='left' (default)
res_cat = cat.searchsorted("apple")
assert res_cat == 2
assert is_scalar(res_cat)
res_ser = ser.searchsorted("apple")
assert res_ser == 2
assert is_scalar(res_ser)
# Searching for single item array, side='left' (default)
res_cat = cat.searchsorted(["bread"])
res_ser = ser.searchsorted(["bread"])
exp = np.array([3], dtype=np.intp)
tm.assert_numpy_array_equal(res_cat, exp)
tm.assert_numpy_array_equal(res_ser, exp)
# Searching for several items array, side='right'
res_cat = cat.searchsorted(["apple", "bread"], side="right")
res_ser = ser.searchsorted(["apple", "bread"], side="right")
exp = np.array([3, 5], dtype=np.intp)
tm.assert_numpy_array_equal(res_cat, exp)
tm.assert_numpy_array_equal(res_ser, exp)
# Searching for a single value that is not from the Categorical
with pytest.raises(TypeError, match="cucumber"):
cat.searchsorted("cucumber")
with pytest.raises(TypeError, match="cucumber"):
ser.searchsorted("cucumber")
# Searching for multiple values one of each is not from the Categorical
msg = (
"Cannot setitem on a Categorical with a new category, "
"set the categories first"
)
with pytest.raises(TypeError, match=msg):
cat.searchsorted(["bread", "cucumber"])
with pytest.raises(TypeError, match=msg):
ser.searchsorted(["bread", "cucumber"])
def test_unique(self, ordered):
# GH38140
dtype = CategoricalDtype(["a", "b", "c"], ordered=ordered)
# categories are reordered based on value when ordered=False
cat = Categorical(["a", "b", "c"], dtype=dtype)
res = cat.unique()
tm.assert_categorical_equal(res, cat)
cat = Categorical(["a", "b", "a", "a"], dtype=dtype)
res = cat.unique()
tm.assert_categorical_equal(res, Categorical(["a", "b"], dtype=dtype))
cat = Categorical(["c", "a", "b", "a", "a"], dtype=dtype)
res = cat.unique()
exp_cat = Categorical(["c", "a", "b"], dtype=dtype)
tm.assert_categorical_equal(res, exp_cat)
# nan must be removed
cat = Categorical(["b", np.nan, "b", np.nan, "a"], dtype=dtype)
res = cat.unique()
exp_cat = Categorical(["b", np.nan, "a"], dtype=dtype)
tm.assert_categorical_equal(res, exp_cat)
def test_unique_index_series(self, ordered):
# GH38140
dtype = CategoricalDtype([3, 2, 1], ordered=ordered)
c = Categorical([3, 1, 2, 2, 1], dtype=dtype)
# Categorical.unique sorts categories by appearance order
# if ordered=False
exp = Categorical([3, 1, 2], dtype=dtype)
tm.assert_categorical_equal(c.unique(), exp)
tm.assert_index_equal(Index(c).unique(), Index(exp))
tm.assert_categorical_equal(Series(c).unique(), exp)
c = Categorical([1, 1, 2, 2], dtype=dtype)
exp = Categorical([1, 2], dtype=dtype)
tm.assert_categorical_equal(c.unique(), exp)
tm.assert_index_equal(Index(c).unique(), Index(exp))
tm.assert_categorical_equal(Series(c).unique(), exp)
def test_shift(self):
# GH 9416
cat = Categorical(["a", "b", "c", "d", "a"])
# shift forward
sp1 = cat.shift(1)
xp1 = Categorical([np.nan, "a", "b", "c", "d"])
tm.assert_categorical_equal(sp1, xp1)
tm.assert_categorical_equal(cat[:-1], sp1[1:])
# shift back
sn2 = cat.shift(-2)
xp2 = Categorical(
["c", "d", "a", np.nan, np.nan], categories=["a", "b", "c", "d"]
)
tm.assert_categorical_equal(sn2, xp2)
tm.assert_categorical_equal(cat[2:], sn2[:-2])
# shift by zero
tm.assert_categorical_equal(cat, cat.shift(0))
def test_nbytes(self):
cat = Categorical([1, 2, 3])
exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories
assert cat.nbytes == exp
def test_memory_usage(self):
cat = Categorical([1, 2, 3])
# .categories is an index, so we include the hashtable
assert 0 < cat.nbytes <= cat.memory_usage()
assert 0 < cat.nbytes <= cat.memory_usage(deep=True)
cat = Categorical(["foo", "foo", "bar"])
assert cat.memory_usage(deep=True) > cat.nbytes
if not PYPY:
# sys.getsizeof will call the .memory_usage with
# deep=True, and add on some GC overhead
diff = cat.memory_usage(deep=True) - sys.getsizeof(cat)
assert abs(diff) < 100
def test_map(self):
c = Categorical(list("ABABC"), categories=list("CBA"), ordered=True)
result = c.map(lambda x: x.lower(), na_action=None)
exp = Categorical(list("ababc"), categories=list("cba"), ordered=True)
tm.assert_categorical_equal(result, exp)
c = Categorical(list("ABABC"), categories=list("ABC"), ordered=False)
result = c.map(lambda x: x.lower(), na_action=None)
exp = Categorical(list("ababc"), categories=list("abc"), ordered=False)
tm.assert_categorical_equal(result, exp)
result = c.map(lambda x: 1, na_action=None)
# GH 12766: Return an index not an array
tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64)))
@pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0])
def test_validate_inplace_raises(self, value):
cat = Categorical(["A", "B", "B", "C", "A"])
msg = (
'For argument "inplace" expected type bool, '
f"received type {type(value).__name__}"
)
with pytest.raises(ValueError, match=msg):
cat.sort_values(inplace=value)
def test_quantile_empty(self):
# make sure we have correct itemsize on resulting codes
cat = Categorical(["A", "B"])
idx = Index([0.0, 0.5])
result = cat[:0]._quantile(idx, interpolation="linear")
assert result._codes.dtype == np.int8
expected = cat.take([-1, -1], allow_fill=True)
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,501 @@
import re
import numpy as np
import pytest
from pandas.compat import PY311
from pandas import (
Categorical,
CategoricalIndex,
DataFrame,
Index,
Series,
StringDtype,
)
import pandas._testing as tm
from pandas.core.arrays.categorical import recode_for_categories
class TestCategoricalAPI:
def test_to_list_deprecated(self):
# GH#51254
cat1 = Categorical(list("acb"), ordered=False)
msg = "Categorical.to_list is deprecated and will be removed"
with tm.assert_produces_warning(FutureWarning, match=msg):
cat1.to_list()
def test_ordered_api(self):
# GH 9347
cat1 = Categorical(list("acb"), ordered=False)
tm.assert_index_equal(cat1.categories, Index(["a", "b", "c"]))
assert not cat1.ordered
cat2 = Categorical(list("acb"), categories=list("bca"), ordered=False)
tm.assert_index_equal(cat2.categories, Index(["b", "c", "a"]))
assert not cat2.ordered
cat3 = Categorical(list("acb"), ordered=True)
tm.assert_index_equal(cat3.categories, Index(["a", "b", "c"]))
assert cat3.ordered
cat4 = Categorical(list("acb"), categories=list("bca"), ordered=True)
tm.assert_index_equal(cat4.categories, Index(["b", "c", "a"]))
assert cat4.ordered
def test_set_ordered(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
cat2 = cat.as_unordered()
assert not cat2.ordered
cat2 = cat.as_ordered()
assert cat2.ordered
assert cat2.set_ordered(True).ordered
assert not cat2.set_ordered(False).ordered
# removed in 0.19.0
msg = (
"property 'ordered' of 'Categorical' object has no setter"
if PY311
else "can't set attribute"
)
with pytest.raises(AttributeError, match=msg):
cat.ordered = True
with pytest.raises(AttributeError, match=msg):
cat.ordered = False
def test_rename_categories(self):
cat = Categorical(["a", "b", "c", "a"])
# inplace=False: the old one must not be changed
res = cat.rename_categories([1, 2, 3])
tm.assert_numpy_array_equal(
res.__array__(), np.array([1, 2, 3, 1], dtype=np.int64)
)
tm.assert_index_equal(res.categories, Index([1, 2, 3]))
exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_)
tm.assert_numpy_array_equal(cat.__array__(), exp_cat)
exp_cat = Index(["a", "b", "c"])
tm.assert_index_equal(cat.categories, exp_cat)
# GH18862 (let rename_categories take callables)
result = cat.rename_categories(lambda x: x.upper())
expected = Categorical(["A", "B", "C", "A"])
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("new_categories", [[1, 2, 3, 4], [1, 2]])
def test_rename_categories_wrong_length_raises(self, new_categories):
cat = Categorical(["a", "b", "c", "a"])
msg = (
"new categories need to have the same number of items as the "
"old categories!"
)
with pytest.raises(ValueError, match=msg):
cat.rename_categories(new_categories)
def test_rename_categories_series(self):
# https://github.com/pandas-dev/pandas/issues/17981
c = Categorical(["a", "b"])
result = c.rename_categories(Series([0, 1], index=["a", "b"]))
expected = Categorical([0, 1])
tm.assert_categorical_equal(result, expected)
def test_rename_categories_dict(self):
# GH 17336
cat = Categorical(["a", "b", "c", "d"])
res = cat.rename_categories({"a": 4, "b": 3, "c": 2, "d": 1})
expected = Index([4, 3, 2, 1])
tm.assert_index_equal(res.categories, expected)
# Test for dicts of smaller length
cat = Categorical(["a", "b", "c", "d"])
res = cat.rename_categories({"a": 1, "c": 3})
expected = Index([1, "b", 3, "d"])
tm.assert_index_equal(res.categories, expected)
# Test for dicts with bigger length
cat = Categorical(["a", "b", "c", "d"])
res = cat.rename_categories({"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6})
expected = Index([1, 2, 3, 4])
tm.assert_index_equal(res.categories, expected)
# Test for dicts with no items from old categories
cat = Categorical(["a", "b", "c", "d"])
res = cat.rename_categories({"f": 1, "g": 3})
expected = Index(["a", "b", "c", "d"])
tm.assert_index_equal(res.categories, expected)
def test_reorder_categories(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
old = cat.copy()
new = Categorical(
["a", "b", "c", "a"], categories=["c", "b", "a"], ordered=True
)
res = cat.reorder_categories(["c", "b", "a"])
# cat must be the same as before
tm.assert_categorical_equal(cat, old)
# only res is changed
tm.assert_categorical_equal(res, new)
@pytest.mark.parametrize(
"new_categories",
[
["a"], # not all "old" included in "new"
["a", "b", "d"], # still not all "old" in "new"
["a", "b", "c", "d"], # all "old" included in "new", but too long
],
)
def test_reorder_categories_raises(self, new_categories):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
msg = "items in new_categories are not the same as in old categories"
with pytest.raises(ValueError, match=msg):
cat.reorder_categories(new_categories)
def test_add_categories(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
old = cat.copy()
new = Categorical(
["a", "b", "c", "a"], categories=["a", "b", "c", "d"], ordered=True
)
res = cat.add_categories("d")
tm.assert_categorical_equal(cat, old)
tm.assert_categorical_equal(res, new)
res = cat.add_categories(["d"])
tm.assert_categorical_equal(cat, old)
tm.assert_categorical_equal(res, new)
# GH 9927
cat = Categorical(list("abc"), ordered=True)
expected = Categorical(list("abc"), categories=list("abcde"), ordered=True)
# test with Series, np.array, index, list
res = cat.add_categories(Series(["d", "e"]))
tm.assert_categorical_equal(res, expected)
res = cat.add_categories(np.array(["d", "e"]))
tm.assert_categorical_equal(res, expected)
res = cat.add_categories(Index(["d", "e"]))
tm.assert_categorical_equal(res, expected)
res = cat.add_categories(["d", "e"])
tm.assert_categorical_equal(res, expected)
def test_add_categories_existing_raises(self):
# new is in old categories
cat = Categorical(["a", "b", "c", "d"], ordered=True)
msg = re.escape("new categories must not include old categories: {'d'}")
with pytest.raises(ValueError, match=msg):
cat.add_categories(["d"])
def test_add_categories_losing_dtype_information(self):
# GH#48812
cat = Categorical(Series([1, 2], dtype="Int64"))
ser = Series([4], dtype="Int64")
result = cat.add_categories(ser)
expected = Categorical(
Series([1, 2], dtype="Int64"), categories=Series([1, 2, 4], dtype="Int64")
)
tm.assert_categorical_equal(result, expected)
cat = Categorical(Series(["a", "b", "a"], dtype=StringDtype()))
ser = Series(["d"], dtype=StringDtype())
result = cat.add_categories(ser)
expected = Categorical(
Series(["a", "b", "a"], dtype=StringDtype()),
categories=Series(["a", "b", "d"], dtype=StringDtype()),
)
tm.assert_categorical_equal(result, expected)
def test_set_categories(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
exp_categories = Index(["c", "b", "a"])
exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_)
cat = cat.set_categories(["c", "b", "a"])
res = cat.set_categories(["a", "b", "c"])
# cat must be the same as before
tm.assert_index_equal(cat.categories, exp_categories)
tm.assert_numpy_array_equal(cat.__array__(), exp_values)
# only res is changed
exp_categories_back = Index(["a", "b", "c"])
tm.assert_index_equal(res.categories, exp_categories_back)
tm.assert_numpy_array_equal(res.__array__(), exp_values)
# not all "old" included in "new" -> all not included ones are now
# np.nan
cat = Categorical(["a", "b", "c", "a"], ordered=True)
res = cat.set_categories(["a"])
tm.assert_numpy_array_equal(res.codes, np.array([0, -1, -1, 0], dtype=np.int8))
# still not all "old" in "new"
res = cat.set_categories(["a", "b", "d"])
tm.assert_numpy_array_equal(res.codes, np.array([0, 1, -1, 0], dtype=np.int8))
tm.assert_index_equal(res.categories, Index(["a", "b", "d"]))
# all "old" included in "new"
cat = cat.set_categories(["a", "b", "c", "d"])
exp_categories = Index(["a", "b", "c", "d"])
tm.assert_index_equal(cat.categories, exp_categories)
# internals...
c = Categorical([1, 2, 3, 4, 1], categories=[1, 2, 3, 4], ordered=True)
tm.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 3, 0], dtype=np.int8))
tm.assert_index_equal(c.categories, Index([1, 2, 3, 4]))
exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
tm.assert_numpy_array_equal(np.asarray(c), exp)
# all "pointers" to '4' must be changed from 3 to 0,...
c = c.set_categories([4, 3, 2, 1])
# positions are changed
tm.assert_numpy_array_equal(c._codes, np.array([3, 2, 1, 0, 3], dtype=np.int8))
# categories are now in new order
tm.assert_index_equal(c.categories, Index([4, 3, 2, 1]))
# output is the same
exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
tm.assert_numpy_array_equal(np.asarray(c), exp)
assert c.min() == 4
assert c.max() == 1
# set_categories should set the ordering if specified
c2 = c.set_categories([4, 3, 2, 1], ordered=False)
assert not c2.ordered
tm.assert_numpy_array_equal(np.asarray(c), np.asarray(c2))
# set_categories should pass thru the ordering
c2 = c.set_ordered(False).set_categories([4, 3, 2, 1])
assert not c2.ordered
tm.assert_numpy_array_equal(np.asarray(c), np.asarray(c2))
@pytest.mark.parametrize(
"values, categories, new_categories",
[
# No NaNs, same cats, same order
(["a", "b", "a"], ["a", "b"], ["a", "b"]),
# No NaNs, same cats, different order
(["a", "b", "a"], ["a", "b"], ["b", "a"]),
# Same, unsorted
(["b", "a", "a"], ["a", "b"], ["a", "b"]),
# No NaNs, same cats, different order
(["b", "a", "a"], ["a", "b"], ["b", "a"]),
# NaNs
(["a", "b", "c"], ["a", "b"], ["a", "b"]),
(["a", "b", "c"], ["a", "b"], ["b", "a"]),
(["b", "a", "c"], ["a", "b"], ["a", "b"]),
(["b", "a", "c"], ["a", "b"], ["a", "b"]),
# Introduce NaNs
(["a", "b", "c"], ["a", "b"], ["a"]),
(["a", "b", "c"], ["a", "b"], ["b"]),
(["b", "a", "c"], ["a", "b"], ["a"]),
(["b", "a", "c"], ["a", "b"], ["a"]),
# No overlap
(["a", "b", "c"], ["a", "b"], ["d", "e"]),
],
)
@pytest.mark.parametrize("ordered", [True, False])
def test_set_categories_many(self, values, categories, new_categories, ordered):
c = Categorical(values, categories)
expected = Categorical(values, new_categories, ordered)
result = c.set_categories(new_categories, ordered=ordered)
tm.assert_categorical_equal(result, expected)
def test_set_categories_rename_less(self):
# GH 24675
cat = Categorical(["A", "B"])
result = cat.set_categories(["A"], rename=True)
expected = Categorical(["A", np.nan])
tm.assert_categorical_equal(result, expected)
def test_set_categories_private(self):
cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"])
cat._set_categories(["a", "c", "d", "e"])
expected = Categorical(["a", "c", "d"], categories=list("acde"))
tm.assert_categorical_equal(cat, expected)
# fastpath
cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"])
cat._set_categories(["a", "c", "d", "e"], fastpath=True)
expected = Categorical(["a", "c", "d"], categories=list("acde"))
tm.assert_categorical_equal(cat, expected)
def test_remove_categories(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
old = cat.copy()
new = Categorical(["a", "b", np.nan, "a"], categories=["a", "b"], ordered=True)
res = cat.remove_categories("c")
tm.assert_categorical_equal(cat, old)
tm.assert_categorical_equal(res, new)
res = cat.remove_categories(["c"])
tm.assert_categorical_equal(cat, old)
tm.assert_categorical_equal(res, new)
@pytest.mark.parametrize("removals", [["c"], ["c", np.nan], "c", ["c", "c"]])
def test_remove_categories_raises(self, removals):
cat = Categorical(["a", "b", "a"])
message = re.escape("removals must all be in old categories: {'c'}")
with pytest.raises(ValueError, match=message):
cat.remove_categories(removals)
def test_remove_unused_categories(self):
c = Categorical(["a", "b", "c", "d", "a"], categories=["a", "b", "c", "d", "e"])
exp_categories_all = Index(["a", "b", "c", "d", "e"])
exp_categories_dropped = Index(["a", "b", "c", "d"])
tm.assert_index_equal(c.categories, exp_categories_all)
res = c.remove_unused_categories()
tm.assert_index_equal(res.categories, exp_categories_dropped)
tm.assert_index_equal(c.categories, exp_categories_all)
# with NaN values (GH11599)
c = Categorical(["a", "b", "c", np.nan], categories=["a", "b", "c", "d", "e"])
res = c.remove_unused_categories()
tm.assert_index_equal(res.categories, Index(np.array(["a", "b", "c"])))
exp_codes = np.array([0, 1, 2, -1], dtype=np.int8)
tm.assert_numpy_array_equal(res.codes, exp_codes)
tm.assert_index_equal(c.categories, exp_categories_all)
val = ["F", np.nan, "D", "B", "D", "F", np.nan]
cat = Categorical(values=val, categories=list("ABCDEFG"))
out = cat.remove_unused_categories()
tm.assert_index_equal(out.categories, Index(["B", "D", "F"]))
exp_codes = np.array([2, -1, 1, 0, 1, 2, -1], dtype=np.int8)
tm.assert_numpy_array_equal(out.codes, exp_codes)
assert out.tolist() == val
alpha = list("abcdefghijklmnopqrstuvwxyz")
val = np.random.default_rng(2).choice(alpha[::2], 10000).astype("object")
val[np.random.default_rng(2).choice(len(val), 100)] = np.nan
cat = Categorical(values=val, categories=alpha)
out = cat.remove_unused_categories()
assert out.tolist() == val.tolist()
class TestCategoricalAPIWithFactor:
def test_describe(self):
factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
# string type
desc = factor.describe()
assert factor.ordered
exp_index = CategoricalIndex(
["a", "b", "c"], name="categories", ordered=factor.ordered
)
expected = DataFrame(
{"counts": [3, 2, 3], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0]}, index=exp_index
)
tm.assert_frame_equal(desc, expected)
# check unused categories
cat = factor.copy()
cat = cat.set_categories(["a", "b", "c", "d"])
desc = cat.describe()
exp_index = CategoricalIndex(
list("abcd"), ordered=factor.ordered, name="categories"
)
expected = DataFrame(
{"counts": [3, 2, 3, 0], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0, 0]},
index=exp_index,
)
tm.assert_frame_equal(desc, expected)
# check an integer one
cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1])
desc = cat.describe()
exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered, name="categories")
expected = DataFrame(
{"counts": [5, 3, 3], "freqs": [5 / 11.0, 3 / 11.0, 3 / 11.0]},
index=exp_index,
)
tm.assert_frame_equal(desc, expected)
# https://github.com/pandas-dev/pandas/issues/3678
# describe should work with NaN
cat = Categorical([np.nan, 1, 2, 2])
desc = cat.describe()
expected = DataFrame(
{"counts": [1, 2, 1], "freqs": [1 / 4.0, 2 / 4.0, 1 / 4.0]},
index=CategoricalIndex(
[1, 2, np.nan], categories=[1, 2], name="categories"
),
)
tm.assert_frame_equal(desc, expected)
class TestPrivateCategoricalAPI:
def test_codes_immutable(self):
# Codes should be read only
c = Categorical(["a", "b", "c", "a", np.nan])
exp = np.array([0, 1, 2, 0, -1], dtype="int8")
tm.assert_numpy_array_equal(c.codes, exp)
# Assignments to codes should raise
msg = (
"property 'codes' of 'Categorical' object has no setter"
if PY311
else "can't set attribute"
)
with pytest.raises(AttributeError, match=msg):
c.codes = np.array([0, 1, 2, 0, 1], dtype="int8")
# changes in the codes array should raise
codes = c.codes
with pytest.raises(ValueError, match="assignment destination is read-only"):
codes[4] = 1
# But even after getting the codes, the original array should still be
# writeable!
c[4] = "a"
exp = np.array([0, 1, 2, 0, 0], dtype="int8")
tm.assert_numpy_array_equal(c.codes, exp)
c._codes[4] = 2
exp = np.array([0, 1, 2, 0, 2], dtype="int8")
tm.assert_numpy_array_equal(c.codes, exp)
@pytest.mark.parametrize(
"codes, old, new, expected",
[
([0, 1], ["a", "b"], ["a", "b"], [0, 1]),
([0, 1], ["b", "a"], ["b", "a"], [0, 1]),
([0, 1], ["a", "b"], ["b", "a"], [1, 0]),
([0, 1], ["b", "a"], ["a", "b"], [1, 0]),
([0, 1, 0, 1], ["a", "b"], ["a", "b", "c"], [0, 1, 0, 1]),
([0, 1, 2, 2], ["a", "b", "c"], ["a", "b"], [0, 1, -1, -1]),
([0, 1, -1], ["a", "b", "c"], ["a", "b", "c"], [0, 1, -1]),
([0, 1, -1], ["a", "b", "c"], ["b"], [-1, 0, -1]),
([0, 1, -1], ["a", "b", "c"], ["d"], [-1, -1, -1]),
([0, 1, -1], ["a", "b", "c"], [], [-1, -1, -1]),
([-1, -1], [], ["a", "b"], [-1, -1]),
([1, 0], ["b", "a"], ["a", "b"], [0, 1]),
],
)
def test_recode_to_categories(self, codes, old, new, expected):
codes = np.asanyarray(codes, dtype=np.int8)
expected = np.asanyarray(expected, dtype=np.int8)
old = Index(old)
new = Index(new)
result = recode_for_categories(codes, old, new)
tm.assert_numpy_array_equal(result, expected)
def test_recode_to_categories_large(self):
N = 1000
codes = np.arange(N)
old = Index(codes)
expected = np.arange(N - 1, -1, -1, dtype=np.int16)
new = Index(expected)
result = recode_for_categories(codes, old, new)
tm.assert_numpy_array_equal(result, expected)

View File

@ -0,0 +1,155 @@
import numpy as np
import pytest
from pandas import (
Categorical,
CategoricalDtype,
CategoricalIndex,
DatetimeIndex,
Interval,
NaT,
Period,
Timestamp,
array,
to_datetime,
)
import pandas._testing as tm
class TestAstype:
@pytest.mark.parametrize("cls", [Categorical, CategoricalIndex])
@pytest.mark.parametrize("values", [[1, np.nan], [Timestamp("2000"), NaT]])
def test_astype_nan_to_int(self, cls, values):
# GH#28406
obj = cls(values)
msg = "Cannot (cast|convert)"
with pytest.raises((ValueError, TypeError), match=msg):
obj.astype(int)
@pytest.mark.parametrize(
"expected",
[
array(["2019", "2020"], dtype="datetime64[ns, UTC]"),
array([0, 0], dtype="timedelta64[ns]"),
array([Period("2019"), Period("2020")], dtype="period[Y-DEC]"),
array([Interval(0, 1), Interval(1, 2)], dtype="interval"),
array([1, np.nan], dtype="Int64"),
],
)
def test_astype_category_to_extension_dtype(self, expected):
# GH#28668
result = expected.astype("category").astype(expected.dtype)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"dtype, expected",
[
(
"datetime64[ns]",
np.array(["2015-01-01T00:00:00.000000000"], dtype="datetime64[ns]"),
),
(
"datetime64[ns, MET]",
DatetimeIndex([Timestamp("2015-01-01 00:00:00+0100", tz="MET")]).array,
),
],
)
def test_astype_to_datetime64(self, dtype, expected):
# GH#28448
result = Categorical(["2015-01-01"]).astype(dtype)
assert result == expected
def test_astype_str_int_categories_to_nullable_int(self):
# GH#39616
dtype = CategoricalDtype([str(i) for i in range(5)])
codes = np.random.default_rng(2).integers(5, size=20)
arr = Categorical.from_codes(codes, dtype=dtype)
res = arr.astype("Int64")
expected = array(codes, dtype="Int64")
tm.assert_extension_array_equal(res, expected)
def test_astype_str_int_categories_to_nullable_float(self):
# GH#39616
dtype = CategoricalDtype([str(i / 2) for i in range(5)])
codes = np.random.default_rng(2).integers(5, size=20)
arr = Categorical.from_codes(codes, dtype=dtype)
res = arr.astype("Float64")
expected = array(codes, dtype="Float64") / 2
tm.assert_extension_array_equal(res, expected)
@pytest.mark.parametrize("ordered", [True, False])
def test_astype(self, ordered):
# string
cat = Categorical(list("abbaaccc"), ordered=ordered)
result = cat.astype(object)
expected = np.array(cat)
tm.assert_numpy_array_equal(result, expected)
msg = r"Cannot cast object|string dtype to float64"
with pytest.raises(ValueError, match=msg):
cat.astype(float)
# numeric
cat = Categorical([0, 1, 2, 2, 1, 0, 1, 0, 2], ordered=ordered)
result = cat.astype(object)
expected = np.array(cat, dtype=object)
tm.assert_numpy_array_equal(result, expected)
result = cat.astype(int)
expected = np.array(cat, dtype="int")
tm.assert_numpy_array_equal(result, expected)
result = cat.astype(float)
expected = np.array(cat, dtype=float)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("dtype_ordered", [True, False])
@pytest.mark.parametrize("cat_ordered", [True, False])
def test_astype_category(self, dtype_ordered, cat_ordered):
# GH#10696/GH#18593
data = list("abcaacbab")
cat = Categorical(data, categories=list("bac"), ordered=cat_ordered)
# standard categories
dtype = CategoricalDtype(ordered=dtype_ordered)
result = cat.astype(dtype)
expected = Categorical(data, categories=cat.categories, ordered=dtype_ordered)
tm.assert_categorical_equal(result, expected)
# non-standard categories
dtype = CategoricalDtype(list("adc"), dtype_ordered)
result = cat.astype(dtype)
expected = Categorical(data, dtype=dtype)
tm.assert_categorical_equal(result, expected)
if dtype_ordered is False:
# dtype='category' can't specify ordered, so only test once
result = cat.astype("category")
expected = cat
tm.assert_categorical_equal(result, expected)
def test_astype_object_datetime_categories(self):
# GH#40754
cat = Categorical(to_datetime(["2021-03-27", NaT]))
result = cat.astype(object)
expected = np.array([Timestamp("2021-03-27 00:00:00"), NaT], dtype="object")
tm.assert_numpy_array_equal(result, expected)
def test_astype_object_timestamp_categories(self):
# GH#18024
cat = Categorical([Timestamp("2014-01-01")])
result = cat.astype(object)
expected = np.array([Timestamp("2014-01-01 00:00:00")], dtype="object")
tm.assert_numpy_array_equal(result, expected)
def test_astype_category_readonly_mask_values(self):
# GH#53658
arr = array([0, 1, 2], dtype="Int64")
arr._mask.flags["WRITEABLE"] = False
result = arr.astype("category")
expected = array([0, 1, 2], dtype="Int64").astype("category")
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,783 @@
from datetime import (
date,
datetime,
)
import numpy as np
import pytest
from pandas._config import using_pyarrow_string_dtype
from pandas.core.dtypes.common import (
is_float_dtype,
is_integer_dtype,
)
from pandas.core.dtypes.dtypes import CategoricalDtype
import pandas as pd
from pandas import (
Categorical,
CategoricalIndex,
DatetimeIndex,
Index,
Interval,
IntervalIndex,
MultiIndex,
NaT,
Series,
Timestamp,
date_range,
period_range,
timedelta_range,
)
import pandas._testing as tm
class TestCategoricalConstructors:
def test_fastpath_deprecated(self):
codes = np.array([1, 2, 3])
dtype = CategoricalDtype(categories=["a", "b", "c", "d"], ordered=False)
msg = "The 'fastpath' keyword in Categorical is deprecated"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
Categorical(codes, dtype=dtype, fastpath=True)
def test_categorical_from_cat_and_dtype_str_preserve_ordered(self):
# GH#49309 we should preserve orderedness in `res`
cat = Categorical([3, 1], categories=[3, 2, 1], ordered=True)
res = Categorical(cat, dtype="category")
assert res.dtype.ordered
def test_categorical_disallows_scalar(self):
# GH#38433
with pytest.raises(TypeError, match="Categorical input must be list-like"):
Categorical("A", categories=["A", "B"])
def test_categorical_1d_only(self):
# ndim > 1
msg = "> 1 ndim Categorical are not supported at this time"
with pytest.raises(NotImplementedError, match=msg):
Categorical(np.array([list("abcd")]))
def test_validate_ordered(self):
# see gh-14058
exp_msg = "'ordered' must either be 'True' or 'False'"
exp_err = TypeError
# This should be a boolean.
ordered = np.array([0, 1, 2])
with pytest.raises(exp_err, match=exp_msg):
Categorical([1, 2, 3], ordered=ordered)
with pytest.raises(exp_err, match=exp_msg):
Categorical.from_codes(
[0, 0, 1], categories=["a", "b", "c"], ordered=ordered
)
def test_constructor_empty(self):
# GH 17248
c = Categorical([])
expected = Index([])
tm.assert_index_equal(c.categories, expected)
c = Categorical([], categories=[1, 2, 3])
expected = Index([1, 2, 3], dtype=np.int64)
tm.assert_index_equal(c.categories, expected)
def test_constructor_empty_boolean(self):
# see gh-22702
cat = Categorical([], categories=[True, False])
categories = sorted(cat.categories.tolist())
assert categories == [False, True]
def test_constructor_tuples(self):
values = np.array([(1,), (1, 2), (1,), (1, 2)], dtype=object)
result = Categorical(values)
expected = Index([(1,), (1, 2)], tupleize_cols=False)
tm.assert_index_equal(result.categories, expected)
assert result.ordered is False
def test_constructor_tuples_datetimes(self):
# numpy will auto reshape when all of the tuples are the
# same len, so add an extra one with 2 items and slice it off
values = np.array(
[
(Timestamp("2010-01-01"),),
(Timestamp("2010-01-02"),),
(Timestamp("2010-01-01"),),
(Timestamp("2010-01-02"),),
("a", "b"),
],
dtype=object,
)[:-1]
result = Categorical(values)
expected = Index(
[(Timestamp("2010-01-01"),), (Timestamp("2010-01-02"),)],
tupleize_cols=False,
)
tm.assert_index_equal(result.categories, expected)
def test_constructor_unsortable(self):
# it works!
arr = np.array([1, 2, 3, datetime.now()], dtype="O")
factor = Categorical(arr, ordered=False)
assert not factor.ordered
# this however will raise as cannot be sorted
msg = (
"'values' is not ordered, please explicitly specify the "
"categories order by passing in a categories argument."
)
with pytest.raises(TypeError, match=msg):
Categorical(arr, ordered=True)
def test_constructor_interval(self):
result = Categorical(
[Interval(1, 2), Interval(2, 3), Interval(3, 6)], ordered=True
)
ii = IntervalIndex([Interval(1, 2), Interval(2, 3), Interval(3, 6)])
exp = Categorical(ii, ordered=True)
tm.assert_categorical_equal(result, exp)
tm.assert_index_equal(result.categories, ii)
def test_constructor(self):
exp_arr = np.array(["a", "b", "c", "a", "b", "c"], dtype=np.object_)
c1 = Categorical(exp_arr)
tm.assert_numpy_array_equal(c1.__array__(), exp_arr)
c2 = Categorical(exp_arr, categories=["a", "b", "c"])
tm.assert_numpy_array_equal(c2.__array__(), exp_arr)
c2 = Categorical(exp_arr, categories=["c", "b", "a"])
tm.assert_numpy_array_equal(c2.__array__(), exp_arr)
# categories must be unique
msg = "Categorical categories must be unique"
with pytest.raises(ValueError, match=msg):
Categorical([1, 2], [1, 2, 2])
with pytest.raises(ValueError, match=msg):
Categorical(["a", "b"], ["a", "b", "b"])
# The default should be unordered
c1 = Categorical(["a", "b", "c", "a"])
assert not c1.ordered
# Categorical as input
c1 = Categorical(["a", "b", "c", "a"])
c2 = Categorical(c1)
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
c2 = Categorical(c1)
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
c2 = Categorical(c1)
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
c2 = Categorical(c1, categories=["a", "b", "c"])
tm.assert_numpy_array_equal(c1.__array__(), c2.__array__())
tm.assert_index_equal(c2.categories, Index(["a", "b", "c"]))
# Series of dtype category
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
c2 = Categorical(Series(c1))
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
c2 = Categorical(Series(c1))
tm.assert_categorical_equal(c1, c2)
# Series
c1 = Categorical(["a", "b", "c", "a"])
c2 = Categorical(Series(["a", "b", "c", "a"]))
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
c2 = Categorical(Series(["a", "b", "c", "a"]), categories=["a", "b", "c", "d"])
tm.assert_categorical_equal(c1, c2)
# This should result in integer categories, not float!
cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
assert is_integer_dtype(cat.categories)
# https://github.com/pandas-dev/pandas/issues/3678
cat = Categorical([np.nan, 1, 2, 3])
assert is_integer_dtype(cat.categories)
# this should result in floats
cat = Categorical([np.nan, 1, 2.0, 3])
assert is_float_dtype(cat.categories)
cat = Categorical([np.nan, 1.0, 2.0, 3.0])
assert is_float_dtype(cat.categories)
# This doesn't work -> this would probably need some kind of "remember
# the original type" feature to try to cast the array interface result
# to...
# vals = np.asarray(cat[cat.notna()])
# assert is_integer_dtype(vals)
# corner cases
cat = Categorical([1])
assert len(cat.categories) == 1
assert cat.categories[0] == 1
assert len(cat.codes) == 1
assert cat.codes[0] == 0
cat = Categorical(["a"])
assert len(cat.categories) == 1
assert cat.categories[0] == "a"
assert len(cat.codes) == 1
assert cat.codes[0] == 0
# two arrays
# - when the first is an integer dtype and the second is not
# - when the resulting codes are all -1/NaN
with tm.assert_produces_warning(None):
Categorical([0, 1, 2, 0, 1, 2], categories=["a", "b", "c"])
with tm.assert_produces_warning(None):
Categorical([0, 1, 2, 0, 1, 2], categories=[3, 4, 5])
# the next one are from the old docs
with tm.assert_produces_warning(None):
Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3])
cat = Categorical([1, 2], categories=[1, 2, 3])
# this is a legitimate constructor
with tm.assert_produces_warning(None):
Categorical(np.array([], dtype="int64"), categories=[3, 2, 1], ordered=True)
def test_constructor_with_existing_categories(self):
# GH25318: constructing with pd.Series used to bogusly skip recoding
# categories
c0 = Categorical(["a", "b", "c", "a"])
c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"])
c2 = Categorical(c0, categories=c1.categories)
tm.assert_categorical_equal(c1, c2)
c3 = Categorical(Series(c0), categories=c1.categories)
tm.assert_categorical_equal(c1, c3)
def test_constructor_not_sequence(self):
# https://github.com/pandas-dev/pandas/issues/16022
msg = r"^Parameter 'categories' must be list-like, was"
with pytest.raises(TypeError, match=msg):
Categorical(["a", "b"], categories="a")
def test_constructor_with_null(self):
# Cannot have NaN in categories
msg = "Categorical categories cannot be null"
with pytest.raises(ValueError, match=msg):
Categorical([np.nan, "a", "b", "c"], categories=[np.nan, "a", "b", "c"])
with pytest.raises(ValueError, match=msg):
Categorical([None, "a", "b", "c"], categories=[None, "a", "b", "c"])
with pytest.raises(ValueError, match=msg):
Categorical(
DatetimeIndex(["nat", "20160101"]),
categories=[NaT, Timestamp("20160101")],
)
def test_constructor_with_index(self):
ci = CategoricalIndex(list("aabbca"), categories=list("cab"))
tm.assert_categorical_equal(ci.values, Categorical(ci))
ci = CategoricalIndex(list("aabbca"), categories=list("cab"))
tm.assert_categorical_equal(
ci.values, Categorical(ci.astype(object), categories=ci.categories)
)
def test_constructor_with_generator(self):
# This was raising an Error in isna(single_val).any() because isna
# returned a scalar for a generator
exp = Categorical([0, 1, 2])
cat = Categorical(x for x in [0, 1, 2])
tm.assert_categorical_equal(cat, exp)
cat = Categorical(range(3))
tm.assert_categorical_equal(cat, exp)
MultiIndex.from_product([range(5), ["a", "b", "c"]])
# check that categories accept generators and sequences
cat = Categorical([0, 1, 2], categories=(x for x in [0, 1, 2]))
tm.assert_categorical_equal(cat, exp)
cat = Categorical([0, 1, 2], categories=range(3))
tm.assert_categorical_equal(cat, exp)
def test_constructor_with_rangeindex(self):
# RangeIndex is preserved in Categories
rng = Index(range(3))
cat = Categorical(rng)
tm.assert_index_equal(cat.categories, rng, exact=True)
cat = Categorical([1, 2, 0], categories=rng)
tm.assert_index_equal(cat.categories, rng, exact=True)
@pytest.mark.parametrize(
"dtl",
[
date_range("1995-01-01 00:00:00", periods=5, freq="s"),
date_range("1995-01-01 00:00:00", periods=5, freq="s", tz="US/Eastern"),
timedelta_range("1 day", periods=5, freq="s"),
],
)
def test_constructor_with_datetimelike(self, dtl):
# see gh-12077
# constructor with a datetimelike and NaT
s = Series(dtl)
c = Categorical(s)
expected = type(dtl)(s)
expected._data.freq = None
tm.assert_index_equal(c.categories, expected)
tm.assert_numpy_array_equal(c.codes, np.arange(5, dtype="int8"))
# with NaT
s2 = s.copy()
s2.iloc[-1] = NaT
c = Categorical(s2)
expected = type(dtl)(s2.dropna())
expected._data.freq = None
tm.assert_index_equal(c.categories, expected)
exp = np.array([0, 1, 2, 3, -1], dtype=np.int8)
tm.assert_numpy_array_equal(c.codes, exp)
result = repr(c)
assert "NaT" in result
def test_constructor_from_index_series_datetimetz(self):
idx = date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern")
idx = idx._with_freq(None) # freq not preserved in result.categories
result = Categorical(idx)
tm.assert_index_equal(result.categories, idx)
result = Categorical(Series(idx))
tm.assert_index_equal(result.categories, idx)
def test_constructor_date_objects(self):
# we dont cast date objects to timestamps, matching Index constructor
v = date.today()
cat = Categorical([v, v])
assert cat.categories.dtype == object
assert type(cat.categories[0]) is date
def test_constructor_from_index_series_timedelta(self):
idx = timedelta_range("1 days", freq="D", periods=3)
idx = idx._with_freq(None) # freq not preserved in result.categories
result = Categorical(idx)
tm.assert_index_equal(result.categories, idx)
result = Categorical(Series(idx))
tm.assert_index_equal(result.categories, idx)
def test_constructor_from_index_series_period(self):
idx = period_range("2015-01-01", freq="D", periods=3)
result = Categorical(idx)
tm.assert_index_equal(result.categories, idx)
result = Categorical(Series(idx))
tm.assert_index_equal(result.categories, idx)
@pytest.mark.parametrize(
"values",
[
np.array([1.0, 1.2, 1.8, np.nan]),
np.array([1, 2, 3], dtype="int64"),
["a", "b", "c", np.nan],
[pd.Period("2014-01"), pd.Period("2014-02"), NaT],
[Timestamp("2014-01-01"), Timestamp("2014-01-02"), NaT],
[
Timestamp("2014-01-01", tz="US/Eastern"),
Timestamp("2014-01-02", tz="US/Eastern"),
NaT,
],
],
)
def test_constructor_invariant(self, values):
# GH 14190
c = Categorical(values)
c2 = Categorical(c)
tm.assert_categorical_equal(c, c2)
@pytest.mark.parametrize("ordered", [True, False])
def test_constructor_with_dtype(self, ordered):
categories = ["b", "a", "c"]
dtype = CategoricalDtype(categories, ordered=ordered)
result = Categorical(["a", "b", "a", "c"], dtype=dtype)
expected = Categorical(
["a", "b", "a", "c"], categories=categories, ordered=ordered
)
tm.assert_categorical_equal(result, expected)
assert result.ordered is ordered
def test_constructor_dtype_and_others_raises(self):
dtype = CategoricalDtype(["a", "b"], ordered=True)
msg = "Cannot specify `categories` or `ordered` together with `dtype`."
with pytest.raises(ValueError, match=msg):
Categorical(["a", "b"], categories=["a", "b"], dtype=dtype)
with pytest.raises(ValueError, match=msg):
Categorical(["a", "b"], ordered=True, dtype=dtype)
with pytest.raises(ValueError, match=msg):
Categorical(["a", "b"], ordered=False, dtype=dtype)
@pytest.mark.parametrize("categories", [None, ["a", "b"], ["a", "c"]])
@pytest.mark.parametrize("ordered", [True, False])
def test_constructor_str_category(self, categories, ordered):
result = Categorical(
["a", "b"], categories=categories, ordered=ordered, dtype="category"
)
expected = Categorical(["a", "b"], categories=categories, ordered=ordered)
tm.assert_categorical_equal(result, expected)
def test_constructor_str_unknown(self):
with pytest.raises(ValueError, match="Unknown dtype"):
Categorical([1, 2], dtype="foo")
@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="Can't be NumPy strings")
def test_constructor_np_strs(self):
# GH#31499 Hashtable.map_locations needs to work on np.str_ objects
cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")])
assert all(isinstance(x, np.str_) for x in cat.categories)
def test_constructor_from_categorical_with_dtype(self):
dtype = CategoricalDtype(["a", "b", "c"], ordered=True)
values = Categorical(["a", "b", "d"])
result = Categorical(values, dtype=dtype)
# We use dtype.categories, not values.categories
expected = Categorical(
["a", "b", "d"], categories=["a", "b", "c"], ordered=True
)
tm.assert_categorical_equal(result, expected)
def test_constructor_from_categorical_with_unknown_dtype(self):
dtype = CategoricalDtype(None, ordered=True)
values = Categorical(["a", "b", "d"])
result = Categorical(values, dtype=dtype)
# We use values.categories, not dtype.categories
expected = Categorical(
["a", "b", "d"], categories=["a", "b", "d"], ordered=True
)
tm.assert_categorical_equal(result, expected)
def test_constructor_from_categorical_string(self):
values = Categorical(["a", "b", "d"])
# use categories, ordered
result = Categorical(
values, categories=["a", "b", "c"], ordered=True, dtype="category"
)
expected = Categorical(
["a", "b", "d"], categories=["a", "b", "c"], ordered=True
)
tm.assert_categorical_equal(result, expected)
# No string
result = Categorical(values, categories=["a", "b", "c"], ordered=True)
tm.assert_categorical_equal(result, expected)
def test_constructor_with_categorical_categories(self):
# GH17884
expected = Categorical(["a", "b"], categories=["a", "b", "c"])
result = Categorical(["a", "b"], categories=Categorical(["a", "b", "c"]))
tm.assert_categorical_equal(result, expected)
result = Categorical(["a", "b"], categories=CategoricalIndex(["a", "b", "c"]))
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("klass", [lambda x: np.array(x, dtype=object), list])
def test_construction_with_null(self, klass, nulls_fixture):
# https://github.com/pandas-dev/pandas/issues/31927
values = klass(["a", nulls_fixture, "b"])
result = Categorical(values)
dtype = CategoricalDtype(["a", "b"])
codes = [0, -1, 1]
expected = Categorical.from_codes(codes=codes, dtype=dtype)
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("validate", [True, False])
def test_from_codes_nullable_int_categories(self, any_numeric_ea_dtype, validate):
# GH#39649
cats = pd.array(range(5), dtype=any_numeric_ea_dtype)
codes = np.random.default_rng(2).integers(5, size=3)
dtype = CategoricalDtype(cats)
arr = Categorical.from_codes(codes, dtype=dtype, validate=validate)
assert arr.categories.dtype == cats.dtype
tm.assert_index_equal(arr.categories, Index(cats))
def test_from_codes_empty(self):
cat = ["a", "b", "c"]
result = Categorical.from_codes([], categories=cat)
expected = Categorical([], categories=cat)
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("validate", [True, False])
def test_from_codes_validate(self, validate):
# GH53122
dtype = CategoricalDtype(["a", "b"])
if validate:
with pytest.raises(ValueError, match="codes need to be between "):
Categorical.from_codes([4, 5], dtype=dtype, validate=validate)
else:
# passes, though has incorrect codes, but that's the user responsibility
Categorical.from_codes([4, 5], dtype=dtype, validate=validate)
def test_from_codes_too_few_categories(self):
dtype = CategoricalDtype(categories=[1, 2])
msg = "codes need to be between "
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([1, 2], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([1, 2], dtype=dtype)
def test_from_codes_non_int_codes(self):
dtype = CategoricalDtype(categories=[1, 2])
msg = "codes need to be array-like integers"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(["a"], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(["a"], dtype=dtype)
def test_from_codes_non_unique_categories(self):
with pytest.raises(ValueError, match="Categorical categories must be unique"):
Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"])
def test_from_codes_nan_cat_included(self):
with pytest.raises(ValueError, match="Categorical categories cannot be null"):
Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan])
def test_from_codes_too_negative(self):
dtype = CategoricalDtype(categories=["a", "b", "c"])
msg = r"codes need to be between -1 and len\(categories\)-1"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([-2, 1, 2], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([-2, 1, 2], dtype=dtype)
def test_from_codes(self):
dtype = CategoricalDtype(categories=["a", "b", "c"])
exp = Categorical(["a", "b", "c"], ordered=False)
res = Categorical.from_codes([0, 1, 2], categories=dtype.categories)
tm.assert_categorical_equal(exp, res)
res = Categorical.from_codes([0, 1, 2], dtype=dtype)
tm.assert_categorical_equal(exp, res)
@pytest.mark.parametrize("klass", [Categorical, CategoricalIndex])
def test_from_codes_with_categorical_categories(self, klass):
# GH17884
expected = Categorical(["a", "b"], categories=["a", "b", "c"])
result = Categorical.from_codes([0, 1], categories=klass(["a", "b", "c"]))
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("klass", [Categorical, CategoricalIndex])
def test_from_codes_with_non_unique_categorical_categories(self, klass):
with pytest.raises(ValueError, match="Categorical categories must be unique"):
Categorical.from_codes([0, 1], klass(["a", "b", "a"]))
def test_from_codes_with_nan_code(self):
# GH21767
codes = [1, 2, np.nan]
dtype = CategoricalDtype(categories=["a", "b", "c"])
with pytest.raises(ValueError, match="codes need to be array-like integers"):
Categorical.from_codes(codes, categories=dtype.categories)
with pytest.raises(ValueError, match="codes need to be array-like integers"):
Categorical.from_codes(codes, dtype=dtype)
@pytest.mark.parametrize("codes", [[1.0, 2.0, 0], [1.1, 2.0, 0]])
def test_from_codes_with_float(self, codes):
# GH21767
# float codes should raise even if values are equal to integers
dtype = CategoricalDtype(categories=["a", "b", "c"])
msg = "codes need to be array-like integers"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(codes, dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(codes, dtype=dtype)
def test_from_codes_with_dtype_raises(self):
msg = "Cannot specify"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(
[0, 1], categories=["a", "b"], dtype=CategoricalDtype(["a", "b"])
)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(
[0, 1], ordered=True, dtype=CategoricalDtype(["a", "b"])
)
def test_from_codes_neither(self):
msg = "Both were None"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([0, 1])
def test_from_codes_with_nullable_int(self):
codes = pd.array([0, 1], dtype="Int64")
categories = ["a", "b"]
result = Categorical.from_codes(codes, categories=categories)
expected = Categorical.from_codes(codes.to_numpy(int), categories=categories)
tm.assert_categorical_equal(result, expected)
def test_from_codes_with_nullable_int_na_raises(self):
codes = pd.array([0, None], dtype="Int64")
categories = ["a", "b"]
msg = "codes cannot contain NA values"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(codes, categories=categories)
@pytest.mark.parametrize("dtype", [None, "category"])
def test_from_inferred_categories(self, dtype):
cats = ["a", "b"]
codes = np.array([0, 0, 1, 1], dtype="i8")
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical.from_codes(codes, cats)
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("dtype", [None, "category"])
def test_from_inferred_categories_sorts(self, dtype):
cats = ["b", "a"]
codes = np.array([0, 1, 1, 1], dtype="i8")
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical.from_codes([1, 0, 0, 0], ["a", "b"])
tm.assert_categorical_equal(result, expected)
def test_from_inferred_categories_dtype(self):
cats = ["a", "b", "d"]
codes = np.array([0, 1, 0, 2], dtype="i8")
dtype = CategoricalDtype(["c", "b", "a"], ordered=True)
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical(
["a", "b", "a", "d"], categories=["c", "b", "a"], ordered=True
)
tm.assert_categorical_equal(result, expected)
def test_from_inferred_categories_coerces(self):
cats = ["1", "2", "bad"]
codes = np.array([0, 0, 1, 2], dtype="i8")
dtype = CategoricalDtype([1, 2])
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical([1, 1, 2, np.nan])
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("ordered", [None, True, False])
def test_construction_with_ordered(self, ordered):
# GH 9347, 9190
cat = Categorical([0, 1, 2], ordered=ordered)
assert cat.ordered == bool(ordered)
def test_constructor_imaginary(self):
values = [1, 2, 3 + 1j]
c1 = Categorical(values)
tm.assert_index_equal(c1.categories, Index(values))
tm.assert_numpy_array_equal(np.array(c1), np.array(values))
def test_constructor_string_and_tuples(self):
# GH 21416
c = Categorical(np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object))
expected_index = Index([("a", "b"), ("b", "a"), "c"])
assert c.categories.equals(expected_index)
def test_interval(self):
idx = pd.interval_range(0, 10, periods=10)
cat = Categorical(idx, categories=idx)
expected_codes = np.arange(10, dtype="int8")
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
# infer categories
cat = Categorical(idx)
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
# list values
cat = Categorical(list(idx))
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
# list values, categories
cat = Categorical(list(idx), categories=list(idx))
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
# shuffled
values = idx.take([1, 2, 0])
cat = Categorical(values, categories=idx)
tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype="int8"))
tm.assert_index_equal(cat.categories, idx)
# extra
values = pd.interval_range(8, 11, periods=3)
cat = Categorical(values, categories=idx)
expected_codes = np.array([8, 9, -1], dtype="int8")
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
# overlapping
idx = IntervalIndex([Interval(0, 2), Interval(0, 1)])
cat = Categorical(idx, categories=idx)
expected_codes = np.array([0, 1], dtype="int8")
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
def test_categorical_extension_array_nullable(self, nulls_fixture):
# GH:
arr = pd.arrays.StringArray._from_sequence(
[nulls_fixture] * 2, dtype=pd.StringDtype()
)
result = Categorical(arr)
assert arr.dtype == result.categories.dtype
expected = Categorical(Series([pd.NA, pd.NA], dtype=arr.dtype))
tm.assert_categorical_equal(result, expected)
def test_from_sequence_copy(self):
cat = Categorical(np.arange(5).repeat(2))
result = Categorical._from_sequence(cat, dtype=cat.dtype, copy=False)
# more generally, we'd be OK with a view
assert result._codes is cat._codes
result = Categorical._from_sequence(cat, dtype=cat.dtype, copy=True)
assert not tm.shares_memory(result, cat)
def test_constructor_datetime64_non_nano(self):
categories = np.arange(10).view("M8[D]")
values = categories[::2].copy()
cat = Categorical(values, categories=categories)
assert (cat == values).all()
def test_constructor_preserves_freq(self):
# GH33830 freq retention in categorical
dti = date_range("2016-01-01", periods=5)
expected = dti.freq
cat = Categorical(dti)
result = cat.categories.freq
assert expected == result

View File

@ -0,0 +1,139 @@
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas import (
Categorical,
CategoricalIndex,
Index,
IntervalIndex,
Series,
Timestamp,
)
import pandas._testing as tm
class TestCategoricalDtypes:
def test_categories_match_up_to_permutation(self):
# test dtype comparisons between cats
c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False)
c2 = Categorical(list("aabca"), categories=list("cab"), ordered=False)
c3 = Categorical(list("aabca"), categories=list("cab"), ordered=True)
assert c1._categories_match_up_to_permutation(c1)
assert c2._categories_match_up_to_permutation(c2)
assert c3._categories_match_up_to_permutation(c3)
assert c1._categories_match_up_to_permutation(c2)
assert not c1._categories_match_up_to_permutation(c3)
assert not c1._categories_match_up_to_permutation(Index(list("aabca")))
assert not c1._categories_match_up_to_permutation(c1.astype(object))
assert c1._categories_match_up_to_permutation(CategoricalIndex(c1))
assert c1._categories_match_up_to_permutation(
CategoricalIndex(c1, categories=list("cab"))
)
assert not c1._categories_match_up_to_permutation(
CategoricalIndex(c1, ordered=True)
)
# GH 16659
s1 = Series(c1)
s2 = Series(c2)
s3 = Series(c3)
assert c1._categories_match_up_to_permutation(s1)
assert c2._categories_match_up_to_permutation(s2)
assert c3._categories_match_up_to_permutation(s3)
assert c1._categories_match_up_to_permutation(s2)
assert not c1._categories_match_up_to_permutation(s3)
assert not c1._categories_match_up_to_permutation(s1.astype(object))
def test_set_dtype_same(self):
c = Categorical(["a", "b", "c"])
result = c._set_dtype(CategoricalDtype(["a", "b", "c"]))
tm.assert_categorical_equal(result, c)
def test_set_dtype_new_categories(self):
c = Categorical(["a", "b", "c"])
result = c._set_dtype(CategoricalDtype(list("abcd")))
tm.assert_numpy_array_equal(result.codes, c.codes)
tm.assert_index_equal(result.dtype.categories, Index(list("abcd")))
@pytest.mark.parametrize(
"values, categories, new_categories",
[
# No NaNs, same cats, same order
(["a", "b", "a"], ["a", "b"], ["a", "b"]),
# No NaNs, same cats, different order
(["a", "b", "a"], ["a", "b"], ["b", "a"]),
# Same, unsorted
(["b", "a", "a"], ["a", "b"], ["a", "b"]),
# No NaNs, same cats, different order
(["b", "a", "a"], ["a", "b"], ["b", "a"]),
# NaNs
(["a", "b", "c"], ["a", "b"], ["a", "b"]),
(["a", "b", "c"], ["a", "b"], ["b", "a"]),
(["b", "a", "c"], ["a", "b"], ["a", "b"]),
(["b", "a", "c"], ["a", "b"], ["a", "b"]),
# Introduce NaNs
(["a", "b", "c"], ["a", "b"], ["a"]),
(["a", "b", "c"], ["a", "b"], ["b"]),
(["b", "a", "c"], ["a", "b"], ["a"]),
(["b", "a", "c"], ["a", "b"], ["a"]),
# No overlap
(["a", "b", "c"], ["a", "b"], ["d", "e"]),
],
)
@pytest.mark.parametrize("ordered", [True, False])
def test_set_dtype_many(self, values, categories, new_categories, ordered):
c = Categorical(values, categories)
expected = Categorical(values, new_categories, ordered)
result = c._set_dtype(expected.dtype)
tm.assert_categorical_equal(result, expected)
def test_set_dtype_no_overlap(self):
c = Categorical(["a", "b", "c"], ["d", "e"])
result = c._set_dtype(CategoricalDtype(["a", "b"]))
expected = Categorical([None, None, None], categories=["a", "b"])
tm.assert_categorical_equal(result, expected)
def test_codes_dtypes(self):
# GH 8453
result = Categorical(["foo", "bar", "baz"])
assert result.codes.dtype == "int8"
result = Categorical([f"foo{i:05d}" for i in range(400)])
assert result.codes.dtype == "int16"
result = Categorical([f"foo{i:05d}" for i in range(40000)])
assert result.codes.dtype == "int32"
# adding cats
result = Categorical(["foo", "bar", "baz"])
assert result.codes.dtype == "int8"
result = result.add_categories([f"foo{i:05d}" for i in range(400)])
assert result.codes.dtype == "int16"
# removing cats
result = result.remove_categories([f"foo{i:05d}" for i in range(300)])
assert result.codes.dtype == "int8"
def test_iter_python_types(self):
# GH-19909
cat = Categorical([1, 2])
assert isinstance(next(iter(cat)), int)
assert isinstance(cat.tolist()[0], int)
def test_iter_python_types_datetime(self):
cat = Categorical([Timestamp("2017-01-01"), Timestamp("2017-01-02")])
assert isinstance(next(iter(cat)), Timestamp)
assert isinstance(cat.tolist()[0], Timestamp)
def test_interval_index_category(self):
# GH 38316
index = IntervalIndex.from_breaks(np.arange(3, dtype="uint64"))
result = CategoricalIndex(index).dtype.categories
expected = IntervalIndex.from_arrays(
[0, 1], [1, 2], dtype="interval[uint64, right]"
)
tm.assert_index_equal(result, expected)

View File

@ -0,0 +1,388 @@
import math
import numpy as np
import pytest
from pandas import (
NA,
Categorical,
CategoricalIndex,
Index,
Interval,
IntervalIndex,
NaT,
PeriodIndex,
Series,
Timedelta,
Timestamp,
)
import pandas._testing as tm
import pandas.core.common as com
class TestCategoricalIndexingWithFactor:
def test_getitem(self):
factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
assert factor[0] == "a"
assert factor[-1] == "c"
subf = factor[[0, 1, 2]]
tm.assert_numpy_array_equal(subf._codes, np.array([0, 1, 1], dtype=np.int8))
subf = factor[np.asarray(factor) == "c"]
tm.assert_numpy_array_equal(subf._codes, np.array([2, 2, 2], dtype=np.int8))
def test_setitem(self):
factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
# int/positional
c = factor.copy()
c[0] = "b"
assert c[0] == "b"
c[-1] = "a"
assert c[-1] == "a"
# boolean
c = factor.copy()
indexer = np.zeros(len(c), dtype="bool")
indexer[0] = True
indexer[-1] = True
c[indexer] = "c"
expected = Categorical(["c", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
tm.assert_categorical_equal(c, expected)
@pytest.mark.parametrize(
"other",
[Categorical(["b", "a"]), Categorical(["b", "a"], categories=["b", "a"])],
)
def test_setitem_same_but_unordered(self, other):
# GH-24142
target = Categorical(["a", "b"], categories=["a", "b"])
mask = np.array([True, False])
target[mask] = other[mask]
expected = Categorical(["b", "b"], categories=["a", "b"])
tm.assert_categorical_equal(target, expected)
@pytest.mark.parametrize(
"other",
[
Categorical(["b", "a"], categories=["b", "a", "c"]),
Categorical(["b", "a"], categories=["a", "b", "c"]),
Categorical(["a", "a"], categories=["a"]),
Categorical(["b", "b"], categories=["b"]),
],
)
def test_setitem_different_unordered_raises(self, other):
# GH-24142
target = Categorical(["a", "b"], categories=["a", "b"])
mask = np.array([True, False])
msg = "Cannot set a Categorical with another, without identical categories"
with pytest.raises(TypeError, match=msg):
target[mask] = other[mask]
@pytest.mark.parametrize(
"other",
[
Categorical(["b", "a"]),
Categorical(["b", "a"], categories=["b", "a"], ordered=True),
Categorical(["b", "a"], categories=["a", "b", "c"], ordered=True),
],
)
def test_setitem_same_ordered_raises(self, other):
# Gh-24142
target = Categorical(["a", "b"], categories=["a", "b"], ordered=True)
mask = np.array([True, False])
msg = "Cannot set a Categorical with another, without identical categories"
with pytest.raises(TypeError, match=msg):
target[mask] = other[mask]
def test_setitem_tuple(self):
# GH#20439
cat = Categorical([(0, 1), (0, 2), (0, 1)])
# This should not raise
cat[1] = cat[0]
assert cat[1] == (0, 1)
def test_setitem_listlike(self):
# GH#9469
# properly coerce the input indexers
cat = Categorical(
np.random.default_rng(2).integers(0, 5, size=150000).astype(np.int8)
).add_categories([-1000])
indexer = np.array([100000]).astype(np.int64)
cat[indexer] = -1000
# we are asserting the code result here
# which maps to the -1000 category
result = cat.codes[np.array([100000]).astype(np.int64)]
tm.assert_numpy_array_equal(result, np.array([5], dtype="int8"))
class TestCategoricalIndexing:
def test_getitem_slice(self):
cat = Categorical(["a", "b", "c", "d", "a", "b", "c"])
sliced = cat[3]
assert sliced == "d"
sliced = cat[3:5]
expected = Categorical(["d", "a"], categories=["a", "b", "c", "d"])
tm.assert_categorical_equal(sliced, expected)
def test_getitem_listlike(self):
# GH 9469
# properly coerce the input indexers
c = Categorical(
np.random.default_rng(2).integers(0, 5, size=150000).astype(np.int8)
)
result = c.codes[np.array([100000]).astype(np.int64)]
expected = c[np.array([100000]).astype(np.int64)].codes
tm.assert_numpy_array_equal(result, expected)
def test_periodindex(self):
idx1 = PeriodIndex(
["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"],
freq="M",
)
cat1 = Categorical(idx1)
str(cat1)
exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.int8)
exp_idx = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M")
tm.assert_numpy_array_equal(cat1._codes, exp_arr)
tm.assert_index_equal(cat1.categories, exp_idx)
idx2 = PeriodIndex(
["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"],
freq="M",
)
cat2 = Categorical(idx2, ordered=True)
str(cat2)
exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.int8)
exp_idx2 = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M")
tm.assert_numpy_array_equal(cat2._codes, exp_arr)
tm.assert_index_equal(cat2.categories, exp_idx2)
idx3 = PeriodIndex(
[
"2013-12",
"2013-11",
"2013-10",
"2013-09",
"2013-08",
"2013-07",
"2013-05",
],
freq="M",
)
cat3 = Categorical(idx3, ordered=True)
exp_arr = np.array([6, 5, 4, 3, 2, 1, 0], dtype=np.int8)
exp_idx = PeriodIndex(
[
"2013-05",
"2013-07",
"2013-08",
"2013-09",
"2013-10",
"2013-11",
"2013-12",
],
freq="M",
)
tm.assert_numpy_array_equal(cat3._codes, exp_arr)
tm.assert_index_equal(cat3.categories, exp_idx)
@pytest.mark.parametrize(
"null_val",
[None, np.nan, NaT, NA, math.nan, "NaT", "nat", "NAT", "nan", "NaN", "NAN"],
)
def test_periodindex_on_null_types(self, null_val):
# GH 46673
result = PeriodIndex(["2022-04-06", "2022-04-07", null_val], freq="D")
expected = PeriodIndex(["2022-04-06", "2022-04-07", "NaT"], dtype="period[D]")
assert result[2] is NaT
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize("new_categories", [[1, 2, 3, 4], [1, 2]])
def test_categories_assignments_wrong_length_raises(self, new_categories):
cat = Categorical(["a", "b", "c", "a"])
msg = (
"new categories need to have the same number of items "
"as the old categories!"
)
with pytest.raises(ValueError, match=msg):
cat.rename_categories(new_categories)
# Combinations of sorted/unique:
@pytest.mark.parametrize(
"idx_values", [[1, 2, 3, 4], [1, 3, 2, 4], [1, 3, 3, 4], [1, 2, 2, 4]]
)
# Combinations of missing/unique
@pytest.mark.parametrize("key_values", [[1, 2], [1, 5], [1, 1], [5, 5]])
@pytest.mark.parametrize("key_class", [Categorical, CategoricalIndex])
@pytest.mark.parametrize("dtype", [None, "category", "key"])
def test_get_indexer_non_unique(self, idx_values, key_values, key_class, dtype):
# GH 21448
key = key_class(key_values, categories=range(1, 5))
if dtype == "key":
dtype = key.dtype
# Test for flat index and CategoricalIndex with same/different cats:
idx = Index(idx_values, dtype=dtype)
expected, exp_miss = idx.get_indexer_non_unique(key_values)
result, res_miss = idx.get_indexer_non_unique(key)
tm.assert_numpy_array_equal(expected, result)
tm.assert_numpy_array_equal(exp_miss, res_miss)
exp_unique = idx.unique().get_indexer(key_values)
res_unique = idx.unique().get_indexer(key)
tm.assert_numpy_array_equal(res_unique, exp_unique)
def test_where_unobserved_nan(self):
ser = Series(Categorical(["a", "b"]))
result = ser.where([True, False])
expected = Series(Categorical(["a", None], categories=["a", "b"]))
tm.assert_series_equal(result, expected)
# all NA
ser = Series(Categorical(["a", "b"]))
result = ser.where([False, False])
expected = Series(Categorical([None, None], categories=["a", "b"]))
tm.assert_series_equal(result, expected)
def test_where_unobserved_categories(self):
ser = Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"]))
result = ser.where([True, True, False], other="b")
expected = Series(Categorical(["a", "b", "b"], categories=ser.cat.categories))
tm.assert_series_equal(result, expected)
def test_where_other_categorical(self):
ser = Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"]))
other = Categorical(["b", "c", "a"], categories=["a", "c", "b", "d"])
result = ser.where([True, False, True], other)
expected = Series(Categorical(["a", "c", "c"], dtype=ser.dtype))
tm.assert_series_equal(result, expected)
def test_where_new_category_raises(self):
ser = Series(Categorical(["a", "b", "c"]))
msg = "Cannot setitem on a Categorical with a new category"
with pytest.raises(TypeError, match=msg):
ser.where([True, False, True], "d")
def test_where_ordered_differs_rasies(self):
ser = Series(
Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"], ordered=True)
)
other = Categorical(
["b", "c", "a"], categories=["a", "c", "b", "d"], ordered=True
)
with pytest.raises(TypeError, match="without identical categories"):
ser.where([True, False, True], other)
class TestContains:
def test_contains(self):
# GH#21508
cat = Categorical(list("aabbca"), categories=list("cab"))
assert "b" in cat
assert "z" not in cat
assert np.nan not in cat
with pytest.raises(TypeError, match="unhashable type: 'list'"):
assert [1] in cat
# assert codes NOT in index
assert 0 not in cat
assert 1 not in cat
cat = Categorical(list("aabbca") + [np.nan], categories=list("cab"))
assert np.nan in cat
@pytest.mark.parametrize(
"item, expected",
[
(Interval(0, 1), True),
(1.5, True),
(Interval(0.5, 1.5), False),
("a", False),
(Timestamp(1), False),
(Timedelta(1), False),
],
ids=str,
)
def test_contains_interval(self, item, expected):
# GH#23705
cat = Categorical(IntervalIndex.from_breaks(range(3)))
result = item in cat
assert result is expected
def test_contains_list(self):
# GH#21729
cat = Categorical([1, 2, 3])
assert "a" not in cat
with pytest.raises(TypeError, match="unhashable type"):
["a"] in cat
with pytest.raises(TypeError, match="unhashable type"):
["a", "b"] in cat
@pytest.mark.parametrize("index", [True, False])
def test_mask_with_boolean(index):
ser = Series(range(3))
idx = Categorical([True, False, True])
if index:
idx = CategoricalIndex(idx)
assert com.is_bool_indexer(idx)
result = ser[idx]
expected = ser[idx.astype("object")]
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("index", [True, False])
def test_mask_with_boolean_na_treated_as_false(index):
# https://github.com/pandas-dev/pandas/issues/31503
ser = Series(range(3))
idx = Categorical([True, False, None])
if index:
idx = CategoricalIndex(idx)
result = ser[idx]
expected = ser[idx.fillna(False)]
tm.assert_series_equal(result, expected)
@pytest.fixture
def non_coercible_categorical(monkeypatch):
"""
Monkeypatch Categorical.__array__ to ensure no implicit conversion.
Raises
------
ValueError
When Categorical.__array__ is called.
"""
# TODO(Categorical): identify other places where this may be
# useful and move to a conftest.py
def array(self, dtype=None):
raise ValueError("I cannot be converted.")
with monkeypatch.context() as m:
m.setattr(Categorical, "__array__", array)
yield
def test_series_at():
arr = Categorical(["a", "b", "c"])
ser = Series(arr)
result = ser.at[0]
assert result == "a"

View File

@ -0,0 +1,154 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
Categorical,
Index,
Series,
)
import pandas._testing as tm
@pytest.fixture(params=[None, "ignore"])
def na_action(request):
return request.param
@pytest.mark.parametrize(
"data, categories",
[
(list("abcbca"), list("cab")),
(pd.interval_range(0, 3).repeat(3), pd.interval_range(0, 3)),
],
ids=["string", "interval"],
)
def test_map_str(data, categories, ordered, na_action):
# GH 31202 - override base class since we want to maintain categorical/ordered
cat = Categorical(data, categories=categories, ordered=ordered)
result = cat.map(str, na_action=na_action)
expected = Categorical(
map(str, data), categories=map(str, categories), ordered=ordered
)
tm.assert_categorical_equal(result, expected)
def test_map(na_action):
cat = Categorical(list("ABABC"), categories=list("CBA"), ordered=True)
result = cat.map(lambda x: x.lower(), na_action=na_action)
exp = Categorical(list("ababc"), categories=list("cba"), ordered=True)
tm.assert_categorical_equal(result, exp)
cat = Categorical(list("ABABC"), categories=list("BAC"), ordered=False)
result = cat.map(lambda x: x.lower(), na_action=na_action)
exp = Categorical(list("ababc"), categories=list("bac"), ordered=False)
tm.assert_categorical_equal(result, exp)
# GH 12766: Return an index not an array
result = cat.map(lambda x: 1, na_action=na_action)
exp = Index(np.array([1] * 5, dtype=np.int64))
tm.assert_index_equal(result, exp)
# change categories dtype
cat = Categorical(list("ABABC"), categories=list("BAC"), ordered=False)
def f(x):
return {"A": 10, "B": 20, "C": 30}.get(x)
result = cat.map(f, na_action=na_action)
exp = Categorical([10, 20, 10, 20, 30], categories=[20, 10, 30], ordered=False)
tm.assert_categorical_equal(result, exp)
mapper = Series([10, 20, 30], index=["A", "B", "C"])
result = cat.map(mapper, na_action=na_action)
tm.assert_categorical_equal(result, exp)
result = cat.map({"A": 10, "B": 20, "C": 30}, na_action=na_action)
tm.assert_categorical_equal(result, exp)
@pytest.mark.parametrize(
("data", "f", "expected"),
(
([1, 1, np.nan], pd.isna, Index([False, False, True])),
([1, 2, np.nan], pd.isna, Index([False, False, True])),
([1, 1, np.nan], {1: False}, Categorical([False, False, np.nan])),
([1, 2, np.nan], {1: False, 2: False}, Index([False, False, np.nan])),
(
[1, 1, np.nan],
Series([False, False]),
Categorical([False, False, np.nan]),
),
(
[1, 2, np.nan],
Series([False] * 3),
Index([False, False, np.nan]),
),
),
)
def test_map_with_nan_none(data, f, expected): # GH 24241
values = Categorical(data)
result = values.map(f, na_action=None)
if isinstance(expected, Categorical):
tm.assert_categorical_equal(result, expected)
else:
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize(
("data", "f", "expected"),
(
([1, 1, np.nan], pd.isna, Categorical([False, False, np.nan])),
([1, 2, np.nan], pd.isna, Index([False, False, np.nan])),
([1, 1, np.nan], {1: False}, Categorical([False, False, np.nan])),
([1, 2, np.nan], {1: False, 2: False}, Index([False, False, np.nan])),
(
[1, 1, np.nan],
Series([False, False]),
Categorical([False, False, np.nan]),
),
(
[1, 2, np.nan],
Series([False, False, False]),
Index([False, False, np.nan]),
),
),
)
def test_map_with_nan_ignore(data, f, expected): # GH 24241
values = Categorical(data)
result = values.map(f, na_action="ignore")
if data[1] == 1:
tm.assert_categorical_equal(result, expected)
else:
tm.assert_index_equal(result, expected)
def test_map_with_dict_or_series(na_action):
orig_values = ["a", "B", 1, "a"]
new_values = ["one", 2, 3.0, "one"]
cat = Categorical(orig_values)
mapper = Series(new_values[:-1], index=orig_values[:-1])
result = cat.map(mapper, na_action=na_action)
# Order of categories in result can be different
expected = Categorical(new_values, categories=[3.0, 2, "one"])
tm.assert_categorical_equal(result, expected)
mapper = dict(zip(orig_values[:-1], new_values[:-1]))
result = cat.map(mapper, na_action=na_action)
# Order of categories in result can be different
tm.assert_categorical_equal(result, expected)
def test_map_na_action_no_default_deprecated():
# GH51645
cat = Categorical(["a", "b", "c"])
msg = (
"The default value of 'ignore' for the `na_action` parameter in "
"pandas.Categorical.map is deprecated and will be "
"changed to 'None' in a future version. Please set na_action to the "
"desired value to avoid seeing this warning"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
cat.map(lambda x: x)

View File

@ -0,0 +1,216 @@
import collections
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import CategoricalDtype
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Index,
Series,
isna,
)
import pandas._testing as tm
class TestCategoricalMissing:
def test_isna(self):
exp = np.array([False, False, True])
cat = Categorical(["a", "b", np.nan])
res = cat.isna()
tm.assert_numpy_array_equal(res, exp)
def test_na_flags_int_categories(self):
# #1457
categories = list(range(10))
labels = np.random.default_rng(2).integers(0, 10, 20)
labels[::5] = -1
cat = Categorical(labels, categories)
repr(cat)
tm.assert_numpy_array_equal(isna(cat), labels == -1)
def test_nan_handling(self):
# Nans are represented as -1 in codes
c = Categorical(["a", "b", np.nan, "a"])
tm.assert_index_equal(c.categories, Index(["a", "b"]))
tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8))
c[1] = np.nan
tm.assert_index_equal(c.categories, Index(["a", "b"]))
tm.assert_numpy_array_equal(c._codes, np.array([0, -1, -1, 0], dtype=np.int8))
# Adding nan to categories should make assigned nan point to the
# category!
c = Categorical(["a", "b", np.nan, "a"])
tm.assert_index_equal(c.categories, Index(["a", "b"]))
tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8))
def test_set_dtype_nans(self):
c = Categorical(["a", "b", np.nan])
result = c._set_dtype(CategoricalDtype(["a", "c"]))
tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1], dtype="int8"))
def test_set_item_nan(self):
cat = Categorical([1, 2, 3])
cat[1] = np.nan
exp = Categorical([1, np.nan, 3], categories=[1, 2, 3])
tm.assert_categorical_equal(cat, exp)
@pytest.mark.parametrize(
"fillna_kwargs, msg",
[
(
{"value": 1, "method": "ffill"},
"Cannot specify both 'value' and 'method'.",
),
({}, "Must specify a fill 'value' or 'method'."),
({"method": "bad"}, "Invalid fill method. Expecting .* bad"),
(
{"value": Series([1, 2, 3, 4, "a"])},
"Cannot setitem on a Categorical with a new category",
),
],
)
def test_fillna_raises(self, fillna_kwargs, msg):
# https://github.com/pandas-dev/pandas/issues/19682
# https://github.com/pandas-dev/pandas/issues/13628
cat = Categorical([1, 2, 3, None, None])
if len(fillna_kwargs) == 1 and "value" in fillna_kwargs:
err = TypeError
else:
err = ValueError
with pytest.raises(err, match=msg):
cat.fillna(**fillna_kwargs)
@pytest.mark.parametrize("named", [True, False])
def test_fillna_iterable_category(self, named):
# https://github.com/pandas-dev/pandas/issues/21097
if named:
Point = collections.namedtuple("Point", "x y")
else:
Point = lambda *args: args # tuple
cat = Categorical(np.array([Point(0, 0), Point(0, 1), None], dtype=object))
result = cat.fillna(Point(0, 0))
expected = Categorical([Point(0, 0), Point(0, 1), Point(0, 0)])
tm.assert_categorical_equal(result, expected)
# Case where the Point is not among our categories; we want ValueError,
# not NotImplementedError GH#41914
cat = Categorical(np.array([Point(1, 0), Point(0, 1), None], dtype=object))
msg = "Cannot setitem on a Categorical with a new category"
with pytest.raises(TypeError, match=msg):
cat.fillna(Point(0, 0))
def test_fillna_array(self):
# accept Categorical or ndarray value if it holds appropriate values
cat = Categorical(["A", "B", "C", None, None])
other = cat.fillna("C")
result = cat.fillna(other)
tm.assert_categorical_equal(result, other)
assert isna(cat[-1]) # didn't modify original inplace
other = np.array(["A", "B", "C", "B", "A"])
result = cat.fillna(other)
expected = Categorical(["A", "B", "C", "B", "A"], dtype=cat.dtype)
tm.assert_categorical_equal(result, expected)
assert isna(cat[-1]) # didn't modify original inplace
@pytest.mark.parametrize(
"values, expected",
[
([1, 2, 3], np.array([False, False, False])),
([1, 2, np.nan], np.array([False, False, True])),
([1, 2, np.inf], np.array([False, False, True])),
([1, 2, pd.NA], np.array([False, False, True])),
],
)
def test_use_inf_as_na(self, values, expected):
# https://github.com/pandas-dev/pandas/issues/33594
msg = "use_inf_as_na option is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
with pd.option_context("mode.use_inf_as_na", True):
cat = Categorical(values)
result = cat.isna()
tm.assert_numpy_array_equal(result, expected)
result = Series(cat).isna()
expected = Series(expected)
tm.assert_series_equal(result, expected)
result = DataFrame(cat).isna()
expected = DataFrame(expected)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"values, expected",
[
([1, 2, 3], np.array([False, False, False])),
([1, 2, np.nan], np.array([False, False, True])),
([1, 2, np.inf], np.array([False, False, True])),
([1, 2, pd.NA], np.array([False, False, True])),
],
)
def test_use_inf_as_na_outside_context(self, values, expected):
# https://github.com/pandas-dev/pandas/issues/33594
# Using isna directly for Categorical will fail in general here
cat = Categorical(values)
msg = "use_inf_as_na option is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
with pd.option_context("mode.use_inf_as_na", True):
result = isna(cat)
tm.assert_numpy_array_equal(result, expected)
result = isna(Series(cat))
expected = Series(expected)
tm.assert_series_equal(result, expected)
result = isna(DataFrame(cat))
expected = DataFrame(expected)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"a1, a2, categories",
[
(["a", "b", "c"], [np.nan, "a", "b"], ["a", "b", "c"]),
([1, 2, 3], [np.nan, 1, 2], [1, 2, 3]),
],
)
def test_compare_categorical_with_missing(self, a1, a2, categories):
# GH 28384
cat_type = CategoricalDtype(categories)
# !=
result = Series(a1, dtype=cat_type) != Series(a2, dtype=cat_type)
expected = Series(a1) != Series(a2)
tm.assert_series_equal(result, expected)
# ==
result = Series(a1, dtype=cat_type) == Series(a2, dtype=cat_type)
expected = Series(a1) == Series(a2)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"na_value, dtype",
[
(pd.NaT, "datetime64[ns]"),
(None, "float64"),
(np.nan, "float64"),
(pd.NA, "float64"),
],
)
def test_categorical_only_missing_values_no_cast(self, na_value, dtype):
# GH#44900
result = Categorical([na_value, na_value])
tm.assert_index_equal(result.categories, Index([], dtype=dtype))

View File

@ -0,0 +1,414 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Series,
Timestamp,
date_range,
)
import pandas._testing as tm
class TestCategoricalOpsWithFactor:
def test_categories_none_comparisons(self):
factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
tm.assert_categorical_equal(factor, factor)
def test_comparisons(self):
factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
result = factor[factor == "a"]
expected = factor[np.asarray(factor) == "a"]
tm.assert_categorical_equal(result, expected)
result = factor[factor != "a"]
expected = factor[np.asarray(factor) != "a"]
tm.assert_categorical_equal(result, expected)
result = factor[factor < "c"]
expected = factor[np.asarray(factor) < "c"]
tm.assert_categorical_equal(result, expected)
result = factor[factor > "a"]
expected = factor[np.asarray(factor) > "a"]
tm.assert_categorical_equal(result, expected)
result = factor[factor >= "b"]
expected = factor[np.asarray(factor) >= "b"]
tm.assert_categorical_equal(result, expected)
result = factor[factor <= "b"]
expected = factor[np.asarray(factor) <= "b"]
tm.assert_categorical_equal(result, expected)
n = len(factor)
other = factor[np.random.default_rng(2).permutation(n)]
result = factor == other
expected = np.asarray(factor) == np.asarray(other)
tm.assert_numpy_array_equal(result, expected)
result = factor == "d"
expected = np.zeros(len(factor), dtype=bool)
tm.assert_numpy_array_equal(result, expected)
# comparisons with categoricals
cat_rev = Categorical(["a", "b", "c"], categories=["c", "b", "a"], ordered=True)
cat_rev_base = Categorical(
["b", "b", "b"], categories=["c", "b", "a"], ordered=True
)
cat = Categorical(["a", "b", "c"], ordered=True)
cat_base = Categorical(["b", "b", "b"], categories=cat.categories, ordered=True)
# comparisons need to take categories ordering into account
res_rev = cat_rev > cat_rev_base
exp_rev = np.array([True, False, False])
tm.assert_numpy_array_equal(res_rev, exp_rev)
res_rev = cat_rev < cat_rev_base
exp_rev = np.array([False, False, True])
tm.assert_numpy_array_equal(res_rev, exp_rev)
res = cat > cat_base
exp = np.array([False, False, True])
tm.assert_numpy_array_equal(res, exp)
# Only categories with same categories can be compared
msg = "Categoricals can only be compared if 'categories' are the same"
with pytest.raises(TypeError, match=msg):
cat > cat_rev
cat_rev_base2 = Categorical(["b", "b", "b"], categories=["c", "b", "a", "d"])
with pytest.raises(TypeError, match=msg):
cat_rev > cat_rev_base2
# Only categories with same ordering information can be compared
cat_unordered = cat.set_ordered(False)
assert not (cat > cat).any()
with pytest.raises(TypeError, match=msg):
cat > cat_unordered
# comparison (in both directions) with Series will raise
s = Series(["b", "b", "b"], dtype=object)
msg = (
"Cannot compare a Categorical for op __gt__ with type "
r"<class 'numpy\.ndarray'>"
)
with pytest.raises(TypeError, match=msg):
cat > s
with pytest.raises(TypeError, match=msg):
cat_rev > s
with pytest.raises(TypeError, match=msg):
s < cat
with pytest.raises(TypeError, match=msg):
s < cat_rev
# comparison with numpy.array will raise in both direction, but only on
# newer numpy versions
a = np.array(["b", "b", "b"], dtype=object)
with pytest.raises(TypeError, match=msg):
cat > a
with pytest.raises(TypeError, match=msg):
cat_rev > a
# Make sure that unequal comparison take the categories order in
# account
cat_rev = Categorical(list("abc"), categories=list("cba"), ordered=True)
exp = np.array([True, False, False])
res = cat_rev > "b"
tm.assert_numpy_array_equal(res, exp)
# check that zero-dim array gets unboxed
res = cat_rev > np.array("b")
tm.assert_numpy_array_equal(res, exp)
class TestCategoricalOps:
@pytest.mark.parametrize(
"categories",
[["a", "b"], [0, 1], [Timestamp("2019"), Timestamp("2020")]],
)
def test_not_equal_with_na(self, categories):
# https://github.com/pandas-dev/pandas/issues/32276
c1 = Categorical.from_codes([-1, 0], categories=categories)
c2 = Categorical.from_codes([0, 1], categories=categories)
result = c1 != c2
assert result.all()
def test_compare_frame(self):
# GH#24282 check that Categorical.__cmp__(DataFrame) defers to frame
data = ["a", "b", 2, "a"]
cat = Categorical(data)
df = DataFrame(cat)
result = cat == df.T
expected = DataFrame([[True, True, True, True]])
tm.assert_frame_equal(result, expected)
result = cat[::-1] != df.T
expected = DataFrame([[False, True, True, False]])
tm.assert_frame_equal(result, expected)
def test_compare_frame_raises(self, comparison_op):
# alignment raises unless we transpose
op = comparison_op
cat = Categorical(["a", "b", 2, "a"])
df = DataFrame(cat)
msg = "Unable to coerce to Series, length must be 1: given 4"
with pytest.raises(ValueError, match=msg):
op(cat, df)
def test_datetime_categorical_comparison(self):
dt_cat = Categorical(date_range("2014-01-01", periods=3), ordered=True)
tm.assert_numpy_array_equal(dt_cat > dt_cat[0], np.array([False, True, True]))
tm.assert_numpy_array_equal(dt_cat[0] < dt_cat, np.array([False, True, True]))
def test_reflected_comparison_with_scalars(self):
# GH8658
cat = Categorical([1, 2, 3], ordered=True)
tm.assert_numpy_array_equal(cat > cat[0], np.array([False, True, True]))
tm.assert_numpy_array_equal(cat[0] < cat, np.array([False, True, True]))
def test_comparison_with_unknown_scalars(self):
# https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057
# and following comparisons with scalars not in categories should raise
# for unequal comps, but not for equal/not equal
cat = Categorical([1, 2, 3], ordered=True)
msg = "Invalid comparison between dtype=category and int"
with pytest.raises(TypeError, match=msg):
cat < 4
with pytest.raises(TypeError, match=msg):
cat > 4
with pytest.raises(TypeError, match=msg):
4 < cat
with pytest.raises(TypeError, match=msg):
4 > cat
tm.assert_numpy_array_equal(cat == 4, np.array([False, False, False]))
tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True]))
def test_comparison_with_tuple(self):
cat = Categorical(np.array(["foo", (0, 1), 3, (0, 1)], dtype=object))
result = cat == "foo"
expected = np.array([True, False, False, False], dtype=bool)
tm.assert_numpy_array_equal(result, expected)
result = cat == (0, 1)
expected = np.array([False, True, False, True], dtype=bool)
tm.assert_numpy_array_equal(result, expected)
result = cat != (0, 1)
tm.assert_numpy_array_equal(result, ~expected)
@pytest.mark.filterwarnings("ignore::RuntimeWarning")
def test_comparison_of_ordered_categorical_with_nan_to_scalar(
self, compare_operators_no_eq_ne
):
# https://github.com/pandas-dev/pandas/issues/26504
# BUG: fix ordered categorical comparison with missing values (#26504 )
# and following comparisons with scalars in categories with missing
# values should be evaluated as False
cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True)
scalar = 2
expected = getattr(np.array(cat), compare_operators_no_eq_ne)(scalar)
actual = getattr(cat, compare_operators_no_eq_ne)(scalar)
tm.assert_numpy_array_equal(actual, expected)
@pytest.mark.filterwarnings("ignore::RuntimeWarning")
def test_comparison_of_ordered_categorical_with_nan_to_listlike(
self, compare_operators_no_eq_ne
):
# https://github.com/pandas-dev/pandas/issues/26504
# and following comparisons of missing values in ordered Categorical
# with listlike should be evaluated as False
cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True)
other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True)
expected = getattr(np.array(cat), compare_operators_no_eq_ne)(2)
actual = getattr(cat, compare_operators_no_eq_ne)(other)
tm.assert_numpy_array_equal(actual, expected)
@pytest.mark.parametrize(
"data,reverse,base",
[(list("abc"), list("cba"), list("bbb")), ([1, 2, 3], [3, 2, 1], [2, 2, 2])],
)
def test_comparisons(self, data, reverse, base):
cat_rev = Series(Categorical(data, categories=reverse, ordered=True))
cat_rev_base = Series(Categorical(base, categories=reverse, ordered=True))
cat = Series(Categorical(data, ordered=True))
cat_base = Series(
Categorical(base, categories=cat.cat.categories, ordered=True)
)
s = Series(base, dtype=object if base == list("bbb") else None)
a = np.array(base)
# comparisons need to take categories ordering into account
res_rev = cat_rev > cat_rev_base
exp_rev = Series([True, False, False])
tm.assert_series_equal(res_rev, exp_rev)
res_rev = cat_rev < cat_rev_base
exp_rev = Series([False, False, True])
tm.assert_series_equal(res_rev, exp_rev)
res = cat > cat_base
exp = Series([False, False, True])
tm.assert_series_equal(res, exp)
scalar = base[1]
res = cat > scalar
exp = Series([False, False, True])
exp2 = cat.values > scalar
tm.assert_series_equal(res, exp)
tm.assert_numpy_array_equal(res.values, exp2)
res_rev = cat_rev > scalar
exp_rev = Series([True, False, False])
exp_rev2 = cat_rev.values > scalar
tm.assert_series_equal(res_rev, exp_rev)
tm.assert_numpy_array_equal(res_rev.values, exp_rev2)
# Only categories with same categories can be compared
msg = "Categoricals can only be compared if 'categories' are the same"
with pytest.raises(TypeError, match=msg):
cat > cat_rev
# categorical cannot be compared to Series or numpy array, and also
# not the other way around
msg = (
"Cannot compare a Categorical for op __gt__ with type "
r"<class 'numpy\.ndarray'>"
)
with pytest.raises(TypeError, match=msg):
cat > s
with pytest.raises(TypeError, match=msg):
cat_rev > s
with pytest.raises(TypeError, match=msg):
cat > a
with pytest.raises(TypeError, match=msg):
cat_rev > a
with pytest.raises(TypeError, match=msg):
s < cat
with pytest.raises(TypeError, match=msg):
s < cat_rev
with pytest.raises(TypeError, match=msg):
a < cat
with pytest.raises(TypeError, match=msg):
a < cat_rev
@pytest.mark.parametrize(
"ctor",
[
lambda *args, **kwargs: Categorical(*args, **kwargs),
lambda *args, **kwargs: Series(Categorical(*args, **kwargs)),
],
)
def test_unordered_different_order_equal(self, ctor):
# https://github.com/pandas-dev/pandas/issues/16014
c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False)
c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False)
assert (c1 == c2).all()
c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False)
c2 = ctor(["b", "a"], categories=["b", "a"], ordered=False)
assert (c1 != c2).all()
c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False)
c2 = ctor(["b", "b"], categories=["b", "a"], ordered=False)
assert (c1 != c2).all()
c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False)
c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False)
result = c1 == c2
tm.assert_numpy_array_equal(np.array(result), np.array([True, False]))
def test_unordered_different_categories_raises(self):
c1 = Categorical(["a", "b"], categories=["a", "b"], ordered=False)
c2 = Categorical(["a", "c"], categories=["c", "a"], ordered=False)
with pytest.raises(TypeError, match=("Categoricals can only be compared")):
c1 == c2
def test_compare_different_lengths(self):
c1 = Categorical([], categories=["a", "b"])
c2 = Categorical([], categories=["a"])
msg = "Categoricals can only be compared if 'categories' are the same."
with pytest.raises(TypeError, match=msg):
c1 == c2
def test_compare_unordered_different_order(self):
# https://github.com/pandas-dev/pandas/issues/16603#issuecomment-
# 349290078
a = Categorical(["a"], categories=["a", "b"])
b = Categorical(["b"], categories=["b", "a"])
assert not a.equals(b)
def test_numeric_like_ops(self):
df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 100)})
labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)]
cat_labels = Categorical(labels, labels)
df = df.sort_values(by=["value"], ascending=True)
df["value_group"] = pd.cut(
df.value, range(0, 10500, 500), right=False, labels=cat_labels
)
# numeric ops should not succeed
for op, str_rep in [
("__add__", r"\+"),
("__sub__", "-"),
("__mul__", r"\*"),
("__truediv__", "/"),
]:
msg = f"Series cannot perform the operation {str_rep}|unsupported operand"
with pytest.raises(TypeError, match=msg):
getattr(df, op)(df)
# reduction ops should not succeed (unless specifically defined, e.g.
# min/max)
s = df["value_group"]
for op in ["kurt", "skew", "var", "std", "mean", "sum", "median"]:
msg = f"does not support reduction '{op}'"
with pytest.raises(TypeError, match=msg):
getattr(s, op)(numeric_only=False)
def test_numeric_like_ops_series(self):
# numpy ops
s = Series(Categorical([1, 2, 3, 4]))
with pytest.raises(TypeError, match="does not support reduction 'sum'"):
np.sum(s)
@pytest.mark.parametrize(
"op, str_rep",
[
("__add__", r"\+"),
("__sub__", "-"),
("__mul__", r"\*"),
("__truediv__", "/"),
],
)
def test_numeric_like_ops_series_arith(self, op, str_rep):
# numeric ops on a Series
s = Series(Categorical([1, 2, 3, 4]))
msg = f"Series cannot perform the operation {str_rep}|unsupported operand"
with pytest.raises(TypeError, match=msg):
getattr(s, op)(2)
def test_numeric_like_ops_series_invalid(self):
# invalid ufunc
s = Series(Categorical([1, 2, 3, 4]))
msg = "Object with dtype category cannot perform the numpy op log"
with pytest.raises(TypeError, match=msg):
np.log(s)

View File

@ -0,0 +1,111 @@
import pytest
import pandas as pd
from pandas import Categorical
import pandas._testing as tm
@pytest.mark.parametrize(
"to_replace,value,expected,flip_categories",
[
# one-to-one
(1, 2, [2, 2, 3], False),
(1, 4, [4, 2, 3], False),
(4, 1, [1, 2, 3], False),
(5, 6, [1, 2, 3], False),
# many-to-one
([1], 2, [2, 2, 3], False),
([1, 2], 3, [3, 3, 3], False),
([1, 2], 4, [4, 4, 3], False),
((1, 2, 4), 5, [5, 5, 3], False),
((5, 6), 2, [1, 2, 3], False),
([1], [2], [2, 2, 3], False),
([1, 4], [5, 2], [5, 2, 3], False),
# GH49404: overlap between to_replace and value
([1, 2, 3], [2, 3, 4], [2, 3, 4], False),
# GH50872, GH46884: replace with null
(1, None, [None, 2, 3], False),
(1, pd.NA, [None, 2, 3], False),
# check_categorical sorts categories, which crashes on mixed dtypes
(3, "4", [1, 2, "4"], False),
([1, 2, "3"], "5", ["5", "5", 3], True),
],
)
@pytest.mark.filterwarnings(
"ignore:.*with CategoricalDtype is deprecated:FutureWarning"
)
def test_replace_categorical_series(to_replace, value, expected, flip_categories):
# GH 31720
ser = pd.Series([1, 2, 3], dtype="category")
result = ser.replace(to_replace, value)
expected = pd.Series(expected, dtype="category")
ser.replace(to_replace, value, inplace=True)
if flip_categories:
expected = expected.cat.set_categories(expected.cat.categories[::-1])
tm.assert_series_equal(expected, result, check_category_order=False)
tm.assert_series_equal(expected, ser, check_category_order=False)
@pytest.mark.parametrize(
"to_replace, value, result, expected_error_msg",
[
("b", "c", ["a", "c"], "Categorical.categories are different"),
("c", "d", ["a", "b"], None),
# https://github.com/pandas-dev/pandas/issues/33288
("a", "a", ["a", "b"], None),
("b", None, ["a", None], "Categorical.categories length are different"),
],
)
def test_replace_categorical(to_replace, value, result, expected_error_msg):
# GH#26988
cat = Categorical(["a", "b"])
expected = Categorical(result)
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
warn = FutureWarning if expected_error_msg is not None else None
with tm.assert_produces_warning(warn, match=msg):
result = pd.Series(cat, copy=False).replace(to_replace, value)._values
tm.assert_categorical_equal(result, expected)
if to_replace == "b": # the "c" test is supposed to be unchanged
with pytest.raises(AssertionError, match=expected_error_msg):
# ensure non-inplace call does not affect original
tm.assert_categorical_equal(cat, expected)
ser = pd.Series(cat, copy=False)
with tm.assert_produces_warning(warn, match=msg):
ser.replace(to_replace, value, inplace=True)
tm.assert_categorical_equal(cat, expected)
def test_replace_categorical_ea_dtype():
# GH49404
cat = Categorical(pd.array(["a", "b"], dtype="string"))
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values
expected = Categorical(pd.array(["c", pd.NA], dtype="string"))
tm.assert_categorical_equal(result, expected)
def test_replace_maintain_ordering():
# GH51016
dtype = pd.CategoricalDtype([0, 1, 2], ordered=True)
ser = pd.Series([0, 1, 2], dtype=dtype)
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
result = ser.replace(0, 2)
expected_dtype = pd.CategoricalDtype([1, 2], ordered=True)
expected = pd.Series([2, 1, 2], dtype=expected_dtype)
tm.assert_series_equal(expected, result, check_category_order=True)

View File

@ -0,0 +1,550 @@
import numpy as np
import pytest
from pandas._config import using_pyarrow_string_dtype
from pandas import (
Categorical,
CategoricalDtype,
CategoricalIndex,
Index,
Series,
date_range,
option_context,
period_range,
timedelta_range,
)
class TestCategoricalReprWithFactor:
def test_print(self, using_infer_string):
factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
if using_infer_string:
expected = [
"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
"Categories (3, string): [a < b < c]",
]
else:
expected = [
"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
"Categories (3, object): ['a' < 'b' < 'c']",
]
expected = "\n".join(expected)
actual = repr(factor)
assert actual == expected
class TestCategoricalRepr:
def test_big_print(self):
codes = np.array([0, 1, 2, 0, 1, 2] * 100)
dtype = CategoricalDtype(categories=Index(["a", "b", "c"], dtype=object))
factor = Categorical.from_codes(codes, dtype=dtype)
expected = [
"['a', 'b', 'c', 'a', 'b', ..., 'b', 'c', 'a', 'b', 'c']",
"Length: 600",
"Categories (3, object): ['a', 'b', 'c']",
]
expected = "\n".join(expected)
actual = repr(factor)
assert actual == expected
def test_empty_print(self):
factor = Categorical([], Index(["a", "b", "c"], dtype=object))
expected = "[], Categories (3, object): ['a', 'b', 'c']"
actual = repr(factor)
assert actual == expected
assert expected == actual
factor = Categorical([], Index(["a", "b", "c"], dtype=object), ordered=True)
expected = "[], Categories (3, object): ['a' < 'b' < 'c']"
actual = repr(factor)
assert expected == actual
factor = Categorical([], [])
expected = "[], Categories (0, object): []"
assert expected == repr(factor)
def test_print_none_width(self):
# GH10087
a = Series(Categorical([1, 2, 3, 4]))
exp = (
"0 1\n1 2\n2 3\n3 4\n"
"dtype: category\nCategories (4, int64): [1, 2, 3, 4]"
)
with option_context("display.width", None):
assert exp == repr(a)
@pytest.mark.skipif(
using_pyarrow_string_dtype(),
reason="Change once infer_string is set to True by default",
)
def test_unicode_print(self):
c = Categorical(["aaaaa", "bb", "cccc"] * 20)
expected = """\
['aaaaa', 'bb', 'cccc', 'aaaaa', 'bb', ..., 'bb', 'cccc', 'aaaaa', 'bb', 'cccc']
Length: 60
Categories (3, object): ['aaaaa', 'bb', 'cccc']"""
assert repr(c) == expected
c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20)
expected = """\
['ああああ', 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', ..., 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', 'ううううううう']
Length: 60
Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa: E501
assert repr(c) == expected
# unicode option should not affect to Categorical, as it doesn't care
# the repr width
with option_context("display.unicode.east_asian_width", True):
c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20)
expected = """['ああああ', 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', ..., 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', 'ううううううう']
Length: 60
Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa: E501
assert repr(c) == expected
def test_categorical_repr(self):
c = Categorical([1, 2, 3])
exp = """[1, 2, 3]
Categories (3, int64): [1, 2, 3]"""
assert repr(c) == exp
c = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
exp = """[1, 2, 3, 1, 2, 3]
Categories (3, int64): [1, 2, 3]"""
assert repr(c) == exp
c = Categorical([1, 2, 3, 4, 5] * 10)
exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5]
Length: 50
Categories (5, int64): [1, 2, 3, 4, 5]"""
assert repr(c) == exp
c = Categorical(np.arange(20, dtype=np.int64))
exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19]
Length: 20
Categories (20, int64): [0, 1, 2, 3, ..., 16, 17, 18, 19]"""
assert repr(c) == exp
def test_categorical_repr_ordered(self):
c = Categorical([1, 2, 3], ordered=True)
exp = """[1, 2, 3]
Categories (3, int64): [1 < 2 < 3]"""
assert repr(c) == exp
c = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True)
exp = """[1, 2, 3, 1, 2, 3]
Categories (3, int64): [1 < 2 < 3]"""
assert repr(c) == exp
c = Categorical([1, 2, 3, 4, 5] * 10, ordered=True)
exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5]
Length: 50
Categories (5, int64): [1 < 2 < 3 < 4 < 5]"""
assert repr(c) == exp
c = Categorical(np.arange(20, dtype=np.int64), ordered=True)
exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19]
Length: 20
Categories (20, int64): [0 < 1 < 2 < 3 ... 16 < 17 < 18 < 19]"""
assert repr(c) == exp
def test_categorical_repr_datetime(self):
idx = date_range("2011-01-01 09:00", freq="h", periods=5)
c = Categorical(idx)
exp = (
"[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, "
"2011-01-01 12:00:00, 2011-01-01 13:00:00]\n"
"Categories (5, datetime64[ns]): [2011-01-01 09:00:00, "
"2011-01-01 10:00:00, 2011-01-01 11:00:00,\n"
" 2011-01-01 12:00:00, "
"2011-01-01 13:00:00]"
""
)
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = (
"[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, "
"2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, "
"2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, "
"2011-01-01 13:00:00]\n"
"Categories (5, datetime64[ns]): [2011-01-01 09:00:00, "
"2011-01-01 10:00:00, 2011-01-01 11:00:00,\n"
" 2011-01-01 12:00:00, "
"2011-01-01 13:00:00]"
)
assert repr(c) == exp
idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern")
c = Categorical(idx)
exp = (
"[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, "
"2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, "
"2011-01-01 13:00:00-05:00]\n"
"Categories (5, datetime64[ns, US/Eastern]): "
"[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,\n"
" "
"2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n"
" "
"2011-01-01 13:00:00-05:00]"
)
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = (
"[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, "
"2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, "
"2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, "
"2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, "
"2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]\n"
"Categories (5, datetime64[ns, US/Eastern]): "
"[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,\n"
" "
"2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n"
" "
"2011-01-01 13:00:00-05:00]"
)
assert repr(c) == exp
def test_categorical_repr_datetime_ordered(self):
idx = date_range("2011-01-01 09:00", freq="h", periods=5)
c = Categorical(idx, ordered=True)
exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00]
Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 <
2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa: E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00]
Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 <
2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa: E501
assert repr(c) == exp
idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern")
c = Categorical(idx, ordered=True)
exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]
Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 <
2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 <
2011-01-01 13:00:00-05:00]""" # noqa: E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]
Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 <
2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 <
2011-01-01 13:00:00-05:00]""" # noqa: E501
assert repr(c) == exp
def test_categorical_repr_int_with_nan(self):
c = Categorical([1, 2, np.nan])
c_exp = """[1, 2, NaN]\nCategories (2, int64): [1, 2]"""
assert repr(c) == c_exp
s = Series([1, 2, np.nan], dtype="object").astype("category")
s_exp = """0 1\n1 2\n2 NaN
dtype: category
Categories (2, int64): [1, 2]"""
assert repr(s) == s_exp
def test_categorical_repr_period(self):
idx = period_range("2011-01-01 09:00", freq="h", periods=5)
c = Categorical(idx)
exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
Categories (5, period[h]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00,
2011-01-01 13:00]""" # noqa: E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
Categories (5, period[h]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00,
2011-01-01 13:00]""" # noqa: E501
assert repr(c) == exp
idx = period_range("2011-01", freq="M", periods=5)
c = Categorical(idx)
exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]"""
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" # noqa: E501
assert repr(c) == exp
def test_categorical_repr_period_ordered(self):
idx = period_range("2011-01-01 09:00", freq="h", periods=5)
c = Categorical(idx, ordered=True)
exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
Categories (5, period[h]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 <
2011-01-01 13:00]""" # noqa: E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
Categories (5, period[h]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 <
2011-01-01 13:00]""" # noqa: E501
assert repr(c) == exp
idx = period_range("2011-01", freq="M", periods=5)
c = Categorical(idx, ordered=True)
exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]"""
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" # noqa: E501
assert repr(c) == exp
def test_categorical_repr_timedelta(self):
idx = timedelta_range("1 days", periods=5)
c = Categorical(idx)
exp = """[1 days, 2 days, 3 days, 4 days, 5 days]
Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]"""
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days]
Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" # noqa: E501
assert repr(c) == exp
idx = timedelta_range("1 hours", periods=20)
c = Categorical(idx)
exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
Length: 20
Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00,
3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00,
18 days 01:00:00, 19 days 01:00:00]""" # noqa: E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
Length: 40
Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00,
3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00,
18 days 01:00:00, 19 days 01:00:00]""" # noqa: E501
assert repr(c) == exp
def test_categorical_repr_timedelta_ordered(self):
idx = timedelta_range("1 days", periods=5)
c = Categorical(idx, ordered=True)
exp = """[1 days, 2 days, 3 days, 4 days, 5 days]
Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]"""
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days]
Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" # noqa: E501
assert repr(c) == exp
idx = timedelta_range("1 hours", periods=20)
c = Categorical(idx, ordered=True)
exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
Length: 20
Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 <
3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 <
18 days 01:00:00 < 19 days 01:00:00]""" # noqa: E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
Length: 40
Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 <
3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 <
18 days 01:00:00 < 19 days 01:00:00]""" # noqa: E501
assert repr(c) == exp
def test_categorical_index_repr(self):
idx = CategoricalIndex(Categorical([1, 2, 3]))
exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=False, dtype='category')""" # noqa: E501
assert repr(idx) == exp
i = CategoricalIndex(Categorical(np.arange(10, dtype=np.int64)))
exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, ..., 6, 7, 8, 9], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_index_repr_ordered(self):
i = CategoricalIndex(Categorical([1, 2, 3], ordered=True))
exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
i = CategoricalIndex(Categorical(np.arange(10, dtype=np.int64), ordered=True))
exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, ..., 6, 7, 8, 9], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_index_repr_datetime(self):
idx = date_range("2011-01-01 09:00", freq="h", periods=5)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00',
'2011-01-01 11:00:00', '2011-01-01 12:00:00',
'2011-01-01 13:00:00'],
categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern")
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
'2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
'2011-01-01 13:00:00-05:00'],
categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_index_repr_datetime_ordered(self):
idx = date_range("2011-01-01 09:00", freq="h", periods=5)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00',
'2011-01-01 11:00:00', '2011-01-01 12:00:00',
'2011-01-01 13:00:00'],
categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern")
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
'2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
'2011-01-01 13:00:00-05:00'],
categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
i = CategoricalIndex(Categorical(idx.append(idx), ordered=True))
exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
'2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
'2011-01-01 13:00:00-05:00', '2011-01-01 09:00:00-05:00',
'2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00',
'2011-01-01 12:00:00-05:00', '2011-01-01 13:00:00-05:00'],
categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_index_repr_period(self):
# test all length
idx = period_range("2011-01-01 09:00", freq="h", periods=1)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00'], categories=[2011-01-01 09:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = period_range("2011-01-01 09:00", freq="h", periods=2)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = period_range("2011-01-01 09:00", freq="h", periods=3)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = period_range("2011-01-01 09:00", freq="h", periods=5)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
'2011-01-01 12:00', '2011-01-01 13:00'],
categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
i = CategoricalIndex(Categorical(idx.append(idx)))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
'2011-01-01 12:00', '2011-01-01 13:00', '2011-01-01 09:00',
'2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00',
'2011-01-01 13:00'],
categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = period_range("2011-01", freq="M", periods=5)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_index_repr_period_ordered(self):
idx = period_range("2011-01-01 09:00", freq="h", periods=5)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
'2011-01-01 12:00', '2011-01-01 13:00'],
categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = period_range("2011-01", freq="M", periods=5)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_index_repr_timedelta(self):
idx = timedelta_range("1 days", periods=5)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days, 2 days, 3 days, 4 days, 5 days], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = timedelta_range("1 hours", periods=10)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00',
'3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00',
'6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00',
'9 days 01:00:00'],
categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00, 8 days 01:00:00, 9 days 01:00:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_index_repr_timedelta_ordered(self):
idx = timedelta_range("1 days", periods=5)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days, 2 days, 3 days, 4 days, 5 days], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = timedelta_range("1 hours", periods=10)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00',
'3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00',
'6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00',
'9 days 01:00:00'],
categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00, 8 days 01:00:00, 9 days 01:00:00], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_str_repr(self):
# GH 33676
result = repr(Categorical([1, "2", 3, 4]))
expected = "[1, '2', 3, 4]\nCategories (4, object): [1, 3, 4, '2']"
assert result == expected

View File

@ -0,0 +1,128 @@
import numpy as np
import pytest
from pandas import (
Categorical,
Index,
)
import pandas._testing as tm
class TestCategoricalSort:
def test_argsort(self):
c = Categorical([5, 3, 1, 4, 2], ordered=True)
expected = np.array([2, 4, 1, 3, 0])
tm.assert_numpy_array_equal(
c.argsort(ascending=True), expected, check_dtype=False
)
expected = expected[::-1]
tm.assert_numpy_array_equal(
c.argsort(ascending=False), expected, check_dtype=False
)
def test_numpy_argsort(self):
c = Categorical([5, 3, 1, 4, 2], ordered=True)
expected = np.array([2, 4, 1, 3, 0])
tm.assert_numpy_array_equal(np.argsort(c), expected, check_dtype=False)
tm.assert_numpy_array_equal(
np.argsort(c, kind="mergesort"), expected, check_dtype=False
)
msg = "the 'axis' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.argsort(c, axis=0)
msg = "the 'order' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.argsort(c, order="C")
def test_sort_values(self):
# unordered cats are sortable
cat = Categorical(["a", "b", "b", "a"], ordered=False)
cat.sort_values()
cat = Categorical(["a", "c", "b", "d"], ordered=True)
# sort_values
res = cat.sort_values()
exp = np.array(["a", "b", "c", "d"], dtype=object)
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, cat.categories)
cat = Categorical(
["a", "c", "b", "d"], categories=["a", "b", "c", "d"], ordered=True
)
res = cat.sort_values()
exp = np.array(["a", "b", "c", "d"], dtype=object)
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, cat.categories)
res = cat.sort_values(ascending=False)
exp = np.array(["d", "c", "b", "a"], dtype=object)
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, cat.categories)
# sort (inplace order)
cat1 = cat.copy()
orig_codes = cat1._codes
cat1.sort_values(inplace=True)
assert cat1._codes is orig_codes
exp = np.array(["a", "b", "c", "d"], dtype=object)
tm.assert_numpy_array_equal(cat1.__array__(), exp)
tm.assert_index_equal(res.categories, cat.categories)
# reverse
cat = Categorical(["a", "c", "c", "b", "d"], ordered=True)
res = cat.sort_values(ascending=False)
exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object)
exp_categories = Index(["a", "b", "c", "d"])
tm.assert_numpy_array_equal(res.__array__(), exp_val)
tm.assert_index_equal(res.categories, exp_categories)
def test_sort_values_na_position(self):
# see gh-12882
cat = Categorical([5, 2, np.nan, 2, np.nan], ordered=True)
exp_categories = Index([2, 5])
exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
res = cat.sort_values() # default arguments
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
exp = np.array([np.nan, np.nan, 2.0, 2.0, 5.0])
res = cat.sort_values(ascending=True, na_position="first")
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
exp = np.array([np.nan, np.nan, 5.0, 2.0, 2.0])
res = cat.sort_values(ascending=False, na_position="first")
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
res = cat.sort_values(ascending=True, na_position="last")
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
exp = np.array([5.0, 2.0, 2.0, np.nan, np.nan])
res = cat.sort_values(ascending=False, na_position="last")
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
res = cat.sort_values(ascending=False, na_position="last")
exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object)
exp_categories = Index(["a", "b", "c", "d"])
tm.assert_numpy_array_equal(res.__array__(), exp_val)
tm.assert_index_equal(res.categories, exp_categories)
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
res = cat.sort_values(ascending=False, na_position="first")
exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object)
exp_categories = Index(["a", "b", "c", "d"])
tm.assert_numpy_array_equal(res.__array__(), exp_val)
tm.assert_index_equal(res.categories, exp_categories)

View File

@ -0,0 +1,26 @@
from pandas import Categorical
import pandas._testing as tm
class SubclassedCategorical(Categorical):
pass
class TestCategoricalSubclassing:
def test_constructor(self):
sc = SubclassedCategorical(["a", "b", "c"])
assert isinstance(sc, SubclassedCategorical)
tm.assert_categorical_equal(sc, Categorical(["a", "b", "c"]))
def test_from_codes(self):
sc = SubclassedCategorical.from_codes([1, 0, 2], ["a", "b", "c"])
assert isinstance(sc, SubclassedCategorical)
exp = Categorical.from_codes([1, 0, 2], ["a", "b", "c"])
tm.assert_categorical_equal(sc, exp)
def test_map(self):
sc = SubclassedCategorical(["a", "b", "c"])
res = sc.map(lambda x: x.upper(), na_action=None)
assert isinstance(res, SubclassedCategorical)
exp = Categorical(["A", "B", "C"])
tm.assert_categorical_equal(res, exp)

View File

@ -0,0 +1,89 @@
import numpy as np
import pytest
from pandas import Categorical
import pandas._testing as tm
@pytest.fixture(params=[True, False])
def allow_fill(request):
"""Boolean 'allow_fill' parameter for Categorical.take"""
return request.param
class TestTake:
# https://github.com/pandas-dev/pandas/issues/20664
def test_take_default_allow_fill(self):
cat = Categorical(["a", "b"])
with tm.assert_produces_warning(None):
result = cat.take([0, -1])
assert result.equals(cat)
def test_take_positive_no_warning(self):
cat = Categorical(["a", "b"])
with tm.assert_produces_warning(None):
cat.take([0, 0])
def test_take_bounds(self, allow_fill):
# https://github.com/pandas-dev/pandas/issues/20664
cat = Categorical(["a", "b", "a"])
if allow_fill:
msg = "indices are out-of-bounds"
else:
msg = "index 4 is out of bounds for( axis 0 with)? size 3"
with pytest.raises(IndexError, match=msg):
cat.take([4, 5], allow_fill=allow_fill)
def test_take_empty(self, allow_fill):
# https://github.com/pandas-dev/pandas/issues/20664
cat = Categorical([], categories=["a", "b"])
if allow_fill:
msg = "indices are out-of-bounds"
else:
msg = "cannot do a non-empty take from an empty axes"
with pytest.raises(IndexError, match=msg):
cat.take([0], allow_fill=allow_fill)
def test_positional_take(self, ordered):
cat = Categorical(["a", "a", "b", "b"], categories=["b", "a"], ordered=ordered)
result = cat.take([0, 1, 2], allow_fill=False)
expected = Categorical(
["a", "a", "b"], categories=cat.categories, ordered=ordered
)
tm.assert_categorical_equal(result, expected)
def test_positional_take_unobserved(self, ordered):
cat = Categorical(["a", "b"], categories=["a", "b", "c"], ordered=ordered)
result = cat.take([1, 0], allow_fill=False)
expected = Categorical(["b", "a"], categories=cat.categories, ordered=ordered)
tm.assert_categorical_equal(result, expected)
def test_take_allow_fill(self):
# https://github.com/pandas-dev/pandas/issues/23296
cat = Categorical(["a", "a", "b"])
result = cat.take([0, -1, -1], allow_fill=True)
expected = Categorical(["a", np.nan, np.nan], categories=["a", "b"])
tm.assert_categorical_equal(result, expected)
def test_take_fill_with_negative_one(self):
# -1 was a category
cat = Categorical([-1, 0, 1])
result = cat.take([0, -1, 1], allow_fill=True, fill_value=-1)
expected = Categorical([-1, -1, 0], categories=[-1, 0, 1])
tm.assert_categorical_equal(result, expected)
def test_take_fill_value(self):
# https://github.com/pandas-dev/pandas/issues/23296
cat = Categorical(["a", "b", "c"])
result = cat.take([0, 1, -1], fill_value="a", allow_fill=True)
expected = Categorical(["a", "b", "a"], categories=["a", "b", "c"])
tm.assert_categorical_equal(result, expected)
def test_take_fill_value_new_raises(self):
# https://github.com/pandas-dev/pandas/issues/23296
cat = Categorical(["a", "b", "c"])
xpr = r"Cannot setitem on a Categorical with a new category \(d\)"
with pytest.raises(TypeError, match=xpr):
cat.take([0, 1, -1], fill_value="d", allow_fill=True)

View File

@ -0,0 +1,19 @@
import pytest
import pandas._testing as tm
class TestCategoricalWarnings:
def test_tab_complete_warning(self, ip):
# https://github.com/pandas-dev/pandas/issues/16409
pytest.importorskip("IPython", minversion="6.0.0")
from IPython.core.completer import provisionalcompleter
code = "import pandas as pd; c = pd.Categorical([])"
ip.run_cell(code)
# GH 31324 newer jedi version raises Deprecation warning;
# appears resolved 2021-02-02
with tm.assert_produces_warning(None, raise_on_extra_warnings=False):
with provisionalcompleter("ignore"):
list(ip.Completer.completions("c.", 1))

View File

@ -0,0 +1,284 @@
import numpy as np
import pytest
from pandas._libs import iNaT
from pandas.core.dtypes.dtypes import DatetimeTZDtype
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import DatetimeArray
class TestDatetimeArrayConstructor:
def test_from_sequence_invalid_type(self):
mi = pd.MultiIndex.from_product([np.arange(5), np.arange(5)])
with pytest.raises(TypeError, match="Cannot create a DatetimeArray"):
DatetimeArray._from_sequence(mi, dtype="M8[ns]")
def test_only_1dim_accepted(self):
arr = np.array([0, 1, 2, 3], dtype="M8[h]").astype("M8[ns]")
depr_msg = "DatetimeArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="Only 1-dimensional"):
# 3-dim, we allow 2D to sneak in for ops purposes GH#29853
DatetimeArray(arr.reshape(2, 2, 1))
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="Only 1-dimensional"):
# 0-dim
DatetimeArray(arr[[0]].squeeze())
def test_freq_validation(self):
# GH#24623 check that invalid instances cannot be created with the
# public constructor
arr = np.arange(5, dtype=np.int64) * 3600 * 10**9
msg = (
"Inferred frequency h from passed values does not "
"conform to passed frequency W-SUN"
)
depr_msg = "DatetimeArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match=msg):
DatetimeArray(arr, freq="W")
@pytest.mark.parametrize(
"meth",
[
DatetimeArray._from_sequence,
pd.to_datetime,
pd.DatetimeIndex,
],
)
def test_mixing_naive_tzaware_raises(self, meth):
# GH#24569
arr = np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")])
msg = (
"Cannot mix tz-aware with tz-naive values|"
"Tz-aware datetime.datetime cannot be converted "
"to datetime64 unless utc=True"
)
for obj in [arr, arr[::-1]]:
# check that we raise regardless of whether naive is found
# before aware or vice-versa
with pytest.raises(ValueError, match=msg):
meth(obj)
def test_from_pandas_array(self):
arr = pd.array(np.arange(5, dtype=np.int64)) * 3600 * 10**9
result = DatetimeArray._from_sequence(arr, dtype="M8[ns]")._with_freq("infer")
expected = pd.date_range("1970-01-01", periods=5, freq="h")._data
tm.assert_datetime_array_equal(result, expected)
def test_mismatched_timezone_raises(self):
depr_msg = "DatetimeArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
arr = DatetimeArray(
np.array(["2000-01-01T06:00:00"], dtype="M8[ns]"),
dtype=DatetimeTZDtype(tz="US/Central"),
)
dtype = DatetimeTZDtype(tz="US/Eastern")
msg = r"dtype=datetime64\[ns.*\] does not match data dtype datetime64\[ns.*\]"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(TypeError, match=msg):
DatetimeArray(arr, dtype=dtype)
# also with mismatched tzawareness
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(TypeError, match=msg):
DatetimeArray(arr, dtype=np.dtype("M8[ns]"))
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(TypeError, match=msg):
DatetimeArray(arr.tz_localize(None), dtype=arr.dtype)
def test_non_array_raises(self):
depr_msg = "DatetimeArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="list"):
DatetimeArray([1, 2, 3])
def test_bool_dtype_raises(self):
arr = np.array([1, 2, 3], dtype="bool")
depr_msg = "DatetimeArray.__init__ is deprecated"
msg = "Unexpected value for 'dtype': 'bool'. Must be"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match=msg):
DatetimeArray(arr)
msg = r"dtype bool cannot be converted to datetime64\[ns\]"
with pytest.raises(TypeError, match=msg):
DatetimeArray._from_sequence(arr, dtype="M8[ns]")
with pytest.raises(TypeError, match=msg):
pd.DatetimeIndex(arr)
with pytest.raises(TypeError, match=msg):
pd.to_datetime(arr)
def test_incorrect_dtype_raises(self):
depr_msg = "DatetimeArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="Unexpected value for 'dtype'."):
DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="category")
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="Unexpected value for 'dtype'."):
DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="m8[s]")
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="Unexpected value for 'dtype'."):
DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="M8[D]")
def test_mismatched_values_dtype_units(self):
arr = np.array([1, 2, 3], dtype="M8[s]")
dtype = np.dtype("M8[ns]")
msg = "Values resolution does not match dtype."
depr_msg = "DatetimeArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match=msg):
DatetimeArray(arr, dtype=dtype)
dtype2 = DatetimeTZDtype(tz="UTC", unit="ns")
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match=msg):
DatetimeArray(arr, dtype=dtype2)
def test_freq_infer_raises(self):
depr_msg = "DatetimeArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="Frequency inference"):
DatetimeArray(np.array([1, 2, 3], dtype="i8"), freq="infer")
def test_copy(self):
data = np.array([1, 2, 3], dtype="M8[ns]")
arr = DatetimeArray._from_sequence(data, copy=False)
assert arr._ndarray is data
arr = DatetimeArray._from_sequence(data, copy=True)
assert arr._ndarray is not data
@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"])
def test_numpy_datetime_unit(self, unit):
data = np.array([1, 2, 3], dtype=f"M8[{unit}]")
arr = DatetimeArray._from_sequence(data)
assert arr.unit == unit
assert arr[0].unit == unit
class TestSequenceToDT64NS:
def test_tz_dtype_mismatch_raises(self):
arr = DatetimeArray._from_sequence(
["2000"], dtype=DatetimeTZDtype(tz="US/Central")
)
with pytest.raises(TypeError, match="data is already tz-aware"):
DatetimeArray._from_sequence(arr, dtype=DatetimeTZDtype(tz="UTC"))
def test_tz_dtype_matches(self):
dtype = DatetimeTZDtype(tz="US/Central")
arr = DatetimeArray._from_sequence(["2000"], dtype=dtype)
result = DatetimeArray._from_sequence(arr, dtype=dtype)
tm.assert_equal(arr, result)
@pytest.mark.parametrize("order", ["F", "C"])
def test_2d(self, order):
dti = pd.date_range("2016-01-01", periods=6, tz="US/Pacific")
arr = np.array(dti, dtype=object).reshape(3, 2)
if order == "F":
arr = arr.T
res = DatetimeArray._from_sequence(arr, dtype=dti.dtype)
expected = DatetimeArray._from_sequence(arr.ravel(), dtype=dti.dtype).reshape(
arr.shape
)
tm.assert_datetime_array_equal(res, expected)
# ----------------------------------------------------------------------------
# Arrow interaction
EXTREME_VALUES = [0, 123456789, None, iNaT, 2**63 - 1, -(2**63) + 1]
FINE_TO_COARSE_SAFE = [123_000_000_000, None, -123_000_000_000]
COARSE_TO_FINE_SAFE = [123, None, -123]
@pytest.mark.parametrize(
("pa_unit", "pd_unit", "pa_tz", "pd_tz", "data"),
[
("s", "s", "UTC", "UTC", EXTREME_VALUES),
("ms", "ms", "UTC", "Europe/Berlin", EXTREME_VALUES),
("us", "us", "US/Eastern", "UTC", EXTREME_VALUES),
("ns", "ns", "US/Central", "Asia/Kolkata", EXTREME_VALUES),
("ns", "s", "UTC", "UTC", FINE_TO_COARSE_SAFE),
("us", "ms", "UTC", "Europe/Berlin", FINE_TO_COARSE_SAFE),
("ms", "us", "US/Eastern", "UTC", COARSE_TO_FINE_SAFE),
("s", "ns", "US/Central", "Asia/Kolkata", COARSE_TO_FINE_SAFE),
],
)
def test_from_arrow_with_different_units_and_timezones_with(
pa_unit, pd_unit, pa_tz, pd_tz, data
):
pa = pytest.importorskip("pyarrow")
pa_type = pa.timestamp(pa_unit, tz=pa_tz)
arr = pa.array(data, type=pa_type)
dtype = DatetimeTZDtype(unit=pd_unit, tz=pd_tz)
result = dtype.__from_arrow__(arr)
expected = DatetimeArray._from_sequence(data, dtype=f"M8[{pa_unit}, UTC]").astype(
dtype, copy=False
)
tm.assert_extension_array_equal(result, expected)
result = dtype.__from_arrow__(pa.chunked_array([arr]))
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
("unit", "tz"),
[
("s", "UTC"),
("ms", "Europe/Berlin"),
("us", "US/Eastern"),
("ns", "Asia/Kolkata"),
("ns", "UTC"),
],
)
def test_from_arrow_from_empty(unit, tz):
pa = pytest.importorskip("pyarrow")
data = []
arr = pa.array(data)
dtype = DatetimeTZDtype(unit=unit, tz=tz)
result = dtype.__from_arrow__(arr)
expected = DatetimeArray._from_sequence(np.array(data, dtype=f"datetime64[{unit}]"))
expected = expected.tz_localize(tz=tz)
tm.assert_extension_array_equal(result, expected)
result = dtype.__from_arrow__(pa.chunked_array([arr]))
tm.assert_extension_array_equal(result, expected)
def test_from_arrow_from_integers():
pa = pytest.importorskip("pyarrow")
data = [0, 123456789, None, 2**63 - 1, iNaT, -123456789]
arr = pa.array(data)
dtype = DatetimeTZDtype(unit="ns", tz="UTC")
result = dtype.__from_arrow__(arr)
expected = DatetimeArray._from_sequence(np.array(data, dtype="datetime64[ns]"))
expected = expected.tz_localize("UTC")
tm.assert_extension_array_equal(result, expected)
result = dtype.__from_arrow__(pa.chunked_array([arr]))
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,44 @@
import pytest
import pandas._testing as tm
from pandas.core.arrays import DatetimeArray
class TestAccumulator:
def test_accumulators_freq(self):
# GH#50297
arr = DatetimeArray._from_sequence(
[
"2000-01-01",
"2000-01-02",
"2000-01-03",
],
dtype="M8[ns]",
)._with_freq("infer")
result = arr._accumulate("cummin")
expected = DatetimeArray._from_sequence(["2000-01-01"] * 3, dtype="M8[ns]")
tm.assert_datetime_array_equal(result, expected)
result = arr._accumulate("cummax")
expected = DatetimeArray._from_sequence(
[
"2000-01-01",
"2000-01-02",
"2000-01-03",
],
dtype="M8[ns]",
)
tm.assert_datetime_array_equal(result, expected)
@pytest.mark.parametrize("func", ["cumsum", "cumprod"])
def test_accumulators_disallowed(self, func):
# GH#50297
arr = DatetimeArray._from_sequence(
[
"2000-01-01",
"2000-01-02",
],
dtype="M8[ns]",
)._with_freq("infer")
with pytest.raises(TypeError, match=f"Accumulation {func}"):
arr._accumulate(func)

View File

@ -0,0 +1,183 @@
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import DatetimeTZDtype
import pandas as pd
from pandas import NaT
import pandas._testing as tm
from pandas.core.arrays import DatetimeArray
class TestReductions:
@pytest.fixture(params=["s", "ms", "us", "ns"])
def unit(self, request):
return request.param
@pytest.fixture
def arr1d(self, tz_naive_fixture):
"""Fixture returning DatetimeArray with parametrized timezones"""
tz = tz_naive_fixture
dtype = DatetimeTZDtype(tz=tz) if tz is not None else np.dtype("M8[ns]")
arr = DatetimeArray._from_sequence(
[
"2000-01-03",
"2000-01-03",
"NaT",
"2000-01-02",
"2000-01-05",
"2000-01-04",
],
dtype=dtype,
)
return arr
def test_min_max(self, arr1d, unit):
arr = arr1d
arr = arr.as_unit(unit)
tz = arr.tz
result = arr.min()
expected = pd.Timestamp("2000-01-02", tz=tz).as_unit(unit)
assert result == expected
assert result.unit == expected.unit
result = arr.max()
expected = pd.Timestamp("2000-01-05", tz=tz).as_unit(unit)
assert result == expected
assert result.unit == expected.unit
result = arr.min(skipna=False)
assert result is NaT
result = arr.max(skipna=False)
assert result is NaT
@pytest.mark.parametrize("tz", [None, "US/Central"])
@pytest.mark.parametrize("skipna", [True, False])
def test_min_max_empty(self, skipna, tz):
dtype = DatetimeTZDtype(tz=tz) if tz is not None else np.dtype("M8[ns]")
arr = DatetimeArray._from_sequence([], dtype=dtype)
result = arr.min(skipna=skipna)
assert result is NaT
result = arr.max(skipna=skipna)
assert result is NaT
@pytest.mark.parametrize("tz", [None, "US/Central"])
@pytest.mark.parametrize("skipna", [True, False])
def test_median_empty(self, skipna, tz):
dtype = DatetimeTZDtype(tz=tz) if tz is not None else np.dtype("M8[ns]")
arr = DatetimeArray._from_sequence([], dtype=dtype)
result = arr.median(skipna=skipna)
assert result is NaT
arr = arr.reshape(0, 3)
result = arr.median(axis=0, skipna=skipna)
expected = type(arr)._from_sequence([NaT, NaT, NaT], dtype=arr.dtype)
tm.assert_equal(result, expected)
result = arr.median(axis=1, skipna=skipna)
expected = type(arr)._from_sequence([], dtype=arr.dtype)
tm.assert_equal(result, expected)
def test_median(self, arr1d):
arr = arr1d
result = arr.median()
assert result == arr[0]
result = arr.median(skipna=False)
assert result is NaT
result = arr.dropna().median(skipna=False)
assert result == arr[0]
result = arr.median(axis=0)
assert result == arr[0]
def test_median_axis(self, arr1d):
arr = arr1d
assert arr.median(axis=0) == arr.median()
assert arr.median(axis=0, skipna=False) is NaT
msg = r"abs\(axis\) must be less than ndim"
with pytest.raises(ValueError, match=msg):
arr.median(axis=1)
@pytest.mark.filterwarnings("ignore:All-NaN slice encountered:RuntimeWarning")
def test_median_2d(self, arr1d):
arr = arr1d.reshape(1, -1)
# axis = None
assert arr.median() == arr1d.median()
assert arr.median(skipna=False) is NaT
# axis = 0
result = arr.median(axis=0)
expected = arr1d
tm.assert_equal(result, expected)
# Since column 3 is all-NaT, we get NaT there with or without skipna
result = arr.median(axis=0, skipna=False)
expected = arr1d
tm.assert_equal(result, expected)
# axis = 1
result = arr.median(axis=1)
expected = type(arr)._from_sequence([arr1d.median()], dtype=arr.dtype)
tm.assert_equal(result, expected)
result = arr.median(axis=1, skipna=False)
expected = type(arr)._from_sequence([NaT], dtype=arr.dtype)
tm.assert_equal(result, expected)
def test_mean(self, arr1d):
arr = arr1d
# manually verified result
expected = arr[0] + 0.4 * pd.Timedelta(days=1)
result = arr.mean()
assert result == expected
result = arr.mean(skipna=False)
assert result is NaT
result = arr.dropna().mean(skipna=False)
assert result == expected
result = arr.mean(axis=0)
assert result == expected
def test_mean_2d(self):
dti = pd.date_range("2016-01-01", periods=6, tz="US/Pacific")
dta = dti._data.reshape(3, 2)
result = dta.mean(axis=0)
expected = dta[1]
tm.assert_datetime_array_equal(result, expected)
result = dta.mean(axis=1)
expected = dta[:, 0] + pd.Timedelta(hours=12)
tm.assert_datetime_array_equal(result, expected)
result = dta.mean(axis=None)
expected = dti.mean()
assert result == expected
@pytest.mark.parametrize("skipna", [True, False])
def test_mean_empty(self, arr1d, skipna):
arr = arr1d[:0]
assert arr.mean(skipna=skipna) is NaT
arr2d = arr.reshape(0, 3)
result = arr2d.mean(axis=0, skipna=skipna)
expected = DatetimeArray._from_sequence([NaT, NaT, NaT], dtype=arr.dtype)
tm.assert_datetime_array_equal(result, expected)
result = arr2d.mean(axis=1, skipna=skipna)
expected = arr # i.e. 1D, empty
tm.assert_datetime_array_equal(result, expected)
result = arr2d.mean(axis=None, skipna=skipna)
assert result is NaT

View File

@ -0,0 +1,48 @@
import numpy as np
import pytest
import pandas as pd
from pandas.core.arrays.floating import (
Float32Dtype,
Float64Dtype,
)
@pytest.fixture(params=[Float32Dtype, Float64Dtype])
def dtype(request):
"""Parametrized fixture returning a float 'dtype'"""
return request.param()
@pytest.fixture
def data(dtype):
"""Fixture returning 'data' array according to parametrized float 'dtype'"""
return pd.array(
list(np.arange(0.1, 0.9, 0.1))
+ [pd.NA]
+ list(np.arange(1, 9.8, 0.1))
+ [pd.NA]
+ [9.9, 10.0],
dtype=dtype,
)
@pytest.fixture
def data_missing(dtype):
"""
Fixture returning array with missing data according to parametrized float
'dtype'.
"""
return pd.array([np.nan, 0.1], dtype=dtype)
@pytest.fixture(params=["data", "data_missing"])
def all_data(request, data, data_missing):
"""Parametrized fixture returning 'data' or 'data_missing' float arrays.
Used to test dtype conversion with and without missing values.
"""
if request.param == "data":
return data
elif request.param == "data_missing":
return data_missing

View File

@ -0,0 +1,244 @@
import operator
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
# Basic test for the arithmetic array ops
# -----------------------------------------------------------------------------
@pytest.mark.parametrize(
"opname, exp",
[
("add", [1.1, 2.2, None, None, 5.5]),
("mul", [0.1, 0.4, None, None, 2.5]),
("sub", [0.9, 1.8, None, None, 4.5]),
("truediv", [10.0, 10.0, None, None, 10.0]),
("floordiv", [9.0, 9.0, None, None, 10.0]),
("mod", [0.1, 0.2, None, None, 0.0]),
],
ids=["add", "mul", "sub", "div", "floordiv", "mod"],
)
def test_array_op(dtype, opname, exp):
a = pd.array([1.0, 2.0, None, 4.0, 5.0], dtype=dtype)
b = pd.array([0.1, 0.2, 0.3, None, 0.5], dtype=dtype)
op = getattr(operator, opname)
result = op(a, b)
expected = pd.array(exp, dtype=dtype)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)])
def test_divide_by_zero(dtype, zero, negative):
# TODO pending NA/NaN discussion
# https://github.com/pandas-dev/pandas/issues/32265/
a = pd.array([0, 1, -1, None], dtype=dtype)
result = a / zero
expected = FloatingArray(
np.array([np.nan, np.inf, -np.inf, np.nan], dtype=dtype.numpy_dtype),
np.array([False, False, False, True]),
)
if negative:
expected *= -1
tm.assert_extension_array_equal(result, expected)
def test_pow_scalar(dtype):
a = pd.array([-1, 0, 1, None, 2], dtype=dtype)
result = a**0
expected = pd.array([1, 1, 1, 1, 1], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = a**1
expected = pd.array([-1, 0, 1, None, 2], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = a**pd.NA
expected = pd.array([None, None, 1, None, None], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = a**np.nan
# TODO np.nan should be converted to pd.NA / missing before operation?
expected = FloatingArray(
np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype=dtype.numpy_dtype),
mask=a._mask,
)
tm.assert_extension_array_equal(result, expected)
# reversed
a = a[1:] # Can't raise integers to negative powers.
result = 0**a
expected = pd.array([1, 0, None, 0], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = 1**a
expected = pd.array([1, 1, 1, 1], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = pd.NA**a
expected = pd.array([1, None, None, None], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = np.nan**a
expected = FloatingArray(
np.array([1, np.nan, np.nan, np.nan], dtype=dtype.numpy_dtype), mask=a._mask
)
tm.assert_extension_array_equal(result, expected)
def test_pow_array(dtype):
a = pd.array([0, 0, 0, 1, 1, 1, None, None, None], dtype=dtype)
b = pd.array([0, 1, None, 0, 1, None, 0, 1, None], dtype=dtype)
result = a**b
expected = pd.array([1, 0, None, 1, 1, 1, 1, None, None], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
def test_rpow_one_to_na():
# https://github.com/pandas-dev/pandas/issues/22022
# https://github.com/pandas-dev/pandas/issues/29997
arr = pd.array([np.nan, np.nan], dtype="Float64")
result = np.array([1.0, 2.0]) ** arr
expected = pd.array([1.0, np.nan], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("other", [0, 0.5])
def test_arith_zero_dim_ndarray(other):
arr = pd.array([1, None, 2], dtype="Float64")
result = arr + np.array(other)
expected = arr + other
tm.assert_equal(result, expected)
# Test generic characteristics / errors
# -----------------------------------------------------------------------------
def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)
if using_infer_string:
import pyarrow as pa
errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
else:
errs = TypeError
# invalid scalars
msg = "|".join(
[
r"can only perform ops with numeric values",
r"FloatingArray cannot perform the operation mod",
"unsupported operand type",
"not all arguments converted during string formatting",
"can't multiply sequence by non-int of type 'float'",
"ufunc 'subtract' cannot use operands with types dtype",
r"can only concatenate str \(not \"float\"\) to str",
"ufunc '.*' not supported for the input types, and the inputs could not",
"ufunc '.*' did not contain a loop with signature matching types",
"Concatenation operation is not implemented for NumPy arrays",
"has no kernel",
"not implemented",
]
)
with pytest.raises(errs, match=msg):
ops("foo")
with pytest.raises(errs, match=msg):
ops(pd.Timestamp("20180101"))
# invalid array-likes
with pytest.raises(errs, match=msg):
ops(pd.Series("foo", index=s.index))
msg = "|".join(
[
"can only perform ops with numeric values",
"cannot perform .* with this index type: DatetimeArray",
"Addition/subtraction of integers and integer-arrays "
"with DatetimeArray is no longer supported. *",
"unsupported operand type",
"not all arguments converted during string formatting",
"can't multiply sequence by non-int of type 'float'",
"ufunc 'subtract' cannot use operands with types dtype",
(
"ufunc 'add' cannot use operands with types "
rf"dtype\('{tm.ENDIAN}M8\[ns\]'\)"
),
r"ufunc 'add' cannot use operands with types dtype\('float\d{2}'\)",
"cannot subtract DatetimeArray from ndarray",
"has no kernel",
"not implemented",
]
)
with pytest.raises(errs, match=msg):
ops(pd.Series(pd.date_range("20180101", periods=len(s))))
# Various
# -----------------------------------------------------------------------------
def test_cross_type_arithmetic():
df = pd.DataFrame(
{
"A": pd.array([1, 2, np.nan], dtype="Float64"),
"B": pd.array([1, np.nan, 3], dtype="Float32"),
"C": np.array([1, 2, 3], dtype="float64"),
}
)
result = df.A + df.C
expected = pd.Series([2, 4, np.nan], dtype="Float64")
tm.assert_series_equal(result, expected)
result = (df.A + df.C) * 3 == 12
expected = pd.Series([False, True, None], dtype="boolean")
tm.assert_series_equal(result, expected)
result = df.A + df.B
expected = pd.Series([2, np.nan, np.nan], dtype="Float64")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"source, neg_target, abs_target",
[
([1.1, 2.2, 3.3], [-1.1, -2.2, -3.3], [1.1, 2.2, 3.3]),
([1.1, 2.2, None], [-1.1, -2.2, None], [1.1, 2.2, None]),
([-1.1, 0.0, 1.1], [1.1, 0.0, -1.1], [1.1, 0.0, 1.1]),
],
)
def test_unary_float_operators(float_ea_dtype, source, neg_target, abs_target):
# GH38794
dtype = float_ea_dtype
arr = pd.array(source, dtype=dtype)
neg_result, pos_result, abs_result = -arr, +arr, abs(arr)
neg_target = pd.array(neg_target, dtype=dtype)
abs_target = pd.array(abs_target, dtype=dtype)
tm.assert_extension_array_equal(neg_result, neg_target)
tm.assert_extension_array_equal(pos_result, arr)
assert not tm.shares_memory(pos_result, arr)
tm.assert_extension_array_equal(abs_result, abs_target)
def test_bitwise(dtype):
left = pd.array([1, None, 3, 4], dtype=dtype)
right = pd.array([None, 3, 5, 4], dtype=dtype)
with pytest.raises(TypeError, match="unsupported operand type"):
left | right
with pytest.raises(TypeError, match="unsupported operand type"):
left & right
with pytest.raises(TypeError, match="unsupported operand type"):
left ^ right

View File

@ -0,0 +1,128 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
def test_astype():
# with missing values
arr = pd.array([0.1, 0.2, None], dtype="Float64")
with pytest.raises(ValueError, match="cannot convert NA to integer"):
arr.astype("int64")
with pytest.raises(ValueError, match="cannot convert float NaN to bool"):
arr.astype("bool")
result = arr.astype("float64")
expected = np.array([0.1, 0.2, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
# no missing values
arr = pd.array([0.0, 1.0, 0.5], dtype="Float64")
result = arr.astype("int64")
expected = np.array([0, 1, 0], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
result = arr.astype("bool")
expected = np.array([False, True, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
def test_astype_to_floating_array():
# astype to FloatingArray
arr = pd.array([0.0, 1.0, None], dtype="Float64")
result = arr.astype("Float64")
tm.assert_extension_array_equal(result, arr)
result = arr.astype(pd.Float64Dtype())
tm.assert_extension_array_equal(result, arr)
result = arr.astype("Float32")
expected = pd.array([0.0, 1.0, None], dtype="Float32")
tm.assert_extension_array_equal(result, expected)
def test_astype_to_boolean_array():
# astype to BooleanArray
arr = pd.array([0.0, 1.0, None], dtype="Float64")
result = arr.astype("boolean")
expected = pd.array([False, True, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = arr.astype(pd.BooleanDtype())
tm.assert_extension_array_equal(result, expected)
def test_astype_to_integer_array():
# astype to IntegerArray
arr = pd.array([0.0, 1.5, None], dtype="Float64")
result = arr.astype("Int64")
expected = pd.array([0, 1, None], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
def test_astype_str():
a = pd.array([0.1, 0.2, None], dtype="Float64")
expected = np.array(["0.1", "0.2", "<NA>"], dtype="U32")
tm.assert_numpy_array_equal(a.astype(str), expected)
tm.assert_numpy_array_equal(a.astype("str"), expected)
def test_astype_copy():
arr = pd.array([0.1, 0.2, None], dtype="Float64")
orig = pd.array([0.1, 0.2, None], dtype="Float64")
# copy=True -> ensure both data and mask are actual copies
result = arr.astype("Float64", copy=True)
assert result is not arr
assert not tm.shares_memory(result, arr)
result[0] = 10
tm.assert_extension_array_equal(arr, orig)
result[0] = pd.NA
tm.assert_extension_array_equal(arr, orig)
# copy=False
result = arr.astype("Float64", copy=False)
assert result is arr
assert np.shares_memory(result._data, arr._data)
assert np.shares_memory(result._mask, arr._mask)
result[0] = 10
assert arr[0] == 10
result[0] = pd.NA
assert arr[0] is pd.NA
# astype to different dtype -> always needs a copy -> even with copy=False
# we need to ensure that also the mask is actually copied
arr = pd.array([0.1, 0.2, None], dtype="Float64")
orig = pd.array([0.1, 0.2, None], dtype="Float64")
result = arr.astype("Float32", copy=False)
assert not tm.shares_memory(result, arr)
result[0] = 10
tm.assert_extension_array_equal(arr, orig)
result[0] = pd.NA
tm.assert_extension_array_equal(arr, orig)
def test_astype_object(dtype):
arr = pd.array([1.0, pd.NA], dtype=dtype)
result = arr.astype(object)
expected = np.array([1.0, pd.NA], dtype=object)
tm.assert_numpy_array_equal(result, expected)
# check exact element types
assert isinstance(result[0], float)
assert result[1] is pd.NA
def test_Float64_conversion():
# GH#40729
testseries = pd.Series(["1", "2", "3", "4"], dtype="object")
result = testseries.astype(pd.Float64Dtype())
expected = pd.Series([1.0, 2.0, 3.0, 4.0], dtype=pd.Float64Dtype())
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,65 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
from pandas.tests.arrays.masked_shared import (
ComparisonOps,
NumericOps,
)
class TestComparisonOps(NumericOps, ComparisonOps):
@pytest.mark.parametrize("other", [True, False, pd.NA, -1.0, 0.0, 1])
def test_scalar(self, other, comparison_op, dtype):
ComparisonOps.test_scalar(self, other, comparison_op, dtype)
def test_compare_with_integerarray(self, comparison_op):
op = comparison_op
a = pd.array([0, 1, None] * 3, dtype="Int64")
b = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype="Float64")
other = b.astype("Int64")
expected = op(a, other)
result = op(a, b)
tm.assert_extension_array_equal(result, expected)
expected = op(other, a)
result = op(b, a)
tm.assert_extension_array_equal(result, expected)
def test_equals():
# GH-30652
# equals is generally tested in /tests/extension/base/methods, but this
# specifically tests that two arrays of the same class but different dtype
# do not evaluate equal
a1 = pd.array([1, 2, None], dtype="Float64")
a2 = pd.array([1, 2, None], dtype="Float32")
assert a1.equals(a2) is False
def test_equals_nan_vs_na():
# GH#44382
mask = np.zeros(3, dtype=bool)
data = np.array([1.0, np.nan, 3.0], dtype=np.float64)
left = FloatingArray(data, mask)
assert left.equals(left)
tm.assert_extension_array_equal(left, left)
assert left.equals(left.copy())
assert left.equals(FloatingArray(data.copy(), mask.copy()))
mask2 = np.array([False, True, False], dtype=bool)
data2 = np.array([1.0, 2.0, 3.0], dtype=np.float64)
right = FloatingArray(data2, mask2)
assert right.equals(right)
tm.assert_extension_array_equal(right, right)
assert not left.equals(right)
# with mask[1] = True, the only difference is data[1], which should
# not matter for equals
mask[1] = True
assert left.equals(right)

View File

@ -0,0 +1,20 @@
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize(
"to_concat_dtypes, result_dtype",
[
(["Float64", "Float64"], "Float64"),
(["Float32", "Float64"], "Float64"),
(["Float32", "Float32"], "Float32"),
],
)
def test_concat_series(to_concat_dtypes, result_dtype):
result = pd.concat([pd.Series([1, 2, pd.NA], dtype=t) for t in to_concat_dtypes])
expected = pd.concat([pd.Series([1, 2, pd.NA], dtype=object)] * 2).astype(
result_dtype
)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,204 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
from pandas.core.arrays.floating import (
Float32Dtype,
Float64Dtype,
)
def test_uses_pandas_na():
a = pd.array([1, None], dtype=Float64Dtype())
assert a[1] is pd.NA
def test_floating_array_constructor():
values = np.array([1, 2, 3, 4], dtype="float64")
mask = np.array([False, False, False, True], dtype="bool")
result = FloatingArray(values, mask)
expected = pd.array([1, 2, 3, np.nan], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
tm.assert_numpy_array_equal(result._data, values)
tm.assert_numpy_array_equal(result._mask, mask)
msg = r".* should be .* numpy array. Use the 'pd.array' function instead"
with pytest.raises(TypeError, match=msg):
FloatingArray(values.tolist(), mask)
with pytest.raises(TypeError, match=msg):
FloatingArray(values, mask.tolist())
with pytest.raises(TypeError, match=msg):
FloatingArray(values.astype(int), mask)
msg = r"__init__\(\) missing 1 required positional argument: 'mask'"
with pytest.raises(TypeError, match=msg):
FloatingArray(values)
def test_floating_array_disallows_float16():
# GH#44715
arr = np.array([1, 2], dtype=np.float16)
mask = np.array([False, False])
msg = "FloatingArray does not support np.float16 dtype"
with pytest.raises(TypeError, match=msg):
FloatingArray(arr, mask)
def test_floating_array_disallows_Float16_dtype(request):
# GH#44715
with pytest.raises(TypeError, match="data type 'Float16' not understood"):
pd.array([1.0, 2.0], dtype="Float16")
def test_floating_array_constructor_copy():
values = np.array([1, 2, 3, 4], dtype="float64")
mask = np.array([False, False, False, True], dtype="bool")
result = FloatingArray(values, mask)
assert result._data is values
assert result._mask is mask
result = FloatingArray(values, mask, copy=True)
assert result._data is not values
assert result._mask is not mask
def test_to_array():
result = pd.array([0.1, 0.2, 0.3, 0.4])
expected = pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"a, b",
[
([1, None], [1, pd.NA]),
([None], [pd.NA]),
([None, np.nan], [pd.NA, pd.NA]),
([1, np.nan], [1, pd.NA]),
([np.nan], [pd.NA]),
],
)
def test_to_array_none_is_nan(a, b):
result = pd.array(a, dtype="Float64")
expected = pd.array(b, dtype="Float64")
tm.assert_extension_array_equal(result, expected)
def test_to_array_mixed_integer_float():
result = pd.array([1, 2.0])
expected = pd.array([1.0, 2.0], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
result = pd.array([1, None, 2.0])
expected = pd.array([1.0, None, 2.0], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"values",
[
["foo", "bar"],
"foo",
1,
1.0,
pd.date_range("20130101", periods=2),
np.array(["foo"]),
[[1, 2], [3, 4]],
[np.nan, {"a": 1}],
# GH#44514 all-NA case used to get quietly swapped out before checking ndim
np.array([pd.NA] * 6, dtype=object).reshape(3, 2),
],
)
def test_to_array_error(values):
# error in converting existing arrays to FloatingArray
msg = "|".join(
[
"cannot be converted to FloatingDtype",
"values must be a 1D list-like",
"Cannot pass scalar",
r"float\(\) argument must be a string or a (real )?number, not 'dict'",
"could not convert string to float: 'foo'",
r"could not convert string to float: np\.str_\('foo'\)",
]
)
with pytest.raises((TypeError, ValueError), match=msg):
pd.array(values, dtype="Float64")
@pytest.mark.parametrize("values", [["1", "2", None], ["1.5", "2", None]])
def test_construct_from_float_strings(values):
# see also test_to_integer_array_str
expected = pd.array([float(values[0]), 2, None], dtype="Float64")
res = pd.array(values, dtype="Float64")
tm.assert_extension_array_equal(res, expected)
res = FloatingArray._from_sequence(values)
tm.assert_extension_array_equal(res, expected)
def test_to_array_inferred_dtype():
# if values has dtype -> respect it
result = pd.array(np.array([1, 2], dtype="float32"))
assert result.dtype == Float32Dtype()
# if values have no dtype -> always float64
result = pd.array([1.0, 2.0])
assert result.dtype == Float64Dtype()
def test_to_array_dtype_keyword():
result = pd.array([1, 2], dtype="Float32")
assert result.dtype == Float32Dtype()
# if values has dtype -> override it
result = pd.array(np.array([1, 2], dtype="float32"), dtype="Float64")
assert result.dtype == Float64Dtype()
def test_to_array_integer():
result = pd.array([1, 2], dtype="Float64")
expected = pd.array([1.0, 2.0], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
# for integer dtypes, the itemsize is not preserved
# TODO can we specify "floating" in general?
result = pd.array(np.array([1, 2], dtype="int32"), dtype="Float64")
assert result.dtype == Float64Dtype()
@pytest.mark.parametrize(
"bool_values, values, target_dtype, expected_dtype",
[
([False, True], [0, 1], Float64Dtype(), Float64Dtype()),
([False, True], [0, 1], "Float64", Float64Dtype()),
([False, True, np.nan], [0, 1, np.nan], Float64Dtype(), Float64Dtype()),
],
)
def test_to_array_bool(bool_values, values, target_dtype, expected_dtype):
result = pd.array(bool_values, dtype=target_dtype)
assert result.dtype == expected_dtype
expected = pd.array(values, dtype=target_dtype)
tm.assert_extension_array_equal(result, expected)
def test_series_from_float(data):
# construct from our dtype & string dtype
dtype = data.dtype
# from float
expected = pd.Series(data)
result = pd.Series(data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype))
tm.assert_series_equal(result, expected)
# from list
expected = pd.Series(data)
result = pd.Series(np.array(data).tolist(), dtype=str(dtype))
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,12 @@
import numpy as np
import pandas as pd
def test_contains_nan():
# GH#52840
arr = pd.array(range(5)) / 0
assert np.isnan(arr._data[0])
assert not arr.isna()[0]
assert np.nan in arr

View File

@ -0,0 +1,194 @@
import numpy as np
import pytest
from pandas.compat import IS64
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize("ufunc", [np.abs, np.sign])
# np.sign emits a warning with nans, <https://github.com/numpy/numpy/issues/15127>
@pytest.mark.filterwarnings("ignore:invalid value encountered in sign:RuntimeWarning")
def test_ufuncs_single(ufunc):
a = pd.array([1, 2, -3, np.nan], dtype="Float64")
result = ufunc(a)
expected = pd.array(ufunc(a.astype(float)), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
s = pd.Series(a)
result = ufunc(s)
expected = pd.Series(expected)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt])
def test_ufuncs_single_float(ufunc):
a = pd.array([1.0, 0.2, 3.0, np.nan], dtype="Float64")
with np.errstate(invalid="ignore"):
result = ufunc(a)
expected = pd.array(ufunc(a.astype(float)), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
s = pd.Series(a)
with np.errstate(invalid="ignore"):
result = ufunc(s)
expected = pd.Series(ufunc(s.astype(float)), dtype="Float64")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("ufunc", [np.add, np.subtract])
def test_ufuncs_binary_float(ufunc):
# two FloatingArrays
a = pd.array([1, 0.2, -3, np.nan], dtype="Float64")
result = ufunc(a, a)
expected = pd.array(ufunc(a.astype(float), a.astype(float)), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
# FloatingArray with numpy array
arr = np.array([1, 2, 3, 4])
result = ufunc(a, arr)
expected = pd.array(ufunc(a.astype(float), arr), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
result = ufunc(arr, a)
expected = pd.array(ufunc(arr, a.astype(float)), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
# FloatingArray with scalar
result = ufunc(a, 1)
expected = pd.array(ufunc(a.astype(float), 1), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
result = ufunc(1, a)
expected = pd.array(ufunc(1, a.astype(float)), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("values", [[0, 1], [0, None]])
def test_ufunc_reduce_raises(values):
arr = pd.array(values, dtype="Float64")
res = np.add.reduce(arr)
expected = arr.sum(skipna=False)
tm.assert_almost_equal(res, expected)
@pytest.mark.skipif(not IS64, reason="GH 36579: fail on 32-bit system")
@pytest.mark.parametrize(
"pandasmethname, kwargs",
[
("var", {"ddof": 0}),
("var", {"ddof": 1}),
("std", {"ddof": 0}),
("std", {"ddof": 1}),
("kurtosis", {}),
("skew", {}),
("sem", {}),
],
)
def test_stat_method(pandasmethname, kwargs):
s = pd.Series(data=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, np.nan, np.nan], dtype="Float64")
pandasmeth = getattr(s, pandasmethname)
result = pandasmeth(**kwargs)
s2 = pd.Series(data=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype="float64")
pandasmeth = getattr(s2, pandasmethname)
expected = pandasmeth(**kwargs)
assert expected == result
def test_value_counts_na():
arr = pd.array([0.1, 0.2, 0.1, pd.NA], dtype="Float64")
result = arr.value_counts(dropna=False)
idx = pd.Index([0.1, 0.2, pd.NA], dtype=arr.dtype)
assert idx.dtype == arr.dtype
expected = pd.Series([2, 1, 1], index=idx, dtype="Int64", name="count")
tm.assert_series_equal(result, expected)
result = arr.value_counts(dropna=True)
expected = pd.Series([2, 1], index=idx[:-1], dtype="Int64", name="count")
tm.assert_series_equal(result, expected)
def test_value_counts_empty():
ser = pd.Series([], dtype="Float64")
result = ser.value_counts()
idx = pd.Index([], dtype="Float64")
assert idx.dtype == "Float64"
expected = pd.Series([], index=idx, dtype="Int64", name="count")
tm.assert_series_equal(result, expected)
def test_value_counts_with_normalize():
ser = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64")
result = ser.value_counts(normalize=True)
expected = pd.Series([2, 1], index=ser[:2], dtype="Float64", name="proportion") / 3
assert expected.index.dtype == ser.dtype
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("min_count", [0, 4])
def test_floating_array_sum(skipna, min_count, dtype):
arr = pd.array([1, 2, 3, None], dtype=dtype)
result = arr.sum(skipna=skipna, min_count=min_count)
if skipna and min_count == 0:
assert result == 6.0
else:
assert result is pd.NA
@pytest.mark.parametrize(
"values, expected", [([1, 2, 3], 6.0), ([1, 2, 3, None], 6.0), ([None], 0.0)]
)
def test_floating_array_numpy_sum(values, expected):
arr = pd.array(values, dtype="Float64")
result = np.sum(arr)
assert result == expected
@pytest.mark.parametrize("op", ["sum", "min", "max", "prod"])
def test_preserve_dtypes(op):
df = pd.DataFrame(
{
"A": ["a", "b", "b"],
"B": [1, None, 3],
"C": pd.array([0.1, None, 3.0], dtype="Float64"),
}
)
# op
result = getattr(df.C, op)()
assert isinstance(result, np.float64)
# groupby
result = getattr(df.groupby("A"), op)()
expected = pd.DataFrame(
{"B": np.array([1.0, 3.0]), "C": pd.array([0.1, 3], dtype="Float64")},
index=pd.Index(["a", "b"], name="A"),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("method", ["min", "max"])
def test_floating_array_min_max(skipna, method, dtype):
arr = pd.array([0.0, 1.0, None], dtype=dtype)
func = getattr(arr, method)
result = func(skipna=skipna)
if skipna:
assert result == (0 if method == "min" else 1)
else:
assert result is pd.NA
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("min_count", [0, 9])
def test_floating_array_prod(skipna, min_count, dtype):
arr = pd.array([1.0, 2.0, None], dtype=dtype)
result = arr.prod(skipna=skipna, min_count=min_count)
if skipna and min_count == 0:
assert result == 2
else:
assert result is pd.NA

View File

@ -0,0 +1,47 @@
import numpy as np
import pytest
import pandas as pd
from pandas.core.arrays.floating import (
Float32Dtype,
Float64Dtype,
)
def test_dtypes(dtype):
# smoke tests on auto dtype construction
np.dtype(dtype.type).kind == "f"
assert dtype.name is not None
@pytest.mark.parametrize(
"dtype, expected",
[(Float32Dtype(), "Float32Dtype()"), (Float64Dtype(), "Float64Dtype()")],
)
def test_repr_dtype(dtype, expected):
assert repr(dtype) == expected
def test_repr_array():
result = repr(pd.array([1.0, None, 3.0]))
expected = "<FloatingArray>\n[1.0, <NA>, 3.0]\nLength: 3, dtype: Float64"
assert result == expected
def test_repr_array_long():
data = pd.array([1.0, 2.0, None] * 1000)
expected = """<FloatingArray>
[ 1.0, 2.0, <NA>, 1.0, 2.0, <NA>, 1.0, 2.0, <NA>, 1.0,
...
<NA>, 1.0, 2.0, <NA>, 1.0, 2.0, <NA>, 1.0, 2.0, <NA>]
Length: 3000, dtype: Float64"""
result = repr(data)
assert result == expected
def test_frame_repr(data_missing):
df = pd.DataFrame({"A": data_missing})
result = repr(df)
expected = " A\n0 <NA>\n1 0.1"
assert result == expected

View File

@ -0,0 +1,132 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy(box):
con = pd.Series if box else pd.array
# default (with or without missing values) -> object dtype
arr = con([0.1, 0.2, 0.3], dtype="Float64")
result = arr.to_numpy()
expected = np.array([0.1, 0.2, 0.3], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
arr = con([0.1, 0.2, None], dtype="Float64")
result = arr.to_numpy()
expected = np.array([0.1, 0.2, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_float(box):
con = pd.Series if box else pd.array
# no missing values -> can convert to float, otherwise raises
arr = con([0.1, 0.2, 0.3], dtype="Float64")
result = arr.to_numpy(dtype="float64")
expected = np.array([0.1, 0.2, 0.3], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
arr = con([0.1, 0.2, None], dtype="Float64")
result = arr.to_numpy(dtype="float64")
expected = np.array([0.1, 0.2, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
result = arr.to_numpy(dtype="float64", na_value=np.nan)
expected = np.array([0.1, 0.2, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_int(box):
con = pd.Series if box else pd.array
# no missing values -> can convert to int, otherwise raises
arr = con([1.0, 2.0, 3.0], dtype="Float64")
result = arr.to_numpy(dtype="int64")
expected = np.array([1, 2, 3], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
arr = con([1.0, 2.0, None], dtype="Float64")
with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"):
result = arr.to_numpy(dtype="int64")
# automatic casting (floors the values)
arr = con([0.1, 0.9, 1.1], dtype="Float64")
result = arr.to_numpy(dtype="int64")
expected = np.array([0, 0, 1], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_na_value(box):
con = pd.Series if box else pd.array
arr = con([0.0, 1.0, None], dtype="Float64")
result = arr.to_numpy(dtype=object, na_value=None)
expected = np.array([0.0, 1.0, None], dtype="object")
tm.assert_numpy_array_equal(result, expected)
result = arr.to_numpy(dtype=bool, na_value=False)
expected = np.array([False, True, False], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
result = arr.to_numpy(dtype="int64", na_value=-99)
expected = np.array([0, 1, -99], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
def test_to_numpy_na_value_with_nan():
# array with both NaN and NA -> only fill NA with `na_value`
arr = FloatingArray(np.array([0.0, np.nan, 0.0]), np.array([False, False, True]))
result = arr.to_numpy(dtype="float64", na_value=-1)
expected = np.array([0.0, np.nan, -1.0], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("dtype", ["float64", "float32", "int32", "int64", "bool"])
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_dtype(box, dtype):
con = pd.Series if box else pd.array
arr = con([0.0, 1.0], dtype="Float64")
result = arr.to_numpy(dtype=dtype)
expected = np.array([0, 1], dtype=dtype)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("dtype", ["int32", "int64", "bool"])
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_na_raises(box, dtype):
con = pd.Series if box else pd.array
arr = con([0.0, 1.0, None], dtype="Float64")
with pytest.raises(ValueError, match=dtype):
arr.to_numpy(dtype=dtype)
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_string(box, dtype):
con = pd.Series if box else pd.array
arr = con([0.0, 1.0, None], dtype="Float64")
result = arr.to_numpy(dtype="str")
expected = np.array([0.0, 1.0, pd.NA], dtype=f"{tm.ENDIAN}U32")
tm.assert_numpy_array_equal(result, expected)
def test_to_numpy_copy():
# to_numpy can be zero-copy if no missing values
arr = pd.array([0.1, 0.2, 0.3], dtype="Float64")
result = arr.to_numpy(dtype="float64")
result[0] = 10
tm.assert_extension_array_equal(arr, pd.array([10, 0.2, 0.3], dtype="Float64"))
arr = pd.array([0.1, 0.2, 0.3], dtype="Float64")
result = arr.to_numpy(dtype="float64", copy=True)
result[0] = 10
tm.assert_extension_array_equal(arr, pd.array([0.1, 0.2, 0.3], dtype="Float64"))

View File

@ -0,0 +1,68 @@
import numpy as np
import pytest
import pandas as pd
from pandas.core.arrays.integer import (
Int8Dtype,
Int16Dtype,
Int32Dtype,
Int64Dtype,
UInt8Dtype,
UInt16Dtype,
UInt32Dtype,
UInt64Dtype,
)
@pytest.fixture(
params=[
Int8Dtype,
Int16Dtype,
Int32Dtype,
Int64Dtype,
UInt8Dtype,
UInt16Dtype,
UInt32Dtype,
UInt64Dtype,
]
)
def dtype(request):
"""Parametrized fixture returning integer 'dtype'"""
return request.param()
@pytest.fixture
def data(dtype):
"""
Fixture returning 'data' array with valid and missing values according to
parametrized integer 'dtype'.
Used to test dtype conversion with and without missing values.
"""
return pd.array(
list(range(8)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100],
dtype=dtype,
)
@pytest.fixture
def data_missing(dtype):
"""
Fixture returning array with exactly one NaN and one valid integer,
according to parametrized integer 'dtype'.
Used to test dtype conversion with and without missing values.
"""
return pd.array([np.nan, 1], dtype=dtype)
@pytest.fixture(params=["data", "data_missing"])
def all_data(request, data, data_missing):
"""Parametrized fixture returning 'data' or 'data_missing' integer arrays.
Used to test dtype conversion with and without missing values.
"""
if request.param == "data":
return data
elif request.param == "data_missing":
return data_missing

View File

@ -0,0 +1,385 @@
import operator
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core import ops
from pandas.core.arrays import FloatingArray
# Basic test for the arithmetic array ops
# -----------------------------------------------------------------------------
@pytest.mark.parametrize(
"opname, exp",
[("add", [1, 3, None, None, 9]), ("mul", [0, 2, None, None, 20])],
ids=["add", "mul"],
)
def test_add_mul(dtype, opname, exp):
a = pd.array([0, 1, None, 3, 4], dtype=dtype)
b = pd.array([1, 2, 3, None, 5], dtype=dtype)
# array / array
expected = pd.array(exp, dtype=dtype)
op = getattr(operator, opname)
result = op(a, b)
tm.assert_extension_array_equal(result, expected)
op = getattr(ops, "r" + opname)
result = op(a, b)
tm.assert_extension_array_equal(result, expected)
def test_sub(dtype):
a = pd.array([1, 2, 3, None, 5], dtype=dtype)
b = pd.array([0, 1, None, 3, 4], dtype=dtype)
result = a - b
expected = pd.array([1, 1, None, None, 1], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
def test_div(dtype):
a = pd.array([1, 2, 3, None, 5], dtype=dtype)
b = pd.array([0, 1, None, 3, 4], dtype=dtype)
result = a / b
expected = pd.array([np.inf, 2, None, None, 1.25], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)])
def test_divide_by_zero(zero, negative):
# https://github.com/pandas-dev/pandas/issues/27398, GH#22793
a = pd.array([0, 1, -1, None], dtype="Int64")
result = a / zero
expected = FloatingArray(
np.array([np.nan, np.inf, -np.inf, 1], dtype="float64"),
np.array([False, False, False, True]),
)
if negative:
expected *= -1
tm.assert_extension_array_equal(result, expected)
def test_floordiv(dtype):
a = pd.array([1, 2, 3, None, 5], dtype=dtype)
b = pd.array([0, 1, None, 3, 4], dtype=dtype)
result = a // b
# Series op sets 1//0 to np.inf, which IntegerArray does not do (yet)
expected = pd.array([0, 2, None, None, 1], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
def test_floordiv_by_int_zero_no_mask(any_int_ea_dtype):
# GH 48223: Aligns with non-masked floordiv
# but differs from numpy
# https://github.com/pandas-dev/pandas/issues/30188#issuecomment-564452740
ser = pd.Series([0, 1], dtype=any_int_ea_dtype)
result = 1 // ser
expected = pd.Series([np.inf, 1.0], dtype="Float64")
tm.assert_series_equal(result, expected)
ser_non_nullable = ser.astype(ser.dtype.numpy_dtype)
result = 1 // ser_non_nullable
expected = expected.astype(np.float64)
tm.assert_series_equal(result, expected)
def test_mod(dtype):
a = pd.array([1, 2, 3, None, 5], dtype=dtype)
b = pd.array([0, 1, None, 3, 4], dtype=dtype)
result = a % b
expected = pd.array([0, 0, None, None, 1], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
def test_pow_scalar():
a = pd.array([-1, 0, 1, None, 2], dtype="Int64")
result = a**0
expected = pd.array([1, 1, 1, 1, 1], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = a**1
expected = pd.array([-1, 0, 1, None, 2], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = a**pd.NA
expected = pd.array([None, None, 1, None, None], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = a**np.nan
expected = FloatingArray(
np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64"),
np.array([False, False, False, True, False]),
)
tm.assert_extension_array_equal(result, expected)
# reversed
a = a[1:] # Can't raise integers to negative powers.
result = 0**a
expected = pd.array([1, 0, None, 0], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = 1**a
expected = pd.array([1, 1, 1, 1], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = pd.NA**a
expected = pd.array([1, None, None, None], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = np.nan**a
expected = FloatingArray(
np.array([1, np.nan, np.nan, np.nan], dtype="float64"),
np.array([False, False, True, False]),
)
tm.assert_extension_array_equal(result, expected)
def test_pow_array():
a = pd.array([0, 0, 0, 1, 1, 1, None, None, None])
b = pd.array([0, 1, None, 0, 1, None, 0, 1, None])
result = a**b
expected = pd.array([1, 0, None, 1, 1, 1, 1, None, None])
tm.assert_extension_array_equal(result, expected)
def test_rpow_one_to_na():
# https://github.com/pandas-dev/pandas/issues/22022
# https://github.com/pandas-dev/pandas/issues/29997
arr = pd.array([np.nan, np.nan], dtype="Int64")
result = np.array([1.0, 2.0]) ** arr
expected = pd.array([1.0, np.nan], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("other", [0, 0.5])
def test_numpy_zero_dim_ndarray(other):
arr = pd.array([1, None, 2])
result = arr + np.array(other)
expected = arr + other
tm.assert_equal(result, expected)
# Test generic characteristics / errors
# -----------------------------------------------------------------------------
def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)
if using_infer_string:
import pyarrow as pa
errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
else:
errs = TypeError
# invalid scalars
msg = "|".join(
[
r"can only perform ops with numeric values",
r"IntegerArray cannot perform the operation mod",
r"unsupported operand type",
r"can only concatenate str \(not \"int\"\) to str",
"not all arguments converted during string",
"ufunc '.*' not supported for the input types, and the inputs could not",
"ufunc '.*' did not contain a loop with signature matching types",
"Addition/subtraction of integers and integer-arrays with Timestamp",
"has no kernel",
"not implemented",
"The 'out' kwarg is necessary. Use numpy.strings.multiply without it.",
]
)
with pytest.raises(errs, match=msg):
ops("foo")
with pytest.raises(errs, match=msg):
ops(pd.Timestamp("20180101"))
# invalid array-likes
str_ser = pd.Series("foo", index=s.index)
# with pytest.raises(TypeError, match=msg):
if (
all_arithmetic_operators
in [
"__mul__",
"__rmul__",
]
and not using_infer_string
): # (data[~data.isna()] >= 0).all():
res = ops(str_ser)
expected = pd.Series(["foo" * x for x in data], index=s.index)
expected = expected.fillna(np.nan)
# TODO: doing this fillna to keep tests passing as we make
# assert_almost_equal stricter, but the expected with pd.NA seems
# more-correct than np.nan here.
tm.assert_series_equal(res, expected)
else:
with pytest.raises(errs, match=msg):
ops(str_ser)
msg = "|".join(
[
"can only perform ops with numeric values",
"cannot perform .* with this index type: DatetimeArray",
"Addition/subtraction of integers and integer-arrays "
"with DatetimeArray is no longer supported. *",
"unsupported operand type",
r"can only concatenate str \(not \"int\"\) to str",
"not all arguments converted during string",
"cannot subtract DatetimeArray from ndarray",
"has no kernel",
"not implemented",
]
)
with pytest.raises(errs, match=msg):
ops(pd.Series(pd.date_range("20180101", periods=len(s))))
# Various
# -----------------------------------------------------------------------------
# TODO test unsigned overflow
def test_arith_coerce_scalar(data, all_arithmetic_operators):
op = tm.get_op_from_name(all_arithmetic_operators)
s = pd.Series(data)
other = 0.01
result = op(s, other)
expected = op(s.astype(float), other)
expected = expected.astype("Float64")
# rmod results in NaN that wasn't NA in original nullable Series -> unmask it
if all_arithmetic_operators == "__rmod__":
mask = (s == 0).fillna(False).to_numpy(bool)
expected.array._mask[mask] = False
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("other", [1.0, np.array(1.0)])
def test_arithmetic_conversion(all_arithmetic_operators, other):
# if we have a float operand we should have a float result
# if that is equal to an integer
op = tm.get_op_from_name(all_arithmetic_operators)
s = pd.Series([1, 2, 3], dtype="Int64")
result = op(s, other)
assert result.dtype == "Float64"
def test_cross_type_arithmetic():
df = pd.DataFrame(
{
"A": pd.Series([1, 2, np.nan], dtype="Int64"),
"B": pd.Series([1, np.nan, 3], dtype="UInt8"),
"C": [1, 2, 3],
}
)
result = df.A + df.C
expected = pd.Series([2, 4, np.nan], dtype="Int64")
tm.assert_series_equal(result, expected)
result = (df.A + df.C) * 3 == 12
expected = pd.Series([False, True, None], dtype="boolean")
tm.assert_series_equal(result, expected)
result = df.A + df.B
expected = pd.Series([2, np.nan, np.nan], dtype="Int64")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("op", ["mean"])
def test_reduce_to_float(op):
# some reduce ops always return float, even if the result
# is a rounded number
df = pd.DataFrame(
{
"A": ["a", "b", "b"],
"B": [1, None, 3],
"C": pd.array([1, None, 3], dtype="Int64"),
}
)
# op
result = getattr(df.C, op)()
assert isinstance(result, float)
# groupby
result = getattr(df.groupby("A"), op)()
expected = pd.DataFrame(
{"B": np.array([1.0, 3.0]), "C": pd.array([1, 3], dtype="Float64")},
index=pd.Index(["a", "b"], name="A"),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"source, neg_target, abs_target",
[
([1, 2, 3], [-1, -2, -3], [1, 2, 3]),
([1, 2, None], [-1, -2, None], [1, 2, None]),
([-1, 0, 1], [1, 0, -1], [1, 0, 1]),
],
)
def test_unary_int_operators(any_signed_int_ea_dtype, source, neg_target, abs_target):
dtype = any_signed_int_ea_dtype
arr = pd.array(source, dtype=dtype)
neg_result, pos_result, abs_result = -arr, +arr, abs(arr)
neg_target = pd.array(neg_target, dtype=dtype)
abs_target = pd.array(abs_target, dtype=dtype)
tm.assert_extension_array_equal(neg_result, neg_target)
tm.assert_extension_array_equal(pos_result, arr)
assert not tm.shares_memory(pos_result, arr)
tm.assert_extension_array_equal(abs_result, abs_target)
def test_values_multiplying_large_series_by_NA():
# GH#33701
result = pd.NA * pd.Series(np.zeros(10001))
expected = pd.Series([pd.NA] * 10001)
tm.assert_series_equal(result, expected)
def test_bitwise(dtype):
left = pd.array([1, None, 3, 4], dtype=dtype)
right = pd.array([None, 3, 5, 4], dtype=dtype)
result = left | right
expected = pd.array([None, None, 3 | 5, 4 | 4], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = left & right
expected = pd.array([None, None, 3 & 5, 4 & 4], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = left ^ right
expected = pd.array([None, None, 3 ^ 5, 4 ^ 4], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
# TODO: desired behavior when operating with boolean? defer?
floats = right.astype("Float64")
with pytest.raises(TypeError, match="unsupported operand type"):
left | floats
with pytest.raises(TypeError, match="unsupported operand type"):
left & floats
with pytest.raises(TypeError, match="unsupported operand type"):
left ^ floats

View File

@ -0,0 +1,39 @@
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.tests.arrays.masked_shared import (
ComparisonOps,
NumericOps,
)
class TestComparisonOps(NumericOps, ComparisonOps):
@pytest.mark.parametrize("other", [True, False, pd.NA, -1, 0, 1])
def test_scalar(self, other, comparison_op, dtype):
ComparisonOps.test_scalar(self, other, comparison_op, dtype)
def test_compare_to_int(self, dtype, comparison_op):
# GH 28930
op_name = f"__{comparison_op.__name__}__"
s1 = pd.Series([1, None, 3], dtype=dtype)
s2 = pd.Series([1, None, 3], dtype="float")
method = getattr(s1, op_name)
result = method(2)
method = getattr(s2, op_name)
expected = method(2).astype("boolean")
expected[s2.isna()] = pd.NA
tm.assert_series_equal(result, expected)
def test_equals():
# GH-30652
# equals is generally tested in /tests/extension/base/methods, but this
# specifically tests that two arrays of the same class but different dtype
# do not evaluate equal
a1 = pd.array([1, 2, None], dtype="Int64")
a2 = pd.array([1, 2, None], dtype="Int32")
assert a1.equals(a2) is False

View File

@ -0,0 +1,69 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize(
"to_concat_dtypes, result_dtype",
[
(["Int64", "Int64"], "Int64"),
(["UInt64", "UInt64"], "UInt64"),
(["Int8", "Int8"], "Int8"),
(["Int8", "Int16"], "Int16"),
(["UInt8", "Int8"], "Int16"),
(["Int32", "UInt32"], "Int64"),
(["Int64", "UInt64"], "Float64"),
(["Int64", "boolean"], "object"),
(["UInt8", "boolean"], "object"),
],
)
def test_concat_series(to_concat_dtypes, result_dtype):
# we expect the same dtypes as we would get with non-masked inputs,
# just masked where available.
result = pd.concat([pd.Series([0, 1, pd.NA], dtype=t) for t in to_concat_dtypes])
expected = pd.concat([pd.Series([0, 1, pd.NA], dtype=object)] * 2).astype(
result_dtype
)
tm.assert_series_equal(result, expected)
# order doesn't matter for result
result = pd.concat(
[pd.Series([0, 1, pd.NA], dtype=t) for t in to_concat_dtypes[::-1]]
)
expected = pd.concat([pd.Series([0, 1, pd.NA], dtype=object)] * 2).astype(
result_dtype
)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"to_concat_dtypes, result_dtype",
[
(["Int64", "int64"], "Int64"),
(["UInt64", "uint64"], "UInt64"),
(["Int8", "int8"], "Int8"),
(["Int8", "int16"], "Int16"),
(["UInt8", "int8"], "Int16"),
(["Int32", "uint32"], "Int64"),
(["Int64", "uint64"], "Float64"),
(["Int64", "bool"], "object"),
(["UInt8", "bool"], "object"),
],
)
def test_concat_series_with_numpy(to_concat_dtypes, result_dtype):
# we expect the same dtypes as we would get with non-masked inputs,
# just masked where available.
s1 = pd.Series([0, 1, pd.NA], dtype=to_concat_dtypes[0])
s2 = pd.Series(np.array([0, 1], dtype=to_concat_dtypes[1]))
result = pd.concat([s1, s2], ignore_index=True)
expected = pd.Series([0, 1, pd.NA, 0, 1], dtype=object).astype(result_dtype)
tm.assert_series_equal(result, expected)
# order doesn't matter for result
result = pd.concat([s2, s1], ignore_index=True)
expected = pd.Series([0, 1, 0, 1, pd.NA], dtype=object).astype(result_dtype)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,245 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.api.types import is_integer
from pandas.core.arrays import IntegerArray
from pandas.core.arrays.integer import (
Int8Dtype,
Int32Dtype,
Int64Dtype,
)
@pytest.fixture(params=[pd.array, IntegerArray._from_sequence])
def constructor(request):
"""Fixture returning parametrized IntegerArray from given sequence.
Used to test dtype conversions.
"""
return request.param
def test_uses_pandas_na():
a = pd.array([1, None], dtype=Int64Dtype())
assert a[1] is pd.NA
def test_from_dtype_from_float(data):
# construct from our dtype & string dtype
dtype = data.dtype
# from float
expected = pd.Series(data)
result = pd.Series(data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype))
tm.assert_series_equal(result, expected)
# from int / list
expected = pd.Series(data)
result = pd.Series(np.array(data).tolist(), dtype=str(dtype))
tm.assert_series_equal(result, expected)
# from int / array
expected = pd.Series(data).dropna().reset_index(drop=True)
dropped = np.array(data.dropna()).astype(np.dtype(dtype.type))
result = pd.Series(dropped, dtype=str(dtype))
tm.assert_series_equal(result, expected)
def test_conversions(data_missing):
# astype to object series
df = pd.DataFrame({"A": data_missing})
result = df["A"].astype("object")
expected = pd.Series(np.array([pd.NA, 1], dtype=object), name="A")
tm.assert_series_equal(result, expected)
# convert to object ndarray
# we assert that we are exactly equal
# including type conversions of scalars
result = df["A"].astype("object").values
expected = np.array([pd.NA, 1], dtype=object)
tm.assert_numpy_array_equal(result, expected)
for r, e in zip(result, expected):
if pd.isnull(r):
assert pd.isnull(e)
elif is_integer(r):
assert r == e
assert is_integer(e)
else:
assert r == e
assert type(r) == type(e)
def test_integer_array_constructor():
values = np.array([1, 2, 3, 4], dtype="int64")
mask = np.array([False, False, False, True], dtype="bool")
result = IntegerArray(values, mask)
expected = pd.array([1, 2, 3, np.nan], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
msg = r".* should be .* numpy array. Use the 'pd.array' function instead"
with pytest.raises(TypeError, match=msg):
IntegerArray(values.tolist(), mask)
with pytest.raises(TypeError, match=msg):
IntegerArray(values, mask.tolist())
with pytest.raises(TypeError, match=msg):
IntegerArray(values.astype(float), mask)
msg = r"__init__\(\) missing 1 required positional argument: 'mask'"
with pytest.raises(TypeError, match=msg):
IntegerArray(values)
def test_integer_array_constructor_copy():
values = np.array([1, 2, 3, 4], dtype="int64")
mask = np.array([False, False, False, True], dtype="bool")
result = IntegerArray(values, mask)
assert result._data is values
assert result._mask is mask
result = IntegerArray(values, mask, copy=True)
assert result._data is not values
assert result._mask is not mask
@pytest.mark.parametrize(
"a, b",
[
([1, None], [1, np.nan]),
([None], [np.nan]),
([None, np.nan], [np.nan, np.nan]),
([np.nan, np.nan], [np.nan, np.nan]),
],
)
def test_to_integer_array_none_is_nan(a, b):
result = pd.array(a, dtype="Int64")
expected = pd.array(b, dtype="Int64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"values",
[
["foo", "bar"],
"foo",
1,
1.0,
pd.date_range("20130101", periods=2),
np.array(["foo"]),
[[1, 2], [3, 4]],
[np.nan, {"a": 1}],
],
)
def test_to_integer_array_error(values):
# error in converting existing arrays to IntegerArrays
msg = "|".join(
[
r"cannot be converted to IntegerDtype",
r"invalid literal for int\(\) with base 10:",
r"values must be a 1D list-like",
r"Cannot pass scalar",
r"int\(\) argument must be a string",
]
)
with pytest.raises((ValueError, TypeError), match=msg):
pd.array(values, dtype="Int64")
with pytest.raises((ValueError, TypeError), match=msg):
IntegerArray._from_sequence(values)
def test_to_integer_array_inferred_dtype(constructor):
# if values has dtype -> respect it
result = constructor(np.array([1, 2], dtype="int8"))
assert result.dtype == Int8Dtype()
result = constructor(np.array([1, 2], dtype="int32"))
assert result.dtype == Int32Dtype()
# if values have no dtype -> always int64
result = constructor([1, 2])
assert result.dtype == Int64Dtype()
def test_to_integer_array_dtype_keyword(constructor):
result = constructor([1, 2], dtype="Int8")
assert result.dtype == Int8Dtype()
# if values has dtype -> override it
result = constructor(np.array([1, 2], dtype="int8"), dtype="Int32")
assert result.dtype == Int32Dtype()
def test_to_integer_array_float():
result = IntegerArray._from_sequence([1.0, 2.0], dtype="Int64")
expected = pd.array([1, 2], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
with pytest.raises(TypeError, match="cannot safely cast non-equivalent"):
IntegerArray._from_sequence([1.5, 2.0], dtype="Int64")
# for float dtypes, the itemsize is not preserved
result = IntegerArray._from_sequence(
np.array([1.0, 2.0], dtype="float32"), dtype="Int64"
)
assert result.dtype == Int64Dtype()
def test_to_integer_array_str():
result = IntegerArray._from_sequence(["1", "2", None], dtype="Int64")
expected = pd.array([1, 2, np.nan], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
with pytest.raises(
ValueError, match=r"invalid literal for int\(\) with base 10: .*"
):
IntegerArray._from_sequence(["1", "2", ""], dtype="Int64")
with pytest.raises(
ValueError, match=r"invalid literal for int\(\) with base 10: .*"
):
IntegerArray._from_sequence(["1.5", "2.0"], dtype="Int64")
@pytest.mark.parametrize(
"bool_values, int_values, target_dtype, expected_dtype",
[
([False, True], [0, 1], Int64Dtype(), Int64Dtype()),
([False, True], [0, 1], "Int64", Int64Dtype()),
([False, True, np.nan], [0, 1, np.nan], Int64Dtype(), Int64Dtype()),
],
)
def test_to_integer_array_bool(
constructor, bool_values, int_values, target_dtype, expected_dtype
):
result = constructor(bool_values, dtype=target_dtype)
assert result.dtype == expected_dtype
expected = pd.array(int_values, dtype=target_dtype)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"values, to_dtype, result_dtype",
[
(np.array([1], dtype="int64"), None, Int64Dtype),
(np.array([1, np.nan]), None, Int64Dtype),
(np.array([1, np.nan]), "int8", Int8Dtype),
],
)
def test_to_integer_array(values, to_dtype, result_dtype):
# convert existing arrays to IntegerArrays
result = IntegerArray._from_sequence(values, dtype=to_dtype)
assert result.dtype == result_dtype()
expected = pd.array(values, dtype=result_dtype())
tm.assert_extension_array_equal(result, expected)
def test_integer_array_from_boolean():
# GH31104
expected = pd.array(np.array([True, False]), dtype="Int64")
result = pd.array(np.array([True, False], dtype=object), dtype="Int64")
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,294 @@
import numpy as np
import pytest
from pandas.core.dtypes.generic import ABCIndex
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays.integer import (
Int8Dtype,
UInt32Dtype,
)
def test_dtypes(dtype):
# smoke tests on auto dtype construction
if dtype.is_signed_integer:
assert np.dtype(dtype.type).kind == "i"
else:
assert np.dtype(dtype.type).kind == "u"
assert dtype.name is not None
@pytest.mark.parametrize("op", ["sum", "min", "max", "prod"])
def test_preserve_dtypes(op):
# for ops that enable (mean would actually work here
# but generally it is a float return value)
df = pd.DataFrame(
{
"A": ["a", "b", "b"],
"B": [1, None, 3],
"C": pd.array([1, None, 3], dtype="Int64"),
}
)
# op
result = getattr(df.C, op)()
if op in {"sum", "prod", "min", "max"}:
assert isinstance(result, np.int64)
else:
assert isinstance(result, int)
# groupby
result = getattr(df.groupby("A"), op)()
expected = pd.DataFrame(
{"B": np.array([1.0, 3.0]), "C": pd.array([1, 3], dtype="Int64")},
index=pd.Index(["a", "b"], name="A"),
)
tm.assert_frame_equal(result, expected)
def test_astype_nansafe():
# see gh-22343
arr = pd.array([np.nan, 1, 2], dtype="Int8")
msg = "cannot convert NA to integer"
with pytest.raises(ValueError, match=msg):
arr.astype("uint32")
@pytest.mark.parametrize("dropna", [True, False])
def test_construct_index(all_data, dropna):
# ensure that we do not coerce to different Index dtype or non-index
all_data = all_data[:10]
if dropna:
other = np.array(all_data[~all_data.isna()])
else:
other = all_data
result = pd.Index(pd.array(other, dtype=all_data.dtype))
expected = pd.Index(other, dtype=all_data.dtype)
assert all_data.dtype == expected.dtype # dont coerce to object
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize("dropna", [True, False])
def test_astype_index(all_data, dropna):
# as an int/uint index to Index
all_data = all_data[:10]
if dropna:
other = all_data[~all_data.isna()]
else:
other = all_data
dtype = all_data.dtype
idx = pd.Index(np.array(other))
assert isinstance(idx, ABCIndex)
result = idx.astype(dtype)
expected = idx.astype(object).astype(dtype)
tm.assert_index_equal(result, expected)
def test_astype(all_data):
all_data = all_data[:10]
ints = all_data[~all_data.isna()]
mixed = all_data
dtype = Int8Dtype()
# coerce to same type - ints
s = pd.Series(ints)
result = s.astype(all_data.dtype)
expected = pd.Series(ints)
tm.assert_series_equal(result, expected)
# coerce to same other - ints
s = pd.Series(ints)
result = s.astype(dtype)
expected = pd.Series(ints, dtype=dtype)
tm.assert_series_equal(result, expected)
# coerce to same numpy_dtype - ints
s = pd.Series(ints)
result = s.astype(all_data.dtype.numpy_dtype)
expected = pd.Series(ints._data.astype(all_data.dtype.numpy_dtype))
tm.assert_series_equal(result, expected)
# coerce to same type - mixed
s = pd.Series(mixed)
result = s.astype(all_data.dtype)
expected = pd.Series(mixed)
tm.assert_series_equal(result, expected)
# coerce to same other - mixed
s = pd.Series(mixed)
result = s.astype(dtype)
expected = pd.Series(mixed, dtype=dtype)
tm.assert_series_equal(result, expected)
# coerce to same numpy_dtype - mixed
s = pd.Series(mixed)
msg = "cannot convert NA to integer"
with pytest.raises(ValueError, match=msg):
s.astype(all_data.dtype.numpy_dtype)
# coerce to object
s = pd.Series(mixed)
result = s.astype("object")
expected = pd.Series(np.asarray(mixed, dtype=object))
tm.assert_series_equal(result, expected)
def test_astype_copy():
arr = pd.array([1, 2, 3, None], dtype="Int64")
orig = pd.array([1, 2, 3, None], dtype="Int64")
# copy=True -> ensure both data and mask are actual copies
result = arr.astype("Int64", copy=True)
assert result is not arr
assert not tm.shares_memory(result, arr)
result[0] = 10
tm.assert_extension_array_equal(arr, orig)
result[0] = pd.NA
tm.assert_extension_array_equal(arr, orig)
# copy=False
result = arr.astype("Int64", copy=False)
assert result is arr
assert np.shares_memory(result._data, arr._data)
assert np.shares_memory(result._mask, arr._mask)
result[0] = 10
assert arr[0] == 10
result[0] = pd.NA
assert arr[0] is pd.NA
# astype to different dtype -> always needs a copy -> even with copy=False
# we need to ensure that also the mask is actually copied
arr = pd.array([1, 2, 3, None], dtype="Int64")
orig = pd.array([1, 2, 3, None], dtype="Int64")
result = arr.astype("Int32", copy=False)
assert not tm.shares_memory(result, arr)
result[0] = 10
tm.assert_extension_array_equal(arr, orig)
result[0] = pd.NA
tm.assert_extension_array_equal(arr, orig)
def test_astype_to_larger_numpy():
a = pd.array([1, 2], dtype="Int32")
result = a.astype("int64")
expected = np.array([1, 2], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
a = pd.array([1, 2], dtype="UInt32")
result = a.astype("uint64")
expected = np.array([1, 2], dtype="uint64")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("dtype", [Int8Dtype(), "Int8", UInt32Dtype(), "UInt32"])
def test_astype_specific_casting(dtype):
s = pd.Series([1, 2, 3], dtype="Int64")
result = s.astype(dtype)
expected = pd.Series([1, 2, 3], dtype=dtype)
tm.assert_series_equal(result, expected)
s = pd.Series([1, 2, 3, None], dtype="Int64")
result = s.astype(dtype)
expected = pd.Series([1, 2, 3, None], dtype=dtype)
tm.assert_series_equal(result, expected)
def test_astype_floating():
arr = pd.array([1, 2, None], dtype="Int64")
result = arr.astype("Float64")
expected = pd.array([1.0, 2.0, None], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
def test_astype_dt64():
# GH#32435
arr = pd.array([1, 2, 3, pd.NA]) * 10**9
result = arr.astype("datetime64[ns]")
expected = np.array([1, 2, 3, "NaT"], dtype="M8[s]").astype("M8[ns]")
tm.assert_numpy_array_equal(result, expected)
def test_construct_cast_invalid(dtype):
msg = "cannot safely"
arr = [1.2, 2.3, 3.7]
with pytest.raises(TypeError, match=msg):
pd.array(arr, dtype=dtype)
with pytest.raises(TypeError, match=msg):
pd.Series(arr).astype(dtype)
arr = [1.2, 2.3, 3.7, np.nan]
with pytest.raises(TypeError, match=msg):
pd.array(arr, dtype=dtype)
with pytest.raises(TypeError, match=msg):
pd.Series(arr).astype(dtype)
@pytest.mark.parametrize("in_series", [True, False])
def test_to_numpy_na_nan(in_series):
a = pd.array([0, 1, None], dtype="Int64")
if in_series:
a = pd.Series(a)
result = a.to_numpy(dtype="float64", na_value=np.nan)
expected = np.array([0.0, 1.0, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
result = a.to_numpy(dtype="int64", na_value=-1)
expected = np.array([0, 1, -1], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
result = a.to_numpy(dtype="bool", na_value=False)
expected = np.array([False, True, False], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("in_series", [True, False])
@pytest.mark.parametrize("dtype", ["int32", "int64", "bool"])
def test_to_numpy_dtype(dtype, in_series):
a = pd.array([0, 1], dtype="Int64")
if in_series:
a = pd.Series(a)
result = a.to_numpy(dtype=dtype)
expected = np.array([0, 1], dtype=dtype)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("dtype", ["int64", "bool"])
def test_to_numpy_na_raises(dtype):
a = pd.array([0, 1, None], dtype="Int64")
with pytest.raises(ValueError, match=dtype):
a.to_numpy(dtype=dtype)
def test_astype_str():
a = pd.array([1, 2, None], dtype="Int64")
expected = np.array(["1", "2", "<NA>"], dtype=f"{tm.ENDIAN}U21")
tm.assert_numpy_array_equal(a.astype(str), expected)
tm.assert_numpy_array_equal(a.astype("str"), expected)
def test_astype_boolean():
# https://github.com/pandas-dev/pandas/issues/31102
a = pd.array([1, 0, -1, 2, None], dtype="Int64")
result = a.astype("boolean")
expected = pd.array([True, False, True, True, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,203 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
@pytest.mark.parametrize("ufunc", [np.abs, np.sign])
# np.sign emits a warning with nans, <https://github.com/numpy/numpy/issues/15127>
@pytest.mark.filterwarnings("ignore:invalid value encountered in sign:RuntimeWarning")
def test_ufuncs_single_int(ufunc):
a = pd.array([1, 2, -3, np.nan])
result = ufunc(a)
expected = pd.array(ufunc(a.astype(float)), dtype="Int64")
tm.assert_extension_array_equal(result, expected)
s = pd.Series(a)
result = ufunc(s)
expected = pd.Series(pd.array(ufunc(a.astype(float)), dtype="Int64"))
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt])
def test_ufuncs_single_float(ufunc):
a = pd.array([1, 2, -3, np.nan])
with np.errstate(invalid="ignore"):
result = ufunc(a)
expected = FloatingArray(ufunc(a.astype(float)), mask=a._mask)
tm.assert_extension_array_equal(result, expected)
s = pd.Series(a)
with np.errstate(invalid="ignore"):
result = ufunc(s)
expected = pd.Series(expected)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("ufunc", [np.add, np.subtract])
def test_ufuncs_binary_int(ufunc):
# two IntegerArrays
a = pd.array([1, 2, -3, np.nan])
result = ufunc(a, a)
expected = pd.array(ufunc(a.astype(float), a.astype(float)), dtype="Int64")
tm.assert_extension_array_equal(result, expected)
# IntegerArray with numpy array
arr = np.array([1, 2, 3, 4])
result = ufunc(a, arr)
expected = pd.array(ufunc(a.astype(float), arr), dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = ufunc(arr, a)
expected = pd.array(ufunc(arr, a.astype(float)), dtype="Int64")
tm.assert_extension_array_equal(result, expected)
# IntegerArray with scalar
result = ufunc(a, 1)
expected = pd.array(ufunc(a.astype(float), 1), dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = ufunc(1, a)
expected = pd.array(ufunc(1, a.astype(float)), dtype="Int64")
tm.assert_extension_array_equal(result, expected)
def test_ufunc_binary_output():
a = pd.array([1, 2, np.nan])
result = np.modf(a)
expected = np.modf(a.to_numpy(na_value=np.nan, dtype="float"))
expected = (pd.array(expected[0]), pd.array(expected[1]))
assert isinstance(result, tuple)
assert len(result) == 2
for x, y in zip(result, expected):
tm.assert_extension_array_equal(x, y)
@pytest.mark.parametrize("values", [[0, 1], [0, None]])
def test_ufunc_reduce_raises(values):
arr = pd.array(values)
res = np.add.reduce(arr)
expected = arr.sum(skipna=False)
tm.assert_almost_equal(res, expected)
@pytest.mark.parametrize(
"pandasmethname, kwargs",
[
("var", {"ddof": 0}),
("var", {"ddof": 1}),
("std", {"ddof": 0}),
("std", {"ddof": 1}),
("kurtosis", {}),
("skew", {}),
("sem", {}),
],
)
def test_stat_method(pandasmethname, kwargs):
s = pd.Series(data=[1, 2, 3, 4, 5, 6, np.nan, np.nan], dtype="Int64")
pandasmeth = getattr(s, pandasmethname)
result = pandasmeth(**kwargs)
s2 = pd.Series(data=[1, 2, 3, 4, 5, 6], dtype="Int64")
pandasmeth = getattr(s2, pandasmethname)
expected = pandasmeth(**kwargs)
assert expected == result
def test_value_counts_na():
arr = pd.array([1, 2, 1, pd.NA], dtype="Int64")
result = arr.value_counts(dropna=False)
ex_index = pd.Index([1, 2, pd.NA], dtype="Int64")
assert ex_index.dtype == "Int64"
expected = pd.Series([2, 1, 1], index=ex_index, dtype="Int64", name="count")
tm.assert_series_equal(result, expected)
result = arr.value_counts(dropna=True)
expected = pd.Series([2, 1], index=arr[:2], dtype="Int64", name="count")
assert expected.index.dtype == arr.dtype
tm.assert_series_equal(result, expected)
def test_value_counts_empty():
# https://github.com/pandas-dev/pandas/issues/33317
ser = pd.Series([], dtype="Int64")
result = ser.value_counts()
idx = pd.Index([], dtype=ser.dtype)
assert idx.dtype == ser.dtype
expected = pd.Series([], index=idx, dtype="Int64", name="count")
tm.assert_series_equal(result, expected)
def test_value_counts_with_normalize():
# GH 33172
ser = pd.Series([1, 2, 1, pd.NA], dtype="Int64")
result = ser.value_counts(normalize=True)
expected = pd.Series([2, 1], index=ser[:2], dtype="Float64", name="proportion") / 3
assert expected.index.dtype == ser.dtype
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("min_count", [0, 4])
def test_integer_array_sum(skipna, min_count, any_int_ea_dtype):
dtype = any_int_ea_dtype
arr = pd.array([1, 2, 3, None], dtype=dtype)
result = arr.sum(skipna=skipna, min_count=min_count)
if skipna and min_count == 0:
assert result == 6
else:
assert result is pd.NA
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("method", ["min", "max"])
def test_integer_array_min_max(skipna, method, any_int_ea_dtype):
dtype = any_int_ea_dtype
arr = pd.array([0, 1, None], dtype=dtype)
func = getattr(arr, method)
result = func(skipna=skipna)
if skipna:
assert result == (0 if method == "min" else 1)
else:
assert result is pd.NA
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("min_count", [0, 9])
def test_integer_array_prod(skipna, min_count, any_int_ea_dtype):
dtype = any_int_ea_dtype
arr = pd.array([1, 2, None], dtype=dtype)
result = arr.prod(skipna=skipna, min_count=min_count)
if skipna and min_count == 0:
assert result == 2
else:
assert result is pd.NA
@pytest.mark.parametrize(
"values, expected", [([1, 2, 3], 6), ([1, 2, 3, None], 6), ([None], 0)]
)
def test_integer_array_numpy_sum(values, expected):
arr = pd.array(values, dtype="Int64")
result = np.sum(arr)
assert result == expected
@pytest.mark.parametrize("op", ["sum", "prod", "min", "max"])
def test_dataframe_reductions(op):
# https://github.com/pandas-dev/pandas/pull/32867
# ensure the integers are not cast to float during reductions
df = pd.DataFrame({"a": pd.array([1, 2], dtype="Int64")})
result = df.max()
assert isinstance(result["a"], np.int64)
# TODO(jreback) - these need testing / are broken
# shift
# set_index (destroys type)

View File

@ -0,0 +1,19 @@
import pandas as pd
import pandas._testing as tm
def test_array_setitem_nullable_boolean_mask():
# GH 31446
ser = pd.Series([1, 2], dtype="Int64")
result = ser.where(ser > 1)
expected = pd.Series([pd.NA, 2], dtype="Int64")
tm.assert_series_equal(result, expected)
def test_array_setitem():
# GH 31446
arr = pd.Series([1, 2], dtype="Int64").array
arr[arr > 1] = 1
expected = pd.array([1, 1], dtype="Int64")
tm.assert_extension_array_equal(arr, expected)

View File

@ -0,0 +1,125 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Series,
array,
)
import pandas._testing as tm
@pytest.mark.parametrize(
"op, expected",
[
["sum", np.int64(3)],
["prod", np.int64(2)],
["min", np.int64(1)],
["max", np.int64(2)],
["mean", np.float64(1.5)],
["median", np.float64(1.5)],
["var", np.float64(0.5)],
["std", np.float64(0.5**0.5)],
["skew", pd.NA],
["kurt", pd.NA],
["any", True],
["all", True],
],
)
def test_series_reductions(op, expected):
ser = Series([1, 2], dtype="Int64")
result = getattr(ser, op)()
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"op, expected",
[
["sum", Series([3], index=["a"], dtype="Int64")],
["prod", Series([2], index=["a"], dtype="Int64")],
["min", Series([1], index=["a"], dtype="Int64")],
["max", Series([2], index=["a"], dtype="Int64")],
["mean", Series([1.5], index=["a"], dtype="Float64")],
["median", Series([1.5], index=["a"], dtype="Float64")],
["var", Series([0.5], index=["a"], dtype="Float64")],
["std", Series([0.5**0.5], index=["a"], dtype="Float64")],
["skew", Series([pd.NA], index=["a"], dtype="Float64")],
["kurt", Series([pd.NA], index=["a"], dtype="Float64")],
["any", Series([True], index=["a"], dtype="boolean")],
["all", Series([True], index=["a"], dtype="boolean")],
],
)
def test_dataframe_reductions(op, expected):
df = DataFrame({"a": array([1, 2], dtype="Int64")})
result = getattr(df, op)()
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"op, expected",
[
["sum", array([1, 3], dtype="Int64")],
["prod", array([1, 3], dtype="Int64")],
["min", array([1, 3], dtype="Int64")],
["max", array([1, 3], dtype="Int64")],
["mean", array([1, 3], dtype="Float64")],
["median", array([1, 3], dtype="Float64")],
["var", array([pd.NA], dtype="Float64")],
["std", array([pd.NA], dtype="Float64")],
["skew", array([pd.NA], dtype="Float64")],
["any", array([True, True], dtype="boolean")],
["all", array([True, True], dtype="boolean")],
],
)
def test_groupby_reductions(op, expected):
df = DataFrame(
{
"A": ["a", "b", "b"],
"B": array([1, None, 3], dtype="Int64"),
}
)
result = getattr(df.groupby("A"), op)()
expected = DataFrame(expected, index=pd.Index(["a", "b"], name="A"), columns=["B"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"op, expected",
[
["sum", Series([4, 4], index=["B", "C"], dtype="Float64")],
["prod", Series([3, 3], index=["B", "C"], dtype="Float64")],
["min", Series([1, 1], index=["B", "C"], dtype="Float64")],
["max", Series([3, 3], index=["B", "C"], dtype="Float64")],
["mean", Series([2, 2], index=["B", "C"], dtype="Float64")],
["median", Series([2, 2], index=["B", "C"], dtype="Float64")],
["var", Series([2, 2], index=["B", "C"], dtype="Float64")],
["std", Series([2**0.5, 2**0.5], index=["B", "C"], dtype="Float64")],
["skew", Series([pd.NA, pd.NA], index=["B", "C"], dtype="Float64")],
["kurt", Series([pd.NA, pd.NA], index=["B", "C"], dtype="Float64")],
["any", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")],
["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")],
],
)
def test_mixed_reductions(op, expected, using_infer_string):
if op in ["any", "all"] and using_infer_string:
expected = expected.astype("bool")
df = DataFrame(
{
"A": ["a", "b", "b"],
"B": [1, None, 3],
"C": array([1, None, 3], dtype="Int64"),
}
)
# series
result = getattr(df.C, op)()
tm.assert_equal(result, expected["C"])
# frame
if op in ["any", "all"]:
result = getattr(df, op)()
else:
result = getattr(df, op)(numeric_only=True)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,67 @@
import numpy as np
import pytest
import pandas as pd
from pandas.core.arrays.integer import (
Int8Dtype,
Int16Dtype,
Int32Dtype,
Int64Dtype,
UInt8Dtype,
UInt16Dtype,
UInt32Dtype,
UInt64Dtype,
)
def test_dtypes(dtype):
# smoke tests on auto dtype construction
if dtype.is_signed_integer:
assert np.dtype(dtype.type).kind == "i"
else:
assert np.dtype(dtype.type).kind == "u"
assert dtype.name is not None
@pytest.mark.parametrize(
"dtype, expected",
[
(Int8Dtype(), "Int8Dtype()"),
(Int16Dtype(), "Int16Dtype()"),
(Int32Dtype(), "Int32Dtype()"),
(Int64Dtype(), "Int64Dtype()"),
(UInt8Dtype(), "UInt8Dtype()"),
(UInt16Dtype(), "UInt16Dtype()"),
(UInt32Dtype(), "UInt32Dtype()"),
(UInt64Dtype(), "UInt64Dtype()"),
],
)
def test_repr_dtype(dtype, expected):
assert repr(dtype) == expected
def test_repr_array():
result = repr(pd.array([1, None, 3]))
expected = "<IntegerArray>\n[1, <NA>, 3]\nLength: 3, dtype: Int64"
assert result == expected
def test_repr_array_long():
data = pd.array([1, 2, None] * 1000)
expected = (
"<IntegerArray>\n"
"[ 1, 2, <NA>, 1, 2, <NA>, 1, 2, <NA>, 1,\n"
" ...\n"
" <NA>, 1, 2, <NA>, 1, 2, <NA>, 1, 2, <NA>]\n"
"Length: 3000, dtype: Int64"
)
result = repr(data)
assert result == expected
def test_frame_repr(data_missing):
df = pd.DataFrame({"A": data_missing})
result = repr(df)
expected = " A\n0 <NA>\n1 1"
assert result == expected

View File

@ -0,0 +1,28 @@
import pytest
from pandas import (
Categorical,
CategoricalDtype,
Index,
IntervalIndex,
)
import pandas._testing as tm
class TestAstype:
@pytest.mark.parametrize("ordered", [True, False])
def test_astype_categorical_retains_ordered(self, ordered):
index = IntervalIndex.from_breaks(range(5))
arr = index._data
dtype = CategoricalDtype(None, ordered=ordered)
expected = Categorical(list(arr), ordered=ordered)
result = arr.astype(dtype)
assert result.ordered is ordered
tm.assert_categorical_equal(result, expected)
# test IntervalIndex.astype while we're at it.
result = index.astype(dtype)
expected = Index(expected)
tm.assert_index_equal(result, expected)

View File

@ -0,0 +1,13 @@
from pandas.core.arrays import IntervalArray
def test_repr():
# GH#25022
arr = IntervalArray.from_tuples([(0, 1), (1, 2)])
result = repr(arr)
expected = (
"<IntervalArray>\n"
"[(0, 1], (1, 2]]\n"
"Length: 2, dtype: interval[int64, right]"
)
assert result == expected

View File

@ -0,0 +1,231 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
Index,
Interval,
IntervalIndex,
Timedelta,
Timestamp,
date_range,
timedelta_range,
)
import pandas._testing as tm
from pandas.core.arrays import IntervalArray
@pytest.fixture(
params=[
(Index([0, 2, 4]), Index([1, 3, 5])),
(Index([0.0, 1.0, 2.0]), Index([1.0, 2.0, 3.0])),
(timedelta_range("0 days", periods=3), timedelta_range("1 day", periods=3)),
(date_range("20170101", periods=3), date_range("20170102", periods=3)),
(
date_range("20170101", periods=3, tz="US/Eastern"),
date_range("20170102", periods=3, tz="US/Eastern"),
),
],
ids=lambda x: str(x[0].dtype),
)
def left_right_dtypes(request):
"""
Fixture for building an IntervalArray from various dtypes
"""
return request.param
class TestAttributes:
@pytest.mark.parametrize(
"left, right",
[
(0, 1),
(Timedelta("0 days"), Timedelta("1 day")),
(Timestamp("2018-01-01"), Timestamp("2018-01-02")),
(
Timestamp("2018-01-01", tz="US/Eastern"),
Timestamp("2018-01-02", tz="US/Eastern"),
),
],
)
@pytest.mark.parametrize("constructor", [IntervalArray, IntervalIndex])
def test_is_empty(self, constructor, left, right, closed):
# GH27219
tuples = [(left, left), (left, right), np.nan]
expected = np.array([closed != "both", False, False])
result = constructor.from_tuples(tuples, closed=closed).is_empty
tm.assert_numpy_array_equal(result, expected)
class TestMethods:
@pytest.mark.parametrize("new_closed", ["left", "right", "both", "neither"])
def test_set_closed(self, closed, new_closed):
# GH 21670
array = IntervalArray.from_breaks(range(10), closed=closed)
result = array.set_closed(new_closed)
expected = IntervalArray.from_breaks(range(10), closed=new_closed)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"other",
[
Interval(0, 1, closed="right"),
IntervalArray.from_breaks([1, 2, 3, 4], closed="right"),
],
)
def test_where_raises(self, other):
# GH#45768 The IntervalArray methods raises; the Series method coerces
ser = pd.Series(IntervalArray.from_breaks([1, 2, 3, 4], closed="left"))
mask = np.array([True, False, True])
match = "'value.closed' is 'right', expected 'left'."
with pytest.raises(ValueError, match=match):
ser.array._where(mask, other)
res = ser.where(mask, other=other)
expected = ser.astype(object).where(mask, other)
tm.assert_series_equal(res, expected)
def test_shift(self):
# https://github.com/pandas-dev/pandas/issues/31495, GH#22428, GH#31502
a = IntervalArray.from_breaks([1, 2, 3])
result = a.shift()
# int -> float
expected = IntervalArray.from_tuples([(np.nan, np.nan), (1.0, 2.0)])
tm.assert_interval_array_equal(result, expected)
msg = "can only insert Interval objects and NA into an IntervalArray"
with pytest.raises(TypeError, match=msg):
a.shift(1, fill_value=pd.NaT)
def test_shift_datetime(self):
# GH#31502, GH#31504
a = IntervalArray.from_breaks(date_range("2000", periods=4))
result = a.shift(2)
expected = a.take([-1, -1, 0], allow_fill=True)
tm.assert_interval_array_equal(result, expected)
result = a.shift(-1)
expected = a.take([1, 2, -1], allow_fill=True)
tm.assert_interval_array_equal(result, expected)
msg = "can only insert Interval objects and NA into an IntervalArray"
with pytest.raises(TypeError, match=msg):
a.shift(1, fill_value=np.timedelta64("NaT", "ns"))
class TestSetitem:
def test_set_na(self, left_right_dtypes):
left, right = left_right_dtypes
left = left.copy(deep=True)
right = right.copy(deep=True)
result = IntervalArray.from_arrays(left, right)
if result.dtype.subtype.kind not in ["m", "M"]:
msg = "'value' should be an interval type, got <.*NaTType'> instead."
with pytest.raises(TypeError, match=msg):
result[0] = pd.NaT
if result.dtype.subtype.kind in ["i", "u"]:
msg = "Cannot set float NaN to integer-backed IntervalArray"
# GH#45484 TypeError, not ValueError, matches what we get with
# non-NA un-holdable value.
with pytest.raises(TypeError, match=msg):
result[0] = np.nan
return
result[0] = np.nan
expected_left = Index([left._na_value] + list(left[1:]))
expected_right = Index([right._na_value] + list(right[1:]))
expected = IntervalArray.from_arrays(expected_left, expected_right)
tm.assert_extension_array_equal(result, expected)
def test_setitem_mismatched_closed(self):
arr = IntervalArray.from_breaks(range(4))
orig = arr.copy()
other = arr.set_closed("both")
msg = "'value.closed' is 'both', expected 'right'"
with pytest.raises(ValueError, match=msg):
arr[0] = other[0]
with pytest.raises(ValueError, match=msg):
arr[:1] = other[:1]
with pytest.raises(ValueError, match=msg):
arr[:0] = other[:0]
with pytest.raises(ValueError, match=msg):
arr[:] = other[::-1]
with pytest.raises(ValueError, match=msg):
arr[:] = list(other[::-1])
with pytest.raises(ValueError, match=msg):
arr[:] = other[::-1].astype(object)
with pytest.raises(ValueError, match=msg):
arr[:] = other[::-1].astype("category")
# empty list should be no-op
arr[:0] = []
tm.assert_interval_array_equal(arr, orig)
class TestReductions:
def test_min_max_invalid_axis(self, left_right_dtypes):
left, right = left_right_dtypes
left = left.copy(deep=True)
right = right.copy(deep=True)
arr = IntervalArray.from_arrays(left, right)
msg = "`axis` must be fewer than the number of dimensions"
for axis in [-2, 1]:
with pytest.raises(ValueError, match=msg):
arr.min(axis=axis)
with pytest.raises(ValueError, match=msg):
arr.max(axis=axis)
msg = "'>=' not supported between"
with pytest.raises(TypeError, match=msg):
arr.min(axis="foo")
with pytest.raises(TypeError, match=msg):
arr.max(axis="foo")
def test_min_max(self, left_right_dtypes, index_or_series_or_array):
# GH#44746
left, right = left_right_dtypes
left = left.copy(deep=True)
right = right.copy(deep=True)
arr = IntervalArray.from_arrays(left, right)
# The expected results below are only valid if monotonic
assert left.is_monotonic_increasing
assert Index(arr).is_monotonic_increasing
MIN = arr[0]
MAX = arr[-1]
indexer = np.arange(len(arr))
np.random.default_rng(2).shuffle(indexer)
arr = arr.take(indexer)
arr_na = arr.insert(2, np.nan)
arr = index_or_series_or_array(arr)
arr_na = index_or_series_or_array(arr_na)
for skipna in [True, False]:
res = arr.min(skipna=skipna)
assert res == MIN
assert type(res) == type(MIN)
res = arr.max(skipna=skipna)
assert res == MAX
assert type(res) == type(MAX)
res = arr_na.min(skipna=False)
assert np.isnan(res)
res = arr_na.max(skipna=False)
assert np.isnan(res)
res = arr_na.min(skipna=True)
assert res == MIN
assert type(res) == type(MIN)
res = arr_na.max(skipna=True)
assert res == MAX
assert type(res) == type(MAX)

View File

@ -0,0 +1,160 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import IntervalArray
def test_arrow_extension_type():
pa = pytest.importorskip("pyarrow")
from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
p1 = ArrowIntervalType(pa.int64(), "left")
p2 = ArrowIntervalType(pa.int64(), "left")
p3 = ArrowIntervalType(pa.int64(), "right")
assert p1.closed == "left"
assert p1 == p2
assert p1 != p3
assert hash(p1) == hash(p2)
assert hash(p1) != hash(p3)
def test_arrow_array():
pa = pytest.importorskip("pyarrow")
from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
intervals = pd.interval_range(1, 5, freq=1).array
result = pa.array(intervals)
assert isinstance(result.type, ArrowIntervalType)
assert result.type.closed == intervals.closed
assert result.type.subtype == pa.int64()
assert result.storage.field("left").equals(pa.array([1, 2, 3, 4], type="int64"))
assert result.storage.field("right").equals(pa.array([2, 3, 4, 5], type="int64"))
expected = pa.array([{"left": i, "right": i + 1} for i in range(1, 5)])
assert result.storage.equals(expected)
# convert to its storage type
result = pa.array(intervals, type=expected.type)
assert result.equals(expected)
# unsupported conversions
with pytest.raises(TypeError, match="Not supported to convert IntervalArray"):
pa.array(intervals, type="float64")
with pytest.raises(TypeError, match="Not supported to convert IntervalArray"):
pa.array(intervals, type=ArrowIntervalType(pa.float64(), "left"))
def test_arrow_array_missing():
pa = pytest.importorskip("pyarrow")
from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
arr = IntervalArray.from_breaks([0.0, 1.0, 2.0, 3.0])
arr[1] = None
result = pa.array(arr)
assert isinstance(result.type, ArrowIntervalType)
assert result.type.closed == arr.closed
assert result.type.subtype == pa.float64()
# fields have missing values (not NaN)
left = pa.array([0.0, None, 2.0], type="float64")
right = pa.array([1.0, None, 3.0], type="float64")
assert result.storage.field("left").equals(left)
assert result.storage.field("right").equals(right)
# structarray itself also has missing values on the array level
vals = [
{"left": 0.0, "right": 1.0},
{"left": None, "right": None},
{"left": 2.0, "right": 3.0},
]
expected = pa.StructArray.from_pandas(vals, mask=np.array([False, True, False]))
assert result.storage.equals(expected)
@pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@pytest.mark.parametrize(
"breaks",
[[0.0, 1.0, 2.0, 3.0], pd.date_range("2017", periods=4, freq="D")],
ids=["float", "datetime64[ns]"],
)
def test_arrow_table_roundtrip(breaks):
pa = pytest.importorskip("pyarrow")
from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
arr = IntervalArray.from_breaks(breaks)
arr[1] = None
df = pd.DataFrame({"a": arr})
table = pa.table(df)
assert isinstance(table.field("a").type, ArrowIntervalType)
result = table.to_pandas()
assert isinstance(result["a"].dtype, pd.IntervalDtype)
tm.assert_frame_equal(result, df)
table2 = pa.concat_tables([table, table])
result = table2.to_pandas()
expected = pd.concat([df, df], ignore_index=True)
tm.assert_frame_equal(result, expected)
# GH#41040
table = pa.table(
[pa.chunked_array([], type=table.column(0).type)], schema=table.schema
)
result = table.to_pandas()
tm.assert_frame_equal(result, expected[0:0])
@pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@pytest.mark.parametrize(
"breaks",
[[0.0, 1.0, 2.0, 3.0], pd.date_range("2017", periods=4, freq="D")],
ids=["float", "datetime64[ns]"],
)
def test_arrow_table_roundtrip_without_metadata(breaks):
pa = pytest.importorskip("pyarrow")
arr = IntervalArray.from_breaks(breaks)
arr[1] = None
df = pd.DataFrame({"a": arr})
table = pa.table(df)
# remove the metadata
table = table.replace_schema_metadata()
assert table.schema.metadata is None
result = table.to_pandas()
assert isinstance(result["a"].dtype, pd.IntervalDtype)
tm.assert_frame_equal(result, df)
def test_from_arrow_from_raw_struct_array():
# in case pyarrow lost the Interval extension type (eg on parquet roundtrip
# with datetime64[ns] subtype, see GH-45881), still allow conversion
# from arrow to IntervalArray
pa = pytest.importorskip("pyarrow")
arr = pa.array([{"left": 0, "right": 1}, {"left": 1, "right": 2}])
dtype = pd.IntervalDtype(np.dtype("int64"), closed="neither")
result = dtype.__from_arrow__(arr)
expected = IntervalArray.from_breaks(
np.array([0, 1, 2], dtype="int64"), closed="neither"
)
tm.assert_extension_array_equal(result, expected)
result = dtype.__from_arrow__(pa.chunked_array([arr]))
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,93 @@
"""Tests for Interval-Interval operations, such as overlaps, contains, etc."""
import numpy as np
import pytest
from pandas import (
Interval,
IntervalIndex,
Timedelta,
Timestamp,
)
import pandas._testing as tm
from pandas.core.arrays import IntervalArray
@pytest.fixture(params=[IntervalArray, IntervalIndex])
def constructor(request):
"""
Fixture for testing both interval container classes.
"""
return request.param
@pytest.fixture(
params=[
(Timedelta("0 days"), Timedelta("1 day")),
(Timestamp("2018-01-01"), Timedelta("1 day")),
(0, 1),
],
ids=lambda x: type(x[0]).__name__,
)
def start_shift(request):
"""
Fixture for generating intervals of different types from a start value
and a shift value that can be added to start to generate an endpoint.
"""
return request.param
class TestOverlaps:
def test_overlaps_interval(self, constructor, start_shift, closed, other_closed):
start, shift = start_shift
interval = Interval(start, start + 3 * shift, other_closed)
# intervals: identical, nested, spanning, partial, adjacent, disjoint
tuples = [
(start, start + 3 * shift),
(start + shift, start + 2 * shift),
(start - shift, start + 4 * shift),
(start + 2 * shift, start + 4 * shift),
(start + 3 * shift, start + 4 * shift),
(start + 4 * shift, start + 5 * shift),
]
interval_container = constructor.from_tuples(tuples, closed)
adjacent = interval.closed_right and interval_container.closed_left
expected = np.array([True, True, True, True, adjacent, False])
result = interval_container.overlaps(interval)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("other_constructor", [IntervalArray, IntervalIndex])
def test_overlaps_interval_container(self, constructor, other_constructor):
# TODO: modify this test when implemented
interval_container = constructor.from_breaks(range(5))
other_container = other_constructor.from_breaks(range(5))
with pytest.raises(NotImplementedError, match="^$"):
interval_container.overlaps(other_container)
def test_overlaps_na(self, constructor, start_shift):
"""NA values are marked as False"""
start, shift = start_shift
interval = Interval(start, start + shift)
tuples = [
(start, start + shift),
np.nan,
(start + 2 * shift, start + 3 * shift),
]
interval_container = constructor.from_tuples(tuples)
expected = np.array([True, False, False])
result = interval_container.overlaps(interval)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize(
"other",
[10, True, "foo", Timedelta("1 day"), Timestamp("2018-01-01")],
ids=lambda x: type(x).__name__,
)
def test_overlaps_invalid_type(self, constructor, other):
interval_container = constructor.from_breaks(range(5))
msg = f"`other` must be Interval-like, got {type(other).__name__}"
with pytest.raises(TypeError, match=msg):
interval_container.overlaps(other)

View File

@ -0,0 +1,248 @@
from __future__ import annotations
from typing import Any
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
# integer dtypes
arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES]
scalars: list[Any] = [2] * len(arrays)
# floating dtypes
arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES]
scalars += [0.2, 0.2]
# boolean
arrays += [pd.array([True, False, True, None], dtype="boolean")]
scalars += [False]
@pytest.fixture(params=zip(arrays, scalars), ids=[a.dtype.name for a in arrays])
def data(request):
"""Fixture returning parametrized (array, scalar) tuple.
Used to test equivalence of scalars, numpy arrays with array ops, and the
equivalence of DataFrame and Series ops.
"""
return request.param
def check_skip(data, op_name):
if isinstance(data.dtype, pd.BooleanDtype) and "sub" in op_name:
pytest.skip("subtract not implemented for boolean")
def is_bool_not_implemented(data, op_name):
# match non-masked behavior
return data.dtype.kind == "b" and op_name.strip("_").lstrip("r") in [
"pow",
"truediv",
"floordiv",
]
# Test equivalence of scalars, numpy arrays with array ops
# -----------------------------------------------------------------------------
def test_array_scalar_like_equivalence(data, all_arithmetic_operators):
data, scalar = data
op = tm.get_op_from_name(all_arithmetic_operators)
check_skip(data, all_arithmetic_operators)
scalar_array = pd.array([scalar] * len(data), dtype=data.dtype)
# TODO also add len-1 array (np.array([scalar], dtype=data.dtype.numpy_dtype))
for scalar in [scalar, data.dtype.type(scalar)]:
if is_bool_not_implemented(data, all_arithmetic_operators):
msg = "operator '.*' not implemented for bool dtypes"
with pytest.raises(NotImplementedError, match=msg):
op(data, scalar)
with pytest.raises(NotImplementedError, match=msg):
op(data, scalar_array)
else:
result = op(data, scalar)
expected = op(data, scalar_array)
tm.assert_extension_array_equal(result, expected)
def test_array_NA(data, all_arithmetic_operators):
data, _ = data
op = tm.get_op_from_name(all_arithmetic_operators)
check_skip(data, all_arithmetic_operators)
scalar = pd.NA
scalar_array = pd.array([pd.NA] * len(data), dtype=data.dtype)
mask = data._mask.copy()
if is_bool_not_implemented(data, all_arithmetic_operators):
msg = "operator '.*' not implemented for bool dtypes"
with pytest.raises(NotImplementedError, match=msg):
op(data, scalar)
# GH#45421 check op doesn't alter data._mask inplace
tm.assert_numpy_array_equal(mask, data._mask)
return
result = op(data, scalar)
# GH#45421 check op doesn't alter data._mask inplace
tm.assert_numpy_array_equal(mask, data._mask)
expected = op(data, scalar_array)
tm.assert_numpy_array_equal(mask, data._mask)
tm.assert_extension_array_equal(result, expected)
def test_numpy_array_equivalence(data, all_arithmetic_operators):
data, scalar = data
op = tm.get_op_from_name(all_arithmetic_operators)
check_skip(data, all_arithmetic_operators)
numpy_array = np.array([scalar] * len(data), dtype=data.dtype.numpy_dtype)
pd_array = pd.array(numpy_array, dtype=data.dtype)
if is_bool_not_implemented(data, all_arithmetic_operators):
msg = "operator '.*' not implemented for bool dtypes"
with pytest.raises(NotImplementedError, match=msg):
op(data, numpy_array)
with pytest.raises(NotImplementedError, match=msg):
op(data, pd_array)
return
result = op(data, numpy_array)
expected = op(data, pd_array)
tm.assert_extension_array_equal(result, expected)
# Test equivalence with Series and DataFrame ops
# -----------------------------------------------------------------------------
def test_frame(data, all_arithmetic_operators):
data, scalar = data
op = tm.get_op_from_name(all_arithmetic_operators)
check_skip(data, all_arithmetic_operators)
# DataFrame with scalar
df = pd.DataFrame({"A": data})
if is_bool_not_implemented(data, all_arithmetic_operators):
msg = "operator '.*' not implemented for bool dtypes"
with pytest.raises(NotImplementedError, match=msg):
op(df, scalar)
with pytest.raises(NotImplementedError, match=msg):
op(data, scalar)
return
result = op(df, scalar)
expected = pd.DataFrame({"A": op(data, scalar)})
tm.assert_frame_equal(result, expected)
def test_series(data, all_arithmetic_operators):
data, scalar = data
op = tm.get_op_from_name(all_arithmetic_operators)
check_skip(data, all_arithmetic_operators)
ser = pd.Series(data)
others = [
scalar,
np.array([scalar] * len(data), dtype=data.dtype.numpy_dtype),
pd.array([scalar] * len(data), dtype=data.dtype),
pd.Series([scalar] * len(data), dtype=data.dtype),
]
for other in others:
if is_bool_not_implemented(data, all_arithmetic_operators):
msg = "operator '.*' not implemented for bool dtypes"
with pytest.raises(NotImplementedError, match=msg):
op(ser, other)
else:
result = op(ser, other)
expected = pd.Series(op(data, other))
tm.assert_series_equal(result, expected)
# Test generic characteristics / errors
# -----------------------------------------------------------------------------
def test_error_invalid_object(data, all_arithmetic_operators):
data, _ = data
op = all_arithmetic_operators
opa = getattr(data, op)
# 2d -> return NotImplemented
result = opa(pd.DataFrame({"A": data}))
assert result is NotImplemented
msg = r"can only perform ops with 1-d structures"
with pytest.raises(NotImplementedError, match=msg):
opa(np.arange(len(data)).reshape(-1, len(data)))
def test_error_len_mismatch(data, all_arithmetic_operators):
# operating with a list-like with non-matching length raises
data, scalar = data
op = tm.get_op_from_name(all_arithmetic_operators)
other = [scalar] * (len(data) - 1)
err = ValueError
msg = "|".join(
[
r"operands could not be broadcast together with shapes \(3,\) \(4,\)",
r"operands could not be broadcast together with shapes \(4,\) \(3,\)",
]
)
if data.dtype.kind == "b" and all_arithmetic_operators.strip("_") in [
"sub",
"rsub",
]:
err = TypeError
msg = (
r"numpy boolean subtract, the `\-` operator, is not supported, use "
r"the bitwise_xor, the `\^` operator, or the logical_xor function instead"
)
elif is_bool_not_implemented(data, all_arithmetic_operators):
msg = "operator '.*' not implemented for bool dtypes"
err = NotImplementedError
for other in [other, np.array(other)]:
with pytest.raises(err, match=msg):
op(data, other)
s = pd.Series(data)
with pytest.raises(err, match=msg):
op(s, other)
@pytest.mark.parametrize("op", ["__neg__", "__abs__", "__invert__"])
def test_unary_op_does_not_propagate_mask(data, op):
# https://github.com/pandas-dev/pandas/issues/39943
data, _ = data
ser = pd.Series(data)
if op == "__invert__" and data.dtype.kind == "f":
# we follow numpy in raising
msg = "ufunc 'invert' not supported for the input types"
with pytest.raises(TypeError, match=msg):
getattr(ser, op)()
with pytest.raises(TypeError, match=msg):
getattr(data, op)()
with pytest.raises(TypeError, match=msg):
# Check that this is still the numpy behavior
getattr(data._data, op)()
return
result = getattr(ser, op)()
expected = result.copy(deep=True)
ser[0] = None
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,209 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
pa = pytest.importorskip("pyarrow")
from pandas.core.arrays.arrow._arrow_utils import pyarrow_array_to_numpy_and_mask
arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES]
arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES]
arrays += [pd.array([True, False, True, None], dtype="boolean")]
@pytest.fixture(params=arrays, ids=[a.dtype.name for a in arrays])
def data(request):
"""
Fixture returning parametrized array from given dtype, including integer,
float and boolean
"""
return request.param
def test_arrow_array(data):
arr = pa.array(data)
expected = pa.array(
data.to_numpy(object, na_value=None),
type=pa.from_numpy_dtype(data.dtype.numpy_dtype),
)
assert arr.equals(expected)
def test_arrow_roundtrip(data):
df = pd.DataFrame({"a": data})
table = pa.table(df)
assert table.field("a").type == str(data.dtype.numpy_dtype)
result = table.to_pandas()
assert result["a"].dtype == data.dtype
tm.assert_frame_equal(result, df)
def test_dataframe_from_arrow_types_mapper():
def types_mapper(arrow_type):
if pa.types.is_boolean(arrow_type):
return pd.BooleanDtype()
elif pa.types.is_integer(arrow_type):
return pd.Int64Dtype()
bools_array = pa.array([True, None, False], type=pa.bool_())
ints_array = pa.array([1, None, 2], type=pa.int64())
small_ints_array = pa.array([-1, 0, 7], type=pa.int8())
record_batch = pa.RecordBatch.from_arrays(
[bools_array, ints_array, small_ints_array], ["bools", "ints", "small_ints"]
)
result = record_batch.to_pandas(types_mapper=types_mapper)
bools = pd.Series([True, None, False], dtype="boolean")
ints = pd.Series([1, None, 2], dtype="Int64")
small_ints = pd.Series([-1, 0, 7], dtype="Int64")
expected = pd.DataFrame({"bools": bools, "ints": ints, "small_ints": small_ints})
tm.assert_frame_equal(result, expected)
def test_arrow_load_from_zero_chunks(data):
# GH-41040
df = pd.DataFrame({"a": data[0:0]})
table = pa.table(df)
assert table.field("a").type == str(data.dtype.numpy_dtype)
table = pa.table(
[pa.chunked_array([], type=table.field("a").type)], schema=table.schema
)
result = table.to_pandas()
assert result["a"].dtype == data.dtype
tm.assert_frame_equal(result, df)
def test_arrow_from_arrow_uint():
# https://github.com/pandas-dev/pandas/issues/31896
# possible mismatch in types
dtype = pd.UInt32Dtype()
result = dtype.__from_arrow__(pa.array([1, 2, 3, 4, None], type="int64"))
expected = pd.array([1, 2, 3, 4, None], dtype="UInt32")
tm.assert_extension_array_equal(result, expected)
def test_arrow_sliced(data):
# https://github.com/pandas-dev/pandas/issues/38525
df = pd.DataFrame({"a": data})
table = pa.table(df)
result = table.slice(2, None).to_pandas()
expected = df.iloc[2:].reset_index(drop=True)
tm.assert_frame_equal(result, expected)
# no missing values
df2 = df.fillna(data[0])
table = pa.table(df2)
result = table.slice(2, None).to_pandas()
expected = df2.iloc[2:].reset_index(drop=True)
tm.assert_frame_equal(result, expected)
@pytest.fixture
def np_dtype_to_arrays(any_real_numpy_dtype):
"""
Fixture returning actual and expected dtype, pandas and numpy arrays and
mask from a given numpy dtype
"""
np_dtype = np.dtype(any_real_numpy_dtype)
pa_type = pa.from_numpy_dtype(np_dtype)
# None ensures the creation of a bitmask buffer.
pa_array = pa.array([0, 1, 2, None], type=pa_type)
# Since masked Arrow buffer slots are not required to contain a specific
# value, assert only the first three values of the created np.array
np_expected = np.array([0, 1, 2], dtype=np_dtype)
mask_expected = np.array([True, True, True, False])
return np_dtype, pa_array, np_expected, mask_expected
def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays):
"""
Test conversion from pyarrow array to numpy array.
Modifies the pyarrow buffer to contain padding and offset, which are
considered valid buffers by pyarrow.
Also tests empty pyarrow arrays with non empty buffers.
See https://github.com/pandas-dev/pandas/issues/40896
"""
np_dtype, pa_array, np_expected, mask_expected = np_dtype_to_arrays
data, mask = pyarrow_array_to_numpy_and_mask(pa_array, np_dtype)
tm.assert_numpy_array_equal(data[:3], np_expected)
tm.assert_numpy_array_equal(mask, mask_expected)
mask_buffer = pa_array.buffers()[0]
data_buffer = pa_array.buffers()[1]
data_buffer_bytes = pa_array.buffers()[1].to_pybytes()
# Add trailing padding to the buffer.
data_buffer_trail = pa.py_buffer(data_buffer_bytes + b"\x00")
pa_array_trail = pa.Array.from_buffers(
type=pa_array.type,
length=len(pa_array),
buffers=[mask_buffer, data_buffer_trail],
offset=pa_array.offset,
)
pa_array_trail.validate()
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, np_dtype)
tm.assert_numpy_array_equal(data[:3], np_expected)
tm.assert_numpy_array_equal(mask, mask_expected)
# Add offset to the buffer.
offset = b"\x00" * (pa_array.type.bit_width // 8)
data_buffer_offset = pa.py_buffer(offset + data_buffer_bytes)
mask_buffer_offset = pa.py_buffer(b"\x0E")
pa_array_offset = pa.Array.from_buffers(
type=pa_array.type,
length=len(pa_array),
buffers=[mask_buffer_offset, data_buffer_offset],
offset=pa_array.offset + 1,
)
pa_array_offset.validate()
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype)
tm.assert_numpy_array_equal(data[:3], np_expected)
tm.assert_numpy_array_equal(mask, mask_expected)
# Empty array
np_expected_empty = np.array([], dtype=np_dtype)
mask_expected_empty = np.array([], dtype=np.bool_)
pa_array_offset = pa.Array.from_buffers(
type=pa_array.type,
length=0,
buffers=[mask_buffer, data_buffer],
offset=pa_array.offset,
)
pa_array_offset.validate()
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype)
tm.assert_numpy_array_equal(data[:3], np_expected_empty)
tm.assert_numpy_array_equal(mask, mask_expected_empty)
@pytest.mark.parametrize(
"arr", [pa.nulls(10), pa.chunked_array([pa.nulls(4), pa.nulls(6)])]
)
def test_from_arrow_null(data, arr):
res = data.dtype.__from_arrow__(arr)
assert res.isna().all()
assert len(res) == 10
def test_from_arrow_type_error(data):
# ensure that __from_arrow__ returns a TypeError when getting a wrong
# array type
arr = pa.array(data).cast("string")
with pytest.raises(TypeError, match=None):
# we don't test the exact error message, only the fact that it raises
# a TypeError is relevant
data.dtype.__from_arrow__(arr)

View File

@ -0,0 +1,74 @@
import numpy as np
import pytest
from pandas.core.dtypes.common import is_integer_dtype
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import BaseMaskedArray
arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES]
arrays += [
pd.array([0.141, -0.268, 5.895, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES
]
@pytest.fixture(params=arrays, ids=[a.dtype.name for a in arrays])
def data(request):
"""
Fixture returning parametrized 'data' array with different integer and
floating point types
"""
return request.param
@pytest.fixture()
def numpy_dtype(data):
"""
Fixture returning numpy dtype from 'data' input array.
"""
# For integer dtype, the numpy conversion must be done to float
if is_integer_dtype(data):
numpy_dtype = float
else:
numpy_dtype = data.dtype.type
return numpy_dtype
def test_round(data, numpy_dtype):
# No arguments
result = data.round()
expected = pd.array(
np.round(data.to_numpy(dtype=numpy_dtype, na_value=None)), dtype=data.dtype
)
tm.assert_extension_array_equal(result, expected)
# Decimals argument
result = data.round(decimals=2)
expected = pd.array(
np.round(data.to_numpy(dtype=numpy_dtype, na_value=None), decimals=2),
dtype=data.dtype,
)
tm.assert_extension_array_equal(result, expected)
def test_tolist(data):
result = data.tolist()
expected = list(data)
tm.assert_equal(result, expected)
def test_to_numpy():
# GH#56991
class MyStringArray(BaseMaskedArray):
dtype = pd.StringDtype()
_dtype_cls = pd.StringDtype
_internal_fill_value = pd.NA
arr = MyStringArray(
values=np.array(["a", "b", "c"]), mask=np.array([False, True, False])
)
result = arr.to_numpy()
expected = np.array(["a", pd.NA, "c"])
tm.assert_numpy_array_equal(result, expected)

View File

@ -0,0 +1,60 @@
import re
import numpy as np
import pytest
import pandas as pd
class TestSetitemValidation:
def _check_setitem_invalid(self, arr, invalid):
msg = f"Invalid value '{str(invalid)}' for dtype {arr.dtype}"
msg = re.escape(msg)
with pytest.raises(TypeError, match=msg):
arr[0] = invalid
with pytest.raises(TypeError, match=msg):
arr[:] = invalid
with pytest.raises(TypeError, match=msg):
arr[[0]] = invalid
# FIXME: don't leave commented-out
# with pytest.raises(TypeError):
# arr[[0]] = [invalid]
# with pytest.raises(TypeError):
# arr[[0]] = np.array([invalid], dtype=object)
# Series non-coercion, behavior subject to change
ser = pd.Series(arr)
with pytest.raises(TypeError, match=msg):
ser[0] = invalid
# TODO: so, so many other variants of this...
_invalid_scalars = [
1 + 2j,
"True",
"1",
"1.0",
pd.NaT,
np.datetime64("NaT"),
np.timedelta64("NaT"),
]
@pytest.mark.parametrize(
"invalid", _invalid_scalars + [1, 1.0, np.int64(1), np.float64(1)]
)
def test_setitem_validation_scalar_bool(self, invalid):
arr = pd.array([True, False, None], dtype="boolean")
self._check_setitem_invalid(arr, invalid)
@pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)])
def test_setitem_validation_scalar_int(self, invalid, any_int_ea_dtype):
arr = pd.array([1, 2, None], dtype=any_int_ea_dtype)
self._check_setitem_invalid(arr, invalid)
@pytest.mark.parametrize("invalid", _invalid_scalars + [True])
def test_setitem_validation_scalar_float(self, invalid, float_ea_dtype):
arr = pd.array([1, 2, None], dtype=float_ea_dtype)
self._check_setitem_invalid(arr, invalid)

View File

@ -0,0 +1,154 @@
"""
Tests shared by MaskedArray subclasses.
"""
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.tests.extension.base import BaseOpsUtil
class ComparisonOps(BaseOpsUtil):
def _compare_other(self, data, op, other):
# array
result = pd.Series(op(data, other))
expected = pd.Series(op(data._data, other), dtype="boolean")
# fill the nan locations
expected[data._mask] = pd.NA
tm.assert_series_equal(result, expected)
# series
ser = pd.Series(data)
result = op(ser, other)
# Set nullable dtype here to avoid upcasting when setting to pd.NA below
expected = op(pd.Series(data._data), other).astype("boolean")
# fill the nan locations
expected[data._mask] = pd.NA
tm.assert_series_equal(result, expected)
# subclass will override to parametrize 'other'
def test_scalar(self, other, comparison_op, dtype):
op = comparison_op
left = pd.array([1, 0, None], dtype=dtype)
result = op(left, other)
if other is pd.NA:
expected = pd.array([None, None, None], dtype="boolean")
else:
values = op(left._data, other)
expected = pd.arrays.BooleanArray(values, left._mask, copy=True)
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
result[0] = pd.NA
tm.assert_extension_array_equal(left, pd.array([1, 0, None], dtype=dtype))
class NumericOps:
# Shared by IntegerArray and FloatingArray, not BooleanArray
def test_searchsorted_nan(self, dtype):
# The base class casts to object dtype, for which searchsorted returns
# 0 from the left and 10 from the right.
arr = pd.array(range(10), dtype=dtype)
assert arr.searchsorted(np.nan, side="left") == 10
assert arr.searchsorted(np.nan, side="right") == 10
def test_no_shared_mask(self, data):
result = data + 1
assert not tm.shares_memory(result, data)
def test_array(self, comparison_op, dtype):
op = comparison_op
left = pd.array([0, 1, 2, None, None, None], dtype=dtype)
right = pd.array([0, 1, None, 0, 1, None], dtype=dtype)
result = op(left, right)
values = op(left._data, right._data)
mask = left._mask | right._mask
expected = pd.arrays.BooleanArray(values, mask)
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
result[0] = pd.NA
tm.assert_extension_array_equal(
left, pd.array([0, 1, 2, None, None, None], dtype=dtype)
)
tm.assert_extension_array_equal(
right, pd.array([0, 1, None, 0, 1, None], dtype=dtype)
)
def test_compare_with_booleanarray(self, comparison_op, dtype):
op = comparison_op
left = pd.array([True, False, None] * 3, dtype="boolean")
right = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype=dtype)
other = pd.array([False] * 3 + [True] * 3 + [None] * 3, dtype="boolean")
expected = op(left, other)
result = op(left, right)
tm.assert_extension_array_equal(result, expected)
# reversed op
expected = op(other, left)
result = op(right, left)
tm.assert_extension_array_equal(result, expected)
def test_compare_to_string(self, dtype):
# GH#28930
ser = pd.Series([1, None], dtype=dtype)
result = ser == "a"
expected = pd.Series([False, pd.NA], dtype="boolean")
tm.assert_series_equal(result, expected)
def test_ufunc_with_out(self, dtype):
arr = pd.array([1, 2, 3], dtype=dtype)
arr2 = pd.array([1, 2, pd.NA], dtype=dtype)
mask = arr == arr
mask2 = arr2 == arr2
result = np.zeros(3, dtype=bool)
result |= mask
# If MaskedArray.__array_ufunc__ handled "out" appropriately,
# `result` should still be an ndarray.
assert isinstance(result, np.ndarray)
assert result.all()
# result |= mask worked because mask could be cast losslessly to
# boolean ndarray. mask2 can't, so this raises
result = np.zeros(3, dtype=bool)
msg = "Specify an appropriate 'na_value' for this dtype"
with pytest.raises(ValueError, match=msg):
result |= mask2
# addition
res = np.add(arr, arr2)
expected = pd.array([2, 4, pd.NA], dtype=dtype)
tm.assert_extension_array_equal(res, expected)
# when passing out=arr, we will modify 'arr' inplace.
res = np.add(arr, arr2, out=arr)
assert res is arr
tm.assert_extension_array_equal(res, expected)
tm.assert_extension_array_equal(arr, expected)
def test_mul_td64_array(self, dtype):
# GH#45622
arr = pd.array([1, 2, pd.NA], dtype=dtype)
other = np.arange(3, dtype=np.int64).view("m8[ns]")
result = arr * other
expected = pd.array([pd.Timedelta(0), pd.Timedelta(2), pd.NaT])
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,41 @@
import numpy as np
from pandas.core.dtypes.common import is_scalar
import pandas as pd
import pandas._testing as tm
class TestSearchsorted:
def test_searchsorted_string(self, string_dtype):
arr = pd.array(["a", "b", "c"], dtype=string_dtype)
result = arr.searchsorted("a", side="left")
assert is_scalar(result)
assert result == 0
result = arr.searchsorted("a", side="right")
assert is_scalar(result)
assert result == 1
def test_searchsorted_numeric_dtypes_scalar(self, any_real_numpy_dtype):
arr = pd.array([1, 3, 90], dtype=any_real_numpy_dtype)
result = arr.searchsorted(30)
assert is_scalar(result)
assert result == 2
result = arr.searchsorted([30])
expected = np.array([2], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
def test_searchsorted_numeric_dtypes_vector(self, any_real_numpy_dtype):
arr = pd.array([1, 3, 90], dtype=any_real_numpy_dtype)
result = arr.searchsorted([2, 30])
expected = np.array([1, 2], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
def test_searchsorted_sorter(self, any_real_numpy_dtype):
arr = pd.array([3, 1, 2], dtype=any_real_numpy_dtype)
result = arr.searchsorted([0, 3], sorter=np.argsort(arr))
expected = np.array([0, 2], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)

View File

@ -0,0 +1,324 @@
"""
Additional tests for NumpyExtensionArray that aren't covered by
the interface tests.
"""
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import NumpyEADtype
import pandas as pd
import pandas._testing as tm
from pandas.arrays import NumpyExtensionArray
@pytest.fixture(
params=[
np.array(["a", "b"], dtype=object),
np.array([0, 1], dtype=float),
np.array([0, 1], dtype=int),
np.array([0, 1 + 2j], dtype=complex),
np.array([True, False], dtype=bool),
np.array([0, 1], dtype="datetime64[ns]"),
np.array([0, 1], dtype="timedelta64[ns]"),
]
)
def any_numpy_array(request):
"""
Parametrized fixture for NumPy arrays with different dtypes.
This excludes string and bytes.
"""
return request.param
# ----------------------------------------------------------------------------
# NumpyEADtype
@pytest.mark.parametrize(
"dtype, expected",
[
("bool", True),
("int", True),
("uint", True),
("float", True),
("complex", True),
("str", False),
("bytes", False),
("datetime64[ns]", False),
("object", False),
("void", False),
],
)
def test_is_numeric(dtype, expected):
dtype = NumpyEADtype(dtype)
assert dtype._is_numeric is expected
@pytest.mark.parametrize(
"dtype, expected",
[
("bool", True),
("int", False),
("uint", False),
("float", False),
("complex", False),
("str", False),
("bytes", False),
("datetime64[ns]", False),
("object", False),
("void", False),
],
)
def test_is_boolean(dtype, expected):
dtype = NumpyEADtype(dtype)
assert dtype._is_boolean is expected
def test_repr():
dtype = NumpyEADtype(np.dtype("int64"))
assert repr(dtype) == "NumpyEADtype('int64')"
def test_constructor_from_string():
result = NumpyEADtype.construct_from_string("int64")
expected = NumpyEADtype(np.dtype("int64"))
assert result == expected
def test_dtype_idempotent(any_numpy_dtype):
dtype = NumpyEADtype(any_numpy_dtype)
result = NumpyEADtype(dtype)
assert result == dtype
# ----------------------------------------------------------------------------
# Construction
def test_constructor_no_coercion():
with pytest.raises(ValueError, match="NumPy array"):
NumpyExtensionArray([1, 2, 3])
def test_series_constructor_with_copy():
ndarray = np.array([1, 2, 3])
ser = pd.Series(NumpyExtensionArray(ndarray), copy=True)
assert ser.values is not ndarray
def test_series_constructor_with_astype():
ndarray = np.array([1, 2, 3])
result = pd.Series(NumpyExtensionArray(ndarray), dtype="float64")
expected = pd.Series([1.0, 2.0, 3.0], dtype="float64")
tm.assert_series_equal(result, expected)
def test_from_sequence_dtype():
arr = np.array([1, 2, 3], dtype="int64")
result = NumpyExtensionArray._from_sequence(arr, dtype="uint64")
expected = NumpyExtensionArray(np.array([1, 2, 3], dtype="uint64"))
tm.assert_extension_array_equal(result, expected)
def test_constructor_copy():
arr = np.array([0, 1])
result = NumpyExtensionArray(arr, copy=True)
assert not tm.shares_memory(result, arr)
def test_constructor_with_data(any_numpy_array):
nparr = any_numpy_array
arr = NumpyExtensionArray(nparr)
assert arr.dtype.numpy_dtype == nparr.dtype
# ----------------------------------------------------------------------------
# Conversion
def test_to_numpy():
arr = NumpyExtensionArray(np.array([1, 2, 3]))
result = arr.to_numpy()
assert result is arr._ndarray
result = arr.to_numpy(copy=True)
assert result is not arr._ndarray
result = arr.to_numpy(dtype="f8")
expected = np.array([1, 2, 3], dtype="f8")
tm.assert_numpy_array_equal(result, expected)
# ----------------------------------------------------------------------------
# Setitem
def test_setitem_series():
ser = pd.Series([1, 2, 3])
ser.array[0] = 10
expected = pd.Series([10, 2, 3])
tm.assert_series_equal(ser, expected)
def test_setitem(any_numpy_array):
nparr = any_numpy_array
arr = NumpyExtensionArray(nparr, copy=True)
arr[0] = arr[1]
nparr[0] = nparr[1]
tm.assert_numpy_array_equal(arr.to_numpy(), nparr)
# ----------------------------------------------------------------------------
# Reductions
def test_bad_reduce_raises():
arr = np.array([1, 2, 3], dtype="int64")
arr = NumpyExtensionArray(arr)
msg = "cannot perform not_a_method with type int"
with pytest.raises(TypeError, match=msg):
arr._reduce(msg)
def test_validate_reduction_keyword_args():
arr = NumpyExtensionArray(np.array([1, 2, 3]))
msg = "the 'keepdims' parameter is not supported .*all"
with pytest.raises(ValueError, match=msg):
arr.all(keepdims=True)
def test_np_max_nested_tuples():
# case where checking in ufunc.nout works while checking for tuples
# does not
vals = [
(("j", "k"), ("l", "m")),
(("l", "m"), ("o", "p")),
(("o", "p"), ("j", "k")),
]
ser = pd.Series(vals)
arr = ser.array
assert arr.max() is arr[2]
assert ser.max() is arr[2]
result = np.maximum.reduce(arr)
assert result == arr[2]
result = np.maximum.reduce(ser)
assert result == arr[2]
def test_np_reduce_2d():
raw = np.arange(12).reshape(4, 3)
arr = NumpyExtensionArray(raw)
res = np.maximum.reduce(arr, axis=0)
tm.assert_extension_array_equal(res, arr[-1])
alt = arr.max(axis=0)
tm.assert_extension_array_equal(alt, arr[-1])
# ----------------------------------------------------------------------------
# Ops
@pytest.mark.parametrize("ufunc", [np.abs, np.negative, np.positive])
def test_ufunc_unary(ufunc):
arr = NumpyExtensionArray(np.array([-1.0, 0.0, 1.0]))
result = ufunc(arr)
expected = NumpyExtensionArray(ufunc(arr._ndarray))
tm.assert_extension_array_equal(result, expected)
# same thing but with the 'out' keyword
out = NumpyExtensionArray(np.array([-9.0, -9.0, -9.0]))
ufunc(arr, out=out)
tm.assert_extension_array_equal(out, expected)
def test_ufunc():
arr = NumpyExtensionArray(np.array([-1.0, 0.0, 1.0]))
r1, r2 = np.divmod(arr, np.add(arr, 2))
e1, e2 = np.divmod(arr._ndarray, np.add(arr._ndarray, 2))
e1 = NumpyExtensionArray(e1)
e2 = NumpyExtensionArray(e2)
tm.assert_extension_array_equal(r1, e1)
tm.assert_extension_array_equal(r2, e2)
def test_basic_binop():
# Just a basic smoke test. The EA interface tests exercise this
# more thoroughly.
x = NumpyExtensionArray(np.array([1, 2, 3]))
result = x + x
expected = NumpyExtensionArray(np.array([2, 4, 6]))
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("dtype", [None, object])
def test_setitem_object_typecode(dtype):
arr = NumpyExtensionArray(np.array(["a", "b", "c"], dtype=dtype))
arr[0] = "t"
expected = NumpyExtensionArray(np.array(["t", "b", "c"], dtype=dtype))
tm.assert_extension_array_equal(arr, expected)
def test_setitem_no_coercion():
# https://github.com/pandas-dev/pandas/issues/28150
arr = NumpyExtensionArray(np.array([1, 2, 3]))
with pytest.raises(ValueError, match="int"):
arr[0] = "a"
# With a value that we do coerce, check that we coerce the value
# and not the underlying array.
arr[0] = 2.5
assert isinstance(arr[0], (int, np.integer)), type(arr[0])
def test_setitem_preserves_views():
# GH#28150, see also extension test of the same name
arr = NumpyExtensionArray(np.array([1, 2, 3]))
view1 = arr.view()
view2 = arr[:]
view3 = np.asarray(arr)
arr[0] = 9
assert view1[0] == 9
assert view2[0] == 9
assert view3[0] == 9
arr[-1] = 2.5
view1[-1] = 5
assert arr[-1] == 5
@pytest.mark.parametrize("dtype", [np.int64, np.uint64])
def test_quantile_empty(dtype):
# we should get back np.nans, not -1s
arr = NumpyExtensionArray(np.array([], dtype=dtype))
idx = pd.Index([0.0, 0.5])
result = arr._quantile(idx, interpolation="linear")
expected = NumpyExtensionArray(np.array([np.nan, np.nan]))
tm.assert_extension_array_equal(result, expected)
def test_factorize_unsigned():
# don't raise when calling factorize on unsigned int NumpyExtensionArray
arr = np.array([1, 2, 3], dtype=np.uint64)
obj = NumpyExtensionArray(arr)
res_codes, res_unique = obj.factorize()
exp_codes, exp_unique = pd.factorize(arr)
tm.assert_numpy_array_equal(res_codes, exp_codes)
tm.assert_extension_array_equal(res_unique, NumpyExtensionArray(exp_unique))

View File

@ -0,0 +1,130 @@
import pytest
from pandas.compat.pyarrow import pa_version_under10p1
from pandas.core.dtypes.dtypes import PeriodDtype
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import (
PeriodArray,
period_array,
)
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
pa = pytest.importorskip("pyarrow")
def test_arrow_extension_type():
from pandas.core.arrays.arrow.extension_types import ArrowPeriodType
p1 = ArrowPeriodType("D")
p2 = ArrowPeriodType("D")
p3 = ArrowPeriodType("M")
assert p1.freq == "D"
assert p1 == p2
assert p1 != p3
assert hash(p1) == hash(p2)
assert hash(p1) != hash(p3)
@pytest.mark.xfail(not pa_version_under10p1, reason="Wrong behavior with pyarrow 10")
@pytest.mark.parametrize(
"data, freq",
[
(pd.date_range("2017", periods=3), "D"),
(pd.date_range("2017", periods=3, freq="YE"), "Y-DEC"),
],
)
def test_arrow_array(data, freq):
from pandas.core.arrays.arrow.extension_types import ArrowPeriodType
periods = period_array(data, freq=freq)
result = pa.array(periods)
assert isinstance(result.type, ArrowPeriodType)
assert result.type.freq == freq
expected = pa.array(periods.asi8, type="int64")
assert result.storage.equals(expected)
# convert to its storage type
result = pa.array(periods, type=pa.int64())
assert result.equals(expected)
# unsupported conversions
msg = "Not supported to convert PeriodArray to 'double' type"
with pytest.raises(TypeError, match=msg):
pa.array(periods, type="float64")
with pytest.raises(TypeError, match="different 'freq'"):
pa.array(periods, type=ArrowPeriodType("T"))
def test_arrow_array_missing():
from pandas.core.arrays.arrow.extension_types import ArrowPeriodType
arr = PeriodArray([1, 2, 3], dtype="period[D]")
arr[1] = pd.NaT
result = pa.array(arr)
assert isinstance(result.type, ArrowPeriodType)
assert result.type.freq == "D"
expected = pa.array([1, None, 3], type="int64")
assert result.storage.equals(expected)
def test_arrow_table_roundtrip():
from pandas.core.arrays.arrow.extension_types import ArrowPeriodType
arr = PeriodArray([1, 2, 3], dtype="period[D]")
arr[1] = pd.NaT
df = pd.DataFrame({"a": arr})
table = pa.table(df)
assert isinstance(table.field("a").type, ArrowPeriodType)
result = table.to_pandas()
assert isinstance(result["a"].dtype, PeriodDtype)
tm.assert_frame_equal(result, df)
table2 = pa.concat_tables([table, table])
result = table2.to_pandas()
expected = pd.concat([df, df], ignore_index=True)
tm.assert_frame_equal(result, expected)
def test_arrow_load_from_zero_chunks():
# GH-41040
from pandas.core.arrays.arrow.extension_types import ArrowPeriodType
arr = PeriodArray([], dtype="period[D]")
df = pd.DataFrame({"a": arr})
table = pa.table(df)
assert isinstance(table.field("a").type, ArrowPeriodType)
table = pa.table(
[pa.chunked_array([], type=table.column(0).type)], schema=table.schema
)
result = table.to_pandas()
assert isinstance(result["a"].dtype, PeriodDtype)
tm.assert_frame_equal(result, df)
def test_arrow_table_roundtrip_without_metadata():
arr = PeriodArray([1, 2, 3], dtype="period[h]")
arr[1] = pd.NaT
df = pd.DataFrame({"a": arr})
table = pa.table(df)
# remove the metadata
table = table.replace_schema_metadata()
assert table.schema.metadata is None
result = table.to_pandas()
assert isinstance(result["a"].dtype, PeriodDtype)
tm.assert_frame_equal(result, df)

View File

@ -0,0 +1,67 @@
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import PeriodDtype
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import period_array
@pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"])
def test_astype_int(dtype):
# We choose to ignore the sign and size of integers for
# Period/Datetime/Timedelta astype
arr = period_array(["2000", "2001", None], freq="D")
if np.dtype(dtype) != np.int64:
with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"):
arr.astype(dtype)
return
result = arr.astype(dtype)
expected = arr._ndarray.view("i8")
tm.assert_numpy_array_equal(result, expected)
def test_astype_copies():
arr = period_array(["2000", "2001", None], freq="D")
result = arr.astype(np.int64, copy=False)
# Add the `.base`, since we now use `.asi8` which returns a view.
# We could maybe override it in PeriodArray to return ._ndarray directly.
assert result.base is arr._ndarray
result = arr.astype(np.int64, copy=True)
assert result is not arr._ndarray
tm.assert_numpy_array_equal(result, arr._ndarray.view("i8"))
def test_astype_categorical():
arr = period_array(["2000", "2001", "2001", None], freq="D")
result = arr.astype("category")
categories = pd.PeriodIndex(["2000", "2001"], freq="D")
expected = pd.Categorical.from_codes([0, 1, 1, -1], categories=categories)
tm.assert_categorical_equal(result, expected)
def test_astype_period():
arr = period_array(["2000", "2001", None], freq="D")
result = arr.astype(PeriodDtype("M"))
expected = period_array(["2000", "2001", None], freq="M")
tm.assert_period_array_equal(result, expected)
@pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"])
def test_astype_datetime(dtype):
arr = period_array(["2000", "2001", None], freq="D")
# slice off the [ns] so that the regex matches.
if dtype == "timedelta64[ns]":
with pytest.raises(TypeError, match=dtype[:-4]):
arr.astype(dtype)
else:
# GH#45038 allow period->dt64 because we allow dt64->period
result = arr.astype(dtype)
expected = pd.DatetimeIndex(["2000", "2001", pd.NaT], dtype=dtype)._data
tm.assert_datetime_array_equal(result, expected)

View File

@ -0,0 +1,156 @@
import numpy as np
import pytest
from pandas._libs.tslibs import iNaT
from pandas._libs.tslibs.offsets import MonthEnd
from pandas._libs.tslibs.period import IncompatibleFrequency
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import (
PeriodArray,
period_array,
)
@pytest.mark.parametrize(
"data, freq, expected",
[
([pd.Period("2017", "D")], None, [17167]),
([pd.Period("2017", "D")], "D", [17167]),
([2017], "D", [17167]),
(["2017"], "D", [17167]),
([pd.Period("2017", "D")], pd.tseries.offsets.Day(), [17167]),
([pd.Period("2017", "D"), None], None, [17167, iNaT]),
(pd.Series(pd.date_range("2017", periods=3)), None, [17167, 17168, 17169]),
(pd.date_range("2017", periods=3), None, [17167, 17168, 17169]),
(pd.period_range("2017", periods=4, freq="Q"), None, [188, 189, 190, 191]),
],
)
def test_period_array_ok(data, freq, expected):
result = period_array(data, freq=freq).asi8
expected = np.asarray(expected, dtype=np.int64)
tm.assert_numpy_array_equal(result, expected)
def test_period_array_readonly_object():
# https://github.com/pandas-dev/pandas/issues/25403
pa = period_array([pd.Period("2019-01-01")])
arr = np.asarray(pa, dtype="object")
arr.setflags(write=False)
result = period_array(arr)
tm.assert_period_array_equal(result, pa)
result = pd.Series(arr)
tm.assert_series_equal(result, pd.Series(pa))
result = pd.DataFrame({"A": arr})
tm.assert_frame_equal(result, pd.DataFrame({"A": pa}))
def test_from_datetime64_freq_changes():
# https://github.com/pandas-dev/pandas/issues/23438
arr = pd.date_range("2017", periods=3, freq="D")
result = PeriodArray._from_datetime64(arr, freq="M")
expected = period_array(["2017-01-01", "2017-01-01", "2017-01-01"], freq="M")
tm.assert_period_array_equal(result, expected)
@pytest.mark.parametrize("freq", ["2M", MonthEnd(2)])
def test_from_datetime64_freq_2M(freq):
arr = np.array(
["2020-01-01T00:00:00", "2020-01-02T00:00:00"], dtype="datetime64[ns]"
)
result = PeriodArray._from_datetime64(arr, freq)
expected = period_array(["2020-01", "2020-01"], freq=freq)
tm.assert_period_array_equal(result, expected)
@pytest.mark.parametrize(
"data, freq, msg",
[
(
[pd.Period("2017", "D"), pd.Period("2017", "Y")],
None,
"Input has different freq",
),
([pd.Period("2017", "D")], "Y", "Input has different freq"),
],
)
def test_period_array_raises(data, freq, msg):
with pytest.raises(IncompatibleFrequency, match=msg):
period_array(data, freq)
def test_period_array_non_period_series_raies():
ser = pd.Series([1, 2, 3])
with pytest.raises(TypeError, match="dtype"):
PeriodArray(ser, dtype="period[D]")
def test_period_array_freq_mismatch():
arr = period_array(["2000", "2001"], freq="D")
with pytest.raises(IncompatibleFrequency, match="freq"):
PeriodArray(arr, dtype="period[M]")
dtype = pd.PeriodDtype(pd.tseries.offsets.MonthEnd())
with pytest.raises(IncompatibleFrequency, match="freq"):
PeriodArray(arr, dtype=dtype)
def test_from_sequence_disallows_i8():
arr = period_array(["2000", "2001"], freq="D")
msg = str(arr[0].ordinal)
with pytest.raises(TypeError, match=msg):
PeriodArray._from_sequence(arr.asi8, dtype=arr.dtype)
with pytest.raises(TypeError, match=msg):
PeriodArray._from_sequence(list(arr.asi8), dtype=arr.dtype)
def test_from_td64nat_sequence_raises():
# GH#44507
td = pd.NaT.to_numpy("m8[ns]")
dtype = pd.period_range("2005-01-01", periods=3, freq="D").dtype
arr = np.array([None], dtype=object)
arr[0] = td
msg = "Value must be Period, string, integer, or datetime"
with pytest.raises(ValueError, match=msg):
PeriodArray._from_sequence(arr, dtype=dtype)
with pytest.raises(ValueError, match=msg):
pd.PeriodIndex(arr, dtype=dtype)
with pytest.raises(ValueError, match=msg):
pd.Index(arr, dtype=dtype)
with pytest.raises(ValueError, match=msg):
pd.array(arr, dtype=dtype)
with pytest.raises(ValueError, match=msg):
pd.Series(arr, dtype=dtype)
with pytest.raises(ValueError, match=msg):
pd.DataFrame(arr, dtype=dtype)
def test_freq_deprecated():
# GH#52462
data = np.arange(5).astype(np.int64)
msg = "The 'freq' keyword in the PeriodArray constructor is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
res = PeriodArray(data, freq="M")
expected = PeriodArray(data, dtype="period[M]")
tm.assert_equal(res, expected)
def test_period_array_from_datetime64():
arr = np.array(
["2020-01-01T00:00:00", "2020-02-02T00:00:00"], dtype="datetime64[ns]"
)
result = PeriodArray._from_datetime64(arr, freq=MonthEnd(2))
expected = period_array(["2020-01-01", "2020-02-01"], freq=MonthEnd(2))
tm.assert_period_array_equal(result, expected)

Some files were not shown because too many files have changed in this diff Show More