Updated script that can be controled by Nodejs web app

This commit is contained in:
mac OS
2024-11-25 12:24:18 +07:00
parent c440eda1f4
commit 8b0ab2bd3a
8662 changed files with 1803808 additions and 34 deletions

View File

@ -0,0 +1,7 @@
import pytest
@pytest.fixture(params=[True, False])
def sort(request):
"""Boolean sort keyword for concat and DataFrame.append."""
return request.param

View File

@ -0,0 +1,389 @@
import datetime as dt
from itertools import combinations
import dateutil
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
Series,
Timestamp,
concat,
isna,
)
import pandas._testing as tm
class TestAppend:
def test_append(self, sort, float_frame):
mixed_frame = float_frame.copy()
mixed_frame["foo"] = "bar"
begin_index = float_frame.index[:5]
end_index = float_frame.index[5:]
begin_frame = float_frame.reindex(begin_index)
end_frame = float_frame.reindex(end_index)
appended = begin_frame._append(end_frame)
tm.assert_almost_equal(appended["A"], float_frame["A"])
del end_frame["A"]
partial_appended = begin_frame._append(end_frame, sort=sort)
assert "A" in partial_appended
partial_appended = end_frame._append(begin_frame, sort=sort)
assert "A" in partial_appended
# mixed type handling
appended = mixed_frame[:5]._append(mixed_frame[5:])
tm.assert_frame_equal(appended, mixed_frame)
# what to test here
mixed_appended = mixed_frame[:5]._append(float_frame[5:], sort=sort)
mixed_appended2 = float_frame[:5]._append(mixed_frame[5:], sort=sort)
# all equal except 'foo' column
tm.assert_frame_equal(
mixed_appended.reindex(columns=["A", "B", "C", "D"]),
mixed_appended2.reindex(columns=["A", "B", "C", "D"]),
)
def test_append_empty(self, float_frame):
empty = DataFrame()
appended = float_frame._append(empty)
tm.assert_frame_equal(float_frame, appended)
assert appended is not float_frame
appended = empty._append(float_frame)
tm.assert_frame_equal(float_frame, appended)
assert appended is not float_frame
def test_append_overlap_raises(self, float_frame):
msg = "Indexes have overlapping values"
with pytest.raises(ValueError, match=msg):
float_frame._append(float_frame, verify_integrity=True)
def test_append_new_columns(self):
# see gh-6129: new columns
df = DataFrame({"a": {"x": 1, "y": 2}, "b": {"x": 3, "y": 4}})
row = Series([5, 6, 7], index=["a", "b", "c"], name="z")
expected = DataFrame(
{
"a": {"x": 1, "y": 2, "z": 5},
"b": {"x": 3, "y": 4, "z": 6},
"c": {"z": 7},
}
)
result = df._append(row)
tm.assert_frame_equal(result, expected)
def test_append_length0_frame(self, sort):
df = DataFrame(columns=["A", "B", "C"])
df3 = DataFrame(index=[0, 1], columns=["A", "B"])
df5 = df._append(df3, sort=sort)
expected = DataFrame(index=[0, 1], columns=["A", "B", "C"])
tm.assert_frame_equal(df5, expected)
def test_append_records(self):
arr1 = np.zeros((2,), dtype=("i4,f4,S10"))
arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
arr2 = np.zeros((3,), dtype=("i4,f4,S10"))
arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")]
df1 = DataFrame(arr1)
df2 = DataFrame(arr2)
result = df1._append(df2, ignore_index=True)
expected = DataFrame(np.concatenate((arr1, arr2)))
tm.assert_frame_equal(result, expected)
# rewrite sort fixture, since we also want to test default of None
def test_append_sorts(self, sort):
df1 = DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"])
df2 = DataFrame({"a": [1, 2], "c": [3, 4]}, index=[2, 3])
result = df1._append(df2, sort=sort)
# for None / True
expected = DataFrame(
{"b": [1, 2, None, None], "a": [1, 2, 1, 2], "c": [None, None, 3, 4]},
columns=["a", "b", "c"],
)
if sort is False:
expected = expected[["b", "a", "c"]]
tm.assert_frame_equal(result, expected)
def test_append_different_columns(self, sort):
df = DataFrame(
{
"bools": np.random.default_rng(2).standard_normal(10) > 0,
"ints": np.random.default_rng(2).integers(0, 10, 10),
"floats": np.random.default_rng(2).standard_normal(10),
"strings": ["foo", "bar"] * 5,
}
)
a = df[:5].loc[:, ["bools", "ints", "floats"]]
b = df[5:].loc[:, ["strings", "ints", "floats"]]
appended = a._append(b, sort=sort)
assert isna(appended["strings"][0:4]).all()
assert isna(appended["bools"][5:]).all()
def test_append_many(self, sort, float_frame):
chunks = [
float_frame[:5],
float_frame[5:10],
float_frame[10:15],
float_frame[15:],
]
result = chunks[0]._append(chunks[1:])
tm.assert_frame_equal(result, float_frame)
chunks[-1] = chunks[-1].copy()
chunks[-1]["foo"] = "bar"
result = chunks[0]._append(chunks[1:], sort=sort)
tm.assert_frame_equal(result.loc[:, float_frame.columns], float_frame)
assert (result["foo"][15:] == "bar").all()
assert result["foo"][:15].isna().all()
def test_append_preserve_index_name(self):
# #980
df1 = DataFrame(columns=["A", "B", "C"])
df1 = df1.set_index(["A"])
df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], columns=["A", "B", "C"])
df2 = df2.set_index(["A"])
msg = "The behavior of array concatenation with empty entries is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df1._append(df2)
assert result.index.name == "A"
indexes_can_append = [
pd.RangeIndex(3),
Index([4, 5, 6]),
Index([4.5, 5.5, 6.5]),
Index(list("abc")),
pd.CategoricalIndex("A B C".split()),
pd.CategoricalIndex("D E F".split(), ordered=True),
pd.IntervalIndex.from_breaks([7, 8, 9, 10]),
pd.DatetimeIndex(
[
dt.datetime(2013, 1, 3, 0, 0),
dt.datetime(2013, 1, 3, 6, 10),
dt.datetime(2013, 1, 3, 7, 12),
]
),
pd.MultiIndex.from_arrays(["A B C".split(), "D E F".split()]),
]
@pytest.mark.parametrize(
"index", indexes_can_append, ids=lambda x: type(x).__name__
)
def test_append_same_columns_type(self, index):
# GH18359
# df wider than ser
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=index)
ser_index = index[:2]
ser = Series([7, 8], index=ser_index, name=2)
result = df._append(ser)
expected = DataFrame(
[[1, 2, 3.0], [4, 5, 6], [7, 8, np.nan]], index=[0, 1, 2], columns=index
)
# integer dtype is preserved for columns present in ser.index
assert expected.dtypes.iloc[0].kind == "i"
assert expected.dtypes.iloc[1].kind == "i"
tm.assert_frame_equal(result, expected)
# ser wider than df
ser_index = index
index = index[:2]
df = DataFrame([[1, 2], [4, 5]], columns=index)
ser = Series([7, 8, 9], index=ser_index, name=2)
result = df._append(ser)
expected = DataFrame(
[[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]],
index=[0, 1, 2],
columns=ser_index,
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"df_columns, series_index",
combinations(indexes_can_append, r=2),
ids=lambda x: type(x).__name__,
)
def test_append_different_columns_types(self, df_columns, series_index):
# GH18359
# See also test 'test_append_different_columns_types_raises' below
# for errors raised when appending
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns)
ser = Series([7, 8, 9], index=series_index, name=2)
result = df._append(ser)
idx_diff = ser.index.difference(df_columns)
combined_columns = Index(df_columns.tolist()).append(idx_diff)
expected = DataFrame(
[
[1.0, 2.0, 3.0, np.nan, np.nan, np.nan],
[4, 5, 6, np.nan, np.nan, np.nan],
[np.nan, np.nan, np.nan, 7, 8, 9],
],
index=[0, 1, 2],
columns=combined_columns,
)
tm.assert_frame_equal(result, expected)
def test_append_dtype_coerce(self, sort):
# GH 4993
# appending with datetime will incorrectly convert datetime64
df1 = DataFrame(
index=[1, 2],
data=[dt.datetime(2013, 1, 1, 0, 0), dt.datetime(2013, 1, 2, 0, 0)],
columns=["start_time"],
)
df2 = DataFrame(
index=[4, 5],
data=[
[dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 3, 6, 10)],
[dt.datetime(2013, 1, 4, 0, 0), dt.datetime(2013, 1, 4, 7, 10)],
],
columns=["start_time", "end_time"],
)
expected = concat(
[
Series(
[
pd.NaT,
pd.NaT,
dt.datetime(2013, 1, 3, 6, 10),
dt.datetime(2013, 1, 4, 7, 10),
],
name="end_time",
),
Series(
[
dt.datetime(2013, 1, 1, 0, 0),
dt.datetime(2013, 1, 2, 0, 0),
dt.datetime(2013, 1, 3, 0, 0),
dt.datetime(2013, 1, 4, 0, 0),
],
name="start_time",
),
],
axis=1,
sort=sort,
)
result = df1._append(df2, ignore_index=True, sort=sort)
if sort:
expected = expected[["end_time", "start_time"]]
else:
expected = expected[["start_time", "end_time"]]
tm.assert_frame_equal(result, expected)
def test_append_missing_column_proper_upcast(self, sort):
df1 = DataFrame({"A": np.array([1, 2, 3, 4], dtype="i8")})
df2 = DataFrame({"B": np.array([True, False, True, False], dtype=bool)})
appended = df1._append(df2, ignore_index=True, sort=sort)
assert appended["A"].dtype == "f8"
assert appended["B"].dtype == "O"
def test_append_empty_frame_to_series_with_dateutil_tz(self):
# GH 23682
date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc())
ser = Series({"a": 1.0, "b": 2.0, "date": date})
df = DataFrame(columns=["c", "d"])
result_a = df._append(ser, ignore_index=True)
expected = DataFrame(
[[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"]
)
# These columns get cast to object after append
expected["c"] = expected["c"].astype(object)
expected["d"] = expected["d"].astype(object)
tm.assert_frame_equal(result_a, expected)
expected = DataFrame(
[[np.nan, np.nan, 1.0, 2.0, date]] * 2, columns=["c", "d", "a", "b", "date"]
)
expected["c"] = expected["c"].astype(object)
expected["d"] = expected["d"].astype(object)
result_b = result_a._append(ser, ignore_index=True)
tm.assert_frame_equal(result_b, expected)
result = df._append([ser, ser], ignore_index=True)
tm.assert_frame_equal(result, expected)
def test_append_empty_tz_frame_with_datetime64ns(self, using_array_manager):
# https://github.com/pandas-dev/pandas/issues/35460
df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]")
# pd.NaT gets inferred as tz-naive, so append result is tz-naive
result = df._append({"a": pd.NaT}, ignore_index=True)
if using_array_manager:
expected = DataFrame({"a": [pd.NaT]}, dtype=object)
else:
expected = DataFrame({"a": [np.nan]}, dtype=object)
tm.assert_frame_equal(result, expected)
# also test with typed value to append
df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]")
other = Series({"a": pd.NaT}, dtype="datetime64[ns]")
result = df._append(other, ignore_index=True)
tm.assert_frame_equal(result, expected)
# mismatched tz
other = Series({"a": pd.NaT}, dtype="datetime64[ns, US/Pacific]")
result = df._append(other, ignore_index=True)
expected = DataFrame({"a": [pd.NaT]}).astype(object)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
)
@pytest.mark.parametrize("val", [1, "NaT"])
def test_append_empty_frame_with_timedelta64ns_nat(
self, dtype_str, val, using_array_manager
):
# https://github.com/pandas-dev/pandas/issues/35460
df = DataFrame(columns=["a"]).astype(dtype_str)
other = DataFrame({"a": [np.timedelta64(val, "ns")]})
result = df._append(other, ignore_index=True)
expected = other.astype(object)
if isinstance(val, str) and dtype_str != "int64" and not using_array_manager:
# TODO: expected used to be `other.astype(object)` which is a more
# reasonable result. This was changed when tightening
# assert_frame_equal's treatment of mismatched NAs to match the
# existing behavior.
expected = DataFrame({"a": [np.nan]}, dtype=object)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
)
@pytest.mark.parametrize("val", [1, "NaT"])
def test_append_frame_with_timedelta64ns_nat(self, dtype_str, val):
# https://github.com/pandas-dev/pandas/issues/35460
df = DataFrame({"a": pd.array([1], dtype=dtype_str)})
other = DataFrame({"a": [np.timedelta64(val, "ns")]})
result = df._append(other, ignore_index=True)
expected = DataFrame({"a": [df.iloc[0, 0], other.iloc[0, 0]]}, dtype=object)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,753 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Index,
Series,
)
import pandas._testing as tm
@pytest.fixture(
params=list(
{
"bool": [True, False, True],
"int64": [1, 2, 3],
"float64": [1.1, np.nan, 3.3],
"category": Categorical(["X", "Y", "Z"]),
"object": ["a", "b", "c"],
"datetime64[ns]": [
pd.Timestamp("2011-01-01"),
pd.Timestamp("2011-01-02"),
pd.Timestamp("2011-01-03"),
],
"datetime64[ns, US/Eastern]": [
pd.Timestamp("2011-01-01", tz="US/Eastern"),
pd.Timestamp("2011-01-02", tz="US/Eastern"),
pd.Timestamp("2011-01-03", tz="US/Eastern"),
],
"timedelta64[ns]": [
pd.Timedelta("1 days"),
pd.Timedelta("2 days"),
pd.Timedelta("3 days"),
],
"period[M]": [
pd.Period("2011-01", freq="M"),
pd.Period("2011-02", freq="M"),
pd.Period("2011-03", freq="M"),
],
}.items()
)
)
def item(request):
key, data = request.param
return key, data
@pytest.fixture
def item2(item):
return item
class TestConcatAppendCommon:
"""
Test common dtype coercion rules between concat and append.
"""
def test_dtypes(self, item, index_or_series, using_infer_string):
# to confirm test case covers intended dtypes
typ, vals = item
obj = index_or_series(vals)
if typ == "object" and using_infer_string:
typ = "string"
if isinstance(obj, Index):
assert obj.dtype == typ
elif isinstance(obj, Series):
if typ.startswith("period"):
assert obj.dtype == "Period[M]"
else:
assert obj.dtype == typ
def test_concatlike_same_dtypes(self, item):
# GH 13660
typ1, vals1 = item
vals2 = vals1
vals3 = vals1
if typ1 == "category":
exp_data = Categorical(list(vals1) + list(vals2))
exp_data3 = Categorical(list(vals1) + list(vals2) + list(vals3))
else:
exp_data = vals1 + vals2
exp_data3 = vals1 + vals2 + vals3
# ----- Index ----- #
# index.append
res = Index(vals1).append(Index(vals2))
exp = Index(exp_data)
tm.assert_index_equal(res, exp)
# 3 elements
res = Index(vals1).append([Index(vals2), Index(vals3)])
exp = Index(exp_data3)
tm.assert_index_equal(res, exp)
# index.append name mismatch
i1 = Index(vals1, name="x")
i2 = Index(vals2, name="y")
res = i1.append(i2)
exp = Index(exp_data)
tm.assert_index_equal(res, exp)
# index.append name match
i1 = Index(vals1, name="x")
i2 = Index(vals2, name="x")
res = i1.append(i2)
exp = Index(exp_data, name="x")
tm.assert_index_equal(res, exp)
# cannot append non-index
with pytest.raises(TypeError, match="all inputs must be Index"):
Index(vals1).append(vals2)
with pytest.raises(TypeError, match="all inputs must be Index"):
Index(vals1).append([Index(vals2), vals3])
# ----- Series ----- #
# series.append
res = Series(vals1)._append(Series(vals2), ignore_index=True)
exp = Series(exp_data)
tm.assert_series_equal(res, exp, check_index_type=True)
# concat
res = pd.concat([Series(vals1), Series(vals2)], ignore_index=True)
tm.assert_series_equal(res, exp, check_index_type=True)
# 3 elements
res = Series(vals1)._append([Series(vals2), Series(vals3)], ignore_index=True)
exp = Series(exp_data3)
tm.assert_series_equal(res, exp)
res = pd.concat(
[Series(vals1), Series(vals2), Series(vals3)],
ignore_index=True,
)
tm.assert_series_equal(res, exp)
# name mismatch
s1 = Series(vals1, name="x")
s2 = Series(vals2, name="y")
res = s1._append(s2, ignore_index=True)
exp = Series(exp_data)
tm.assert_series_equal(res, exp, check_index_type=True)
res = pd.concat([s1, s2], ignore_index=True)
tm.assert_series_equal(res, exp, check_index_type=True)
# name match
s1 = Series(vals1, name="x")
s2 = Series(vals2, name="x")
res = s1._append(s2, ignore_index=True)
exp = Series(exp_data, name="x")
tm.assert_series_equal(res, exp, check_index_type=True)
res = pd.concat([s1, s2], ignore_index=True)
tm.assert_series_equal(res, exp, check_index_type=True)
# cannot append non-index
msg = (
r"cannot concatenate object of type '.+'; "
"only Series and DataFrame objs are valid"
)
with pytest.raises(TypeError, match=msg):
Series(vals1)._append(vals2)
with pytest.raises(TypeError, match=msg):
Series(vals1)._append([Series(vals2), vals3])
with pytest.raises(TypeError, match=msg):
pd.concat([Series(vals1), vals2])
with pytest.raises(TypeError, match=msg):
pd.concat([Series(vals1), Series(vals2), vals3])
def test_concatlike_dtypes_coercion(self, item, item2, request):
# GH 13660
typ1, vals1 = item
typ2, vals2 = item2
vals3 = vals2
# basically infer
exp_index_dtype = None
exp_series_dtype = None
if typ1 == typ2:
pytest.skip("same dtype is tested in test_concatlike_same_dtypes")
elif typ1 == "category" or typ2 == "category":
pytest.skip("categorical type tested elsewhere")
# specify expected dtype
if typ1 == "bool" and typ2 in ("int64", "float64"):
# series coerces to numeric based on numpy rule
# index doesn't because bool is object dtype
exp_series_dtype = typ2
mark = pytest.mark.xfail(reason="GH#39187 casting to object")
request.applymarker(mark)
elif typ2 == "bool" and typ1 in ("int64", "float64"):
exp_series_dtype = typ1
mark = pytest.mark.xfail(reason="GH#39187 casting to object")
request.applymarker(mark)
elif typ1 in {"datetime64[ns, US/Eastern]", "timedelta64[ns]"} or typ2 in {
"datetime64[ns, US/Eastern]",
"timedelta64[ns]",
}:
exp_index_dtype = object
exp_series_dtype = object
exp_data = vals1 + vals2
exp_data3 = vals1 + vals2 + vals3
# ----- Index ----- #
# index.append
# GH#39817
res = Index(vals1).append(Index(vals2))
exp = Index(exp_data, dtype=exp_index_dtype)
tm.assert_index_equal(res, exp)
# 3 elements
res = Index(vals1).append([Index(vals2), Index(vals3)])
exp = Index(exp_data3, dtype=exp_index_dtype)
tm.assert_index_equal(res, exp)
# ----- Series ----- #
# series._append
# GH#39817
res = Series(vals1)._append(Series(vals2), ignore_index=True)
exp = Series(exp_data, dtype=exp_series_dtype)
tm.assert_series_equal(res, exp, check_index_type=True)
# concat
# GH#39817
res = pd.concat([Series(vals1), Series(vals2)], ignore_index=True)
tm.assert_series_equal(res, exp, check_index_type=True)
# 3 elements
# GH#39817
res = Series(vals1)._append([Series(vals2), Series(vals3)], ignore_index=True)
exp = Series(exp_data3, dtype=exp_series_dtype)
tm.assert_series_equal(res, exp)
# GH#39817
res = pd.concat(
[Series(vals1), Series(vals2), Series(vals3)],
ignore_index=True,
)
tm.assert_series_equal(res, exp)
def test_concatlike_common_coerce_to_pandas_object(self):
# GH 13626
# result must be Timestamp/Timedelta, not datetime.datetime/timedelta
dti = pd.DatetimeIndex(["2011-01-01", "2011-01-02"])
tdi = pd.TimedeltaIndex(["1 days", "2 days"])
exp = Index(
[
pd.Timestamp("2011-01-01"),
pd.Timestamp("2011-01-02"),
pd.Timedelta("1 days"),
pd.Timedelta("2 days"),
]
)
res = dti.append(tdi)
tm.assert_index_equal(res, exp)
assert isinstance(res[0], pd.Timestamp)
assert isinstance(res[-1], pd.Timedelta)
dts = Series(dti)
tds = Series(tdi)
res = dts._append(tds)
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
assert isinstance(res.iloc[0], pd.Timestamp)
assert isinstance(res.iloc[-1], pd.Timedelta)
res = pd.concat([dts, tds])
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
assert isinstance(res.iloc[0], pd.Timestamp)
assert isinstance(res.iloc[-1], pd.Timedelta)
def test_concatlike_datetimetz(self, tz_aware_fixture):
tz = tz_aware_fixture
# GH 7795
dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz)
dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz=tz)
exp = pd.DatetimeIndex(
["2011-01-01", "2011-01-02", "2012-01-01", "2012-01-02"], tz=tz
)
res = dti1.append(dti2)
tm.assert_index_equal(res, exp)
dts1 = Series(dti1)
dts2 = Series(dti2)
res = dts1._append(dts2)
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
res = pd.concat([dts1, dts2])
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
@pytest.mark.parametrize("tz", ["UTC", "US/Eastern", "Asia/Tokyo", "EST5EDT"])
def test_concatlike_datetimetz_short(self, tz):
# GH#7795
ix1 = pd.date_range(start="2014-07-15", end="2014-07-17", freq="D", tz=tz)
ix2 = pd.DatetimeIndex(["2014-07-11", "2014-07-21"], tz=tz)
df1 = DataFrame(0, index=ix1, columns=["A", "B"])
df2 = DataFrame(0, index=ix2, columns=["A", "B"])
exp_idx = pd.DatetimeIndex(
["2014-07-15", "2014-07-16", "2014-07-17", "2014-07-11", "2014-07-21"],
tz=tz,
).as_unit("ns")
exp = DataFrame(0, index=exp_idx, columns=["A", "B"])
tm.assert_frame_equal(df1._append(df2), exp)
tm.assert_frame_equal(pd.concat([df1, df2]), exp)
def test_concatlike_datetimetz_to_object(self, tz_aware_fixture):
tz = tz_aware_fixture
# GH 13660
# different tz coerces to object
dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz)
dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"])
exp = Index(
[
pd.Timestamp("2011-01-01", tz=tz),
pd.Timestamp("2011-01-02", tz=tz),
pd.Timestamp("2012-01-01"),
pd.Timestamp("2012-01-02"),
],
dtype=object,
)
res = dti1.append(dti2)
tm.assert_index_equal(res, exp)
dts1 = Series(dti1)
dts2 = Series(dti2)
res = dts1._append(dts2)
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
res = pd.concat([dts1, dts2])
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
# different tz
dti3 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz="US/Pacific")
exp = Index(
[
pd.Timestamp("2011-01-01", tz=tz),
pd.Timestamp("2011-01-02", tz=tz),
pd.Timestamp("2012-01-01", tz="US/Pacific"),
pd.Timestamp("2012-01-02", tz="US/Pacific"),
],
dtype=object,
)
res = dti1.append(dti3)
tm.assert_index_equal(res, exp)
dts1 = Series(dti1)
dts3 = Series(dti3)
res = dts1._append(dts3)
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
res = pd.concat([dts1, dts3])
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
def test_concatlike_common_period(self):
# GH 13660
pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M")
pi2 = pd.PeriodIndex(["2012-01", "2012-02"], freq="M")
exp = pd.PeriodIndex(["2011-01", "2011-02", "2012-01", "2012-02"], freq="M")
res = pi1.append(pi2)
tm.assert_index_equal(res, exp)
ps1 = Series(pi1)
ps2 = Series(pi2)
res = ps1._append(ps2)
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
res = pd.concat([ps1, ps2])
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
def test_concatlike_common_period_diff_freq_to_object(self):
# GH 13221
pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M")
pi2 = pd.PeriodIndex(["2012-01-01", "2012-02-01"], freq="D")
exp = Index(
[
pd.Period("2011-01", freq="M"),
pd.Period("2011-02", freq="M"),
pd.Period("2012-01-01", freq="D"),
pd.Period("2012-02-01", freq="D"),
],
dtype=object,
)
res = pi1.append(pi2)
tm.assert_index_equal(res, exp)
ps1 = Series(pi1)
ps2 = Series(pi2)
res = ps1._append(ps2)
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
res = pd.concat([ps1, ps2])
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
def test_concatlike_common_period_mixed_dt_to_object(self):
# GH 13221
# different datetimelike
pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M")
tdi = pd.TimedeltaIndex(["1 days", "2 days"])
exp = Index(
[
pd.Period("2011-01", freq="M"),
pd.Period("2011-02", freq="M"),
pd.Timedelta("1 days"),
pd.Timedelta("2 days"),
],
dtype=object,
)
res = pi1.append(tdi)
tm.assert_index_equal(res, exp)
ps1 = Series(pi1)
tds = Series(tdi)
res = ps1._append(tds)
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
res = pd.concat([ps1, tds])
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
# inverse
exp = Index(
[
pd.Timedelta("1 days"),
pd.Timedelta("2 days"),
pd.Period("2011-01", freq="M"),
pd.Period("2011-02", freq="M"),
],
dtype=object,
)
res = tdi.append(pi1)
tm.assert_index_equal(res, exp)
ps1 = Series(pi1)
tds = Series(tdi)
res = tds._append(ps1)
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
res = pd.concat([tds, ps1])
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
def test_concat_categorical(self):
# GH 13524
# same categories -> category
s1 = Series([1, 2, np.nan], dtype="category")
s2 = Series([2, 1, 2], dtype="category")
exp = Series([1, 2, np.nan, 2, 1, 2], dtype="category")
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
# partially different categories => not-category
s1 = Series([3, 2], dtype="category")
s2 = Series([2, 1], dtype="category")
exp = Series([3, 2, 2, 1])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
# completely different categories (same dtype) => not-category
s1 = Series([10, 11, np.nan], dtype="category")
s2 = Series([np.nan, 1, 3, 2], dtype="category")
exp = Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype=np.float64)
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
def test_union_categorical_same_categories_different_order(self):
# https://github.com/pandas-dev/pandas/issues/19096
a = Series(Categorical(["a", "b", "c"], categories=["a", "b", "c"]))
b = Series(Categorical(["a", "b", "c"], categories=["b", "a", "c"]))
result = pd.concat([a, b], ignore_index=True)
expected = Series(
Categorical(["a", "b", "c", "a", "b", "c"], categories=["a", "b", "c"])
)
tm.assert_series_equal(result, expected)
def test_concat_categorical_coercion(self):
# GH 13524
# category + not-category => not-category
s1 = Series([1, 2, np.nan], dtype="category")
s2 = Series([2, 1, 2])
exp = Series([1, 2, np.nan, 2, 1, 2], dtype=np.float64)
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
# result shouldn't be affected by 1st elem dtype
exp = Series([2, 1, 2, 1, 2, np.nan], dtype=np.float64)
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)
# all values are not in category => not-category
s1 = Series([3, 2], dtype="category")
s2 = Series([2, 1])
exp = Series([3, 2, 2, 1])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
exp = Series([2, 1, 3, 2])
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)
# completely different categories => not-category
s1 = Series([10, 11, np.nan], dtype="category")
s2 = Series([1, 3, 2])
exp = Series([10, 11, np.nan, 1, 3, 2], dtype=np.float64)
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
exp = Series([1, 3, 2, 10, 11, np.nan], dtype=np.float64)
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)
# different dtype => not-category
s1 = Series([10, 11, np.nan], dtype="category")
s2 = Series(["a", "b", "c"])
exp = Series([10, 11, np.nan, "a", "b", "c"])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
exp = Series(["a", "b", "c", 10, 11, np.nan])
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)
# if normal series only contains NaN-likes => not-category
s1 = Series([10, 11], dtype="category")
s2 = Series([np.nan, np.nan, np.nan])
exp = Series([10, 11, np.nan, np.nan, np.nan])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
exp = Series([np.nan, np.nan, np.nan, 10, 11])
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)
def test_concat_categorical_3elem_coercion(self):
# GH 13524
# mixed dtypes => not-category
s1 = Series([1, 2, np.nan], dtype="category")
s2 = Series([2, 1, 2], dtype="category")
s3 = Series([1, 2, 1, 2, np.nan])
exp = Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype="float")
tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
tm.assert_series_equal(s1._append([s2, s3], ignore_index=True), exp)
exp = Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype="float")
tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s3._append([s1, s2], ignore_index=True), exp)
# values are all in either category => not-category
s1 = Series([4, 5, 6], dtype="category")
s2 = Series([1, 2, 3], dtype="category")
s3 = Series([1, 3, 4])
exp = Series([4, 5, 6, 1, 2, 3, 1, 3, 4])
tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
tm.assert_series_equal(s1._append([s2, s3], ignore_index=True), exp)
exp = Series([1, 3, 4, 4, 5, 6, 1, 2, 3])
tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s3._append([s1, s2], ignore_index=True), exp)
# values are all in either category => not-category
s1 = Series([4, 5, 6], dtype="category")
s2 = Series([1, 2, 3], dtype="category")
s3 = Series([10, 11, 12])
exp = Series([4, 5, 6, 1, 2, 3, 10, 11, 12])
tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
tm.assert_series_equal(s1._append([s2, s3], ignore_index=True), exp)
exp = Series([10, 11, 12, 4, 5, 6, 1, 2, 3])
tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s3._append([s1, s2], ignore_index=True), exp)
def test_concat_categorical_multi_coercion(self):
# GH 13524
s1 = Series([1, 3], dtype="category")
s2 = Series([3, 4], dtype="category")
s3 = Series([2, 3])
s4 = Series([2, 2], dtype="category")
s5 = Series([1, np.nan])
s6 = Series([1, 3, 2], dtype="category")
# mixed dtype, values are all in categories => not-category
exp = Series([1, 3, 3, 4, 2, 3, 2, 2, 1, np.nan, 1, 3, 2])
res = pd.concat([s1, s2, s3, s4, s5, s6], ignore_index=True)
tm.assert_series_equal(res, exp)
res = s1._append([s2, s3, s4, s5, s6], ignore_index=True)
tm.assert_series_equal(res, exp)
exp = Series([1, 3, 2, 1, np.nan, 2, 2, 2, 3, 3, 4, 1, 3])
res = pd.concat([s6, s5, s4, s3, s2, s1], ignore_index=True)
tm.assert_series_equal(res, exp)
res = s6._append([s5, s4, s3, s2, s1], ignore_index=True)
tm.assert_series_equal(res, exp)
def test_concat_categorical_ordered(self):
# GH 13524
s1 = Series(Categorical([1, 2, np.nan], ordered=True))
s2 = Series(Categorical([2, 1, 2], ordered=True))
exp = Series(Categorical([1, 2, np.nan, 2, 1, 2], ordered=True))
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
exp = Series(Categorical([1, 2, np.nan, 2, 1, 2, 1, 2, np.nan], ordered=True))
tm.assert_series_equal(pd.concat([s1, s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s1._append([s2, s1], ignore_index=True), exp)
def test_concat_categorical_coercion_nan(self):
# GH 13524
# some edge cases
# category + not-category => not category
s1 = Series(np.array([np.nan, np.nan], dtype=np.float64), dtype="category")
s2 = Series([np.nan, 1])
exp = Series([np.nan, np.nan, np.nan, 1])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
s1 = Series([1, np.nan], dtype="category")
s2 = Series([np.nan, np.nan])
exp = Series([1, np.nan, np.nan, np.nan], dtype="float")
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
# mixed dtype, all nan-likes => not-category
s1 = Series([np.nan, np.nan], dtype="category")
s2 = Series([np.nan, np.nan])
exp = Series([np.nan, np.nan, np.nan, np.nan])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)
# all category nan-likes => category
s1 = Series([np.nan, np.nan], dtype="category")
s2 = Series([np.nan, np.nan], dtype="category")
exp = Series([np.nan, np.nan, np.nan, np.nan], dtype="category")
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
def test_concat_categorical_empty(self):
# GH 13524
s1 = Series([], dtype="category")
s2 = Series([1, 2], dtype="category")
msg = "The behavior of array concatenation with empty entries is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2)
tm.assert_series_equal(s1._append(s2, ignore_index=True), s2)
with tm.assert_produces_warning(FutureWarning, match=msg):
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2)
tm.assert_series_equal(s2._append(s1, ignore_index=True), s2)
s1 = Series([], dtype="category")
s2 = Series([], dtype="category")
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2)
tm.assert_series_equal(s1._append(s2, ignore_index=True), s2)
s1 = Series([], dtype="category")
s2 = Series([], dtype="object")
# different dtype => not-category
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2)
tm.assert_series_equal(s1._append(s2, ignore_index=True), s2)
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2)
tm.assert_series_equal(s2._append(s1, ignore_index=True), s2)
s1 = Series([], dtype="category")
s2 = Series([np.nan, np.nan])
# empty Series is ignored
exp = Series([np.nan, np.nan])
with tm.assert_produces_warning(FutureWarning, match=msg):
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
with tm.assert_produces_warning(FutureWarning, match=msg):
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)
def test_categorical_concat_append(self):
cat = Categorical(["a", "b"], categories=["a", "b"])
vals = [1, 2]
df = DataFrame({"cats": cat, "vals": vals})
cat2 = Categorical(["a", "b", "a", "b"], categories=["a", "b"])
vals2 = [1, 2, 1, 2]
exp = DataFrame({"cats": cat2, "vals": vals2}, index=Index([0, 1, 0, 1]))
tm.assert_frame_equal(pd.concat([df, df]), exp)
tm.assert_frame_equal(df._append(df), exp)
# GH 13524 can concat different categories
cat3 = Categorical(["a", "b"], categories=["a", "b", "c"])
vals3 = [1, 2]
df_different_categories = DataFrame({"cats": cat3, "vals": vals3})
res = pd.concat([df, df_different_categories], ignore_index=True)
exp = DataFrame({"cats": list("abab"), "vals": [1, 2, 1, 2]})
tm.assert_frame_equal(res, exp)
res = df._append(df_different_categories, ignore_index=True)
tm.assert_frame_equal(res, exp)

View File

@ -0,0 +1,273 @@
from datetime import datetime
import numpy as np
from pandas.core.dtypes.dtypes import CategoricalDtype
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Series,
)
import pandas._testing as tm
class TestCategoricalConcat:
def test_categorical_concat(self, sort):
# See GH 10177
df1 = DataFrame(
np.arange(18, dtype="int64").reshape(6, 3), columns=["a", "b", "c"]
)
df2 = DataFrame(np.arange(14, dtype="int64").reshape(7, 2), columns=["a", "c"])
cat_values = ["one", "one", "two", "one", "two", "two", "one"]
df2["h"] = Series(Categorical(cat_values))
res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=sort)
exp = DataFrame(
{
"a": [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12],
"b": [
1,
4,
7,
10,
13,
16,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
],
"c": [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13],
"h": [None] * 6 + cat_values,
}
)
exp["h"] = exp["h"].astype(df2["h"].dtype)
tm.assert_frame_equal(res, exp)
def test_categorical_concat_dtypes(self, using_infer_string):
# GH8143
index = ["cat", "obj", "num"]
cat = Categorical(["a", "b", "c"])
obj = Series(["a", "b", "c"])
num = Series([1, 2, 3])
df = pd.concat([Series(cat), obj, num], axis=1, keys=index)
result = df.dtypes == (
object if not using_infer_string else "string[pyarrow_numpy]"
)
expected = Series([False, True, False], index=index)
tm.assert_series_equal(result, expected)
result = df.dtypes == "int64"
expected = Series([False, False, True], index=index)
tm.assert_series_equal(result, expected)
result = df.dtypes == "category"
expected = Series([True, False, False], index=index)
tm.assert_series_equal(result, expected)
def test_concat_categoricalindex(self):
# GH 16111, categories that aren't lexsorted
categories = [9, 0, 1, 2, 3]
a = Series(1, index=pd.CategoricalIndex([9, 0], categories=categories))
b = Series(2, index=pd.CategoricalIndex([0, 1], categories=categories))
c = Series(3, index=pd.CategoricalIndex([1, 2], categories=categories))
result = pd.concat([a, b, c], axis=1)
exp_idx = pd.CategoricalIndex([9, 0, 1, 2], categories=categories)
exp = DataFrame(
{
0: [1, 1, np.nan, np.nan],
1: [np.nan, 2, 2, np.nan],
2: [np.nan, np.nan, 3, 3],
},
columns=[0, 1, 2],
index=exp_idx,
)
tm.assert_frame_equal(result, exp)
def test_categorical_concat_preserve(self):
# GH 8641 series concat not preserving category dtype
# GH 13524 can concat different categories
s = Series(list("abc"), dtype="category")
s2 = Series(list("abd"), dtype="category")
exp = Series(list("abcabd"))
res = pd.concat([s, s2], ignore_index=True)
tm.assert_series_equal(res, exp)
exp = Series(list("abcabc"), dtype="category")
res = pd.concat([s, s], ignore_index=True)
tm.assert_series_equal(res, exp)
exp = Series(list("abcabc"), index=[0, 1, 2, 0, 1, 2], dtype="category")
res = pd.concat([s, s])
tm.assert_series_equal(res, exp)
a = Series(np.arange(6, dtype="int64"))
b = Series(list("aabbca"))
df2 = DataFrame({"A": a, "B": b.astype(CategoricalDtype(list("cab")))})
res = pd.concat([df2, df2])
exp = DataFrame(
{
"A": pd.concat([a, a]),
"B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))),
}
)
tm.assert_frame_equal(res, exp)
def test_categorical_index_preserver(self):
a = Series(np.arange(6, dtype="int64"))
b = Series(list("aabbca"))
df2 = DataFrame(
{"A": a, "B": b.astype(CategoricalDtype(list("cab")))}
).set_index("B")
result = pd.concat([df2, df2])
expected = DataFrame(
{
"A": pd.concat([a, a]),
"B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))),
}
).set_index("B")
tm.assert_frame_equal(result, expected)
# wrong categories -> uses concat_compat, which casts to object
df3 = DataFrame(
{"A": a, "B": Categorical(b, categories=list("abe"))}
).set_index("B")
result = pd.concat([df2, df3])
expected = pd.concat(
[
df2.set_axis(df2.index.astype(object), axis=0),
df3.set_axis(df3.index.astype(object), axis=0),
]
)
tm.assert_frame_equal(result, expected)
def test_concat_categorical_tz(self):
# GH-23816
a = Series(pd.date_range("2017-01-01", periods=2, tz="US/Pacific"))
b = Series(["a", "b"], dtype="category")
result = pd.concat([a, b], ignore_index=True)
expected = Series(
[
pd.Timestamp("2017-01-01", tz="US/Pacific"),
pd.Timestamp("2017-01-02", tz="US/Pacific"),
"a",
"b",
]
)
tm.assert_series_equal(result, expected)
def test_concat_categorical_datetime(self):
# GH-39443
df1 = DataFrame(
{"x": Series(datetime(2021, 1, 1), index=[0], dtype="category")}
)
df2 = DataFrame(
{"x": Series(datetime(2021, 1, 2), index=[1], dtype="category")}
)
result = pd.concat([df1, df2])
expected = DataFrame(
{"x": Series([datetime(2021, 1, 1), datetime(2021, 1, 2)])}
)
tm.assert_equal(result, expected)
def test_concat_categorical_unchanged(self):
# GH-12007
# test fix for when concat on categorical and float
# coerces dtype categorical -> float
df = DataFrame(Series(["a", "b", "c"], dtype="category", name="A"))
ser = Series([0, 1, 2], index=[0, 1, 3], name="B")
result = pd.concat([df, ser], axis=1)
expected = DataFrame(
{
"A": Series(["a", "b", "c", np.nan], dtype="category"),
"B": Series([0, 1, np.nan, 2], dtype="float"),
}
)
tm.assert_equal(result, expected)
def test_categorical_concat_gh7864(self):
# GH 7864
# make sure ordering is preserved
df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list("abbaae")})
df["grade"] = Categorical(df["raw_grade"])
df["grade"].cat.set_categories(["e", "a", "b"])
df1 = df[0:3]
df2 = df[3:]
tm.assert_index_equal(df["grade"].cat.categories, df1["grade"].cat.categories)
tm.assert_index_equal(df["grade"].cat.categories, df2["grade"].cat.categories)
dfx = pd.concat([df1, df2])
tm.assert_index_equal(df["grade"].cat.categories, dfx["grade"].cat.categories)
dfa = df1._append(df2)
tm.assert_index_equal(df["grade"].cat.categories, dfa["grade"].cat.categories)
def test_categorical_index_upcast(self):
# GH 17629
# test upcasting to object when concatenating on categorical indexes
# with non-identical categories
a = DataFrame({"foo": [1, 2]}, index=Categorical(["foo", "bar"]))
b = DataFrame({"foo": [4, 3]}, index=Categorical(["baz", "bar"]))
res = pd.concat([a, b])
exp = DataFrame({"foo": [1, 2, 4, 3]}, index=["foo", "bar", "baz", "bar"])
tm.assert_equal(res, exp)
a = Series([1, 2], index=Categorical(["foo", "bar"]))
b = Series([4, 3], index=Categorical(["baz", "bar"]))
res = pd.concat([a, b])
exp = Series([1, 2, 4, 3], index=["foo", "bar", "baz", "bar"])
tm.assert_equal(res, exp)
def test_categorical_missing_from_one_frame(self):
# GH 25412
df1 = DataFrame({"f1": [1, 2, 3]})
df2 = DataFrame({"f1": [2, 3, 1], "f2": Series([4, 4, 4]).astype("category")})
result = pd.concat([df1, df2], sort=True)
dtype = CategoricalDtype([4])
expected = DataFrame(
{
"f1": [1, 2, 3, 2, 3, 1],
"f2": Categorical.from_codes([-1, -1, -1, 0, 0, 0], dtype=dtype),
},
index=[0, 1, 2, 0, 1, 2],
)
tm.assert_frame_equal(result, expected)
def test_concat_categorical_same_categories_different_order(self):
# https://github.com/pandas-dev/pandas/issues/24845
c1 = pd.CategoricalIndex(["a", "a"], categories=["a", "b"], ordered=False)
c2 = pd.CategoricalIndex(["b", "b"], categories=["b", "a"], ordered=False)
c3 = pd.CategoricalIndex(
["a", "a", "b", "b"], categories=["a", "b"], ordered=False
)
df1 = DataFrame({"A": [1, 2]}, index=c1)
df2 = DataFrame({"A": [3, 4]}, index=c2)
result = pd.concat((df1, df2))
expected = DataFrame({"A": [1, 2, 3, 4]}, index=c3)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,912 @@
from collections import (
abc,
deque,
)
from collections.abc import Iterator
from datetime import datetime
from decimal import Decimal
import numpy as np
import pytest
from pandas.errors import InvalidIndexError
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
PeriodIndex,
Series,
concat,
date_range,
)
import pandas._testing as tm
from pandas.core.arrays import SparseArray
from pandas.tests.extension.decimal import to_decimal
class TestConcatenate:
def test_append_concat(self):
# GH#1815
d1 = date_range("12/31/1990", "12/31/1999", freq="YE-DEC")
d2 = date_range("12/31/2000", "12/31/2009", freq="YE-DEC")
s1 = Series(np.random.default_rng(2).standard_normal(10), d1)
s2 = Series(np.random.default_rng(2).standard_normal(10), d2)
s1 = s1.to_period()
s2 = s2.to_period()
# drops index
result = concat([s1, s2])
assert isinstance(result.index, PeriodIndex)
assert result.index[0] == s1.index[0]
def test_concat_copy(self, using_array_manager, using_copy_on_write):
df = DataFrame(np.random.default_rng(2).standard_normal((4, 3)))
df2 = DataFrame(np.random.default_rng(2).integers(0, 10, size=4).reshape(4, 1))
df3 = DataFrame({5: "foo"}, index=range(4))
# These are actual copies.
result = concat([df, df2, df3], axis=1, copy=True)
if not using_copy_on_write:
for arr in result._mgr.arrays:
assert not any(
np.shares_memory(arr, y)
for x in [df, df2, df3]
for y in x._mgr.arrays
)
else:
for arr in result._mgr.arrays:
assert arr.base is not None
# These are the same.
result = concat([df, df2, df3], axis=1, copy=False)
for arr in result._mgr.arrays:
if arr.dtype.kind == "f":
assert arr.base is df._mgr.arrays[0].base
elif arr.dtype.kind in ["i", "u"]:
assert arr.base is df2._mgr.arrays[0].base
elif arr.dtype == object:
if using_array_manager:
# we get the same array object, which has no base
assert arr is df3._mgr.arrays[0]
else:
assert arr.base is not None
# Float block was consolidated.
df4 = DataFrame(np.random.default_rng(2).standard_normal((4, 1)))
result = concat([df, df2, df3, df4], axis=1, copy=False)
for arr in result._mgr.arrays:
if arr.dtype.kind == "f":
if using_array_manager or using_copy_on_write:
# this is a view on some array in either df or df4
assert any(
np.shares_memory(arr, other)
for other in df._mgr.arrays + df4._mgr.arrays
)
else:
# the block was consolidated, so we got a copy anyway
assert arr.base is None
elif arr.dtype.kind in ["i", "u"]:
assert arr.base is df2._mgr.arrays[0].base
elif arr.dtype == object:
# this is a view on df3
assert any(np.shares_memory(arr, other) for other in df3._mgr.arrays)
def test_concat_with_group_keys(self):
# axis=0
df = DataFrame(np.random.default_rng(2).standard_normal((3, 4)))
df2 = DataFrame(np.random.default_rng(2).standard_normal((4, 4)))
result = concat([df, df2], keys=[0, 1])
exp_index = MultiIndex.from_arrays(
[[0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 0, 1, 2, 3]]
)
expected = DataFrame(np.r_[df.values, df2.values], index=exp_index)
tm.assert_frame_equal(result, expected)
result = concat([df, df], keys=[0, 1])
exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])
expected = DataFrame(np.r_[df.values, df.values], index=exp_index2)
tm.assert_frame_equal(result, expected)
# axis=1
df = DataFrame(np.random.default_rng(2).standard_normal((4, 3)))
df2 = DataFrame(np.random.default_rng(2).standard_normal((4, 4)))
result = concat([df, df2], keys=[0, 1], axis=1)
expected = DataFrame(np.c_[df.values, df2.values], columns=exp_index)
tm.assert_frame_equal(result, expected)
result = concat([df, df], keys=[0, 1], axis=1)
expected = DataFrame(np.c_[df.values, df.values], columns=exp_index2)
tm.assert_frame_equal(result, expected)
def test_concat_keys_specific_levels(self):
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)))
pieces = [df.iloc[:, [0, 1]], df.iloc[:, [2]], df.iloc[:, [3]]]
level = ["three", "two", "one", "zero"]
result = concat(
pieces,
axis=1,
keys=["one", "two", "three"],
levels=[level],
names=["group_key"],
)
tm.assert_index_equal(result.columns.levels[0], Index(level, name="group_key"))
tm.assert_index_equal(result.columns.levels[1], Index([0, 1, 2, 3]))
assert result.columns.names == ["group_key", None]
@pytest.mark.parametrize("mapping", ["mapping", "dict"])
def test_concat_mapping(self, mapping, non_dict_mapping_subclass):
constructor = dict if mapping == "dict" else non_dict_mapping_subclass
frames = constructor(
{
"foo": DataFrame(np.random.default_rng(2).standard_normal((4, 3))),
"bar": DataFrame(np.random.default_rng(2).standard_normal((4, 3))),
"baz": DataFrame(np.random.default_rng(2).standard_normal((4, 3))),
"qux": DataFrame(np.random.default_rng(2).standard_normal((4, 3))),
}
)
sorted_keys = list(frames.keys())
result = concat(frames)
expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys)
tm.assert_frame_equal(result, expected)
result = concat(frames, axis=1)
expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys, axis=1)
tm.assert_frame_equal(result, expected)
keys = ["baz", "foo", "bar"]
result = concat(frames, keys=keys)
expected = concat([frames[k] for k in keys], keys=keys)
tm.assert_frame_equal(result, expected)
def test_concat_keys_and_levels(self):
df = DataFrame(np.random.default_rng(2).standard_normal((1, 3)))
df2 = DataFrame(np.random.default_rng(2).standard_normal((1, 4)))
levels = [["foo", "baz"], ["one", "two"]]
names = ["first", "second"]
result = concat(
[df, df2, df, df2],
keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")],
levels=levels,
names=names,
)
expected = concat([df, df2, df, df2])
exp_index = MultiIndex(
levels=levels + [[0]],
codes=[[0, 0, 1, 1], [0, 1, 0, 1], [0, 0, 0, 0]],
names=names + [None],
)
expected.index = exp_index
tm.assert_frame_equal(result, expected)
# no names
result = concat(
[df, df2, df, df2],
keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")],
levels=levels,
)
assert result.index.names == (None,) * 3
# no levels
result = concat(
[df, df2, df, df2],
keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")],
names=["first", "second"],
)
assert result.index.names == ("first", "second", None)
tm.assert_index_equal(
result.index.levels[0], Index(["baz", "foo"], name="first")
)
def test_concat_keys_levels_no_overlap(self):
# GH #1406
df = DataFrame(np.random.default_rng(2).standard_normal((1, 3)), index=["a"])
df2 = DataFrame(np.random.default_rng(2).standard_normal((1, 4)), index=["b"])
msg = "Values not found in passed level"
with pytest.raises(ValueError, match=msg):
concat([df, df], keys=["one", "two"], levels=[["foo", "bar", "baz"]])
msg = "Key one not in level"
with pytest.raises(ValueError, match=msg):
concat([df, df2], keys=["one", "two"], levels=[["foo", "bar", "baz"]])
def test_crossed_dtypes_weird_corner(self):
columns = ["A", "B", "C", "D"]
df1 = DataFrame(
{
"A": np.array([1, 2, 3, 4], dtype="f8"),
"B": np.array([1, 2, 3, 4], dtype="i8"),
"C": np.array([1, 2, 3, 4], dtype="f8"),
"D": np.array([1, 2, 3, 4], dtype="i8"),
},
columns=columns,
)
df2 = DataFrame(
{
"A": np.array([1, 2, 3, 4], dtype="i8"),
"B": np.array([1, 2, 3, 4], dtype="f8"),
"C": np.array([1, 2, 3, 4], dtype="i8"),
"D": np.array([1, 2, 3, 4], dtype="f8"),
},
columns=columns,
)
appended = concat([df1, df2], ignore_index=True)
expected = DataFrame(
np.concatenate([df1.values, df2.values], axis=0), columns=columns
)
tm.assert_frame_equal(appended, expected)
df = DataFrame(np.random.default_rng(2).standard_normal((1, 3)), index=["a"])
df2 = DataFrame(np.random.default_rng(2).standard_normal((1, 4)), index=["b"])
result = concat([df, df2], keys=["one", "two"], names=["first", "second"])
assert result.index.names == ("first", "second")
def test_with_mixed_tuples(self, sort):
# 10697
# columns have mixed tuples, so handle properly
df1 = DataFrame({"A": "foo", ("B", 1): "bar"}, index=range(2))
df2 = DataFrame({"B": "foo", ("B", 1): "bar"}, index=range(2))
# it works
concat([df1, df2], sort=sort)
def test_concat_mixed_objs_columns(self):
# Test column-wise concat for mixed series/frames (axis=1)
# G2385
index = date_range("01-Jan-2013", periods=10, freq="h")
arr = np.arange(10, dtype="int64")
s1 = Series(arr, index=index)
s2 = Series(arr, index=index)
df = DataFrame(arr.reshape(-1, 1), index=index)
expected = DataFrame(
np.repeat(arr, 2).reshape(-1, 2), index=index, columns=[0, 0]
)
result = concat([df, df], axis=1)
tm.assert_frame_equal(result, expected)
expected = DataFrame(
np.repeat(arr, 2).reshape(-1, 2), index=index, columns=[0, 1]
)
result = concat([s1, s2], axis=1)
tm.assert_frame_equal(result, expected)
expected = DataFrame(
np.repeat(arr, 3).reshape(-1, 3), index=index, columns=[0, 1, 2]
)
result = concat([s1, s2, s1], axis=1)
tm.assert_frame_equal(result, expected)
expected = DataFrame(
np.repeat(arr, 5).reshape(-1, 5), index=index, columns=[0, 0, 1, 2, 3]
)
result = concat([s1, df, s2, s2, s1], axis=1)
tm.assert_frame_equal(result, expected)
# with names
s1.name = "foo"
expected = DataFrame(
np.repeat(arr, 3).reshape(-1, 3), index=index, columns=["foo", 0, 0]
)
result = concat([s1, df, s2], axis=1)
tm.assert_frame_equal(result, expected)
s2.name = "bar"
expected = DataFrame(
np.repeat(arr, 3).reshape(-1, 3), index=index, columns=["foo", 0, "bar"]
)
result = concat([s1, df, s2], axis=1)
tm.assert_frame_equal(result, expected)
# ignore index
expected = DataFrame(
np.repeat(arr, 3).reshape(-1, 3), index=index, columns=[0, 1, 2]
)
result = concat([s1, df, s2], axis=1, ignore_index=True)
tm.assert_frame_equal(result, expected)
def test_concat_mixed_objs_index(self):
# Test row-wise concat for mixed series/frames with a common name
# GH2385, GH15047
index = date_range("01-Jan-2013", periods=10, freq="h")
arr = np.arange(10, dtype="int64")
s1 = Series(arr, index=index)
s2 = Series(arr, index=index)
df = DataFrame(arr.reshape(-1, 1), index=index)
expected = DataFrame(
np.tile(arr, 3).reshape(-1, 1), index=index.tolist() * 3, columns=[0]
)
result = concat([s1, df, s2])
tm.assert_frame_equal(result, expected)
def test_concat_mixed_objs_index_names(self):
# Test row-wise concat for mixed series/frames with distinct names
# GH2385, GH15047
index = date_range("01-Jan-2013", periods=10, freq="h")
arr = np.arange(10, dtype="int64")
s1 = Series(arr, index=index, name="foo")
s2 = Series(arr, index=index, name="bar")
df = DataFrame(arr.reshape(-1, 1), index=index)
expected = DataFrame(
np.kron(np.where(np.identity(3) == 1, 1, np.nan), arr).T,
index=index.tolist() * 3,
columns=["foo", 0, "bar"],
)
result = concat([s1, df, s2])
tm.assert_frame_equal(result, expected)
# Rename all series to 0 when ignore_index=True
expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), columns=[0])
result = concat([s1, df, s2], ignore_index=True)
tm.assert_frame_equal(result, expected)
def test_dtype_coercion(self):
# 12411
df = DataFrame({"date": [pd.Timestamp("20130101").tz_localize("UTC"), pd.NaT]})
result = concat([df.iloc[[0]], df.iloc[[1]]])
tm.assert_series_equal(result.dtypes, df.dtypes)
# 12045
df = DataFrame({"date": [datetime(2012, 1, 1), datetime(1012, 1, 2)]})
result = concat([df.iloc[[0]], df.iloc[[1]]])
tm.assert_series_equal(result.dtypes, df.dtypes)
# 11594
df = DataFrame({"text": ["some words"] + [None] * 9})
result = concat([df.iloc[[0]], df.iloc[[1]]])
tm.assert_series_equal(result.dtypes, df.dtypes)
def test_concat_single_with_key(self):
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)))
result = concat([df], keys=["foo"])
expected = concat([df, df], keys=["foo", "bar"])
tm.assert_frame_equal(result, expected[:10])
def test_concat_no_items_raises(self):
with pytest.raises(ValueError, match="No objects to concatenate"):
concat([])
def test_concat_exclude_none(self):
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)))
pieces = [df[:5], None, None, df[5:]]
result = concat(pieces)
tm.assert_frame_equal(result, df)
with pytest.raises(ValueError, match="All objects passed were None"):
concat([None, None])
def test_concat_keys_with_none(self):
# #1649
df0 = DataFrame([[10, 20, 30], [10, 20, 30], [10, 20, 30]])
result = concat({"a": None, "b": df0, "c": df0[:2], "d": df0[:1], "e": df0})
expected = concat({"b": df0, "c": df0[:2], "d": df0[:1], "e": df0})
tm.assert_frame_equal(result, expected)
result = concat(
[None, df0, df0[:2], df0[:1], df0], keys=["a", "b", "c", "d", "e"]
)
expected = concat([df0, df0[:2], df0[:1], df0], keys=["b", "c", "d", "e"])
tm.assert_frame_equal(result, expected)
def test_concat_bug_1719(self):
ts1 = Series(
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
)
ts2 = ts1.copy()[::2]
# to join with union
# these two are of different length!
left = concat([ts1, ts2], join="outer", axis=1)
right = concat([ts2, ts1], join="outer", axis=1)
assert len(left) == len(right)
def test_concat_bug_2972(self):
ts0 = Series(np.zeros(5))
ts1 = Series(np.ones(5))
ts0.name = ts1.name = "same name"
result = concat([ts0, ts1], axis=1)
expected = DataFrame({0: ts0, 1: ts1})
expected.columns = ["same name", "same name"]
tm.assert_frame_equal(result, expected)
def test_concat_bug_3602(self):
# GH 3602, duplicate columns
df1 = DataFrame(
{
"firmNo": [0, 0, 0, 0],
"prc": [6, 6, 6, 6],
"stringvar": ["rrr", "rrr", "rrr", "rrr"],
}
)
df2 = DataFrame(
{"C": [9, 10, 11, 12], "misc": [1, 2, 3, 4], "prc": [6, 6, 6, 6]}
)
expected = DataFrame(
[
[0, 6, "rrr", 9, 1, 6],
[0, 6, "rrr", 10, 2, 6],
[0, 6, "rrr", 11, 3, 6],
[0, 6, "rrr", 12, 4, 6],
]
)
expected.columns = ["firmNo", "prc", "stringvar", "C", "misc", "prc"]
result = concat([df1, df2], axis=1)
tm.assert_frame_equal(result, expected)
def test_concat_iterables(self):
# GH8645 check concat works with tuples, list, generators, and weird
# stuff like deque and custom iterables
df1 = DataFrame([1, 2, 3])
df2 = DataFrame([4, 5, 6])
expected = DataFrame([1, 2, 3, 4, 5, 6])
tm.assert_frame_equal(concat((df1, df2), ignore_index=True), expected)
tm.assert_frame_equal(concat([df1, df2], ignore_index=True), expected)
tm.assert_frame_equal(
concat((df for df in (df1, df2)), ignore_index=True), expected
)
tm.assert_frame_equal(concat(deque((df1, df2)), ignore_index=True), expected)
class CustomIterator1:
def __len__(self) -> int:
return 2
def __getitem__(self, index):
try:
return {0: df1, 1: df2}[index]
except KeyError as err:
raise IndexError from err
tm.assert_frame_equal(concat(CustomIterator1(), ignore_index=True), expected)
class CustomIterator2(abc.Iterable):
def __iter__(self) -> Iterator:
yield df1
yield df2
tm.assert_frame_equal(concat(CustomIterator2(), ignore_index=True), expected)
def test_concat_order(self):
# GH 17344, GH#47331
dfs = [DataFrame(index=range(3), columns=["a", 1, None])]
dfs += [DataFrame(index=range(3), columns=[None, 1, "a"]) for _ in range(100)]
result = concat(dfs, sort=True).columns
expected = Index([1, "a", None])
tm.assert_index_equal(result, expected)
def test_concat_different_extension_dtypes_upcasts(self):
a = Series(pd.array([1, 2], dtype="Int64"))
b = Series(to_decimal([1, 2]))
result = concat([a, b], ignore_index=True)
expected = Series([1, 2, Decimal(1), Decimal(2)], dtype=object)
tm.assert_series_equal(result, expected)
def test_concat_ordered_dict(self):
# GH 21510
expected = concat(
[Series(range(3)), Series(range(4))], keys=["First", "Another"]
)
result = concat({"First": Series(range(3)), "Another": Series(range(4))})
tm.assert_series_equal(result, expected)
def test_concat_duplicate_indices_raise(self):
# GH 45888: test raise for concat DataFrames with duplicate indices
# https://github.com/pandas-dev/pandas/issues/36263
df1 = DataFrame(
np.random.default_rng(2).standard_normal(5),
index=[0, 1, 2, 3, 3],
columns=["a"],
)
df2 = DataFrame(
np.random.default_rng(2).standard_normal(5),
index=[0, 1, 2, 2, 4],
columns=["b"],
)
msg = "Reindexing only valid with uniquely valued Index objects"
with pytest.raises(InvalidIndexError, match=msg):
concat([df1, df2], axis=1)
def test_concat_no_unnecessary_upcast(float_numpy_dtype, frame_or_series):
# GH 13247
dims = frame_or_series(dtype=object).ndim
dt = float_numpy_dtype
dfs = [
frame_or_series(np.array([1], dtype=dt, ndmin=dims)),
frame_or_series(np.array([np.nan], dtype=dt, ndmin=dims)),
frame_or_series(np.array([5], dtype=dt, ndmin=dims)),
]
x = concat(dfs)
assert x.values.dtype == dt
@pytest.mark.parametrize("pdt", [Series, DataFrame])
def test_concat_will_upcast(pdt, any_signed_int_numpy_dtype):
dt = any_signed_int_numpy_dtype
dims = pdt().ndim
dfs = [
pdt(np.array([1], dtype=dt, ndmin=dims)),
pdt(np.array([np.nan], ndmin=dims)),
pdt(np.array([5], dtype=dt, ndmin=dims)),
]
x = concat(dfs)
assert x.values.dtype == "float64"
def test_concat_empty_and_non_empty_frame_regression():
# GH 18178 regression test
df1 = DataFrame({"foo": [1]})
df2 = DataFrame({"foo": []})
expected = DataFrame({"foo": [1.0]})
result = concat([df1, df2])
tm.assert_frame_equal(result, expected)
def test_concat_sparse():
# GH 23557
a = Series(SparseArray([0, 1, 2]))
expected = DataFrame(data=[[0, 0], [1, 1], [2, 2]]).astype(
pd.SparseDtype(np.int64, 0)
)
result = concat([a, a], axis=1)
tm.assert_frame_equal(result, expected)
def test_concat_dense_sparse():
# GH 30668
dtype = pd.SparseDtype(np.float64, None)
a = Series(pd.arrays.SparseArray([1, None]), dtype=dtype)
b = Series([1], dtype=float)
expected = Series(data=[1, None, 1], index=[0, 1, 0]).astype(dtype)
result = concat([a, b], axis=0)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("keys", [["e", "f", "f"], ["f", "e", "f"]])
def test_duplicate_keys(keys):
# GH 33654
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
s1 = Series([7, 8, 9], name="c")
s2 = Series([10, 11, 12], name="d")
result = concat([df, s1, s2], axis=1, keys=keys)
expected_values = [[1, 4, 7, 10], [2, 5, 8, 11], [3, 6, 9, 12]]
expected_columns = MultiIndex.from_tuples(
[(keys[0], "a"), (keys[0], "b"), (keys[1], "c"), (keys[2], "d")]
)
expected = DataFrame(expected_values, columns=expected_columns)
tm.assert_frame_equal(result, expected)
def test_duplicate_keys_same_frame():
# GH 43595
keys = ["e", "e"]
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
result = concat([df, df], axis=1, keys=keys)
expected_values = [[1, 4, 1, 4], [2, 5, 2, 5], [3, 6, 3, 6]]
expected_columns = MultiIndex.from_tuples(
[(keys[0], "a"), (keys[0], "b"), (keys[1], "a"), (keys[1], "b")]
)
expected = DataFrame(expected_values, columns=expected_columns)
tm.assert_frame_equal(result, expected)
@pytest.mark.filterwarnings(
"ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning"
)
@pytest.mark.parametrize(
"obj",
[
tm.SubclassedDataFrame({"A": np.arange(0, 10)}),
tm.SubclassedSeries(np.arange(0, 10), name="A"),
],
)
def test_concat_preserves_subclass(obj):
# GH28330 -- preserve subclass
result = concat([obj, obj])
assert isinstance(result, type(obj))
def test_concat_frame_axis0_extension_dtypes():
# preserve extension dtype (through common_dtype mechanism)
df1 = DataFrame({"a": pd.array([1, 2, 3], dtype="Int64")})
df2 = DataFrame({"a": np.array([4, 5, 6])})
result = concat([df1, df2], ignore_index=True)
expected = DataFrame({"a": [1, 2, 3, 4, 5, 6]}, dtype="Int64")
tm.assert_frame_equal(result, expected)
result = concat([df2, df1], ignore_index=True)
expected = DataFrame({"a": [4, 5, 6, 1, 2, 3]}, dtype="Int64")
tm.assert_frame_equal(result, expected)
def test_concat_preserves_extension_int64_dtype():
# GH 24768
df_a = DataFrame({"a": [-1]}, dtype="Int64")
df_b = DataFrame({"b": [1]}, dtype="Int64")
result = concat([df_a, df_b], ignore_index=True)
expected = DataFrame({"a": [-1, None], "b": [None, 1]}, dtype="Int64")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype1,dtype2,expected_dtype",
[
("bool", "bool", "bool"),
("boolean", "bool", "boolean"),
("bool", "boolean", "boolean"),
("boolean", "boolean", "boolean"),
],
)
def test_concat_bool_types(dtype1, dtype2, expected_dtype):
# GH 42800
ser1 = Series([True, False], dtype=dtype1)
ser2 = Series([False, True], dtype=dtype2)
result = concat([ser1, ser2], ignore_index=True)
expected = Series([True, False, False, True], dtype=expected_dtype)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
("keys", "integrity"),
[
(["red"] * 3, True),
(["red"] * 3, False),
(["red", "blue", "red"], False),
(["red", "blue", "red"], True),
],
)
def test_concat_repeated_keys(keys, integrity):
# GH: 20816
series_list = [Series({"a": 1}), Series({"b": 2}), Series({"c": 3})]
result = concat(series_list, keys=keys, verify_integrity=integrity)
tuples = list(zip(keys, ["a", "b", "c"]))
expected = Series([1, 2, 3], index=MultiIndex.from_tuples(tuples))
tm.assert_series_equal(result, expected)
def test_concat_null_object_with_dti():
# GH#40841
dti = pd.DatetimeIndex(
["2021-04-08 21:21:14+00:00"], dtype="datetime64[ns, UTC]", name="Time (UTC)"
)
right = DataFrame(data={"C": [0.5274]}, index=dti)
idx = Index([None], dtype="object", name="Maybe Time (UTC)")
left = DataFrame(data={"A": [None], "B": [np.nan]}, index=idx)
result = concat([left, right], axis="columns")
exp_index = Index([None, dti[0]], dtype=object)
expected = DataFrame(
{
"A": np.array([None, np.nan], dtype=object),
"B": [np.nan, np.nan],
"C": [np.nan, 0.5274],
},
index=exp_index,
)
tm.assert_frame_equal(result, expected)
def test_concat_multiindex_with_empty_rangeindex():
# GH#41234
mi = MultiIndex.from_tuples([("B", 1), ("C", 1)])
df1 = DataFrame([[1, 2]], columns=mi)
df2 = DataFrame(index=[1], columns=pd.RangeIndex(0))
result = concat([df1, df2])
expected = DataFrame([[1, 2], [np.nan, np.nan]], columns=mi)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data",
[
Series(data=[1, 2]),
DataFrame(
data={
"col1": [1, 2],
}
),
DataFrame(dtype=float),
Series(dtype=float),
],
)
def test_concat_drop_attrs(data):
# GH#41828
df1 = data.copy()
df1.attrs = {1: 1}
df2 = data.copy()
df2.attrs = {1: 2}
df = concat([df1, df2])
assert len(df.attrs) == 0
@pytest.mark.parametrize(
"data",
[
Series(data=[1, 2]),
DataFrame(
data={
"col1": [1, 2],
}
),
DataFrame(dtype=float),
Series(dtype=float),
],
)
def test_concat_retain_attrs(data):
# GH#41828
df1 = data.copy()
df1.attrs = {1: 1}
df2 = data.copy()
df2.attrs = {1: 1}
df = concat([df1, df2])
assert df.attrs[1] == 1
@td.skip_array_manager_invalid_test
@pytest.mark.parametrize("df_dtype", ["float64", "int64", "datetime64[ns]"])
@pytest.mark.parametrize("empty_dtype", [None, "float64", "object"])
def test_concat_ignore_empty_object_float(empty_dtype, df_dtype):
# https://github.com/pandas-dev/pandas/issues/45637
df = DataFrame({"foo": [1, 2], "bar": [1, 2]}, dtype=df_dtype)
empty = DataFrame(columns=["foo", "bar"], dtype=empty_dtype)
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
warn = None
if df_dtype == "datetime64[ns]" or (
df_dtype == "float64" and empty_dtype != "float64"
):
warn = FutureWarning
with tm.assert_produces_warning(warn, match=msg):
result = concat([empty, df])
expected = df
if df_dtype == "int64":
# TODO what exact behaviour do we want for integer eventually?
if empty_dtype == "float64":
expected = df.astype("float64")
else:
expected = df.astype("object")
tm.assert_frame_equal(result, expected)
@td.skip_array_manager_invalid_test
@pytest.mark.parametrize("df_dtype", ["float64", "int64", "datetime64[ns]"])
@pytest.mark.parametrize("empty_dtype", [None, "float64", "object"])
def test_concat_ignore_all_na_object_float(empty_dtype, df_dtype):
df = DataFrame({"foo": [1, 2], "bar": [1, 2]}, dtype=df_dtype)
empty = DataFrame({"foo": [np.nan], "bar": [np.nan]}, dtype=empty_dtype)
if df_dtype == "int64":
# TODO what exact behaviour do we want for integer eventually?
if empty_dtype == "object":
df_dtype = "object"
else:
df_dtype = "float64"
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
warn = None
if empty_dtype != df_dtype and empty_dtype is not None:
warn = FutureWarning
elif df_dtype == "datetime64[ns]":
warn = FutureWarning
with tm.assert_produces_warning(warn, match=msg):
result = concat([empty, df], ignore_index=True)
expected = DataFrame({"foo": [np.nan, 1, 2], "bar": [np.nan, 1, 2]}, dtype=df_dtype)
tm.assert_frame_equal(result, expected)
@td.skip_array_manager_invalid_test
def test_concat_ignore_empty_from_reindex():
# https://github.com/pandas-dev/pandas/pull/43507#issuecomment-920375856
df1 = DataFrame({"a": [1], "b": [pd.Timestamp("2012-01-01")]})
df2 = DataFrame({"a": [2]})
aligned = df2.reindex(columns=df1.columns)
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = concat([df1, aligned], ignore_index=True)
expected = df1 = DataFrame({"a": [1, 2], "b": [pd.Timestamp("2012-01-01"), pd.NaT]})
tm.assert_frame_equal(result, expected)
def test_concat_mismatched_keys_length():
# GH#43485
ser = Series(range(5))
sers = [ser + n for n in range(4)]
keys = ["A", "B", "C"]
msg = r"The behavior of pd.concat with len\(keys\) != len\(objs\) is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
concat(sers, keys=keys, axis=1)
with tm.assert_produces_warning(FutureWarning, match=msg):
concat(sers, keys=keys, axis=0)
with tm.assert_produces_warning(FutureWarning, match=msg):
concat((x for x in sers), keys=(y for y in keys), axis=1)
with tm.assert_produces_warning(FutureWarning, match=msg):
concat((x for x in sers), keys=(y for y in keys), axis=0)
def test_concat_multiindex_with_category():
df1 = DataFrame(
{
"c1": Series(list("abc"), dtype="category"),
"c2": Series(list("eee"), dtype="category"),
"i2": Series([1, 2, 3]),
}
)
df1 = df1.set_index(["c1", "c2"])
df2 = DataFrame(
{
"c1": Series(list("abc"), dtype="category"),
"c2": Series(list("eee"), dtype="category"),
"i2": Series([4, 5, 6]),
}
)
df2 = df2.set_index(["c1", "c2"])
result = concat([df1, df2])
expected = DataFrame(
{
"c1": Series(list("abcabc"), dtype="category"),
"c2": Series(list("eeeeee"), dtype="category"),
"i2": Series([1, 2, 3, 4, 5, 6]),
}
)
expected = expected.set_index(["c1", "c2"])
tm.assert_frame_equal(result, expected)
def test_concat_ea_upcast():
# GH#54848
df1 = DataFrame(["a"], dtype="string")
df2 = DataFrame([1], dtype="Int64")
result = concat([df1, df2])
expected = DataFrame(["a", 1], index=[0, 0])
tm.assert_frame_equal(result, expected)
def test_concat_none_with_timezone_timestamp():
# GH#52093
df1 = DataFrame([{"A": None}])
df2 = DataFrame([{"A": pd.Timestamp("1990-12-20 00:00:00+00:00")}])
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = concat([df1, df2], ignore_index=True)
expected = DataFrame({"A": [None, pd.Timestamp("1990-12-20 00:00:00+00:00")]})
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,230 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
Series,
concat,
)
import pandas._testing as tm
class TestDataFrameConcat:
def test_concat_multiple_frames_dtypes(self):
# GH#2759
df1 = DataFrame(data=np.ones((10, 2)), columns=["foo", "bar"], dtype=np.float64)
df2 = DataFrame(data=np.ones((10, 2)), dtype=np.float32)
results = concat((df1, df2), axis=1).dtypes
expected = Series(
[np.dtype("float64")] * 2 + [np.dtype("float32")] * 2,
index=["foo", "bar", 0, 1],
)
tm.assert_series_equal(results, expected)
def test_concat_tuple_keys(self):
# GH#14438
df1 = DataFrame(np.ones((2, 2)), columns=list("AB"))
df2 = DataFrame(np.ones((3, 2)) * 2, columns=list("AB"))
results = concat((df1, df2), keys=[("bee", "bah"), ("bee", "boo")])
expected = DataFrame(
{
"A": {
("bee", "bah", 0): 1.0,
("bee", "bah", 1): 1.0,
("bee", "boo", 0): 2.0,
("bee", "boo", 1): 2.0,
("bee", "boo", 2): 2.0,
},
"B": {
("bee", "bah", 0): 1.0,
("bee", "bah", 1): 1.0,
("bee", "boo", 0): 2.0,
("bee", "boo", 1): 2.0,
("bee", "boo", 2): 2.0,
},
}
)
tm.assert_frame_equal(results, expected)
def test_concat_named_keys(self):
# GH#14252
df = DataFrame({"foo": [1, 2], "bar": [0.1, 0.2]})
index = Index(["a", "b"], name="baz")
concatted_named_from_keys = concat([df, df], keys=index)
expected_named = DataFrame(
{"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]},
index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=["baz", None]),
)
tm.assert_frame_equal(concatted_named_from_keys, expected_named)
index_no_name = Index(["a", "b"], name=None)
concatted_named_from_names = concat([df, df], keys=index_no_name, names=["baz"])
tm.assert_frame_equal(concatted_named_from_names, expected_named)
concatted_unnamed = concat([df, df], keys=index_no_name)
expected_unnamed = DataFrame(
{"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]},
index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=[None, None]),
)
tm.assert_frame_equal(concatted_unnamed, expected_unnamed)
def test_concat_axis_parameter(self):
# GH#14369
df1 = DataFrame({"A": [0.1, 0.2]}, index=range(2))
df2 = DataFrame({"A": [0.3, 0.4]}, index=range(2))
# Index/row/0 DataFrame
expected_index = DataFrame({"A": [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1])
concatted_index = concat([df1, df2], axis="index")
tm.assert_frame_equal(concatted_index, expected_index)
concatted_row = concat([df1, df2], axis="rows")
tm.assert_frame_equal(concatted_row, expected_index)
concatted_0 = concat([df1, df2], axis=0)
tm.assert_frame_equal(concatted_0, expected_index)
# Columns/1 DataFrame
expected_columns = DataFrame(
[[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=["A", "A"]
)
concatted_columns = concat([df1, df2], axis="columns")
tm.assert_frame_equal(concatted_columns, expected_columns)
concatted_1 = concat([df1, df2], axis=1)
tm.assert_frame_equal(concatted_1, expected_columns)
series1 = Series([0.1, 0.2])
series2 = Series([0.3, 0.4])
# Index/row/0 Series
expected_index_series = Series([0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1])
concatted_index_series = concat([series1, series2], axis="index")
tm.assert_series_equal(concatted_index_series, expected_index_series)
concatted_row_series = concat([series1, series2], axis="rows")
tm.assert_series_equal(concatted_row_series, expected_index_series)
concatted_0_series = concat([series1, series2], axis=0)
tm.assert_series_equal(concatted_0_series, expected_index_series)
# Columns/1 Series
expected_columns_series = DataFrame(
[[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1]
)
concatted_columns_series = concat([series1, series2], axis="columns")
tm.assert_frame_equal(concatted_columns_series, expected_columns_series)
concatted_1_series = concat([series1, series2], axis=1)
tm.assert_frame_equal(concatted_1_series, expected_columns_series)
# Testing ValueError
with pytest.raises(ValueError, match="No axis named"):
concat([series1, series2], axis="something")
def test_concat_numerical_names(self):
# GH#15262, GH#12223
df = DataFrame(
{"col": range(9)},
dtype="int32",
index=(
pd.MultiIndex.from_product(
[["A0", "A1", "A2"], ["B0", "B1", "B2"]], names=[1, 2]
)
),
)
result = concat((df.iloc[:2, :], df.iloc[-2:, :]))
expected = DataFrame(
{"col": [0, 1, 7, 8]},
dtype="int32",
index=pd.MultiIndex.from_tuples(
[("A0", "B0"), ("A0", "B1"), ("A2", "B1"), ("A2", "B2")], names=[1, 2]
),
)
tm.assert_frame_equal(result, expected)
def test_concat_astype_dup_col(self):
# GH#23049
df = DataFrame([{"a": "b"}])
df = concat([df, df], axis=1)
result = df.astype("category")
expected = DataFrame(
np.array(["b", "b"]).reshape(1, 2), columns=["a", "a"]
).astype("category")
tm.assert_frame_equal(result, expected)
def test_concat_dataframe_keys_bug(self, sort):
t1 = DataFrame(
{"value": Series([1, 2, 3], index=Index(["a", "b", "c"], name="id"))}
)
t2 = DataFrame({"value": Series([7, 8], index=Index(["a", "b"], name="id"))})
# it works
result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort)
assert list(result.columns) == [("t1", "value"), ("t2", "value")]
def test_concat_bool_with_int(self):
# GH#42092 we may want to change this to return object, but that
# would need a deprecation
df1 = DataFrame(Series([True, False, True, True], dtype="bool"))
df2 = DataFrame(Series([1, 0, 1], dtype="int64"))
result = concat([df1, df2])
expected = concat([df1.astype("int64"), df2])
tm.assert_frame_equal(result, expected)
def test_concat_duplicates_in_index_with_keys(self):
# GH#42651
index = [1, 1, 3]
data = [1, 2, 3]
df = DataFrame(data=data, index=index)
result = concat([df], keys=["A"], names=["ID", "date"])
mi = pd.MultiIndex.from_product([["A"], index], names=["ID", "date"])
expected = DataFrame(data=data, index=mi)
tm.assert_frame_equal(result, expected)
tm.assert_index_equal(result.index.levels[1], Index([1, 3], name="date"))
@pytest.mark.parametrize("ignore_index", [True, False])
@pytest.mark.parametrize("order", ["C", "F"])
@pytest.mark.parametrize("axis", [0, 1])
def test_concat_copies(self, axis, order, ignore_index, using_copy_on_write):
# based on asv ConcatDataFrames
df = DataFrame(np.zeros((10, 5), dtype=np.float32, order=order))
res = concat([df] * 5, axis=axis, ignore_index=ignore_index, copy=True)
if not using_copy_on_write:
for arr in res._iter_column_arrays():
for arr2 in df._iter_column_arrays():
assert not np.shares_memory(arr, arr2)
def test_outer_sort_columns(self):
# GH#47127
df1 = DataFrame({"A": [0], "B": [1], 0: 1})
df2 = DataFrame({"A": [100]})
result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
expected = DataFrame({0: [1.0, np.nan], "A": [0, 100], "B": [1.0, np.nan]})
tm.assert_frame_equal(result, expected)
def test_inner_sort_columns(self):
# GH#47127
df1 = DataFrame({"A": [0], "B": [1], 0: 1})
df2 = DataFrame({"A": [100], 0: 2})
result = concat([df1, df2], ignore_index=True, join="inner", sort=True)
expected = DataFrame({0: [1, 2], "A": [0, 100]})
tm.assert_frame_equal(result, expected)
def test_sort_columns_one_df(self):
# GH#47127
df1 = DataFrame({"A": [100], 0: 2})
result = concat([df1], ignore_index=True, join="inner", sort=True)
expected = DataFrame({0: [2], "A": [100]})
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,606 @@
import datetime as dt
from datetime import datetime
import dateutil
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
DatetimeIndex,
Index,
MultiIndex,
Series,
Timestamp,
concat,
date_range,
to_timedelta,
)
import pandas._testing as tm
class TestDatetimeConcat:
def test_concat_datetime64_block(self):
rng = date_range("1/1/2000", periods=10)
df = DataFrame({"time": rng})
result = concat([df, df])
assert (result.iloc[:10]["time"] == rng).all()
assert (result.iloc[10:]["time"] == rng).all()
def test_concat_datetime_datetime64_frame(self):
# GH#2624
rows = []
rows.append([datetime(2010, 1, 1), 1])
rows.append([datetime(2010, 1, 2), "hi"])
df2_obj = DataFrame.from_records(rows, columns=["date", "test"])
ind = date_range(start="2000/1/1", freq="D", periods=10)
df1 = DataFrame({"date": ind, "test": range(10)})
# it works!
concat([df1, df2_obj])
def test_concat_datetime_timezone(self):
# GH 18523
idx1 = date_range("2011-01-01", periods=3, freq="h", tz="Europe/Paris")
idx2 = date_range(start=idx1[0], end=idx1[-1], freq="h")
df1 = DataFrame({"a": [1, 2, 3]}, index=idx1)
df2 = DataFrame({"b": [1, 2, 3]}, index=idx2)
result = concat([df1, df2], axis=1)
exp_idx = DatetimeIndex(
[
"2011-01-01 00:00:00+01:00",
"2011-01-01 01:00:00+01:00",
"2011-01-01 02:00:00+01:00",
],
dtype="M8[ns, Europe/Paris]",
freq="h",
)
expected = DataFrame(
[[1, 1], [2, 2], [3, 3]], index=exp_idx, columns=["a", "b"]
)
tm.assert_frame_equal(result, expected)
idx3 = date_range("2011-01-01", periods=3, freq="h", tz="Asia/Tokyo")
df3 = DataFrame({"b": [1, 2, 3]}, index=idx3)
result = concat([df1, df3], axis=1)
exp_idx = DatetimeIndex(
[
"2010-12-31 15:00:00+00:00",
"2010-12-31 16:00:00+00:00",
"2010-12-31 17:00:00+00:00",
"2010-12-31 23:00:00+00:00",
"2011-01-01 00:00:00+00:00",
"2011-01-01 01:00:00+00:00",
]
).as_unit("ns")
expected = DataFrame(
[
[np.nan, 1],
[np.nan, 2],
[np.nan, 3],
[1, np.nan],
[2, np.nan],
[3, np.nan],
],
index=exp_idx,
columns=["a", "b"],
)
tm.assert_frame_equal(result, expected)
# GH 13783: Concat after resample
result = concat([df1.resample("h").mean(), df2.resample("h").mean()], sort=True)
expected = DataFrame(
{"a": [1, 2, 3] + [np.nan] * 3, "b": [np.nan] * 3 + [1, 2, 3]},
index=idx1.append(idx1),
)
tm.assert_frame_equal(result, expected)
def test_concat_datetimeindex_freq(self):
# GH 3232
# Monotonic index result
dr = date_range("01-Jan-2013", periods=100, freq="50ms", tz="UTC")
data = list(range(100))
expected = DataFrame(data, index=dr)
result = concat([expected[:50], expected[50:]])
tm.assert_frame_equal(result, expected)
# Non-monotonic index result
result = concat([expected[50:], expected[:50]])
expected = DataFrame(data[50:] + data[:50], index=dr[50:].append(dr[:50]))
expected.index._data.freq = None
tm.assert_frame_equal(result, expected)
def test_concat_multiindex_datetime_object_index(self):
# https://github.com/pandas-dev/pandas/issues/11058
idx = Index(
[dt.date(2013, 1, 1), dt.date(2014, 1, 1), dt.date(2015, 1, 1)],
dtype="object",
)
s = Series(
["a", "b"],
index=MultiIndex.from_arrays(
[
[1, 2],
idx[:-1],
],
names=["first", "second"],
),
)
s2 = Series(
["a", "b"],
index=MultiIndex.from_arrays(
[[1, 2], idx[::2]],
names=["first", "second"],
),
)
mi = MultiIndex.from_arrays(
[[1, 2, 2], idx],
names=["first", "second"],
)
assert mi.levels[1].dtype == object
expected = DataFrame(
[["a", "a"], ["b", np.nan], [np.nan, "b"]],
index=mi,
)
result = concat([s, s2], axis=1)
tm.assert_frame_equal(result, expected)
def test_concat_NaT_series(self):
# GH 11693
# test for merging NaT series with datetime series.
x = Series(
date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="US/Eastern")
)
y = Series(pd.NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]")
expected = Series([x[0], x[1], pd.NaT, pd.NaT])
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
# all NaT with tz
expected = Series(pd.NaT, index=range(4), dtype="datetime64[ns, US/Eastern]")
result = concat([y, y], ignore_index=True)
tm.assert_series_equal(result, expected)
def test_concat_NaT_series2(self):
# without tz
x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h"))
y = Series(date_range("20151124 10:00", "20151124 11:00", freq="1h"))
y[:] = pd.NaT
expected = Series([x[0], x[1], pd.NaT, pd.NaT])
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
# all NaT without tz
x[:] = pd.NaT
expected = Series(pd.NaT, index=range(4), dtype="datetime64[ns]")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("tz", [None, "UTC"])
def test_concat_NaT_dataframes(self, tz):
# GH 12396
dti = DatetimeIndex([pd.NaT, pd.NaT], tz=tz)
first = DataFrame({0: dti})
second = DataFrame(
[[Timestamp("2015/01/01", tz=tz)], [Timestamp("2016/01/01", tz=tz)]],
index=[2, 3],
)
expected = DataFrame(
[
pd.NaT,
pd.NaT,
Timestamp("2015/01/01", tz=tz),
Timestamp("2016/01/01", tz=tz),
]
)
result = concat([first, second], axis=0)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("tz1", [None, "UTC"])
@pytest.mark.parametrize("tz2", [None, "UTC"])
@pytest.mark.parametrize("item", [pd.NaT, Timestamp("20150101")])
def test_concat_NaT_dataframes_all_NaT_axis_0(
self, tz1, tz2, item, using_array_manager
):
# GH 12396
# tz-naive
first = DataFrame([[pd.NaT], [pd.NaT]]).apply(lambda x: x.dt.tz_localize(tz1))
second = DataFrame([item]).apply(lambda x: x.dt.tz_localize(tz2))
result = concat([first, second], axis=0)
expected = DataFrame(Series([pd.NaT, pd.NaT, item], index=[0, 1, 0]))
expected = expected.apply(lambda x: x.dt.tz_localize(tz2))
if tz1 != tz2:
expected = expected.astype(object)
if item is pd.NaT and not using_array_manager:
# GH#18463
# TODO: setting nan here is to keep the test passing as we
# make assert_frame_equal stricter, but is nan really the
# ideal behavior here?
if tz1 is not None:
expected.iloc[-1, 0] = np.nan
else:
expected.iloc[:-1, 0] = np.nan
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("tz1", [None, "UTC"])
@pytest.mark.parametrize("tz2", [None, "UTC"])
def test_concat_NaT_dataframes_all_NaT_axis_1(self, tz1, tz2):
# GH 12396
first = DataFrame(Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1))
second = DataFrame(Series([pd.NaT]).dt.tz_localize(tz2), columns=[1])
expected = DataFrame(
{
0: Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1),
1: Series([pd.NaT, pd.NaT]).dt.tz_localize(tz2),
}
)
result = concat([first, second], axis=1)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("tz1", [None, "UTC"])
@pytest.mark.parametrize("tz2", [None, "UTC"])
def test_concat_NaT_series_dataframe_all_NaT(self, tz1, tz2):
# GH 12396
# tz-naive
first = Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1)
second = DataFrame(
[
[Timestamp("2015/01/01", tz=tz2)],
[Timestamp("2016/01/01", tz=tz2)],
],
index=[2, 3],
)
expected = DataFrame(
[
pd.NaT,
pd.NaT,
Timestamp("2015/01/01", tz=tz2),
Timestamp("2016/01/01", tz=tz2),
]
)
if tz1 != tz2:
expected = expected.astype(object)
result = concat([first, second])
tm.assert_frame_equal(result, expected)
class TestTimezoneConcat:
def test_concat_tz_series(self):
# gh-11755: tz and no tz
x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC"))
y = Series(date_range("2012-01-01", "2012-01-02"))
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
def test_concat_tz_series2(self):
# gh-11887: concat tz and object
x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC"))
y = Series(["a", "b"])
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
def test_concat_tz_series3(self, unit, unit2):
# see gh-12217 and gh-12306
# Concatenating two UTC times
first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]")
first[0] = first[0].dt.tz_localize("UTC")
second = DataFrame([[datetime(2016, 1, 2)]], dtype=f"M8[{unit2}]")
second[0] = second[0].dt.tz_localize("UTC")
result = concat([first, second])
exp_unit = tm.get_finest_unit(unit, unit2)
assert result[0].dtype == f"datetime64[{exp_unit}, UTC]"
def test_concat_tz_series4(self, unit, unit2):
# Concatenating two London times
first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]")
first[0] = first[0].dt.tz_localize("Europe/London")
second = DataFrame([[datetime(2016, 1, 2)]], dtype=f"M8[{unit2}]")
second[0] = second[0].dt.tz_localize("Europe/London")
result = concat([first, second])
exp_unit = tm.get_finest_unit(unit, unit2)
assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]"
def test_concat_tz_series5(self, unit, unit2):
# Concatenating 2+1 London times
first = DataFrame(
[[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]], dtype=f"M8[{unit}]"
)
first[0] = first[0].dt.tz_localize("Europe/London")
second = DataFrame([[datetime(2016, 1, 3)]], dtype=f"M8[{unit2}]")
second[0] = second[0].dt.tz_localize("Europe/London")
result = concat([first, second])
exp_unit = tm.get_finest_unit(unit, unit2)
assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]"
def test_concat_tz_series6(self, unit, unit2):
# Concatenating 1+2 London times
first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]")
first[0] = first[0].dt.tz_localize("Europe/London")
second = DataFrame(
[[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]], dtype=f"M8[{unit2}]"
)
second[0] = second[0].dt.tz_localize("Europe/London")
result = concat([first, second])
exp_unit = tm.get_finest_unit(unit, unit2)
assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]"
def test_concat_tz_series_tzlocal(self):
# see gh-13583
x = [
Timestamp("2011-01-01", tz=dateutil.tz.tzlocal()),
Timestamp("2011-02-01", tz=dateutil.tz.tzlocal()),
]
y = [
Timestamp("2012-01-01", tz=dateutil.tz.tzlocal()),
Timestamp("2012-02-01", tz=dateutil.tz.tzlocal()),
]
result = concat([Series(x), Series(y)], ignore_index=True)
tm.assert_series_equal(result, Series(x + y))
assert result.dtype == "datetime64[ns, tzlocal()]"
def test_concat_tz_series_with_datetimelike(self):
# see gh-12620: tz and timedelta
x = [
Timestamp("2011-01-01", tz="US/Eastern"),
Timestamp("2011-02-01", tz="US/Eastern"),
]
y = [pd.Timedelta("1 day"), pd.Timedelta("2 day")]
result = concat([Series(x), Series(y)], ignore_index=True)
tm.assert_series_equal(result, Series(x + y, dtype="object"))
# tz and period
y = [pd.Period("2011-03", freq="M"), pd.Period("2011-04", freq="M")]
result = concat([Series(x), Series(y)], ignore_index=True)
tm.assert_series_equal(result, Series(x + y, dtype="object"))
def test_concat_tz_frame(self):
df2 = DataFrame(
{
"A": Timestamp("20130102", tz="US/Eastern"),
"B": Timestamp("20130603", tz="CET"),
},
index=range(5),
)
# concat
df3 = concat([df2.A.to_frame(), df2.B.to_frame()], axis=1)
tm.assert_frame_equal(df2, df3)
def test_concat_multiple_tzs(self):
# GH#12467
# combining datetime tz-aware and naive DataFrames
ts1 = Timestamp("2015-01-01", tz=None)
ts2 = Timestamp("2015-01-01", tz="UTC")
ts3 = Timestamp("2015-01-01", tz="EST")
df1 = DataFrame({"time": [ts1]})
df2 = DataFrame({"time": [ts2]})
df3 = DataFrame({"time": [ts3]})
results = concat([df1, df2]).reset_index(drop=True)
expected = DataFrame({"time": [ts1, ts2]}, dtype=object)
tm.assert_frame_equal(results, expected)
results = concat([df1, df3]).reset_index(drop=True)
expected = DataFrame({"time": [ts1, ts3]}, dtype=object)
tm.assert_frame_equal(results, expected)
results = concat([df2, df3]).reset_index(drop=True)
expected = DataFrame({"time": [ts2, ts3]})
tm.assert_frame_equal(results, expected)
def test_concat_multiindex_with_tz(self):
# GH 6606
df = DataFrame(
{
"dt": DatetimeIndex(
[
datetime(2014, 1, 1),
datetime(2014, 1, 2),
datetime(2014, 1, 3),
],
dtype="M8[ns, US/Pacific]",
),
"b": ["A", "B", "C"],
"c": [1, 2, 3],
"d": [4, 5, 6],
}
)
df = df.set_index(["dt", "b"])
exp_idx1 = DatetimeIndex(
["2014-01-01", "2014-01-02", "2014-01-03"] * 2,
dtype="M8[ns, US/Pacific]",
name="dt",
)
exp_idx2 = Index(["A", "B", "C"] * 2, name="b")
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
expected = DataFrame(
{"c": [1, 2, 3] * 2, "d": [4, 5, 6] * 2}, index=exp_idx, columns=["c", "d"]
)
result = concat([df, df])
tm.assert_frame_equal(result, expected)
def test_concat_tz_not_aligned(self):
# GH#22796
ts = pd.to_datetime([1, 2]).tz_localize("UTC")
a = DataFrame({"A": ts})
b = DataFrame({"A": ts, "B": ts})
result = concat([a, b], sort=True, ignore_index=True)
expected = DataFrame(
{"A": list(ts) + list(ts), "B": [pd.NaT, pd.NaT] + list(ts)}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"t1",
[
"2015-01-01",
pytest.param(
pd.NaT,
marks=pytest.mark.xfail(
reason="GH23037 incorrect dtype when concatenating"
),
),
],
)
def test_concat_tz_NaT(self, t1):
# GH#22796
# Concatenating tz-aware multicolumn DataFrames
ts1 = Timestamp(t1, tz="UTC")
ts2 = Timestamp("2015-01-01", tz="UTC")
ts3 = Timestamp("2015-01-01", tz="UTC")
df1 = DataFrame([[ts1, ts2]])
df2 = DataFrame([[ts3]])
result = concat([df1, df2])
expected = DataFrame([[ts1, ts2], [ts3, pd.NaT]], index=[0, 0])
tm.assert_frame_equal(result, expected)
def test_concat_tz_with_empty(self):
# GH 9188
result = concat(
[DataFrame(date_range("2000", periods=1, tz="UTC")), DataFrame()]
)
expected = DataFrame(date_range("2000", periods=1, tz="UTC"))
tm.assert_frame_equal(result, expected)
class TestPeriodConcat:
def test_concat_period_series(self):
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
y = Series(pd.PeriodIndex(["2015-10-01", "2016-01-01"], freq="D"))
expected = Series([x[0], x[1], y[0], y[1]], dtype="Period[D]")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
def test_concat_period_multiple_freq_series(self):
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
y = Series(pd.PeriodIndex(["2015-10-01", "2016-01-01"], freq="M"))
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
assert result.dtype == "object"
def test_concat_period_other_series(self):
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
y = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="M"))
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
assert result.dtype == "object"
def test_concat_period_other_series2(self):
# non-period
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
y = Series(DatetimeIndex(["2015-11-01", "2015-12-01"]))
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
assert result.dtype == "object"
def test_concat_period_other_series3(self):
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
y = Series(["A", "B"])
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
assert result.dtype == "object"
def test_concat_timedelta64_block():
rng = to_timedelta(np.arange(10), unit="s")
df = DataFrame({"time": rng})
result = concat([df, df])
tm.assert_frame_equal(result.iloc[:10], df)
tm.assert_frame_equal(result.iloc[10:], df)
def test_concat_multiindex_datetime_nat():
# GH#44900
left = DataFrame({"a": 1}, index=MultiIndex.from_tuples([(1, pd.NaT)]))
right = DataFrame(
{"b": 2}, index=MultiIndex.from_tuples([(1, pd.NaT), (2, pd.NaT)])
)
result = concat([left, right], axis="columns")
expected = DataFrame(
{"a": [1.0, np.nan], "b": 2}, MultiIndex.from_tuples([(1, pd.NaT), (2, pd.NaT)])
)
tm.assert_frame_equal(result, expected)
def test_concat_float_datetime64(using_array_manager):
# GH#32934
df_time = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")})
df_float = DataFrame({"A": pd.array([1.0], dtype="float64")})
expected = DataFrame(
{
"A": [
pd.array(["2000"], dtype="datetime64[ns]")[0],
pd.array([1.0], dtype="float64")[0],
]
},
index=[0, 0],
)
result = concat([df_time, df_float])
tm.assert_frame_equal(result, expected)
expected = DataFrame({"A": pd.array([], dtype="object")})
result = concat([df_time.iloc[:0], df_float.iloc[:0]])
tm.assert_frame_equal(result, expected)
expected = DataFrame({"A": pd.array([1.0], dtype="object")})
result = concat([df_time.iloc[:0], df_float])
tm.assert_frame_equal(result, expected)
if not using_array_manager:
expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")})
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = concat([df_time, df_float.iloc[:0]])
tm.assert_frame_equal(result, expected)
else:
expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")}).astype(
{"A": "object"}
)
result = concat([df_time, df_float.iloc[:0]])
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,295 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
RangeIndex,
Series,
concat,
date_range,
)
import pandas._testing as tm
class TestEmptyConcat:
def test_handle_empty_objects(self, sort, using_infer_string):
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd")
)
dfcopy = df[:5].copy()
dfcopy["foo"] = "bar"
empty = df[5:5]
frames = [dfcopy, empty, empty, df[5:]]
concatted = concat(frames, axis=0, sort=sort)
expected = df.reindex(columns=["a", "b", "c", "d", "foo"])
expected["foo"] = expected["foo"].astype(
object if not using_infer_string else "string[pyarrow_numpy]"
)
expected.loc[0:4, "foo"] = "bar"
tm.assert_frame_equal(concatted, expected)
# empty as first element with time series
# GH3259
df = DataFrame(
{"A": range(10000)}, index=date_range("20130101", periods=10000, freq="s")
)
empty = DataFrame()
result = concat([df, empty], axis=1)
tm.assert_frame_equal(result, df)
result = concat([empty, df], axis=1)
tm.assert_frame_equal(result, df)
result = concat([df, empty])
tm.assert_frame_equal(result, df)
result = concat([empty, df])
tm.assert_frame_equal(result, df)
def test_concat_empty_series(self):
# GH 11082
s1 = Series([1, 2, 3], name="x")
s2 = Series(name="y", dtype="float64")
res = concat([s1, s2], axis=1)
exp = DataFrame(
{"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan]},
index=RangeIndex(3),
)
tm.assert_frame_equal(res, exp)
s1 = Series([1, 2, 3], name="x")
s2 = Series(name="y", dtype="float64")
msg = "The behavior of array concatenation with empty entries is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
res = concat([s1, s2], axis=0)
# name will be reset
exp = Series([1, 2, 3])
tm.assert_series_equal(res, exp)
# empty Series with no name
s1 = Series([1, 2, 3], name="x")
s2 = Series(name=None, dtype="float64")
res = concat([s1, s2], axis=1)
exp = DataFrame(
{"x": [1, 2, 3], 0: [np.nan, np.nan, np.nan]},
columns=["x", 0],
index=RangeIndex(3),
)
tm.assert_frame_equal(res, exp)
@pytest.mark.parametrize("tz", [None, "UTC"])
@pytest.mark.parametrize("values", [[], [1, 2, 3]])
def test_concat_empty_series_timelike(self, tz, values):
# GH 18447
first = Series([], dtype="M8[ns]").dt.tz_localize(tz)
dtype = None if values else np.float64
second = Series(values, dtype=dtype)
expected = DataFrame(
{
0: Series([pd.NaT] * len(values), dtype="M8[ns]").dt.tz_localize(tz),
1: values,
}
)
result = concat([first, second], axis=1)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"left,right,expected",
[
# booleans
(np.bool_, np.int32, np.object_), # changed from int32 in 2.0 GH#39817
(np.bool_, np.float32, np.object_),
# datetime-like
("m8[ns]", np.bool_, np.object_),
("m8[ns]", np.int64, np.object_),
("M8[ns]", np.bool_, np.object_),
("M8[ns]", np.int64, np.object_),
# categorical
("category", "category", "category"),
("category", "object", "object"),
],
)
def test_concat_empty_series_dtypes(self, left, right, expected):
# GH#39817, GH#45101
result = concat([Series(dtype=left), Series(dtype=right)])
assert result.dtype == expected
@pytest.mark.parametrize(
"dtype", ["float64", "int8", "uint8", "bool", "m8[ns]", "M8[ns]"]
)
def test_concat_empty_series_dtypes_match_roundtrips(self, dtype):
dtype = np.dtype(dtype)
result = concat([Series(dtype=dtype)])
assert result.dtype == dtype
result = concat([Series(dtype=dtype), Series(dtype=dtype)])
assert result.dtype == dtype
@pytest.mark.parametrize("dtype", ["float64", "int8", "uint8", "m8[ns]", "M8[ns]"])
@pytest.mark.parametrize(
"dtype2",
["float64", "int8", "uint8", "m8[ns]", "M8[ns]"],
)
def test_concat_empty_series_dtypes_roundtrips(self, dtype, dtype2):
# round-tripping with self & like self
if dtype == dtype2:
pytest.skip("same dtype is not applicable for test")
def int_result_type(dtype, dtype2):
typs = {dtype.kind, dtype2.kind}
if not len(typs - {"i", "u", "b"}) and (
dtype.kind == "i" or dtype2.kind == "i"
):
return "i"
elif not len(typs - {"u", "b"}) and (
dtype.kind == "u" or dtype2.kind == "u"
):
return "u"
return None
def float_result_type(dtype, dtype2):
typs = {dtype.kind, dtype2.kind}
if not len(typs - {"f", "i", "u"}) and (
dtype.kind == "f" or dtype2.kind == "f"
):
return "f"
return None
def get_result_type(dtype, dtype2):
result = float_result_type(dtype, dtype2)
if result is not None:
return result
result = int_result_type(dtype, dtype2)
if result is not None:
return result
return "O"
dtype = np.dtype(dtype)
dtype2 = np.dtype(dtype2)
expected = get_result_type(dtype, dtype2)
result = concat([Series(dtype=dtype), Series(dtype=dtype2)]).dtype
assert result.kind == expected
def test_concat_empty_series_dtypes_triple(self):
assert (
concat(
[Series(dtype="M8[ns]"), Series(dtype=np.bool_), Series(dtype=np.int64)]
).dtype
== np.object_
)
def test_concat_empty_series_dtype_category_with_array(self):
# GH#18515
assert (
concat(
[Series(np.array([]), dtype="category"), Series(dtype="float64")]
).dtype
== "float64"
)
def test_concat_empty_series_dtypes_sparse(self):
result = concat(
[
Series(dtype="float64").astype("Sparse"),
Series(dtype="float64").astype("Sparse"),
]
)
assert result.dtype == "Sparse[float64]"
result = concat(
[Series(dtype="float64").astype("Sparse"), Series(dtype="float64")]
)
expected = pd.SparseDtype(np.float64)
assert result.dtype == expected
result = concat(
[Series(dtype="float64").astype("Sparse"), Series(dtype="object")]
)
expected = pd.SparseDtype("object")
assert result.dtype == expected
def test_concat_empty_df_object_dtype(self):
# GH 9149
df_1 = DataFrame({"Row": [0, 1, 1], "EmptyCol": np.nan, "NumberCol": [1, 2, 3]})
df_2 = DataFrame(columns=df_1.columns)
result = concat([df_1, df_2], axis=0)
expected = df_1.astype(object)
tm.assert_frame_equal(result, expected)
def test_concat_empty_dataframe_dtypes(self):
df = DataFrame(columns=list("abc"))
df["a"] = df["a"].astype(np.bool_)
df["b"] = df["b"].astype(np.int32)
df["c"] = df["c"].astype(np.float64)
result = concat([df, df])
assert result["a"].dtype == np.bool_
assert result["b"].dtype == np.int32
assert result["c"].dtype == np.float64
result = concat([df, df.astype(np.float64)])
assert result["a"].dtype == np.object_
assert result["b"].dtype == np.float64
assert result["c"].dtype == np.float64
def test_concat_inner_join_empty(self):
# GH 15328
df_empty = DataFrame()
df_a = DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64")
df_expected = DataFrame({"a": []}, index=RangeIndex(0), dtype="int64")
result = concat([df_a, df_empty], axis=1, join="inner")
tm.assert_frame_equal(result, df_expected)
result = concat([df_a, df_empty], axis=1, join="outer")
tm.assert_frame_equal(result, df_a)
def test_empty_dtype_coerce(self):
# xref to #12411
# xref to #12045
# xref to #11594
# see below
# 10571
df1 = DataFrame(data=[[1, None], [2, None]], columns=["a", "b"])
df2 = DataFrame(data=[[3, None], [4, None]], columns=["a", "b"])
result = concat([df1, df2])
expected = df1.dtypes
tm.assert_series_equal(result.dtypes, expected)
def test_concat_empty_dataframe(self):
# 39037
df1 = DataFrame(columns=["a", "b"])
df2 = DataFrame(columns=["b", "c"])
result = concat([df1, df2, df1])
expected = DataFrame(columns=["a", "b", "c"])
tm.assert_frame_equal(result, expected)
df3 = DataFrame(columns=["a", "b"])
df4 = DataFrame(columns=["b"])
result = concat([df3, df4])
expected = DataFrame(columns=["a", "b"])
tm.assert_frame_equal(result, expected)
def test_concat_empty_dataframe_different_dtypes(self, using_infer_string):
# 39037
df1 = DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
df2 = DataFrame({"a": [1, 2, 3]})
result = concat([df1[:0], df2[:0]])
assert result["a"].dtype == np.int64
assert result["b"].dtype == np.object_ if not using_infer_string else "string"
def test_concat_to_empty_ea(self):
"""48510 `concat` to an empty EA should maintain type EA dtype."""
df_empty = DataFrame({"a": pd.array([], dtype=pd.Int64Dtype())})
df_new = DataFrame({"a": pd.array([1, 2, 3], dtype=pd.Int64Dtype())})
expected = df_new.copy()
result = concat([df_empty, df_new])
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,472 @@
from copy import deepcopy
import numpy as np
import pytest
from pandas.errors import PerformanceWarning
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
Series,
concat,
)
import pandas._testing as tm
class TestIndexConcat:
def test_concat_ignore_index(self, sort):
frame1 = DataFrame(
{"test1": ["a", "b", "c"], "test2": [1, 2, 3], "test3": [4.5, 3.2, 1.2]}
)
frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]})
frame1.index = Index(["x", "y", "z"])
frame2.index = Index(["x", "y", "q"])
v1 = concat([frame1, frame2], axis=1, ignore_index=True, sort=sort)
nan = np.nan
expected = DataFrame(
[
[nan, nan, nan, 4.3],
["a", 1, 4.5, 5.2],
["b", 2, 3.2, 2.2],
["c", 3, 1.2, nan],
],
index=Index(["q", "x", "y", "z"]),
)
if not sort:
expected = expected.loc[["x", "y", "z", "q"]]
tm.assert_frame_equal(v1, expected)
@pytest.mark.parametrize(
"name_in1,name_in2,name_in3,name_out",
[
("idx", "idx", "idx", "idx"),
("idx", "idx", None, None),
("idx", None, None, None),
("idx1", "idx2", None, None),
("idx1", "idx1", "idx2", None),
("idx1", "idx2", "idx3", None),
(None, None, None, None),
],
)
def test_concat_same_index_names(self, name_in1, name_in2, name_in3, name_out):
# GH13475
indices = [
Index(["a", "b", "c"], name=name_in1),
Index(["b", "c", "d"], name=name_in2),
Index(["c", "d", "e"], name=name_in3),
]
frames = [
DataFrame({c: [0, 1, 2]}, index=i) for i, c in zip(indices, ["x", "y", "z"])
]
result = concat(frames, axis=1)
exp_ind = Index(["a", "b", "c", "d", "e"], name=name_out)
expected = DataFrame(
{
"x": [0, 1, 2, np.nan, np.nan],
"y": [np.nan, 0, 1, 2, np.nan],
"z": [np.nan, np.nan, 0, 1, 2],
},
index=exp_ind,
)
tm.assert_frame_equal(result, expected)
def test_concat_rename_index(self):
a = DataFrame(
np.random.default_rng(2).random((3, 3)),
columns=list("ABC"),
index=Index(list("abc"), name="index_a"),
)
b = DataFrame(
np.random.default_rng(2).random((3, 3)),
columns=list("ABC"),
index=Index(list("abc"), name="index_b"),
)
result = concat([a, b], keys=["key0", "key1"], names=["lvl0", "lvl1"])
exp = concat([a, b], keys=["key0", "key1"], names=["lvl0"])
names = list(exp.index.names)
names[1] = "lvl1"
exp.index.set_names(names, inplace=True)
tm.assert_frame_equal(result, exp)
assert result.index.names == exp.index.names
def test_concat_copy_index_series(self, axis, using_copy_on_write):
# GH 29879
ser = Series([1, 2])
comb = concat([ser, ser], axis=axis, copy=True)
if not using_copy_on_write or axis in [0, "index"]:
assert comb.index is not ser.index
else:
assert comb.index is ser.index
def test_concat_copy_index_frame(self, axis, using_copy_on_write):
# GH 29879
df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
comb = concat([df, df], axis=axis, copy=True)
if not using_copy_on_write:
assert not comb.index.is_(df.index)
assert not comb.columns.is_(df.columns)
elif axis in [0, "index"]:
assert not comb.index.is_(df.index)
assert comb.columns.is_(df.columns)
elif axis in [1, "columns"]:
assert comb.index.is_(df.index)
assert not comb.columns.is_(df.columns)
def test_default_index(self):
# is_series and ignore_index
s1 = Series([1, 2, 3], name="x")
s2 = Series([4, 5, 6], name="y")
res = concat([s1, s2], axis=1, ignore_index=True)
assert isinstance(res.columns, pd.RangeIndex)
exp = DataFrame([[1, 4], [2, 5], [3, 6]])
# use check_index_type=True to check the result have
# RangeIndex (default index)
tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
# is_series and all inputs have no names
s1 = Series([1, 2, 3])
s2 = Series([4, 5, 6])
res = concat([s1, s2], axis=1, ignore_index=False)
assert isinstance(res.columns, pd.RangeIndex)
exp = DataFrame([[1, 4], [2, 5], [3, 6]])
exp.columns = pd.RangeIndex(2)
tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
# is_dataframe and ignore_index
df1 = DataFrame({"A": [1, 2], "B": [5, 6]})
df2 = DataFrame({"A": [3, 4], "B": [7, 8]})
res = concat([df1, df2], axis=0, ignore_index=True)
exp = DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], columns=["A", "B"])
tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
res = concat([df1, df2], axis=1, ignore_index=True)
exp = DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]])
tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
def test_dups_index(self):
# GH 4771
# single dtypes
df = DataFrame(
np.random.default_rng(2).integers(0, 10, size=40).reshape(10, 4),
columns=["A", "A", "C", "C"],
)
result = concat([df, df], axis=1)
tm.assert_frame_equal(result.iloc[:, :4], df)
tm.assert_frame_equal(result.iloc[:, 4:], df)
result = concat([df, df], axis=0)
tm.assert_frame_equal(result.iloc[:10], df)
tm.assert_frame_equal(result.iloc[10:], df)
# multi dtypes
df = concat(
[
DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=["A", "A", "B", "B"],
),
DataFrame(
np.random.default_rng(2).integers(0, 10, size=20).reshape(10, 2),
columns=["A", "C"],
),
],
axis=1,
)
result = concat([df, df], axis=1)
tm.assert_frame_equal(result.iloc[:, :6], df)
tm.assert_frame_equal(result.iloc[:, 6:], df)
result = concat([df, df], axis=0)
tm.assert_frame_equal(result.iloc[:10], df)
tm.assert_frame_equal(result.iloc[10:], df)
# append
result = df.iloc[0:8, :]._append(df.iloc[8:])
tm.assert_frame_equal(result, df)
result = df.iloc[0:8, :]._append(df.iloc[8:9])._append(df.iloc[9:10])
tm.assert_frame_equal(result, df)
expected = concat([df, df], axis=0)
result = df._append(df)
tm.assert_frame_equal(result, expected)
class TestMultiIndexConcat:
def test_concat_multiindex_with_keys(self, multiindex_dataframe_random_data):
frame = multiindex_dataframe_random_data
index = frame.index
result = concat([frame, frame], keys=[0, 1], names=["iteration"])
assert result.index.names == ("iteration",) + index.names
tm.assert_frame_equal(result.loc[0], frame)
tm.assert_frame_equal(result.loc[1], frame)
assert result.index.nlevels == 3
def test_concat_multiindex_with_none_in_index_names(self):
# GH 15787
index = MultiIndex.from_product([[1], range(5)], names=["level1", None])
df = DataFrame({"col": range(5)}, index=index, dtype=np.int32)
result = concat([df, df], keys=[1, 2], names=["level2"])
index = MultiIndex.from_product(
[[1, 2], [1], range(5)], names=["level2", "level1", None]
)
expected = DataFrame({"col": list(range(5)) * 2}, index=index, dtype=np.int32)
tm.assert_frame_equal(result, expected)
result = concat([df, df[:2]], keys=[1, 2], names=["level2"])
level2 = [1] * 5 + [2] * 2
level1 = [1] * 7
no_name = list(range(5)) + list(range(2))
tuples = list(zip(level2, level1, no_name))
index = MultiIndex.from_tuples(tuples, names=["level2", "level1", None])
expected = DataFrame({"col": no_name}, index=index, dtype=np.int32)
tm.assert_frame_equal(result, expected)
def test_concat_multiindex_rangeindex(self):
# GH13542
# when multi-index levels are RangeIndex objects
# there is a bug in concat with objects of len 1
df = DataFrame(np.random.default_rng(2).standard_normal((9, 2)))
df.index = MultiIndex(
levels=[pd.RangeIndex(3), pd.RangeIndex(3)],
codes=[np.repeat(np.arange(3), 3), np.tile(np.arange(3), 3)],
)
res = concat([df.iloc[[2, 3, 4], :], df.iloc[[5], :]])
exp = df.iloc[[2, 3, 4, 5], :]
tm.assert_frame_equal(res, exp)
def test_concat_multiindex_dfs_with_deepcopy(self):
# GH 9967
example_multiindex1 = MultiIndex.from_product([["a"], ["b"]])
example_dataframe1 = DataFrame([0], index=example_multiindex1)
example_multiindex2 = MultiIndex.from_product([["a"], ["c"]])
example_dataframe2 = DataFrame([1], index=example_multiindex2)
example_dict = {"s1": example_dataframe1, "s2": example_dataframe2}
expected_index = MultiIndex(
levels=[["s1", "s2"], ["a"], ["b", "c"]],
codes=[[0, 1], [0, 0], [0, 1]],
names=["testname", None, None],
)
expected = DataFrame([[0], [1]], index=expected_index)
result_copy = concat(deepcopy(example_dict), names=["testname"])
tm.assert_frame_equal(result_copy, expected)
result_no_copy = concat(example_dict, names=["testname"])
tm.assert_frame_equal(result_no_copy, expected)
@pytest.mark.parametrize(
"mi1_list",
[
[["a"], range(2)],
[["b"], np.arange(2.0, 4.0)],
[["c"], ["A", "B"]],
[["d"], pd.date_range(start="2017", end="2018", periods=2)],
],
)
@pytest.mark.parametrize(
"mi2_list",
[
[["a"], range(2)],
[["b"], np.arange(2.0, 4.0)],
[["c"], ["A", "B"]],
[["d"], pd.date_range(start="2017", end="2018", periods=2)],
],
)
def test_concat_with_various_multiindex_dtypes(
self, mi1_list: list, mi2_list: list
):
# GitHub #23478
mi1 = MultiIndex.from_product(mi1_list)
mi2 = MultiIndex.from_product(mi2_list)
df1 = DataFrame(np.zeros((1, len(mi1))), columns=mi1)
df2 = DataFrame(np.zeros((1, len(mi2))), columns=mi2)
if mi1_list[0] == mi2_list[0]:
expected_mi = MultiIndex(
levels=[mi1_list[0], list(mi1_list[1])],
codes=[[0, 0, 0, 0], [0, 1, 0, 1]],
)
else:
expected_mi = MultiIndex(
levels=[
mi1_list[0] + mi2_list[0],
list(mi1_list[1]) + list(mi2_list[1]),
],
codes=[[0, 0, 1, 1], [0, 1, 2, 3]],
)
expected_df = DataFrame(np.zeros((1, len(expected_mi))), columns=expected_mi)
with tm.assert_produces_warning(None):
result_df = concat((df1, df2), axis=1)
tm.assert_frame_equal(expected_df, result_df)
def test_concat_multiindex_(self):
# GitHub #44786
df = DataFrame({"col": ["a", "b", "c"]}, index=["1", "2", "2"])
df = concat([df], keys=["X"])
iterables = [["X"], ["1", "2", "2"]]
result_index = df.index
expected_index = MultiIndex.from_product(iterables)
tm.assert_index_equal(result_index, expected_index)
result_df = df
expected_df = DataFrame(
{"col": ["a", "b", "c"]}, index=MultiIndex.from_product(iterables)
)
tm.assert_frame_equal(result_df, expected_df)
def test_concat_with_key_not_unique(self):
# GitHub #46519
df1 = DataFrame({"name": [1]})
df2 = DataFrame({"name": [2]})
df3 = DataFrame({"name": [3]})
df_a = concat([df1, df2, df3], keys=["x", "y", "x"])
# the warning is caused by indexing unsorted multi-index
with tm.assert_produces_warning(
PerformanceWarning, match="indexing past lexsort depth"
):
out_a = df_a.loc[("x", 0), :]
df_b = DataFrame(
{"name": [1, 2, 3]}, index=Index([("x", 0), ("y", 0), ("x", 0)])
)
with tm.assert_produces_warning(
PerformanceWarning, match="indexing past lexsort depth"
):
out_b = df_b.loc[("x", 0)]
tm.assert_frame_equal(out_a, out_b)
df1 = DataFrame({"name": ["a", "a", "b"]})
df2 = DataFrame({"name": ["a", "b"]})
df3 = DataFrame({"name": ["c", "d"]})
df_a = concat([df1, df2, df3], keys=["x", "y", "x"])
with tm.assert_produces_warning(
PerformanceWarning, match="indexing past lexsort depth"
):
out_a = df_a.loc[("x", 0), :]
df_b = DataFrame(
{
"a": ["x", "x", "x", "y", "y", "x", "x"],
"b": [0, 1, 2, 0, 1, 0, 1],
"name": list("aababcd"),
}
).set_index(["a", "b"])
df_b.index.names = [None, None]
with tm.assert_produces_warning(
PerformanceWarning, match="indexing past lexsort depth"
):
out_b = df_b.loc[("x", 0), :]
tm.assert_frame_equal(out_a, out_b)
def test_concat_with_duplicated_levels(self):
# keyword levels should be unique
df1 = DataFrame({"A": [1]}, index=["x"])
df2 = DataFrame({"A": [1]}, index=["y"])
msg = r"Level values not unique: \['x', 'y', 'y'\]"
with pytest.raises(ValueError, match=msg):
concat([df1, df2], keys=["x", "y"], levels=[["x", "y", "y"]])
@pytest.mark.parametrize("levels", [[["x", "y"]], [["x", "y", "y"]]])
def test_concat_with_levels_with_none_keys(self, levels):
df1 = DataFrame({"A": [1]}, index=["x"])
df2 = DataFrame({"A": [1]}, index=["y"])
msg = "levels supported only when keys is not None"
with pytest.raises(ValueError, match=msg):
concat([df1, df2], levels=levels)
def test_concat_range_index_result(self):
# GH#47501
df1 = DataFrame({"a": [1, 2]})
df2 = DataFrame({"b": [1, 2]})
result = concat([df1, df2], sort=True, axis=1)
expected = DataFrame({"a": [1, 2], "b": [1, 2]})
tm.assert_frame_equal(result, expected)
expected_index = pd.RangeIndex(0, 2)
tm.assert_index_equal(result.index, expected_index, exact=True)
def test_concat_index_keep_dtype(self):
# GH#47329
df1 = DataFrame([[0, 1, 1]], columns=Index([1, 2, 3], dtype="object"))
df2 = DataFrame([[0, 1]], columns=Index([1, 2], dtype="object"))
result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
expected = DataFrame(
[[0, 1, 1.0], [0, 1, np.nan]], columns=Index([1, 2, 3], dtype="object")
)
tm.assert_frame_equal(result, expected)
def test_concat_index_keep_dtype_ea_numeric(self, any_numeric_ea_dtype):
# GH#47329
df1 = DataFrame(
[[0, 1, 1]], columns=Index([1, 2, 3], dtype=any_numeric_ea_dtype)
)
df2 = DataFrame([[0, 1]], columns=Index([1, 2], dtype=any_numeric_ea_dtype))
result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
expected = DataFrame(
[[0, 1, 1.0], [0, 1, np.nan]],
columns=Index([1, 2, 3], dtype=any_numeric_ea_dtype),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype", ["Int8", "Int16", "Int32"])
def test_concat_index_find_common(self, dtype):
# GH#47329
df1 = DataFrame([[0, 1, 1]], columns=Index([1, 2, 3], dtype=dtype))
df2 = DataFrame([[0, 1]], columns=Index([1, 2], dtype="Int32"))
result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
expected = DataFrame(
[[0, 1, 1.0], [0, 1, np.nan]], columns=Index([1, 2, 3], dtype="Int32")
)
tm.assert_frame_equal(result, expected)
def test_concat_axis_1_sort_false_rangeindex(self, using_infer_string):
# GH 46675
s1 = Series(["a", "b", "c"])
s2 = Series(["a", "b"])
s3 = Series(["a", "b", "c", "d"])
s4 = Series(
[], dtype=object if not using_infer_string else "string[pyarrow_numpy]"
)
result = concat(
[s1, s2, s3, s4], sort=False, join="outer", ignore_index=False, axis=1
)
expected = DataFrame(
[
["a"] * 3 + [np.nan],
["b"] * 3 + [np.nan],
["c", np.nan] * 2,
[np.nan] * 2 + ["d"] + [np.nan],
],
dtype=object if not using_infer_string else "string[pyarrow_numpy]",
)
tm.assert_frame_equal(
result, expected, check_index_type=True, check_column_type=True
)

View File

@ -0,0 +1,54 @@
from io import StringIO
import numpy as np
import pytest
from pandas import (
DataFrame,
concat,
read_csv,
)
import pandas._testing as tm
class TestInvalidConcat:
@pytest.mark.parametrize("obj", [1, {}, [1, 2], (1, 2)])
def test_concat_invalid(self, obj):
# trying to concat a ndframe with a non-ndframe
df1 = DataFrame(range(2))
msg = (
f"cannot concatenate object of type '{type(obj)}'; "
"only Series and DataFrame objs are valid"
)
with pytest.raises(TypeError, match=msg):
concat([df1, obj])
def test_concat_invalid_first_argument(self):
df1 = DataFrame(range(2))
msg = (
"first argument must be an iterable of pandas "
'objects, you passed an object of type "DataFrame"'
)
with pytest.raises(TypeError, match=msg):
concat(df1)
def test_concat_generator_obj(self):
# generator ok though
concat(DataFrame(np.random.default_rng(2).random((5, 5))) for _ in range(3))
def test_concat_textreader_obj(self):
# text reader ok
# GH6583
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
with read_csv(StringIO(data), chunksize=1) as reader:
result = concat(reader, ignore_index=True)
expected = read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,175 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
DatetimeIndex,
Index,
MultiIndex,
Series,
concat,
date_range,
)
import pandas._testing as tm
class TestSeriesConcat:
def test_concat_series(self):
ts = Series(
np.arange(20, dtype=np.float64),
index=date_range("2020-01-01", periods=20),
name="foo",
)
ts.name = "foo"
pieces = [ts[:5], ts[5:15], ts[15:]]
result = concat(pieces)
tm.assert_series_equal(result, ts)
assert result.name == ts.name
result = concat(pieces, keys=[0, 1, 2])
expected = ts.copy()
ts.index = DatetimeIndex(np.array(ts.index.values, dtype="M8[ns]"))
exp_codes = [np.repeat([0, 1, 2], [len(x) for x in pieces]), np.arange(len(ts))]
exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], codes=exp_codes)
expected.index = exp_index
tm.assert_series_equal(result, expected)
def test_concat_empty_and_non_empty_series_regression(self):
# GH 18187 regression test
s1 = Series([1])
s2 = Series([], dtype=object)
expected = s1
msg = "The behavior of array concatenation with empty entries is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = concat([s1, s2])
tm.assert_series_equal(result, expected)
def test_concat_series_axis1(self):
ts = Series(
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
)
pieces = [ts[:-2], ts[2:], ts[2:-2]]
result = concat(pieces, axis=1)
expected = DataFrame(pieces).T
tm.assert_frame_equal(result, expected)
result = concat(pieces, keys=["A", "B", "C"], axis=1)
expected = DataFrame(pieces, index=["A", "B", "C"]).T
tm.assert_frame_equal(result, expected)
def test_concat_series_axis1_preserves_series_names(self):
# preserve series names, #2489
s = Series(np.random.default_rng(2).standard_normal(5), name="A")
s2 = Series(np.random.default_rng(2).standard_normal(5), name="B")
result = concat([s, s2], axis=1)
expected = DataFrame({"A": s, "B": s2})
tm.assert_frame_equal(result, expected)
s2.name = None
result = concat([s, s2], axis=1)
tm.assert_index_equal(result.columns, Index(["A", 0], dtype="object"))
def test_concat_series_axis1_with_reindex(self, sort):
# must reindex, #2603
s = Series(
np.random.default_rng(2).standard_normal(3), index=["c", "a", "b"], name="A"
)
s2 = Series(
np.random.default_rng(2).standard_normal(4),
index=["d", "a", "b", "c"],
name="B",
)
result = concat([s, s2], axis=1, sort=sort)
expected = DataFrame({"A": s, "B": s2}, index=["c", "a", "b", "d"])
if sort:
expected = expected.sort_index()
tm.assert_frame_equal(result, expected)
def test_concat_series_axis1_names_applied(self):
# ensure names argument is not ignored on axis=1, #23490
s = Series([1, 2, 3])
s2 = Series([4, 5, 6])
result = concat([s, s2], axis=1, keys=["a", "b"], names=["A"])
expected = DataFrame(
[[1, 4], [2, 5], [3, 6]], columns=Index(["a", "b"], name="A")
)
tm.assert_frame_equal(result, expected)
result = concat([s, s2], axis=1, keys=[("a", 1), ("b", 2)], names=["A", "B"])
expected = DataFrame(
[[1, 4], [2, 5], [3, 6]],
columns=MultiIndex.from_tuples([("a", 1), ("b", 2)], names=["A", "B"]),
)
tm.assert_frame_equal(result, expected)
def test_concat_series_axis1_same_names_ignore_index(self):
dates = date_range("01-Jan-2013", "01-Jan-2014", freq="MS")[0:-1]
s1 = Series(
np.random.default_rng(2).standard_normal(len(dates)),
index=dates,
name="value",
)
s2 = Series(
np.random.default_rng(2).standard_normal(len(dates)),
index=dates,
name="value",
)
result = concat([s1, s2], axis=1, ignore_index=True)
expected = Index(range(2))
tm.assert_index_equal(result.columns, expected, exact=True)
@pytest.mark.parametrize(
"s1name,s2name", [(np.int64(190), (43, 0)), (190, (43, 0))]
)
def test_concat_series_name_npscalar_tuple(self, s1name, s2name):
# GH21015
s1 = Series({"a": 1, "b": 2}, name=s1name)
s2 = Series({"c": 5, "d": 6}, name=s2name)
result = concat([s1, s2])
expected = Series({"a": 1, "b": 2, "c": 5, "d": 6})
tm.assert_series_equal(result, expected)
def test_concat_series_partial_columns_names(self):
# GH10698
named_series = Series([1, 2], name="foo")
unnamed_series1 = Series([1, 2])
unnamed_series2 = Series([4, 5])
result = concat([named_series, unnamed_series1, unnamed_series2], axis=1)
expected = DataFrame(
{"foo": [1, 2], 0: [1, 2], 1: [4, 5]}, columns=["foo", 0, 1]
)
tm.assert_frame_equal(result, expected)
result = concat(
[named_series, unnamed_series1, unnamed_series2],
axis=1,
keys=["red", "blue", "yellow"],
)
expected = DataFrame(
{"red": [1, 2], "blue": [1, 2], "yellow": [4, 5]},
columns=["red", "blue", "yellow"],
)
tm.assert_frame_equal(result, expected)
result = concat(
[named_series, unnamed_series1, unnamed_series2], axis=1, ignore_index=True
)
expected = DataFrame({0: [1, 2], 1: [1, 2], 2: [4, 5]})
tm.assert_frame_equal(result, expected)
def test_concat_series_length_one_reversed(self, frame_or_series):
# GH39401
obj = frame_or_series([100])
result = concat([obj.iloc[::-1]])
tm.assert_equal(result, obj)

View File

@ -0,0 +1,118 @@
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame
import pandas._testing as tm
class TestConcatSort:
def test_concat_sorts_columns(self, sort):
# GH-4588
df1 = DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"])
df2 = DataFrame({"a": [3, 4], "c": [5, 6]})
# for sort=True/None
expected = DataFrame(
{"a": [1, 2, 3, 4], "b": [1, 2, None, None], "c": [None, None, 5, 6]},
columns=["a", "b", "c"],
)
if sort is False:
expected = expected[["b", "a", "c"]]
# default
with tm.assert_produces_warning(None):
result = pd.concat([df1, df2], ignore_index=True, sort=sort)
tm.assert_frame_equal(result, expected)
def test_concat_sorts_index(self, sort):
df1 = DataFrame({"a": [1, 2, 3]}, index=["c", "a", "b"])
df2 = DataFrame({"b": [1, 2]}, index=["a", "b"])
# For True/None
expected = DataFrame(
{"a": [2, 3, 1], "b": [1, 2, None]},
index=["a", "b", "c"],
columns=["a", "b"],
)
if sort is False:
expected = expected.loc[["c", "a", "b"]]
# Warn and sort by default
with tm.assert_produces_warning(None):
result = pd.concat([df1, df2], axis=1, sort=sort)
tm.assert_frame_equal(result, expected)
def test_concat_inner_sort(self, sort):
# https://github.com/pandas-dev/pandas/pull/20613
df1 = DataFrame(
{"a": [1, 2], "b": [1, 2], "c": [1, 2]}, columns=["b", "a", "c"]
)
df2 = DataFrame({"a": [1, 2], "b": [3, 4]}, index=[3, 4])
with tm.assert_produces_warning(None):
# unset sort should *not* warn for inner join
# since that never sorted
result = pd.concat([df1, df2], sort=sort, join="inner", ignore_index=True)
expected = DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]}, columns=["b", "a"])
if sort is True:
expected = expected[["a", "b"]]
tm.assert_frame_equal(result, expected)
def test_concat_aligned_sort(self):
# GH-4588
df = DataFrame({"c": [1, 2], "b": [3, 4], "a": [5, 6]}, columns=["c", "b", "a"])
result = pd.concat([df, df], sort=True, ignore_index=True)
expected = DataFrame(
{"a": [5, 6, 5, 6], "b": [3, 4, 3, 4], "c": [1, 2, 1, 2]},
columns=["a", "b", "c"],
)
tm.assert_frame_equal(result, expected)
result = pd.concat(
[df, df[["c", "b"]]], join="inner", sort=True, ignore_index=True
)
expected = expected[["b", "c"]]
tm.assert_frame_equal(result, expected)
def test_concat_aligned_sort_does_not_raise(self):
# GH-4588
# We catch TypeErrors from sorting internally and do not re-raise.
df = DataFrame({1: [1, 2], "a": [3, 4]}, columns=[1, "a"])
expected = DataFrame({1: [1, 2, 1, 2], "a": [3, 4, 3, 4]}, columns=[1, "a"])
result = pd.concat([df, df], ignore_index=True, sort=True)
tm.assert_frame_equal(result, expected)
def test_concat_frame_with_sort_false(self):
# GH 43375
result = pd.concat(
[DataFrame({i: i}, index=[i]) for i in range(2, 0, -1)], sort=False
)
expected = DataFrame([[2, np.nan], [np.nan, 1]], index=[2, 1], columns=[2, 1])
tm.assert_frame_equal(result, expected)
# GH 37937
df1 = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=[1, 2, 3])
df2 = DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]}, index=[3, 1, 6])
result = pd.concat([df2, df1], axis=1, sort=False)
expected = DataFrame(
[
[7.0, 10.0, 3.0, 6.0],
[8.0, 11.0, 1.0, 4.0],
[9.0, 12.0, np.nan, np.nan],
[np.nan, np.nan, 2.0, 5.0],
],
index=[3, 1, 6, 2],
columns=["c", "d", "a", "b"],
)
tm.assert_frame_equal(result, expected)
def test_concat_sort_none_raises(self):
# GH#41518
df = DataFrame({1: [1, 2], "a": [3, 4]})
msg = "The 'sort' keyword only accepts boolean values; None was passed."
with pytest.raises(ValueError, match=msg):
pd.concat([df, df], sort=None)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,111 @@
import pytest
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
from pandas.core.reshape.merge import (
MergeError,
merge,
)
@pytest.mark.parametrize(
("input_col", "output_cols"), [("b", ["a", "b"]), ("a", ["a_x", "a_y"])]
)
def test_merge_cross(input_col, output_cols):
# GH#5401
left = DataFrame({"a": [1, 3]})
right = DataFrame({input_col: [3, 4]})
left_copy = left.copy()
right_copy = right.copy()
result = merge(left, right, how="cross")
expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]})
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(left, left_copy)
tm.assert_frame_equal(right, right_copy)
@pytest.mark.parametrize(
"kwargs",
[
{"left_index": True},
{"right_index": True},
{"on": "a"},
{"left_on": "a"},
{"right_on": "b"},
],
)
def test_merge_cross_error_reporting(kwargs):
# GH#5401
left = DataFrame({"a": [1, 3]})
right = DataFrame({"b": [3, 4]})
msg = (
"Can not pass on, right_on, left_on or set right_index=True or "
"left_index=True"
)
with pytest.raises(MergeError, match=msg):
merge(left, right, how="cross", **kwargs)
def test_merge_cross_mixed_dtypes():
# GH#5401
left = DataFrame(["a", "b", "c"], columns=["A"])
right = DataFrame(range(2), columns=["B"])
result = merge(left, right, how="cross")
expected = DataFrame({"A": ["a", "a", "b", "b", "c", "c"], "B": [0, 1, 0, 1, 0, 1]})
tm.assert_frame_equal(result, expected)
def test_merge_cross_more_than_one_column():
# GH#5401
left = DataFrame({"A": list("ab"), "B": [2, 1]})
right = DataFrame({"C": range(2), "D": range(4, 6)})
result = merge(left, right, how="cross")
expected = DataFrame(
{
"A": ["a", "a", "b", "b"],
"B": [2, 2, 1, 1],
"C": [0, 1, 0, 1],
"D": [4, 5, 4, 5],
}
)
tm.assert_frame_equal(result, expected)
def test_merge_cross_null_values(nulls_fixture):
# GH#5401
left = DataFrame({"a": [1, nulls_fixture]})
right = DataFrame({"b": ["a", "b"], "c": [1.0, 2.0]})
result = merge(left, right, how="cross")
expected = DataFrame(
{
"a": [1, 1, nulls_fixture, nulls_fixture],
"b": ["a", "b", "a", "b"],
"c": [1.0, 2.0, 1.0, 2.0],
}
)
tm.assert_frame_equal(result, expected)
def test_join_cross_error_reporting():
# GH#5401
left = DataFrame({"a": [1, 3]})
right = DataFrame({"a": [3, 4]})
msg = (
"Can not pass on, right_on, left_on or set right_index=True or "
"left_index=True"
)
with pytest.raises(MergeError, match=msg):
left.join(right, how="cross", on="a")
def test_merge_cross_series():
# GH#54055
ls = Series([1, 2, 3, 4], index=[1, 2, 3, 4], name="left")
rs = Series([3, 4, 5, 6], index=[3, 4, 5, 6], name="right")
res = merge(ls, rs, how="cross")
expected = merge(ls.to_frame(), rs.to_frame(), how="cross")
tm.assert_frame_equal(res, expected)

View File

@ -0,0 +1,186 @@
import numpy as np
import pytest
from pandas import DataFrame
import pandas._testing as tm
@pytest.fixture
def df1():
return DataFrame(
{
"outer": [1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4],
"inner": [1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2],
"v1": np.linspace(0, 1, 11),
}
)
@pytest.fixture
def df2():
return DataFrame(
{
"outer": [1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3],
"inner": [1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3],
"v2": np.linspace(10, 11, 12),
}
)
@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]])
def left_df(request, df1):
"""Construct left test DataFrame with specified levels
(any of 'outer', 'inner', and 'v1')
"""
levels = request.param
if levels:
df1 = df1.set_index(levels)
return df1
@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]])
def right_df(request, df2):
"""Construct right test DataFrame with specified levels
(any of 'outer', 'inner', and 'v2')
"""
levels = request.param
if levels:
df2 = df2.set_index(levels)
return df2
def compute_expected(df_left, df_right, on=None, left_on=None, right_on=None, how=None):
"""
Compute the expected merge result for the test case.
This method computes the expected result of merging two DataFrames on
a combination of their columns and index levels. It does so by
explicitly dropping/resetting their named index levels, performing a
merge on their columns, and then finally restoring the appropriate
index in the result.
Parameters
----------
df_left : DataFrame
The left DataFrame (may have zero or more named index levels)
df_right : DataFrame
The right DataFrame (may have zero or more named index levels)
on : list of str
The on parameter to the merge operation
left_on : list of str
The left_on parameter to the merge operation
right_on : list of str
The right_on parameter to the merge operation
how : str
The how parameter to the merge operation
Returns
-------
DataFrame
The expected merge result
"""
# Handle on param if specified
if on is not None:
left_on, right_on = on, on
# Compute input named index levels
left_levels = [n for n in df_left.index.names if n is not None]
right_levels = [n for n in df_right.index.names if n is not None]
# Compute output named index levels
output_levels = [i for i in left_on if i in right_levels and i in left_levels]
# Drop index levels that aren't involved in the merge
drop_left = [n for n in left_levels if n not in left_on]
if drop_left:
df_left = df_left.reset_index(drop_left, drop=True)
drop_right = [n for n in right_levels if n not in right_on]
if drop_right:
df_right = df_right.reset_index(drop_right, drop=True)
# Convert remaining index levels to columns
reset_left = [n for n in left_levels if n in left_on]
if reset_left:
df_left = df_left.reset_index(level=reset_left)
reset_right = [n for n in right_levels if n in right_on]
if reset_right:
df_right = df_right.reset_index(level=reset_right)
# Perform merge
expected = df_left.merge(df_right, left_on=left_on, right_on=right_on, how=how)
# Restore index levels
if output_levels:
expected = expected.set_index(output_levels)
return expected
@pytest.mark.parametrize(
"on,how",
[
(["outer"], "inner"),
(["inner"], "left"),
(["outer", "inner"], "right"),
(["inner", "outer"], "outer"),
],
)
def test_merge_indexes_and_columns_on(left_df, right_df, on, how):
# Construct expected result
expected = compute_expected(left_df, right_df, on=on, how=how)
# Perform merge
result = left_df.merge(right_df, on=on, how=how)
tm.assert_frame_equal(result, expected, check_like=True)
@pytest.mark.parametrize(
"left_on,right_on,how",
[
(["outer"], ["outer"], "inner"),
(["inner"], ["inner"], "right"),
(["outer", "inner"], ["outer", "inner"], "left"),
(["inner", "outer"], ["inner", "outer"], "outer"),
],
)
def test_merge_indexes_and_columns_lefton_righton(
left_df, right_df, left_on, right_on, how
):
# Construct expected result
expected = compute_expected(
left_df, right_df, left_on=left_on, right_on=right_on, how=how
)
# Perform merge
result = left_df.merge(right_df, left_on=left_on, right_on=right_on, how=how)
tm.assert_frame_equal(result, expected, check_like=True)
@pytest.mark.parametrize("left_index", ["inner", ["inner", "outer"]])
def test_join_indexes_and_columns_on(df1, df2, left_index, join_type):
# Construct left_df
left_df = df1.set_index(left_index)
# Construct right_df
right_df = df2.set_index(["outer", "inner"])
# Result
expected = (
left_df.reset_index()
.join(
right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y"
)
.set_index(left_index)
)
# Perform join
result = left_df.join(
right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y"
)
tm.assert_frame_equal(result, expected, check_like=True)

View File

@ -0,0 +1,244 @@
import re
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
merge_ordered,
)
import pandas._testing as tm
@pytest.fixture
def left():
return DataFrame({"key": ["a", "c", "e"], "lvalue": [1, 2.0, 3]})
@pytest.fixture
def right():
return DataFrame({"key": ["b", "c", "d", "f"], "rvalue": [1, 2, 3.0, 4]})
class TestMergeOrdered:
def test_basic(self, left, right):
result = merge_ordered(left, right, on="key")
expected = DataFrame(
{
"key": ["a", "b", "c", "d", "e", "f"],
"lvalue": [1, np.nan, 2, np.nan, 3, np.nan],
"rvalue": [np.nan, 1, 2, 3, np.nan, 4],
}
)
tm.assert_frame_equal(result, expected)
def test_ffill(self, left, right):
result = merge_ordered(left, right, on="key", fill_method="ffill")
expected = DataFrame(
{
"key": ["a", "b", "c", "d", "e", "f"],
"lvalue": [1.0, 1, 2, 2, 3, 3.0],
"rvalue": [np.nan, 1, 2, 3, 3, 4],
}
)
tm.assert_frame_equal(result, expected)
def test_multigroup(self, left, right):
left = pd.concat([left, left], ignore_index=True)
left["group"] = ["a"] * 3 + ["b"] * 3
result = merge_ordered(
left, right, on="key", left_by="group", fill_method="ffill"
)
expected = DataFrame(
{
"key": ["a", "b", "c", "d", "e", "f"] * 2,
"lvalue": [1.0, 1, 2, 2, 3, 3.0] * 2,
"rvalue": [np.nan, 1, 2, 3, 3, 4] * 2,
}
)
expected["group"] = ["a"] * 6 + ["b"] * 6
tm.assert_frame_equal(result, expected.loc[:, result.columns])
result2 = merge_ordered(
right, left, on="key", right_by="group", fill_method="ffill"
)
tm.assert_frame_equal(result, result2.loc[:, result.columns])
result = merge_ordered(left, right, on="key", left_by="group")
assert result["group"].notna().all()
@pytest.mark.filterwarnings(
"ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning"
)
def test_merge_type(self, left, right):
class NotADataFrame(DataFrame):
@property
def _constructor(self):
return NotADataFrame
nad = NotADataFrame(left)
result = nad.merge(right, on="key")
assert isinstance(result, NotADataFrame)
@pytest.mark.parametrize(
"df_seq, pattern",
[
((), "[Nn]o objects"),
([], "[Nn]o objects"),
({}, "[Nn]o objects"),
([None], "objects.*None"),
([None, None], "objects.*None"),
],
)
def test_empty_sequence_concat(self, df_seq, pattern):
# GH 9157
with pytest.raises(ValueError, match=pattern):
pd.concat(df_seq)
@pytest.mark.parametrize(
"arg", [[DataFrame()], [None, DataFrame()], [DataFrame(), None]]
)
def test_empty_sequence_concat_ok(self, arg):
pd.concat(arg)
def test_doc_example(self):
left = DataFrame(
{
"group": list("aaabbb"),
"key": ["a", "c", "e", "a", "c", "e"],
"lvalue": [1, 2, 3] * 2,
}
)
right = DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})
result = merge_ordered(left, right, fill_method="ffill", left_by="group")
expected = DataFrame(
{
"group": list("aaaaabbbbb"),
"key": ["a", "b", "c", "d", "e"] * 2,
"lvalue": [1, 1, 2, 2, 3] * 2,
"rvalue": [np.nan, 1, 2, 3, 3] * 2,
}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"left, right, on, left_by, right_by, expected",
[
(
DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}),
DataFrame({"T": [2], "E": [1]}),
["T"],
["G", "H"],
None,
DataFrame(
{
"G": ["g"] * 3,
"H": ["h"] * 3,
"T": [1, 2, 3],
"E": [np.nan, 1.0, np.nan],
}
),
),
(
DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}),
DataFrame({"T": [2], "E": [1]}),
"T",
["G", "H"],
None,
DataFrame(
{
"G": ["g"] * 3,
"H": ["h"] * 3,
"T": [1, 2, 3],
"E": [np.nan, 1.0, np.nan],
}
),
),
(
DataFrame({"T": [2], "E": [1]}),
DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}),
["T"],
None,
["G", "H"],
DataFrame(
{
"T": [1, 2, 3],
"E": [np.nan, 1.0, np.nan],
"G": ["g"] * 3,
"H": ["h"] * 3,
}
),
),
],
)
def test_list_type_by(self, left, right, on, left_by, right_by, expected):
# GH 35269
result = merge_ordered(
left=left,
right=right,
on=on,
left_by=left_by,
right_by=right_by,
)
tm.assert_frame_equal(result, expected)
def test_left_by_length_equals_to_right_shape0(self):
# GH 38166
left = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHE"))
right = DataFrame([[2, 1]], columns=list("ET"))
result = merge_ordered(left, right, on="E", left_by=["G", "H"])
expected = DataFrame(
{"G": ["g"] * 3, "H": ["h"] * 3, "E": [1, 2, 3], "T": [np.nan, 1.0, np.nan]}
)
tm.assert_frame_equal(result, expected)
def test_elements_not_in_by_but_in_df(self):
# GH 38167
left = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHE"))
right = DataFrame([[2, 1]], columns=list("ET"))
msg = r"\{'h'\} not found in left columns"
with pytest.raises(KeyError, match=msg):
merge_ordered(left, right, on="E", left_by=["G", "h"])
@pytest.mark.parametrize("invalid_method", ["linear", "carrot"])
def test_ffill_validate_fill_method(self, left, right, invalid_method):
# GH 55884
with pytest.raises(
ValueError, match=re.escape("fill_method must be 'ffill' or None")
):
merge_ordered(left, right, on="key", fill_method=invalid_method)
def test_ffill_left_merge(self):
# GH 57010
df1 = DataFrame(
{
"key": ["a", "c", "e", "a", "c", "e"],
"lvalue": [1, 2, 3, 1, 2, 3],
"group": ["a", "a", "a", "b", "b", "b"],
}
)
df2 = DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})
result = merge_ordered(
df1, df2, fill_method="ffill", left_by="group", how="left"
)
expected = DataFrame(
{
"key": ["a", "c", "e", "a", "c", "e"],
"lvalue": [1, 2, 3, 1, 2, 3],
"group": ["a", "a", "a", "b", "b", "b"],
"rvalue": [np.nan, 2.0, 2.0, np.nan, 2.0, 2.0],
}
)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,934 @@
import numpy as np
import pytest
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
RangeIndex,
Series,
Timestamp,
option_context,
)
import pandas._testing as tm
from pandas.core.reshape.concat import concat
from pandas.core.reshape.merge import merge
@pytest.fixture
def left():
"""left dataframe (not multi-indexed) for multi-index join tests"""
# a little relevant example with NAs
key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"]
key2 = ["two", "one", "three", "one", "two", "one", "two", "two", "three", "one"]
data = np.random.default_rng(2).standard_normal(len(key1))
return DataFrame({"key1": key1, "key2": key2, "data": data})
@pytest.fixture
def right(multiindex_dataframe_random_data):
"""right dataframe (multi-indexed) for multi-index join tests"""
df = multiindex_dataframe_random_data
df.index.names = ["key1", "key2"]
df.columns = ["j_one", "j_two", "j_three"]
return df
@pytest.fixture
def left_multi():
return DataFrame(
{
"Origin": ["A", "A", "B", "B", "C"],
"Destination": ["A", "B", "A", "C", "A"],
"Period": ["AM", "AM", "IP", "AM", "OP"],
"TripPurp": ["hbw", "nhb", "hbo", "nhb", "hbw"],
"Trips": [1987, 3647, 2470, 4296, 4444],
},
columns=["Origin", "Destination", "Period", "TripPurp", "Trips"],
).set_index(["Origin", "Destination", "Period", "TripPurp"])
@pytest.fixture
def right_multi():
return DataFrame(
{
"Origin": ["A", "A", "B", "B", "C", "C", "E"],
"Destination": ["A", "B", "A", "B", "A", "B", "F"],
"Period": ["AM", "AM", "IP", "AM", "OP", "IP", "AM"],
"LinkType": ["a", "b", "c", "b", "a", "b", "a"],
"Distance": [100, 80, 90, 80, 75, 35, 55],
},
columns=["Origin", "Destination", "Period", "LinkType", "Distance"],
).set_index(["Origin", "Destination", "Period", "LinkType"])
@pytest.fixture
def on_cols_multi():
return ["Origin", "Destination", "Period"]
class TestMergeMulti:
def test_merge_on_multikey(self, left, right, join_type):
on_cols = ["key1", "key2"]
result = left.join(right, on=on_cols, how=join_type).reset_index(drop=True)
expected = merge(left, right.reset_index(), on=on_cols, how=join_type)
tm.assert_frame_equal(result, expected)
result = left.join(right, on=on_cols, how=join_type, sort=True).reset_index(
drop=True
)
expected = merge(
left, right.reset_index(), on=on_cols, how=join_type, sort=True
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
)
@pytest.mark.parametrize("sort", [True, False])
def test_left_join_multi_index(self, sort, infer_string):
with option_context("future.infer_string", infer_string):
icols = ["1st", "2nd", "3rd"]
def bind_cols(df):
iord = lambda a: 0 if a != a else ord(a)
f = lambda ts: ts.map(iord) - ord("a")
return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10
def run_asserts(left, right, sort):
res = left.join(right, on=icols, how="left", sort=sort)
assert len(left) < len(res) + 1
assert not res["4th"].isna().any()
assert not res["5th"].isna().any()
tm.assert_series_equal(res["4th"], -res["5th"], check_names=False)
result = bind_cols(res.iloc[:, :-2])
tm.assert_series_equal(res["4th"], result, check_names=False)
assert result.name is None
if sort:
tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort"))
out = merge(left, right.reset_index(), on=icols, sort=sort, how="left")
res.index = RangeIndex(len(res))
tm.assert_frame_equal(out, res)
lc = list(map(chr, np.arange(ord("a"), ord("z") + 1)))
left = DataFrame(
np.random.default_rng(2).choice(lc, (50, 2)), columns=["1st", "3rd"]
)
# Explicit cast to float to avoid implicit cast when setting nan
left.insert(
1,
"2nd",
np.random.default_rng(2).integers(0, 10, len(left)).astype("float"),
)
i = np.random.default_rng(2).permutation(len(left))
right = left.iloc[i].copy()
left["4th"] = bind_cols(left)
right["5th"] = -bind_cols(right)
right.set_index(icols, inplace=True)
run_asserts(left, right, sort)
# inject some nulls
left.loc[1::4, "1st"] = np.nan
left.loc[2::5, "2nd"] = np.nan
left.loc[3::6, "3rd"] = np.nan
left["4th"] = bind_cols(left)
i = np.random.default_rng(2).permutation(len(left))
right = left.iloc[i, :-1]
right["5th"] = -bind_cols(right)
right.set_index(icols, inplace=True)
run_asserts(left, right, sort)
@pytest.mark.parametrize("sort", [False, True])
def test_merge_right_vs_left(self, left, right, sort):
# compare left vs right merge with multikey
on_cols = ["key1", "key2"]
merged_left_right = left.merge(
right, left_on=on_cols, right_index=True, how="left", sort=sort
)
merge_right_left = right.merge(
left, right_on=on_cols, left_index=True, how="right", sort=sort
)
# Reorder columns
merge_right_left = merge_right_left[merged_left_right.columns]
tm.assert_frame_equal(merged_left_right, merge_right_left)
def test_merge_multiple_cols_with_mixed_cols_index(self):
# GH29522
s = Series(
range(6),
MultiIndex.from_product([["A", "B"], [1, 2, 3]], names=["lev1", "lev2"]),
name="Amount",
)
df = DataFrame({"lev1": list("AAABBB"), "lev2": [1, 2, 3, 1, 2, 3], "col": 0})
result = merge(df, s.reset_index(), on=["lev1", "lev2"])
expected = DataFrame(
{
"lev1": list("AAABBB"),
"lev2": [1, 2, 3, 1, 2, 3],
"col": [0] * 6,
"Amount": range(6),
}
)
tm.assert_frame_equal(result, expected)
def test_compress_group_combinations(self):
# ~ 40000000 possible unique groups
key1 = [str(i) for i in range(10000)]
key1 = np.tile(key1, 2)
key2 = key1[::-1]
df = DataFrame(
{
"key1": key1,
"key2": key2,
"value1": np.random.default_rng(2).standard_normal(20000),
}
)
df2 = DataFrame(
{
"key1": key1[::2],
"key2": key2[::2],
"value2": np.random.default_rng(2).standard_normal(10000),
}
)
# just to hit the label compression code path
merge(df, df2, how="outer")
def test_left_join_index_preserve_order(self):
on_cols = ["k1", "k2"]
left = DataFrame(
{
"k1": [0, 1, 2] * 8,
"k2": ["foo", "bar"] * 12,
"v": np.array(np.arange(24), dtype=np.int64),
}
)
index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
right = DataFrame({"v2": [5, 7]}, index=index)
result = left.join(right, on=on_cols)
expected = left.copy()
expected["v2"] = np.nan
expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5
expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7
tm.assert_frame_equal(result, expected)
result.sort_values(on_cols, kind="mergesort", inplace=True)
expected = left.join(right, on=on_cols, sort=True)
tm.assert_frame_equal(result, expected)
# test join with multi dtypes blocks
left = DataFrame(
{
"k1": [0, 1, 2] * 8,
"k2": ["foo", "bar"] * 12,
"k3": np.array([0, 1, 2] * 8, dtype=np.float32),
"v": np.array(np.arange(24), dtype=np.int32),
}
)
index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
right = DataFrame({"v2": [5, 7]}, index=index)
result = left.join(right, on=on_cols)
expected = left.copy()
expected["v2"] = np.nan
expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5
expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7
tm.assert_frame_equal(result, expected)
result = result.sort_values(on_cols, kind="mergesort")
expected = left.join(right, on=on_cols, sort=True)
tm.assert_frame_equal(result, expected)
def test_left_join_index_multi_match_multiindex(self):
left = DataFrame(
[
["X", "Y", "C", "a"],
["W", "Y", "C", "e"],
["V", "Q", "A", "h"],
["V", "R", "D", "i"],
["X", "Y", "D", "b"],
["X", "Y", "A", "c"],
["W", "Q", "B", "f"],
["W", "R", "C", "g"],
["V", "Y", "C", "j"],
["X", "Y", "B", "d"],
],
columns=["cola", "colb", "colc", "tag"],
index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8],
)
right = DataFrame(
[
["W", "R", "C", 0],
["W", "Q", "B", 3],
["W", "Q", "B", 8],
["X", "Y", "A", 1],
["X", "Y", "A", 4],
["X", "Y", "B", 5],
["X", "Y", "C", 6],
["X", "Y", "C", 9],
["X", "Q", "C", -6],
["X", "R", "C", -9],
["V", "Y", "C", 7],
["V", "R", "D", 2],
["V", "R", "D", -1],
["V", "Q", "A", -3],
],
columns=["col1", "col2", "col3", "val"],
).set_index(["col1", "col2", "col3"])
result = left.join(right, on=["cola", "colb", "colc"], how="left")
expected = DataFrame(
[
["X", "Y", "C", "a", 6],
["X", "Y", "C", "a", 9],
["W", "Y", "C", "e", np.nan],
["V", "Q", "A", "h", -3],
["V", "R", "D", "i", 2],
["V", "R", "D", "i", -1],
["X", "Y", "D", "b", np.nan],
["X", "Y", "A", "c", 1],
["X", "Y", "A", "c", 4],
["W", "Q", "B", "f", 3],
["W", "Q", "B", "f", 8],
["W", "R", "C", "g", 0],
["V", "Y", "C", "j", 7],
["X", "Y", "B", "d", 5],
],
columns=["cola", "colb", "colc", "tag", "val"],
index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8],
)
tm.assert_frame_equal(result, expected)
result = left.join(right, on=["cola", "colb", "colc"], how="left", sort=True)
expected = expected.sort_values(["cola", "colb", "colc"], kind="mergesort")
tm.assert_frame_equal(result, expected)
def test_left_join_index_multi_match(self):
left = DataFrame(
[["c", 0], ["b", 1], ["a", 2], ["b", 3]],
columns=["tag", "val"],
index=[2, 0, 1, 3],
)
right = DataFrame(
[
["a", "v"],
["c", "w"],
["c", "x"],
["d", "y"],
["a", "z"],
["c", "r"],
["e", "q"],
["c", "s"],
],
columns=["tag", "char"],
).set_index("tag")
result = left.join(right, on="tag", how="left")
expected = DataFrame(
[
["c", 0, "w"],
["c", 0, "x"],
["c", 0, "r"],
["c", 0, "s"],
["b", 1, np.nan],
["a", 2, "v"],
["a", 2, "z"],
["b", 3, np.nan],
],
columns=["tag", "val", "char"],
index=[2, 2, 2, 2, 0, 1, 1, 3],
)
tm.assert_frame_equal(result, expected)
result = left.join(right, on="tag", how="left", sort=True)
expected2 = expected.sort_values("tag", kind="mergesort")
tm.assert_frame_equal(result, expected2)
# GH7331 - maintain left frame order in left merge
result = merge(left, right.reset_index(), how="left", on="tag")
expected.index = RangeIndex(len(expected))
tm.assert_frame_equal(result, expected)
def test_left_merge_na_buglet(self):
left = DataFrame(
{
"id": list("abcde"),
"v1": np.random.default_rng(2).standard_normal(5),
"v2": np.random.default_rng(2).standard_normal(5),
"dummy": list("abcde"),
"v3": np.random.default_rng(2).standard_normal(5),
},
columns=["id", "v1", "v2", "dummy", "v3"],
)
right = DataFrame(
{
"id": ["a", "b", np.nan, np.nan, np.nan],
"sv3": [1.234, 5.678, np.nan, np.nan, np.nan],
}
)
result = merge(left, right, on="id", how="left")
rdf = right.drop(["id"], axis=1)
expected = left.join(rdf)
tm.assert_frame_equal(result, expected)
def test_merge_na_keys(self):
data = [
[1950, "A", 1.5],
[1950, "B", 1.5],
[1955, "B", 1.5],
[1960, "B", np.nan],
[1970, "B", 4.0],
[1950, "C", 4.0],
[1960, "C", np.nan],
[1965, "C", 3.0],
[1970, "C", 4.0],
]
frame = DataFrame(data, columns=["year", "panel", "data"])
other_data = [
[1960, "A", np.nan],
[1970, "A", np.nan],
[1955, "A", np.nan],
[1965, "A", np.nan],
[1965, "B", np.nan],
[1955, "C", np.nan],
]
other = DataFrame(other_data, columns=["year", "panel", "data"])
result = frame.merge(other, how="outer")
expected = frame.fillna(-999).merge(other.fillna(-999), how="outer")
expected = expected.replace(-999, np.nan)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("klass", [None, np.asarray, Series, Index])
def test_merge_datetime_index(self, klass):
# see gh-19038
df = DataFrame(
[1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"]
)
df.index = pd.to_datetime(df.index)
on_vector = df.index.year
if klass is not None:
on_vector = klass(on_vector)
exp_years = np.array([2016, 2017, 2018], dtype=np.int32)
expected = DataFrame({"a": [1, 2, 3], "key_1": exp_years})
result = df.merge(df, on=["a", on_vector], how="inner")
tm.assert_frame_equal(result, expected)
expected = DataFrame({"key_0": exp_years, "a_x": [1, 2, 3], "a_y": [1, 2, 3]})
result = df.merge(df, on=[df.index.year], how="inner")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("merge_type", ["left", "right"])
def test_merge_datetime_multi_index_empty_df(self, merge_type):
# see gh-36895
left = DataFrame(
data={
"data": [1.5, 1.5],
},
index=MultiIndex.from_tuples(
[[Timestamp("1950-01-01"), "A"], [Timestamp("1950-01-02"), "B"]],
names=["date", "panel"],
),
)
right = DataFrame(
index=MultiIndex.from_tuples([], names=["date", "panel"]), columns=["state"]
)
expected_index = MultiIndex.from_tuples(
[[Timestamp("1950-01-01"), "A"], [Timestamp("1950-01-02"), "B"]],
names=["date", "panel"],
)
if merge_type == "left":
expected = DataFrame(
data={
"data": [1.5, 1.5],
"state": np.array([np.nan, np.nan], dtype=object),
},
index=expected_index,
)
results_merge = left.merge(right, how="left", on=["date", "panel"])
results_join = left.join(right, how="left")
else:
expected = DataFrame(
data={
"state": np.array([np.nan, np.nan], dtype=object),
"data": [1.5, 1.5],
},
index=expected_index,
)
results_merge = right.merge(left, how="right", on=["date", "panel"])
results_join = right.join(left, how="right")
tm.assert_frame_equal(results_merge, expected)
tm.assert_frame_equal(results_join, expected)
@pytest.fixture
def household(self):
household = DataFrame(
{
"household_id": [1, 2, 3],
"male": [0, 1, 0],
"wealth": [196087.3, 316478.7, 294750],
},
columns=["household_id", "male", "wealth"],
).set_index("household_id")
return household
@pytest.fixture
def portfolio(self):
portfolio = DataFrame(
{
"household_id": [1, 2, 2, 3, 3, 3, 4],
"asset_id": [
"nl0000301109",
"nl0000289783",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"nl0000289965",
np.nan,
],
"name": [
"ABN Amro",
"Robeco",
"Royal Dutch Shell",
"Royal Dutch Shell",
"AAB Eastern Europe Equity Fund",
"Postbank BioTech Fonds",
np.nan,
],
"share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
},
columns=["household_id", "asset_id", "name", "share"],
).set_index(["household_id", "asset_id"])
return portfolio
@pytest.fixture
def expected(self):
expected = (
DataFrame(
{
"male": [0, 1, 1, 0, 0, 0],
"wealth": [
196087.3,
316478.7,
316478.7,
294750.0,
294750.0,
294750.0,
],
"name": [
"ABN Amro",
"Robeco",
"Royal Dutch Shell",
"Royal Dutch Shell",
"AAB Eastern Europe Equity Fund",
"Postbank BioTech Fonds",
],
"share": [1.00, 0.40, 0.60, 0.15, 0.60, 0.25],
"household_id": [1, 2, 2, 3, 3, 3],
"asset_id": [
"nl0000301109",
"nl0000289783",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"nl0000289965",
],
}
)
.set_index(["household_id", "asset_id"])
.reindex(columns=["male", "wealth", "name", "share"])
)
return expected
def test_join_multi_levels(self, portfolio, household, expected):
portfolio = portfolio.copy()
household = household.copy()
# GH 3662
# merge multi-levels
result = household.join(portfolio, how="inner")
tm.assert_frame_equal(result, expected)
def test_join_multi_levels_merge_equivalence(self, portfolio, household, expected):
portfolio = portfolio.copy()
household = household.copy()
# equivalency
result = merge(
household.reset_index(),
portfolio.reset_index(),
on=["household_id"],
how="inner",
).set_index(["household_id", "asset_id"])
tm.assert_frame_equal(result, expected)
def test_join_multi_levels_outer(self, portfolio, household, expected):
portfolio = portfolio.copy()
household = household.copy()
result = household.join(portfolio, how="outer")
expected = concat(
[
expected,
(
DataFrame(
{"share": [1.00]},
index=MultiIndex.from_tuples(
[(4, np.nan)], names=["household_id", "asset_id"]
),
)
),
],
axis=0,
sort=True,
).reindex(columns=expected.columns)
tm.assert_frame_equal(result, expected, check_index_type=False)
def test_join_multi_levels_invalid(self, portfolio, household):
portfolio = portfolio.copy()
household = household.copy()
# invalid cases
household.index.name = "foo"
with pytest.raises(
ValueError, match="cannot join with no overlapping index names"
):
household.join(portfolio, how="inner")
portfolio2 = portfolio.copy()
portfolio2.index.set_names(["household_id", "foo"])
with pytest.raises(ValueError, match="columns overlap but no suffix specified"):
portfolio2.join(portfolio, how="inner")
def test_join_multi_levels2(self):
# some more advanced merges
# GH6360
household = DataFrame(
{
"household_id": [1, 2, 2, 3, 3, 3, 4],
"asset_id": [
"nl0000301109",
"nl0000301109",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"nl0000289965",
np.nan,
],
"share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
},
columns=["household_id", "asset_id", "share"],
).set_index(["household_id", "asset_id"])
log_return = DataFrame(
{
"asset_id": [
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"lu0197800237",
],
"t": [233, 234, 235, 180, 181],
"log_return": [
0.09604978,
-0.06524096,
0.03532373,
0.03025441,
0.036997,
],
}
).set_index(["asset_id", "t"])
expected = (
DataFrame(
{
"household_id": [2, 2, 2, 3, 3, 3, 3, 3],
"asset_id": [
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"lu0197800237",
],
"t": [233, 234, 235, 233, 234, 235, 180, 181],
"share": [0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6],
"log_return": [
0.09604978,
-0.06524096,
0.03532373,
0.09604978,
-0.06524096,
0.03532373,
0.03025441,
0.036997,
],
}
)
.set_index(["household_id", "asset_id", "t"])
.reindex(columns=["share", "log_return"])
)
# this is the equivalency
result = merge(
household.reset_index(),
log_return.reset_index(),
on=["asset_id"],
how="inner",
).set_index(["household_id", "asset_id", "t"])
tm.assert_frame_equal(result, expected)
expected = (
DataFrame(
{
"household_id": [2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 2, 4],
"asset_id": [
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"lu0197800237",
"nl0000289965",
"nl0000301109",
"nl0000301109",
None,
],
"t": [
233,
234,
235,
233,
234,
235,
180,
181,
None,
None,
None,
None,
],
"share": [
0.6,
0.6,
0.6,
0.15,
0.15,
0.15,
0.6,
0.6,
0.25,
1.0,
0.4,
1.0,
],
"log_return": [
0.09604978,
-0.06524096,
0.03532373,
0.09604978,
-0.06524096,
0.03532373,
0.03025441,
0.036997,
None,
None,
None,
None,
],
}
)
.set_index(["household_id", "asset_id", "t"])
.reindex(columns=["share", "log_return"])
)
result = merge(
household.reset_index(),
log_return.reset_index(),
on=["asset_id"],
how="outer",
).set_index(["household_id", "asset_id", "t"])
tm.assert_frame_equal(result, expected)
class TestJoinMultiMulti:
def test_join_multi_multi(self, left_multi, right_multi, join_type, on_cols_multi):
left_names = left_multi.index.names
right_names = right_multi.index.names
if join_type == "right":
level_order = right_names + left_names.difference(right_names)
else:
level_order = left_names + right_names.difference(left_names)
# Multi-index join tests
expected = (
merge(
left_multi.reset_index(),
right_multi.reset_index(),
how=join_type,
on=on_cols_multi,
)
.set_index(level_order)
.sort_index()
)
result = left_multi.join(right_multi, how=join_type).sort_index()
tm.assert_frame_equal(result, expected)
def test_join_multi_empty_frames(
self, left_multi, right_multi, join_type, on_cols_multi
):
left_multi = left_multi.drop(columns=left_multi.columns)
right_multi = right_multi.drop(columns=right_multi.columns)
left_names = left_multi.index.names
right_names = right_multi.index.names
if join_type == "right":
level_order = right_names + left_names.difference(right_names)
else:
level_order = left_names + right_names.difference(left_names)
expected = (
merge(
left_multi.reset_index(),
right_multi.reset_index(),
how=join_type,
on=on_cols_multi,
)
.set_index(level_order)
.sort_index()
)
result = left_multi.join(right_multi, how=join_type).sort_index()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("box", [None, np.asarray, Series, Index])
def test_merge_datetime_index(self, box):
# see gh-19038
df = DataFrame(
[1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"]
)
df.index = pd.to_datetime(df.index)
on_vector = df.index.year
if box is not None:
on_vector = box(on_vector)
exp_years = np.array([2016, 2017, 2018], dtype=np.int32)
expected = DataFrame({"a": [1, 2, 3], "key_1": exp_years})
result = df.merge(df, on=["a", on_vector], how="inner")
tm.assert_frame_equal(result, expected)
expected = DataFrame({"key_0": exp_years, "a_x": [1, 2, 3], "a_y": [1, 2, 3]})
result = df.merge(df, on=[df.index.year], how="inner")
tm.assert_frame_equal(result, expected)
def test_single_common_level(self):
index_left = MultiIndex.from_tuples(
[("K0", "X0"), ("K0", "X1"), ("K1", "X2")], names=["key", "X"]
)
left = DataFrame(
{"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=index_left
)
index_right = MultiIndex.from_tuples(
[("K0", "Y0"), ("K1", "Y1"), ("K2", "Y2"), ("K2", "Y3")], names=["key", "Y"]
)
right = DataFrame(
{"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]},
index=index_right,
)
result = left.join(right)
expected = merge(
left.reset_index(), right.reset_index(), on=["key"], how="inner"
).set_index(["key", "X", "Y"])
tm.assert_frame_equal(result, expected)
def test_join_multi_wrong_order(self):
# GH 25760
# GH 28956
midx1 = MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"])
midx3 = MultiIndex.from_tuples([(4, 1), (3, 2), (3, 1)], names=["b", "a"])
left = DataFrame(index=midx1, data={"x": [10, 20, 30, 40]})
right = DataFrame(index=midx3, data={"y": ["foo", "bar", "fing"]})
result = left.join(right)
expected = DataFrame(
index=midx1,
data={"x": [10, 20, 30, 40], "y": ["fing", "foo", "bar", np.nan]},
)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,886 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
CategoricalDtype,
CategoricalIndex,
DataFrame,
Index,
MultiIndex,
Series,
crosstab,
)
import pandas._testing as tm
@pytest.fixture
def df():
df = DataFrame(
{
"A": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"foo",
"foo",
"foo",
],
"B": [
"one",
"one",
"one",
"two",
"one",
"one",
"one",
"two",
"two",
"two",
"one",
],
"C": [
"dull",
"dull",
"shiny",
"dull",
"dull",
"shiny",
"shiny",
"dull",
"shiny",
"shiny",
"shiny",
],
"D": np.random.default_rng(2).standard_normal(11),
"E": np.random.default_rng(2).standard_normal(11),
"F": np.random.default_rng(2).standard_normal(11),
}
)
return pd.concat([df, df], ignore_index=True)
class TestCrosstab:
def test_crosstab_single(self, df):
result = crosstab(df["A"], df["C"])
expected = df.groupby(["A", "C"]).size().unstack()
tm.assert_frame_equal(result, expected.fillna(0).astype(np.int64))
def test_crosstab_multiple(self, df):
result = crosstab(df["A"], [df["B"], df["C"]])
expected = df.groupby(["A", "B", "C"]).size()
expected = expected.unstack("B").unstack("C").fillna(0).astype(np.int64)
tm.assert_frame_equal(result, expected)
result = crosstab([df["B"], df["C"]], df["A"])
expected = df.groupby(["B", "C", "A"]).size()
expected = expected.unstack("A").fillna(0).astype(np.int64)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("box", [np.array, list, tuple])
def test_crosstab_ndarray(self, box):
# GH 44076
a = box(np.random.default_rng(2).integers(0, 5, size=100))
b = box(np.random.default_rng(2).integers(0, 3, size=100))
c = box(np.random.default_rng(2).integers(0, 10, size=100))
df = DataFrame({"a": a, "b": b, "c": c})
result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"))
expected = crosstab(df["a"], [df["b"], df["c"]])
tm.assert_frame_equal(result, expected)
result = crosstab([b, c], a, colnames=["a"], rownames=("b", "c"))
expected = crosstab([df["b"], df["c"]], df["a"])
tm.assert_frame_equal(result, expected)
# assign arbitrary names
result = crosstab(a, c)
expected = crosstab(df["a"], df["c"])
expected.index.names = ["row_0"]
expected.columns.names = ["col_0"]
tm.assert_frame_equal(result, expected)
def test_crosstab_non_aligned(self):
# GH 17005
a = Series([0, 1, 1], index=["a", "b", "c"])
b = Series([3, 4, 3, 4, 3], index=["a", "b", "c", "d", "f"])
c = np.array([3, 4, 3], dtype=np.int64)
expected = DataFrame(
[[1, 0], [1, 1]],
index=Index([0, 1], name="row_0"),
columns=Index([3, 4], name="col_0"),
)
result = crosstab(a, b)
tm.assert_frame_equal(result, expected)
result = crosstab(a, c)
tm.assert_frame_equal(result, expected)
def test_crosstab_margins(self):
a = np.random.default_rng(2).integers(0, 7, size=100)
b = np.random.default_rng(2).integers(0, 3, size=100)
c = np.random.default_rng(2).integers(0, 5, size=100)
df = DataFrame({"a": a, "b": b, "c": c})
result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"), margins=True)
assert result.index.names == ("a",)
assert result.columns.names == ["b", "c"]
all_cols = result["All", ""]
exp_cols = df.groupby(["a"]).size().astype("i8")
# to keep index.name
exp_margin = Series([len(df)], index=Index(["All"], name="a"))
exp_cols = pd.concat([exp_cols, exp_margin])
exp_cols.name = ("All", "")
tm.assert_series_equal(all_cols, exp_cols)
all_rows = result.loc["All"]
exp_rows = df.groupby(["b", "c"]).size().astype("i8")
exp_rows = pd.concat([exp_rows, Series([len(df)], index=[("All", "")])])
exp_rows.name = "All"
exp_rows = exp_rows.reindex(all_rows.index)
exp_rows = exp_rows.fillna(0).astype(np.int64)
tm.assert_series_equal(all_rows, exp_rows)
def test_crosstab_margins_set_margin_name(self):
# GH 15972
a = np.random.default_rng(2).integers(0, 7, size=100)
b = np.random.default_rng(2).integers(0, 3, size=100)
c = np.random.default_rng(2).integers(0, 5, size=100)
df = DataFrame({"a": a, "b": b, "c": c})
result = crosstab(
a,
[b, c],
rownames=["a"],
colnames=("b", "c"),
margins=True,
margins_name="TOTAL",
)
assert result.index.names == ("a",)
assert result.columns.names == ["b", "c"]
all_cols = result["TOTAL", ""]
exp_cols = df.groupby(["a"]).size().astype("i8")
# to keep index.name
exp_margin = Series([len(df)], index=Index(["TOTAL"], name="a"))
exp_cols = pd.concat([exp_cols, exp_margin])
exp_cols.name = ("TOTAL", "")
tm.assert_series_equal(all_cols, exp_cols)
all_rows = result.loc["TOTAL"]
exp_rows = df.groupby(["b", "c"]).size().astype("i8")
exp_rows = pd.concat([exp_rows, Series([len(df)], index=[("TOTAL", "")])])
exp_rows.name = "TOTAL"
exp_rows = exp_rows.reindex(all_rows.index)
exp_rows = exp_rows.fillna(0).astype(np.int64)
tm.assert_series_equal(all_rows, exp_rows)
msg = "margins_name argument must be a string"
for margins_name in [666, None, ["a", "b"]]:
with pytest.raises(ValueError, match=msg):
crosstab(
a,
[b, c],
rownames=["a"],
colnames=("b", "c"),
margins=True,
margins_name=margins_name,
)
def test_crosstab_pass_values(self):
a = np.random.default_rng(2).integers(0, 7, size=100)
b = np.random.default_rng(2).integers(0, 3, size=100)
c = np.random.default_rng(2).integers(0, 5, size=100)
values = np.random.default_rng(2).standard_normal(100)
table = crosstab(
[a, b], c, values, aggfunc="sum", rownames=["foo", "bar"], colnames=["baz"]
)
df = DataFrame({"foo": a, "bar": b, "baz": c, "values": values})
expected = df.pivot_table(
"values", index=["foo", "bar"], columns="baz", aggfunc="sum"
)
tm.assert_frame_equal(table, expected)
def test_crosstab_dropna(self):
# GH 3820
a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object)
b = np.array(["one", "one", "two", "one", "two", "two", "two"], dtype=object)
c = np.array(
["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object
)
res = crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=False)
m = MultiIndex.from_tuples(
[("one", "dull"), ("one", "shiny"), ("two", "dull"), ("two", "shiny")],
names=["b", "c"],
)
tm.assert_index_equal(res.columns, m)
def test_crosstab_no_overlap(self):
# GS 10291
s1 = Series([1, 2, 3], index=[1, 2, 3])
s2 = Series([4, 5, 6], index=[4, 5, 6])
actual = crosstab(s1, s2)
expected = DataFrame(
index=Index([], dtype="int64", name="row_0"),
columns=Index([], dtype="int64", name="col_0"),
)
tm.assert_frame_equal(actual, expected)
def test_margin_dropna(self):
# GH 12577
# pivot_table counts null into margin ('All')
# when margins=true and dropna=true
df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]})
actual = crosstab(df.a, df.b, margins=True, dropna=True)
expected = DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]])
expected.index = Index([1.0, 2.0, "All"], name="a")
expected.columns = Index([3, 4, "All"], name="b")
tm.assert_frame_equal(actual, expected)
def test_margin_dropna2(self):
df = DataFrame(
{"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]}
)
actual = crosstab(df.a, df.b, margins=True, dropna=True)
expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]])
expected.index = Index([1.0, 2.0, "All"], name="a")
expected.columns = Index([3.0, 4.0, "All"], name="b")
tm.assert_frame_equal(actual, expected)
def test_margin_dropna3(self):
df = DataFrame(
{"a": [1, np.nan, np.nan, np.nan, np.nan, 2], "b": [3, 3, 4, 4, 4, 4]}
)
actual = crosstab(df.a, df.b, margins=True, dropna=True)
expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]])
expected.index = Index([1.0, 2.0, "All"], name="a")
expected.columns = Index([3, 4, "All"], name="b")
tm.assert_frame_equal(actual, expected)
def test_margin_dropna4(self):
# GH 12642
# _add_margins raises KeyError: Level None not found
# when margins=True and dropna=False
# GH: 10772: Keep np.nan in result with dropna=False
df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]})
actual = crosstab(df.a, df.b, margins=True, dropna=False)
expected = DataFrame([[1, 0, 1.0], [1, 3, 4.0], [0, 1, np.nan], [2, 4, 6.0]])
expected.index = Index([1.0, 2.0, np.nan, "All"], name="a")
expected.columns = Index([3, 4, "All"], name="b")
tm.assert_frame_equal(actual, expected)
def test_margin_dropna5(self):
# GH: 10772: Keep np.nan in result with dropna=False
df = DataFrame(
{"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]}
)
actual = crosstab(df.a, df.b, margins=True, dropna=False)
expected = DataFrame(
[[1, 0, 0, 1.0], [0, 1, 0, 1.0], [0, 3, 1, np.nan], [1, 4, 0, 6.0]]
)
expected.index = Index([1.0, 2.0, np.nan, "All"], name="a")
expected.columns = Index([3.0, 4.0, np.nan, "All"], name="b")
tm.assert_frame_equal(actual, expected)
def test_margin_dropna6(self):
# GH: 10772: Keep np.nan in result with dropna=False
a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object)
b = np.array(["one", "one", "two", "one", "two", np.nan, "two"], dtype=object)
c = np.array(
["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object
)
actual = crosstab(
a, [b, c], rownames=["a"], colnames=["b", "c"], margins=True, dropna=False
)
m = MultiIndex.from_arrays(
[
["one", "one", "two", "two", np.nan, np.nan, "All"],
["dull", "shiny", "dull", "shiny", "dull", "shiny", ""],
],
names=["b", "c"],
)
expected = DataFrame(
[[1, 0, 1, 0, 0, 0, 2], [2, 0, 1, 1, 0, 1, 5], [3, 0, 2, 1, 0, 0, 7]],
columns=m,
)
expected.index = Index(["bar", "foo", "All"], name="a")
tm.assert_frame_equal(actual, expected)
actual = crosstab(
[a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=False
)
m = MultiIndex.from_arrays(
[
["bar", "bar", "bar", "foo", "foo", "foo", "All"],
["one", "two", np.nan, "one", "two", np.nan, ""],
],
names=["a", "b"],
)
expected = DataFrame(
[
[1, 0, 1.0],
[1, 0, 1.0],
[0, 0, np.nan],
[2, 0, 2.0],
[1, 1, 2.0],
[0, 1, np.nan],
[5, 2, 7.0],
],
index=m,
)
expected.columns = Index(["dull", "shiny", "All"], name="c")
tm.assert_frame_equal(actual, expected)
actual = crosstab(
[a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=True
)
m = MultiIndex.from_arrays(
[["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]],
names=["a", "b"],
)
expected = DataFrame(
[[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 1, 6]], index=m
)
expected.columns = Index(["dull", "shiny", "All"], name="c")
tm.assert_frame_equal(actual, expected)
def test_crosstab_normalize(self):
# Issue 12578
df = DataFrame(
{"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]}
)
rindex = Index([1, 2], name="a")
cindex = Index([3, 4], name="b")
full_normal = DataFrame([[0.2, 0], [0.2, 0.6]], index=rindex, columns=cindex)
row_normal = DataFrame([[1.0, 0], [0.25, 0.75]], index=rindex, columns=cindex)
col_normal = DataFrame([[0.5, 0], [0.5, 1.0]], index=rindex, columns=cindex)
# Check all normalize args
tm.assert_frame_equal(crosstab(df.a, df.b, normalize="all"), full_normal)
tm.assert_frame_equal(crosstab(df.a, df.b, normalize=True), full_normal)
tm.assert_frame_equal(crosstab(df.a, df.b, normalize="index"), row_normal)
tm.assert_frame_equal(crosstab(df.a, df.b, normalize="columns"), col_normal)
tm.assert_frame_equal(
crosstab(df.a, df.b, normalize=1),
crosstab(df.a, df.b, normalize="columns"),
)
tm.assert_frame_equal(
crosstab(df.a, df.b, normalize=0), crosstab(df.a, df.b, normalize="index")
)
row_normal_margins = DataFrame(
[[1.0, 0], [0.25, 0.75], [0.4, 0.6]],
index=Index([1, 2, "All"], name="a", dtype="object"),
columns=Index([3, 4], name="b", dtype="object"),
)
col_normal_margins = DataFrame(
[[0.5, 0, 0.2], [0.5, 1.0, 0.8]],
index=Index([1, 2], name="a", dtype="object"),
columns=Index([3, 4, "All"], name="b", dtype="object"),
)
all_normal_margins = DataFrame(
[[0.2, 0, 0.2], [0.2, 0.6, 0.8], [0.4, 0.6, 1]],
index=Index([1, 2, "All"], name="a", dtype="object"),
columns=Index([3, 4, "All"], name="b", dtype="object"),
)
tm.assert_frame_equal(
crosstab(df.a, df.b, normalize="index", margins=True), row_normal_margins
)
tm.assert_frame_equal(
crosstab(df.a, df.b, normalize="columns", margins=True), col_normal_margins
)
tm.assert_frame_equal(
crosstab(df.a, df.b, normalize=True, margins=True), all_normal_margins
)
def test_crosstab_normalize_arrays(self):
# GH#12578
df = DataFrame(
{"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]}
)
# Test arrays
crosstab(
[np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])], np.array([1, 2, 1, 2])
)
# Test with aggfunc
norm_counts = DataFrame(
[[0.25, 0, 0.25], [0.25, 0.5, 0.75], [0.5, 0.5, 1]],
index=Index([1, 2, "All"], name="a", dtype="object"),
columns=Index([3, 4, "All"], name="b"),
)
test_case = crosstab(
df.a, df.b, df.c, aggfunc="count", normalize="all", margins=True
)
tm.assert_frame_equal(test_case, norm_counts)
df = DataFrame(
{"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [0, 4, np.nan, 3, 3]}
)
norm_sum = DataFrame(
[[0, 0, 0.0], [0.4, 0.6, 1], [0.4, 0.6, 1]],
index=Index([1, 2, "All"], name="a", dtype="object"),
columns=Index([3, 4, "All"], name="b", dtype="object"),
)
msg = "using DataFrameGroupBy.sum"
with tm.assert_produces_warning(FutureWarning, match=msg):
test_case = crosstab(
df.a, df.b, df.c, aggfunc=np.sum, normalize="all", margins=True
)
tm.assert_frame_equal(test_case, norm_sum)
def test_crosstab_with_empties(self, using_array_manager):
# Check handling of empties
df = DataFrame(
{
"a": [1, 2, 2, 2, 2],
"b": [3, 3, 4, 4, 4],
"c": [np.nan, np.nan, np.nan, np.nan, np.nan],
}
)
empty = DataFrame(
[[0.0, 0.0], [0.0, 0.0]],
index=Index([1, 2], name="a", dtype="int64"),
columns=Index([3, 4], name="b"),
)
for i in [True, "index", "columns"]:
calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=i)
tm.assert_frame_equal(empty, calculated)
nans = DataFrame(
[[0.0, np.nan], [0.0, 0.0]],
index=Index([1, 2], name="a", dtype="int64"),
columns=Index([3, 4], name="b"),
)
if using_array_manager:
# INFO(ArrayManager) column without NaNs can preserve int dtype
nans[3] = nans[3].astype("int64")
calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=False)
tm.assert_frame_equal(nans, calculated)
def test_crosstab_errors(self):
# Issue 12578
df = DataFrame(
{"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]}
)
error = "values cannot be used without an aggfunc."
with pytest.raises(ValueError, match=error):
crosstab(df.a, df.b, values=df.c)
error = "aggfunc cannot be used without values"
with pytest.raises(ValueError, match=error):
crosstab(df.a, df.b, aggfunc=np.mean)
error = "Not a valid normalize argument"
with pytest.raises(ValueError, match=error):
crosstab(df.a, df.b, normalize="42")
with pytest.raises(ValueError, match=error):
crosstab(df.a, df.b, normalize=42)
error = "Not a valid margins argument"
with pytest.raises(ValueError, match=error):
crosstab(df.a, df.b, normalize="all", margins=42)
def test_crosstab_with_categorial_columns(self):
# GH 8860
df = DataFrame(
{
"MAKE": ["Honda", "Acura", "Tesla", "Honda", "Honda", "Acura"],
"MODEL": ["Sedan", "Sedan", "Electric", "Pickup", "Sedan", "Sedan"],
}
)
categories = ["Sedan", "Electric", "Pickup"]
df["MODEL"] = df["MODEL"].astype("category").cat.set_categories(categories)
result = crosstab(df["MAKE"], df["MODEL"])
expected_index = Index(["Acura", "Honda", "Tesla"], name="MAKE")
expected_columns = CategoricalIndex(
categories, categories=categories, ordered=False, name="MODEL"
)
expected_data = [[2, 0, 0], [2, 0, 1], [0, 1, 0]]
expected = DataFrame(
expected_data, index=expected_index, columns=expected_columns
)
tm.assert_frame_equal(result, expected)
def test_crosstab_with_numpy_size(self):
# GH 4003
df = DataFrame(
{
"A": ["one", "one", "two", "three"] * 6,
"B": ["A", "B", "C"] * 8,
"C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4,
"D": np.random.default_rng(2).standard_normal(24),
"E": np.random.default_rng(2).standard_normal(24),
}
)
result = crosstab(
index=[df["A"], df["B"]],
columns=[df["C"]],
margins=True,
aggfunc=np.size,
values=df["D"],
)
expected_index = MultiIndex(
levels=[["All", "one", "three", "two"], ["", "A", "B", "C"]],
codes=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0], [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]],
names=["A", "B"],
)
expected_column = Index(["bar", "foo", "All"], name="C")
expected_data = np.array(
[
[2.0, 2.0, 4.0],
[2.0, 2.0, 4.0],
[2.0, 2.0, 4.0],
[2.0, np.nan, 2.0],
[np.nan, 2.0, 2.0],
[2.0, np.nan, 2.0],
[np.nan, 2.0, 2.0],
[2.0, np.nan, 2.0],
[np.nan, 2.0, 2.0],
[12.0, 12.0, 24.0],
]
)
expected = DataFrame(
expected_data, index=expected_index, columns=expected_column
)
# aggfunc is np.size, resulting in integers
expected["All"] = expected["All"].astype("int64")
tm.assert_frame_equal(result, expected)
def test_crosstab_duplicate_names(self):
# GH 13279 / 22529
s1 = Series(range(3), name="foo")
s2_foo = Series(range(1, 4), name="foo")
s2_bar = Series(range(1, 4), name="bar")
s3 = Series(range(3), name="waldo")
# check result computed with duplicate labels against
# result computed with unique labels, then relabelled
mapper = {"bar": "foo"}
# duplicate row, column labels
result = crosstab(s1, s2_foo)
expected = crosstab(s1, s2_bar).rename_axis(columns=mapper, axis=1)
tm.assert_frame_equal(result, expected)
# duplicate row, unique column labels
result = crosstab([s1, s2_foo], s3)
expected = crosstab([s1, s2_bar], s3).rename_axis(index=mapper, axis=0)
tm.assert_frame_equal(result, expected)
# unique row, duplicate column labels
result = crosstab(s3, [s1, s2_foo])
expected = crosstab(s3, [s1, s2_bar]).rename_axis(columns=mapper, axis=1)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]])
def test_crosstab_tuple_name(self, names):
s1 = Series(range(3), name=names[0])
s2 = Series(range(1, 4), name=names[1])
mi = MultiIndex.from_arrays([range(3), range(1, 4)], names=names)
expected = Series(1, index=mi).unstack(1, fill_value=0)
result = crosstab(s1, s2)
tm.assert_frame_equal(result, expected)
def test_crosstab_both_tuple_names(self):
# GH 18321
s1 = Series(range(3), name=("a", "b"))
s2 = Series(range(3), name=("c", "d"))
expected = DataFrame(
np.eye(3, dtype="int64"),
index=Index(range(3), name=("a", "b")),
columns=Index(range(3), name=("c", "d")),
)
result = crosstab(s1, s2)
tm.assert_frame_equal(result, expected)
def test_crosstab_unsorted_order(self):
df = DataFrame({"b": [3, 1, 2], "a": [5, 4, 6]}, index=["C", "A", "B"])
result = crosstab(df.index, [df.b, df.a])
e_idx = Index(["A", "B", "C"], name="row_0")
e_columns = MultiIndex.from_tuples([(1, 4), (2, 6), (3, 5)], names=["b", "a"])
expected = DataFrame(
[[1, 0, 0], [0, 1, 0], [0, 0, 1]], index=e_idx, columns=e_columns
)
tm.assert_frame_equal(result, expected)
def test_crosstab_normalize_multiple_columns(self):
# GH 15150
df = DataFrame(
{
"A": ["one", "one", "two", "three"] * 6,
"B": ["A", "B", "C"] * 8,
"C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4,
"D": [0] * 24,
"E": [0] * 24,
}
)
msg = "using DataFrameGroupBy.sum"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = crosstab(
[df.A, df.B],
df.C,
values=df.D,
aggfunc=np.sum,
normalize=True,
margins=True,
)
expected = DataFrame(
np.array([0] * 29 + [1], dtype=float).reshape(10, 3),
columns=Index(["bar", "foo", "All"], name="C"),
index=MultiIndex.from_tuples(
[
("one", "A"),
("one", "B"),
("one", "C"),
("three", "A"),
("three", "B"),
("three", "C"),
("two", "A"),
("two", "B"),
("two", "C"),
("All", ""),
],
names=["A", "B"],
),
)
tm.assert_frame_equal(result, expected)
def test_margin_normalize(self):
# GH 27500
df = DataFrame(
{
"A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
"B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
"C": [
"small",
"large",
"large",
"small",
"small",
"large",
"small",
"small",
"large",
],
"D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
"E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
}
)
# normalize on index
result = crosstab(
[df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=0
)
expected = DataFrame(
[[0.5, 0.5], [0.5, 0.5], [0.666667, 0.333333], [0, 1], [0.444444, 0.555556]]
)
expected.index = MultiIndex(
levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]],
codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]],
names=["A", "B"],
)
expected.columns = Index(["large", "small"], name="C")
tm.assert_frame_equal(result, expected)
# normalize on columns
result = crosstab(
[df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=1
)
expected = DataFrame(
[
[0.25, 0.2, 0.222222],
[0.25, 0.2, 0.222222],
[0.5, 0.2, 0.333333],
[0, 0.4, 0.222222],
]
)
expected.columns = Index(["large", "small", "Sub-Total"], name="C")
expected.index = MultiIndex(
levels=[["bar", "foo"], ["one", "two"]],
codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
names=["A", "B"],
)
tm.assert_frame_equal(result, expected)
# normalize on both index and column
result = crosstab(
[df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=True
)
expected = DataFrame(
[
[0.111111, 0.111111, 0.222222],
[0.111111, 0.111111, 0.222222],
[0.222222, 0.111111, 0.333333],
[0.000000, 0.222222, 0.222222],
[0.444444, 0.555555, 1],
]
)
expected.columns = Index(["large", "small", "Sub-Total"], name="C")
expected.index = MultiIndex(
levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]],
codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]],
names=["A", "B"],
)
tm.assert_frame_equal(result, expected)
def test_margin_normalize_multiple_columns(self):
# GH 35144
# use multiple columns with margins and normalization
df = DataFrame(
{
"A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
"B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
"C": [
"small",
"large",
"large",
"small",
"small",
"large",
"small",
"small",
"large",
],
"D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
"E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
}
)
result = crosstab(
index=df.C,
columns=[df.A, df.B],
margins=True,
margins_name="margin",
normalize=True,
)
expected = DataFrame(
[
[0.111111, 0.111111, 0.222222, 0.000000, 0.444444],
[0.111111, 0.111111, 0.111111, 0.222222, 0.555556],
[0.222222, 0.222222, 0.333333, 0.222222, 1.0],
],
index=["large", "small", "margin"],
)
expected.columns = MultiIndex(
levels=[["bar", "foo", "margin"], ["", "one", "two"]],
codes=[[0, 0, 1, 1, 2], [1, 2, 1, 2, 0]],
names=["A", "B"],
)
expected.index.name = "C"
tm.assert_frame_equal(result, expected)
def test_margin_support_Float(self):
# GH 50313
# use Float64 formats and function aggfunc with margins
df = DataFrame(
{"A": [1, 2, 2, 1], "B": [3, 3, 4, 5], "C": [-1.0, 10.0, 1.0, 10.0]},
dtype="Float64",
)
result = crosstab(
df["A"],
df["B"],
values=df["C"],
aggfunc="sum",
margins=True,
)
expected = DataFrame(
[
[-1.0, pd.NA, 10.0, 9.0],
[10.0, 1.0, pd.NA, 11.0],
[9.0, 1.0, 10.0, 20.0],
],
index=Index([1.0, 2.0, "All"], dtype="object", name="A"),
columns=Index([3.0, 4.0, 5.0, "All"], dtype="object", name="B"),
dtype="Float64",
)
tm.assert_frame_equal(result, expected)
def test_margin_with_ordered_categorical_column(self):
# GH 25278
df = DataFrame(
{
"First": ["B", "B", "C", "A", "B", "C"],
"Second": ["C", "B", "B", "B", "C", "A"],
}
)
df["First"] = df["First"].astype(CategoricalDtype(ordered=True))
customized_categories_order = ["C", "A", "B"]
df["First"] = df["First"].cat.reorder_categories(customized_categories_order)
result = crosstab(df["First"], df["Second"], margins=True)
expected_index = Index(["C", "A", "B", "All"], name="First")
expected_columns = Index(["A", "B", "C", "All"], name="Second")
expected_data = [[1, 1, 0, 2], [0, 1, 0, 1], [0, 1, 2, 3], [1, 3, 2, 6]]
expected = DataFrame(
expected_data, index=expected_index, columns=expected_columns
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("a_dtype", ["category", "int64"])
@pytest.mark.parametrize("b_dtype", ["category", "int64"])
def test_categoricals(a_dtype, b_dtype):
# https://github.com/pandas-dev/pandas/issues/37465
g = np.random.default_rng(2)
a = Series(g.integers(0, 3, size=100)).astype(a_dtype)
b = Series(g.integers(0, 2, size=100)).astype(b_dtype)
result = crosstab(a, b, margins=True, dropna=False)
columns = Index([0, 1, "All"], dtype="object", name="col_0")
index = Index([0, 1, 2, "All"], dtype="object", name="row_0")
values = [[10, 18, 28], [23, 16, 39], [17, 16, 33], [50, 50, 100]]
expected = DataFrame(values, index, columns)
tm.assert_frame_equal(result, expected)
# Verify when categorical does not have all values present
a.loc[a == 1] = 2
a_is_cat = isinstance(a.dtype, CategoricalDtype)
assert not a_is_cat or a.value_counts().loc[1] == 0
result = crosstab(a, b, margins=True, dropna=False)
values = [[10, 18, 28], [0, 0, 0], [40, 32, 72], [50, 50, 100]]
expected = DataFrame(values, index, columns)
if not a_is_cat:
expected = expected.loc[[0, 2, "All"]]
expected["All"] = expected["All"].astype("int64")
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,791 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
Categorical,
DataFrame,
DatetimeIndex,
Index,
Interval,
IntervalIndex,
Series,
TimedeltaIndex,
Timestamp,
cut,
date_range,
interval_range,
isna,
qcut,
timedelta_range,
to_datetime,
)
import pandas._testing as tm
from pandas.api.types import CategoricalDtype
import pandas.core.reshape.tile as tmod
def test_simple():
data = np.ones(5, dtype="int64")
result = cut(data, 4, labels=False)
expected = np.array([1, 1, 1, 1, 1])
tm.assert_numpy_array_equal(result, expected, check_dtype=False)
@pytest.mark.parametrize("func", [list, np.array])
def test_bins(func):
data = func([0.2, 1.4, 2.5, 6.2, 9.7, 2.1])
result, bins = cut(data, 3, retbins=True)
intervals = IntervalIndex.from_breaks(bins.round(3))
intervals = intervals.take([0, 0, 0, 1, 2, 0])
expected = Categorical(intervals, ordered=True)
tm.assert_categorical_equal(result, expected)
tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, 6.53333333, 9.7]))
def test_right():
data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
result, bins = cut(data, 4, right=True, retbins=True)
intervals = IntervalIndex.from_breaks(bins.round(3))
expected = Categorical(intervals, ordered=True)
expected = expected.take([0, 0, 0, 2, 3, 0, 0])
tm.assert_categorical_equal(result, expected)
tm.assert_almost_equal(bins, np.array([0.1905, 2.575, 4.95, 7.325, 9.7]))
def test_no_right():
data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
result, bins = cut(data, 4, right=False, retbins=True)
intervals = IntervalIndex.from_breaks(bins.round(3), closed="left")
intervals = intervals.take([0, 0, 0, 2, 3, 0, 1])
expected = Categorical(intervals, ordered=True)
tm.assert_categorical_equal(result, expected)
tm.assert_almost_equal(bins, np.array([0.2, 2.575, 4.95, 7.325, 9.7095]))
def test_bins_from_interval_index():
c = cut(range(5), 3)
expected = c
result = cut(range(5), bins=expected.categories)
tm.assert_categorical_equal(result, expected)
expected = Categorical.from_codes(
np.append(c.codes, -1), categories=c.categories, ordered=True
)
result = cut(range(6), bins=expected.categories)
tm.assert_categorical_equal(result, expected)
def test_bins_from_interval_index_doc_example():
# Make sure we preserve the bins.
ages = np.array([10, 15, 13, 12, 23, 25, 28, 59, 60])
c = cut(ages, bins=[0, 18, 35, 70])
expected = IntervalIndex.from_tuples([(0, 18), (18, 35), (35, 70)])
tm.assert_index_equal(c.categories, expected)
result = cut([25, 20, 50], bins=c.categories)
tm.assert_index_equal(result.categories, expected)
tm.assert_numpy_array_equal(result.codes, np.array([1, 1, 2], dtype="int8"))
def test_bins_not_overlapping_from_interval_index():
# see gh-23980
msg = "Overlapping IntervalIndex is not accepted"
ii = IntervalIndex.from_tuples([(0, 10), (2, 12), (4, 14)])
with pytest.raises(ValueError, match=msg):
cut([5, 6], bins=ii)
def test_bins_not_monotonic():
msg = "bins must increase monotonically"
data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
with pytest.raises(ValueError, match=msg):
cut(data, [0.1, 1.5, 1, 10])
@pytest.mark.parametrize(
"x, bins, expected",
[
(
date_range("2017-12-31", periods=3),
[Timestamp.min, Timestamp("2018-01-01"), Timestamp.max],
IntervalIndex.from_tuples(
[
(Timestamp.min, Timestamp("2018-01-01")),
(Timestamp("2018-01-01"), Timestamp.max),
]
),
),
(
[-1, 0, 1],
np.array(
[np.iinfo(np.int64).min, 0, np.iinfo(np.int64).max], dtype="int64"
),
IntervalIndex.from_tuples(
[(np.iinfo(np.int64).min, 0), (0, np.iinfo(np.int64).max)]
),
),
(
[
np.timedelta64(-1, "ns"),
np.timedelta64(0, "ns"),
np.timedelta64(1, "ns"),
],
np.array(
[
np.timedelta64(-np.iinfo(np.int64).max, "ns"),
np.timedelta64(0, "ns"),
np.timedelta64(np.iinfo(np.int64).max, "ns"),
]
),
IntervalIndex.from_tuples(
[
(
np.timedelta64(-np.iinfo(np.int64).max, "ns"),
np.timedelta64(0, "ns"),
),
(
np.timedelta64(0, "ns"),
np.timedelta64(np.iinfo(np.int64).max, "ns"),
),
]
),
),
],
)
def test_bins_monotonic_not_overflowing(x, bins, expected):
# GH 26045
result = cut(x, bins)
tm.assert_index_equal(result.categories, expected)
def test_wrong_num_labels():
msg = "Bin labels must be one fewer than the number of bin edges"
data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
with pytest.raises(ValueError, match=msg):
cut(data, [0, 1, 10], labels=["foo", "bar", "baz"])
@pytest.mark.parametrize(
"x,bins,msg",
[
([], 2, "Cannot cut empty array"),
([1, 2, 3], 0.5, "`bins` should be a positive integer"),
],
)
def test_cut_corner(x, bins, msg):
with pytest.raises(ValueError, match=msg):
cut(x, bins)
@pytest.mark.parametrize("arg", [2, np.eye(2), DataFrame(np.eye(2))])
@pytest.mark.parametrize("cut_func", [cut, qcut])
def test_cut_not_1d_arg(arg, cut_func):
msg = "Input array must be 1 dimensional"
with pytest.raises(ValueError, match=msg):
cut_func(arg, 2)
@pytest.mark.parametrize(
"data",
[
[0, 1, 2, 3, 4, np.inf],
[-np.inf, 0, 1, 2, 3, 4],
[-np.inf, 0, 1, 2, 3, 4, np.inf],
],
)
def test_int_bins_with_inf(data):
# GH 24314
msg = "cannot specify integer `bins` when input data contains infinity"
with pytest.raises(ValueError, match=msg):
cut(data, bins=3)
def test_cut_out_of_range_more():
# see gh-1511
name = "x"
ser = Series([0, -1, 0, 1, -3], name=name)
ind = cut(ser, [0, 1], labels=False)
exp = Series([np.nan, np.nan, np.nan, 0, np.nan], name=name)
tm.assert_series_equal(ind, exp)
@pytest.mark.parametrize(
"right,breaks,closed",
[
(True, [-1e-3, 0.25, 0.5, 0.75, 1], "right"),
(False, [0, 0.25, 0.5, 0.75, 1 + 1e-3], "left"),
],
)
def test_labels(right, breaks, closed):
arr = np.tile(np.arange(0, 1.01, 0.1), 4)
result, bins = cut(arr, 4, retbins=True, right=right)
ex_levels = IntervalIndex.from_breaks(breaks, closed=closed)
tm.assert_index_equal(result.categories, ex_levels)
def test_cut_pass_series_name_to_factor():
name = "foo"
ser = Series(np.random.default_rng(2).standard_normal(100), name=name)
factor = cut(ser, 4)
assert factor.name == name
def test_label_precision():
arr = np.arange(0, 0.73, 0.01)
result = cut(arr, 4, precision=2)
ex_levels = IntervalIndex.from_breaks([-0.00072, 0.18, 0.36, 0.54, 0.72])
tm.assert_index_equal(result.categories, ex_levels)
@pytest.mark.parametrize("labels", [None, False])
def test_na_handling(labels):
arr = np.arange(0, 0.75, 0.01)
arr[::3] = np.nan
result = cut(arr, 4, labels=labels)
result = np.asarray(result)
expected = np.where(isna(arr), np.nan, result)
tm.assert_almost_equal(result, expected)
def test_inf_handling():
data = np.arange(6)
data_ser = Series(data, dtype="int64")
bins = [-np.inf, 2, 4, np.inf]
result = cut(data, bins)
result_ser = cut(data_ser, bins)
ex_uniques = IntervalIndex.from_breaks(bins)
tm.assert_index_equal(result.categories, ex_uniques)
assert result[5] == Interval(4, np.inf)
assert result[0] == Interval(-np.inf, 2)
assert result_ser[5] == Interval(4, np.inf)
assert result_ser[0] == Interval(-np.inf, 2)
def test_cut_out_of_bounds():
arr = np.random.default_rng(2).standard_normal(100)
result = cut(arr, [-1, 0, 1])
mask = isna(result)
ex_mask = (arr < -1) | (arr > 1)
tm.assert_numpy_array_equal(mask, ex_mask)
@pytest.mark.parametrize(
"get_labels,get_expected",
[
(
lambda labels: labels,
lambda labels: Categorical(
["Medium"] + 4 * ["Small"] + ["Medium", "Large"],
categories=labels,
ordered=True,
),
),
(
lambda labels: Categorical.from_codes([0, 1, 2], labels),
lambda labels: Categorical.from_codes([1] + 4 * [0] + [1, 2], labels),
),
],
)
def test_cut_pass_labels(get_labels, get_expected):
bins = [0, 25, 50, 100]
arr = [50, 5, 10, 15, 20, 30, 70]
labels = ["Small", "Medium", "Large"]
result = cut(arr, bins, labels=get_labels(labels))
tm.assert_categorical_equal(result, get_expected(labels))
def test_cut_pass_labels_compat():
# see gh-16459
arr = [50, 5, 10, 15, 20, 30, 70]
labels = ["Good", "Medium", "Bad"]
result = cut(arr, 3, labels=labels)
exp = cut(arr, 3, labels=Categorical(labels, categories=labels, ordered=True))
tm.assert_categorical_equal(result, exp)
@pytest.mark.parametrize("x", [np.arange(11.0), np.arange(11.0) / 1e10])
def test_round_frac_just_works(x):
# It works.
cut(x, 2)
@pytest.mark.parametrize(
"val,precision,expected",
[
(-117.9998, 3, -118),
(117.9998, 3, 118),
(117.9998, 2, 118),
(0.000123456, 2, 0.00012),
],
)
def test_round_frac(val, precision, expected):
# see gh-1979
result = tmod._round_frac(val, precision=precision)
assert result == expected
def test_cut_return_intervals():
ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
result = cut(ser, 3)
exp_bins = np.linspace(0, 8, num=4).round(3)
exp_bins[0] -= 0.008
expected = Series(
IntervalIndex.from_breaks(exp_bins, closed="right").take(
[0, 0, 0, 1, 1, 1, 2, 2, 2]
)
).astype(CategoricalDtype(ordered=True))
tm.assert_series_equal(result, expected)
def test_series_ret_bins():
# see gh-8589
ser = Series(np.arange(4))
result, bins = cut(ser, 2, retbins=True)
expected = Series(
IntervalIndex.from_breaks([-0.003, 1.5, 3], closed="right").repeat(2)
).astype(CategoricalDtype(ordered=True))
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,msg",
[
({"duplicates": "drop"}, None),
({}, "Bin edges must be unique"),
({"duplicates": "raise"}, "Bin edges must be unique"),
({"duplicates": "foo"}, "invalid value for 'duplicates' parameter"),
],
)
def test_cut_duplicates_bin(kwargs, msg):
# see gh-20947
bins = [0, 2, 4, 6, 10, 10]
values = Series(np.array([1, 3, 5, 7, 9]), index=["a", "b", "c", "d", "e"])
if msg is not None:
with pytest.raises(ValueError, match=msg):
cut(values, bins, **kwargs)
else:
result = cut(values, bins, **kwargs)
expected = cut(values, pd.unique(np.asarray(bins)))
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("data", [9.0, -9.0, 0.0])
@pytest.mark.parametrize("length", [1, 2])
def test_single_bin(data, length):
# see gh-14652, gh-15428
ser = Series([data] * length)
result = cut(ser, 1, labels=False)
expected = Series([0] * length, dtype=np.intp)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"array_1_writeable,array_2_writeable", [(True, True), (True, False), (False, False)]
)
def test_cut_read_only(array_1_writeable, array_2_writeable):
# issue 18773
array_1 = np.arange(0, 100, 10)
array_1.flags.writeable = array_1_writeable
array_2 = np.arange(0, 100, 10)
array_2.flags.writeable = array_2_writeable
hundred_elements = np.arange(100)
tm.assert_categorical_equal(
cut(hundred_elements, array_1), cut(hundred_elements, array_2)
)
@pytest.mark.parametrize(
"conv",
[
lambda v: Timestamp(v),
lambda v: to_datetime(v),
lambda v: np.datetime64(v),
lambda v: Timestamp(v).to_pydatetime(),
],
)
def test_datetime_bin(conv):
data = [np.datetime64("2012-12-13"), np.datetime64("2012-12-15")]
bin_data = ["2012-12-12", "2012-12-14", "2012-12-16"]
expected = Series(
IntervalIndex(
[
Interval(Timestamp(bin_data[0]), Timestamp(bin_data[1])),
Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2])),
]
)
).astype(CategoricalDtype(ordered=True))
bins = [conv(v) for v in bin_data]
result = Series(cut(data, bins=bins))
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("box", [Series, Index, np.array, list])
def test_datetime_cut(unit, box):
# see gh-14714
#
# Testing time data when it comes in various collection types.
data = to_datetime(["2013-01-01", "2013-01-02", "2013-01-03"]).astype(f"M8[{unit}]")
data = box(data)
result, _ = cut(data, 3, retbins=True)
if box is list:
# We don't (yet) do inference on these, so get nanos
unit = "ns"
if unit == "s":
# See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425
# for why we round to 8 seconds instead of 7
left = DatetimeIndex(
["2012-12-31 23:57:08", "2013-01-01 16:00:00", "2013-01-02 08:00:00"],
dtype=f"M8[{unit}]",
)
else:
left = DatetimeIndex(
[
"2012-12-31 23:57:07.200000",
"2013-01-01 16:00:00",
"2013-01-02 08:00:00",
],
dtype=f"M8[{unit}]",
)
right = DatetimeIndex(
["2013-01-01 16:00:00", "2013-01-02 08:00:00", "2013-01-03 00:00:00"],
dtype=f"M8[{unit}]",
)
exp_intervals = IntervalIndex.from_arrays(left, right)
expected = Series(exp_intervals).astype(CategoricalDtype(ordered=True))
tm.assert_series_equal(Series(result), expected)
@pytest.mark.parametrize("box", [list, np.array, Index, Series])
def test_datetime_tz_cut_mismatched_tzawareness(box):
# GH#54964
bins = box(
[
Timestamp("2013-01-01 04:57:07.200000"),
Timestamp("2013-01-01 21:00:00"),
Timestamp("2013-01-02 13:00:00"),
Timestamp("2013-01-03 05:00:00"),
]
)
ser = Series(date_range("20130101", periods=3, tz="US/Eastern"))
msg = "Cannot use timezone-naive bins with timezone-aware values"
with pytest.raises(ValueError, match=msg):
cut(ser, bins)
@pytest.mark.parametrize(
"bins",
[
3,
[
Timestamp("2013-01-01 04:57:07.200000", tz="UTC").tz_convert("US/Eastern"),
Timestamp("2013-01-01 21:00:00", tz="UTC").tz_convert("US/Eastern"),
Timestamp("2013-01-02 13:00:00", tz="UTC").tz_convert("US/Eastern"),
Timestamp("2013-01-03 05:00:00", tz="UTC").tz_convert("US/Eastern"),
],
],
)
@pytest.mark.parametrize("box", [list, np.array, Index, Series])
def test_datetime_tz_cut(bins, box):
# see gh-19872
tz = "US/Eastern"
ser = Series(date_range("20130101", periods=3, tz=tz))
if not isinstance(bins, int):
bins = box(bins)
result = cut(ser, bins)
expected = Series(
IntervalIndex(
[
Interval(
Timestamp("2012-12-31 23:57:07.200000", tz=tz),
Timestamp("2013-01-01 16:00:00", tz=tz),
),
Interval(
Timestamp("2013-01-01 16:00:00", tz=tz),
Timestamp("2013-01-02 08:00:00", tz=tz),
),
Interval(
Timestamp("2013-01-02 08:00:00", tz=tz),
Timestamp("2013-01-03 00:00:00", tz=tz),
),
]
)
).astype(CategoricalDtype(ordered=True))
tm.assert_series_equal(result, expected)
def test_datetime_nan_error():
msg = "bins must be of datetime64 dtype"
with pytest.raises(ValueError, match=msg):
cut(date_range("20130101", periods=3), bins=[0, 2, 4])
def test_datetime_nan_mask():
result = cut(
date_range("20130102", periods=5), bins=date_range("20130101", periods=2)
)
mask = result.categories.isna()
tm.assert_numpy_array_equal(mask, np.array([False]))
mask = result.isna()
tm.assert_numpy_array_equal(mask, np.array([False, True, True, True, True]))
@pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"])
def test_datetime_cut_roundtrip(tz, unit):
# see gh-19891
ser = Series(date_range("20180101", periods=3, tz=tz, unit=unit))
result, result_bins = cut(ser, 2, retbins=True)
expected = cut(ser, result_bins)
tm.assert_series_equal(result, expected)
if unit == "s":
# TODO: constructing DatetimeIndex with dtype="M8[s]" without truncating
# the first entry here raises in array_to_datetime. Should truncate
# instead of raising?
# See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425
# for why we round to 8 seconds instead of 7
expected_bins = DatetimeIndex(
["2017-12-31 23:57:08", "2018-01-02 00:00:00", "2018-01-03 00:00:00"],
dtype=f"M8[{unit}]",
)
else:
expected_bins = DatetimeIndex(
[
"2017-12-31 23:57:07.200000",
"2018-01-02 00:00:00",
"2018-01-03 00:00:00",
],
dtype=f"M8[{unit}]",
)
expected_bins = expected_bins.tz_localize(tz)
tm.assert_index_equal(result_bins, expected_bins)
def test_timedelta_cut_roundtrip():
# see gh-19891
ser = Series(timedelta_range("1day", periods=3))
result, result_bins = cut(ser, 2, retbins=True)
expected = cut(ser, result_bins)
tm.assert_series_equal(result, expected)
expected_bins = TimedeltaIndex(
["0 days 23:57:07.200000", "2 days 00:00:00", "3 days 00:00:00"]
)
tm.assert_index_equal(result_bins, expected_bins)
@pytest.mark.parametrize("bins", [6, 7])
@pytest.mark.parametrize(
"box, compare",
[
(Series, tm.assert_series_equal),
(np.array, tm.assert_categorical_equal),
(list, tm.assert_equal),
],
)
def test_cut_bool_coercion_to_int(bins, box, compare):
# issue 20303
data_expected = box([0, 1, 1, 0, 1] * 10)
data_result = box([False, True, True, False, True] * 10)
expected = cut(data_expected, bins, duplicates="drop")
result = cut(data_result, bins, duplicates="drop")
compare(result, expected)
@pytest.mark.parametrize("labels", ["foo", 1, True])
def test_cut_incorrect_labels(labels):
# GH 13318
values = range(5)
msg = "Bin labels must either be False, None or passed in as a list-like argument"
with pytest.raises(ValueError, match=msg):
cut(values, 4, labels=labels)
@pytest.mark.parametrize("bins", [3, [0, 5, 15]])
@pytest.mark.parametrize("right", [True, False])
@pytest.mark.parametrize("include_lowest", [True, False])
def test_cut_nullable_integer(bins, right, include_lowest):
a = np.random.default_rng(2).integers(0, 10, size=50).astype(float)
a[::2] = np.nan
result = cut(
pd.array(a, dtype="Int64"), bins, right=right, include_lowest=include_lowest
)
expected = cut(a, bins, right=right, include_lowest=include_lowest)
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize(
"data, bins, labels, expected_codes, expected_labels",
[
([15, 17, 19], [14, 16, 18, 20], ["A", "B", "A"], [0, 1, 0], ["A", "B"]),
([1, 3, 5], [0, 2, 4, 6, 8], [2, 0, 1, 2], [2, 0, 1], [0, 1, 2]),
],
)
def test_cut_non_unique_labels(data, bins, labels, expected_codes, expected_labels):
# GH 33141
result = cut(data, bins=bins, labels=labels, ordered=False)
expected = Categorical.from_codes(
expected_codes, categories=expected_labels, ordered=False
)
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize(
"data, bins, labels, expected_codes, expected_labels",
[
([15, 17, 19], [14, 16, 18, 20], ["C", "B", "A"], [0, 1, 2], ["C", "B", "A"]),
([1, 3, 5], [0, 2, 4, 6, 8], [3, 0, 1, 2], [0, 1, 2], [3, 0, 1, 2]),
],
)
def test_cut_unordered_labels(data, bins, labels, expected_codes, expected_labels):
# GH 33141
result = cut(data, bins=bins, labels=labels, ordered=False)
expected = Categorical.from_codes(
expected_codes, categories=expected_labels, ordered=False
)
tm.assert_categorical_equal(result, expected)
def test_cut_unordered_with_missing_labels_raises_error():
# GH 33141
msg = "'labels' must be provided if 'ordered = False'"
with pytest.raises(ValueError, match=msg):
cut([0.5, 3], bins=[0, 1, 2], ordered=False)
def test_cut_unordered_with_series_labels():
# https://github.com/pandas-dev/pandas/issues/36603
ser = Series([1, 2, 3, 4, 5])
bins = Series([0, 2, 4, 6])
labels = Series(["a", "b", "c"])
result = cut(ser, bins=bins, labels=labels, ordered=False)
expected = Series(["a", "a", "b", "b", "c"], dtype="category")
tm.assert_series_equal(result, expected)
def test_cut_no_warnings():
df = DataFrame({"value": np.random.default_rng(2).integers(0, 100, 20)})
labels = [f"{i} - {i + 9}" for i in range(0, 100, 10)]
with tm.assert_produces_warning(False):
df["group"] = cut(df.value, range(0, 105, 10), right=False, labels=labels)
def test_cut_with_duplicated_index_lowest_included():
# GH 42185
expected = Series(
[Interval(-0.001, 2, closed="right")] * 3
+ [Interval(2, 4, closed="right"), Interval(-0.001, 2, closed="right")],
index=[0, 1, 2, 3, 0],
dtype="category",
).cat.as_ordered()
ser = Series([0, 1, 2, 3, 0], index=[0, 1, 2, 3, 0])
result = cut(ser, bins=[0, 2, 4], include_lowest=True)
tm.assert_series_equal(result, expected)
def test_cut_with_nonexact_categorical_indices():
# GH 42424
ser = Series(range(100))
ser1 = cut(ser, 10).value_counts().head(5)
ser2 = cut(ser, 10).value_counts().tail(5)
result = DataFrame({"1": ser1, "2": ser2})
index = pd.CategoricalIndex(
[
Interval(-0.099, 9.9, closed="right"),
Interval(9.9, 19.8, closed="right"),
Interval(19.8, 29.7, closed="right"),
Interval(29.7, 39.6, closed="right"),
Interval(39.6, 49.5, closed="right"),
Interval(49.5, 59.4, closed="right"),
Interval(59.4, 69.3, closed="right"),
Interval(69.3, 79.2, closed="right"),
Interval(79.2, 89.1, closed="right"),
Interval(89.1, 99, closed="right"),
],
ordered=True,
)
expected = DataFrame(
{"1": [10] * 5 + [np.nan] * 5, "2": [np.nan] * 5 + [10] * 5}, index=index
)
tm.assert_frame_equal(expected, result)
def test_cut_with_timestamp_tuple_labels():
# GH 40661
labels = [(Timestamp(10),), (Timestamp(20),), (Timestamp(30),)]
result = cut([2, 4, 6], bins=[1, 3, 5, 7], labels=labels)
expected = Categorical.from_codes([0, 1, 2], labels, ordered=True)
tm.assert_categorical_equal(result, expected)
def test_cut_bins_datetime_intervalindex():
# https://github.com/pandas-dev/pandas/issues/46218
bins = interval_range(Timestamp("2022-02-25"), Timestamp("2022-02-27"), freq="1D")
# passing Series instead of list is important to trigger bug
result = cut(Series([Timestamp("2022-02-26")]).astype("M8[ns]"), bins=bins)
expected = Categorical.from_codes([0], bins, ordered=True)
tm.assert_categorical_equal(result.array, expected)
def test_cut_with_nullable_int64():
# GH 30787
series = Series([0, 1, 2, 3, 4, pd.NA, 6, 7], dtype="Int64")
bins = [0, 2, 4, 6, 8]
intervals = IntervalIndex.from_breaks(bins)
expected = Series(
Categorical.from_codes([-1, 0, 0, 1, 1, -1, 2, 3], intervals, ordered=True)
)
result = cut(series, bins=bins)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,447 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Series,
from_dummies,
get_dummies,
)
import pandas._testing as tm
@pytest.fixture
def dummies_basic():
return DataFrame(
{
"col1_a": [1, 0, 1],
"col1_b": [0, 1, 0],
"col2_a": [0, 1, 0],
"col2_b": [1, 0, 0],
"col2_c": [0, 0, 1],
},
)
@pytest.fixture
def dummies_with_unassigned():
return DataFrame(
{
"col1_a": [1, 0, 0],
"col1_b": [0, 1, 0],
"col2_a": [0, 1, 0],
"col2_b": [0, 0, 0],
"col2_c": [0, 0, 1],
},
)
def test_error_wrong_data_type():
dummies = [0, 1, 0]
with pytest.raises(
TypeError,
match=r"Expected 'data' to be a 'DataFrame'; Received 'data' of type: list",
):
from_dummies(dummies)
def test_error_no_prefix_contains_unassigned():
dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]})
with pytest.raises(
ValueError,
match=(
r"Dummy DataFrame contains unassigned value\(s\); "
r"First instance in row: 2"
),
):
from_dummies(dummies)
def test_error_no_prefix_wrong_default_category_type():
dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]})
with pytest.raises(
TypeError,
match=(
r"Expected 'default_category' to be of type 'None', 'Hashable', or 'dict'; "
r"Received 'default_category' of type: list"
),
):
from_dummies(dummies, default_category=["c", "d"])
def test_error_no_prefix_multi_assignment():
dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]})
with pytest.raises(
ValueError,
match=(
r"Dummy DataFrame contains multi-assignment\(s\); "
r"First instance in row: 2"
),
):
from_dummies(dummies)
def test_error_no_prefix_contains_nan():
dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, np.nan]})
with pytest.raises(
ValueError, match=r"Dummy DataFrame contains NA value in column: 'b'"
):
from_dummies(dummies)
def test_error_contains_non_dummies():
dummies = DataFrame(
{"a": [1, 6, 3, 1], "b": [0, 1, 0, 2], "c": ["c1", "c2", "c3", "c4"]}
)
with pytest.raises(
TypeError,
match=r"Passed DataFrame contains non-dummy data",
):
from_dummies(dummies)
def test_error_with_prefix_multiple_seperators():
dummies = DataFrame(
{
"col1_a": [1, 0, 1],
"col1_b": [0, 1, 0],
"col2-a": [0, 1, 0],
"col2-b": [1, 0, 1],
},
)
with pytest.raises(
ValueError,
match=(r"Separator not specified for column: col2-a"),
):
from_dummies(dummies, sep="_")
def test_error_with_prefix_sep_wrong_type(dummies_basic):
with pytest.raises(
TypeError,
match=(
r"Expected 'sep' to be of type 'str' or 'None'; "
r"Received 'sep' of type: list"
),
):
from_dummies(dummies_basic, sep=["_"])
def test_error_with_prefix_contains_unassigned(dummies_with_unassigned):
with pytest.raises(
ValueError,
match=(
r"Dummy DataFrame contains unassigned value\(s\); "
r"First instance in row: 2"
),
):
from_dummies(dummies_with_unassigned, sep="_")
def test_error_with_prefix_default_category_wrong_type(dummies_with_unassigned):
with pytest.raises(
TypeError,
match=(
r"Expected 'default_category' to be of type 'None', 'Hashable', or 'dict'; "
r"Received 'default_category' of type: list"
),
):
from_dummies(dummies_with_unassigned, sep="_", default_category=["x", "y"])
def test_error_with_prefix_default_category_dict_not_complete(
dummies_with_unassigned,
):
with pytest.raises(
ValueError,
match=(
r"Length of 'default_category' \(1\) did not match "
r"the length of the columns being encoded \(2\)"
),
):
from_dummies(dummies_with_unassigned, sep="_", default_category={"col1": "x"})
def test_error_with_prefix_contains_nan(dummies_basic):
# Set float64 dtype to avoid upcast when setting np.nan
dummies_basic["col2_c"] = dummies_basic["col2_c"].astype("float64")
dummies_basic.loc[2, "col2_c"] = np.nan
with pytest.raises(
ValueError, match=r"Dummy DataFrame contains NA value in column: 'col2_c'"
):
from_dummies(dummies_basic, sep="_")
def test_error_with_prefix_contains_non_dummies(dummies_basic):
# Set object dtype to avoid upcast when setting "str"
dummies_basic["col2_c"] = dummies_basic["col2_c"].astype(object)
dummies_basic.loc[2, "col2_c"] = "str"
with pytest.raises(TypeError, match=r"Passed DataFrame contains non-dummy data"):
from_dummies(dummies_basic, sep="_")
def test_error_with_prefix_double_assignment():
dummies = DataFrame(
{
"col1_a": [1, 0, 1],
"col1_b": [1, 1, 0],
"col2_a": [0, 1, 0],
"col2_b": [1, 0, 0],
"col2_c": [0, 0, 1],
},
)
with pytest.raises(
ValueError,
match=(
r"Dummy DataFrame contains multi-assignment\(s\); "
r"First instance in row: 0"
),
):
from_dummies(dummies, sep="_")
def test_roundtrip_series_to_dataframe():
categories = Series(["a", "b", "c", "a"])
dummies = get_dummies(categories)
result = from_dummies(dummies)
expected = DataFrame({"": ["a", "b", "c", "a"]})
tm.assert_frame_equal(result, expected)
def test_roundtrip_single_column_dataframe():
categories = DataFrame({"": ["a", "b", "c", "a"]})
dummies = get_dummies(categories)
result = from_dummies(dummies, sep="_")
expected = categories
tm.assert_frame_equal(result, expected)
def test_roundtrip_with_prefixes():
categories = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]})
dummies = get_dummies(categories)
result = from_dummies(dummies, sep="_")
expected = categories
tm.assert_frame_equal(result, expected)
def test_no_prefix_string_cats_basic():
dummies = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]})
expected = DataFrame({"": ["a", "b", "c", "a"]})
result = from_dummies(dummies)
tm.assert_frame_equal(result, expected)
def test_no_prefix_string_cats_basic_bool_values():
dummies = DataFrame(
{
"a": [True, False, False, True],
"b": [False, True, False, False],
"c": [False, False, True, False],
}
)
expected = DataFrame({"": ["a", "b", "c", "a"]})
result = from_dummies(dummies)
tm.assert_frame_equal(result, expected)
def test_no_prefix_string_cats_basic_mixed_bool_values():
dummies = DataFrame(
{"a": [1, 0, 0, 1], "b": [False, True, False, False], "c": [0, 0, 1, 0]}
)
expected = DataFrame({"": ["a", "b", "c", "a"]})
result = from_dummies(dummies)
tm.assert_frame_equal(result, expected)
def test_no_prefix_int_cats_basic():
dummies = DataFrame(
{1: [1, 0, 0, 0], 25: [0, 1, 0, 0], 2: [0, 0, 1, 0], 5: [0, 0, 0, 1]}
)
expected = DataFrame({"": [1, 25, 2, 5]})
result = from_dummies(dummies)
tm.assert_frame_equal(result, expected)
def test_no_prefix_float_cats_basic():
dummies = DataFrame(
{1.0: [1, 0, 0, 0], 25.0: [0, 1, 0, 0], 2.5: [0, 0, 1, 0], 5.84: [0, 0, 0, 1]}
)
expected = DataFrame({"": [1.0, 25.0, 2.5, 5.84]})
result = from_dummies(dummies)
tm.assert_frame_equal(result, expected)
def test_no_prefix_mixed_cats_basic():
dummies = DataFrame(
{
1.23: [1, 0, 0, 0, 0],
"c": [0, 1, 0, 0, 0],
2: [0, 0, 1, 0, 0],
False: [0, 0, 0, 1, 0],
None: [0, 0, 0, 0, 1],
}
)
expected = DataFrame({"": [1.23, "c", 2, False, None]}, dtype="object")
result = from_dummies(dummies)
tm.assert_frame_equal(result, expected)
def test_no_prefix_string_cats_contains_get_dummies_NaN_column():
dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "NaN": [0, 0, 1]})
expected = DataFrame({"": ["a", "b", "NaN"]})
result = from_dummies(dummies)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"default_category, expected",
[
pytest.param(
"c",
DataFrame({"": ["a", "b", "c"]}),
id="default_category is a str",
),
pytest.param(
1,
DataFrame({"": ["a", "b", 1]}),
id="default_category is a int",
),
pytest.param(
1.25,
DataFrame({"": ["a", "b", 1.25]}),
id="default_category is a float",
),
pytest.param(
0,
DataFrame({"": ["a", "b", 0]}),
id="default_category is a 0",
),
pytest.param(
False,
DataFrame({"": ["a", "b", False]}),
id="default_category is a bool",
),
pytest.param(
(1, 2),
DataFrame({"": ["a", "b", (1, 2)]}),
id="default_category is a tuple",
),
],
)
def test_no_prefix_string_cats_default_category(
default_category, expected, using_infer_string
):
dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]})
result = from_dummies(dummies, default_category=default_category)
if using_infer_string:
expected[""] = expected[""].astype("string[pyarrow_numpy]")
tm.assert_frame_equal(result, expected)
def test_with_prefix_basic(dummies_basic):
expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]})
result = from_dummies(dummies_basic, sep="_")
tm.assert_frame_equal(result, expected)
def test_with_prefix_contains_get_dummies_NaN_column():
dummies = DataFrame(
{
"col1_a": [1, 0, 0],
"col1_b": [0, 1, 0],
"col1_NaN": [0, 0, 1],
"col2_a": [0, 1, 0],
"col2_b": [0, 0, 0],
"col2_c": [0, 0, 1],
"col2_NaN": [1, 0, 0],
},
)
expected = DataFrame({"col1": ["a", "b", "NaN"], "col2": ["NaN", "a", "c"]})
result = from_dummies(dummies, sep="_")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"default_category, expected",
[
pytest.param(
"x",
DataFrame({"col1": ["a", "b", "x"], "col2": ["x", "a", "c"]}),
id="default_category is a str",
),
pytest.param(
0,
DataFrame({"col1": ["a", "b", 0], "col2": [0, "a", "c"]}),
id="default_category is a 0",
),
pytest.param(
False,
DataFrame({"col1": ["a", "b", False], "col2": [False, "a", "c"]}),
id="default_category is a False",
),
pytest.param(
{"col2": 1, "col1": 2.5},
DataFrame({"col1": ["a", "b", 2.5], "col2": [1, "a", "c"]}),
id="default_category is a dict with int and float values",
),
pytest.param(
{"col2": None, "col1": False},
DataFrame({"col1": ["a", "b", False], "col2": [None, "a", "c"]}),
id="default_category is a dict with bool and None values",
),
pytest.param(
{"col2": (1, 2), "col1": [1.25, False]},
DataFrame({"col1": ["a", "b", [1.25, False]], "col2": [(1, 2), "a", "c"]}),
id="default_category is a dict with list and tuple values",
),
],
)
def test_with_prefix_default_category(
dummies_with_unassigned, default_category, expected
):
result = from_dummies(
dummies_with_unassigned, sep="_", default_category=default_category
)
tm.assert_frame_equal(result, expected)
def test_ea_categories():
# GH 54300
df = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]})
df.columns = df.columns.astype("string[python]")
result = from_dummies(df)
expected = DataFrame({"": Series(list("abca"), dtype="string[python]")})
tm.assert_frame_equal(result, expected)
def test_ea_categories_with_sep():
# GH 54300
df = DataFrame(
{
"col1_a": [1, 0, 1],
"col1_b": [0, 1, 0],
"col2_a": [0, 1, 0],
"col2_b": [1, 0, 0],
"col2_c": [0, 0, 1],
}
)
df.columns = df.columns.astype("string[python]")
result = from_dummies(df, sep="_")
expected = DataFrame(
{
"col1": Series(list("aba"), dtype="string[python]"),
"col2": Series(list("bac"), dtype="string[python]"),
}
)
expected.columns = expected.columns.astype("string[python]")
tm.assert_frame_equal(result, expected)
def test_maintain_original_index():
# GH 54300
df = DataFrame(
{"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}, index=list("abcd")
)
result = from_dummies(df)
expected = DataFrame({"": list("abca")}, index=list("abcd"))
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,743 @@
import re
import unicodedata
import numpy as np
import pytest
import pandas.util._test_decorators as td
from pandas.core.dtypes.common import is_integer_dtype
import pandas as pd
from pandas import (
ArrowDtype,
Categorical,
CategoricalDtype,
CategoricalIndex,
DataFrame,
Index,
RangeIndex,
Series,
SparseDtype,
get_dummies,
)
import pandas._testing as tm
from pandas.core.arrays.sparse import SparseArray
try:
import pyarrow as pa
except ImportError:
pa = None
class TestGetDummies:
@pytest.fixture
def df(self):
return DataFrame({"A": ["a", "b", "a"], "B": ["b", "b", "c"], "C": [1, 2, 3]})
@pytest.fixture(params=["uint8", "i8", np.float64, bool, None])
def dtype(self, request):
return np.dtype(request.param)
@pytest.fixture(params=["dense", "sparse"])
def sparse(self, request):
# params are strings to simplify reading test results,
# e.g. TestGetDummies::test_basic[uint8-sparse] instead of [uint8-True]
return request.param == "sparse"
def effective_dtype(self, dtype):
if dtype is None:
return np.uint8
return dtype
def test_get_dummies_raises_on_dtype_object(self, df):
msg = "dtype=object is not a valid dtype for get_dummies"
with pytest.raises(ValueError, match=msg):
get_dummies(df, dtype="object")
def test_get_dummies_basic(self, sparse, dtype):
s_list = list("abc")
s_series = Series(s_list)
s_series_index = Series(s_list, list("ABC"))
expected = DataFrame(
{"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]},
dtype=self.effective_dtype(dtype),
)
if sparse:
if dtype.kind == "b":
expected = expected.apply(SparseArray, fill_value=False)
else:
expected = expected.apply(SparseArray, fill_value=0.0)
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
tm.assert_frame_equal(result, expected)
result = get_dummies(s_series, sparse=sparse, dtype=dtype)
tm.assert_frame_equal(result, expected)
expected.index = list("ABC")
result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_get_dummies_basic_types(self, sparse, dtype, using_infer_string):
# GH 10531
s_list = list("abc")
s_series = Series(s_list)
s_df = DataFrame(
{"a": [0, 1, 0, 1, 2], "b": ["A", "A", "B", "C", "C"], "c": [2, 3, 3, 3, 2]}
)
expected = DataFrame(
{"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]},
dtype=self.effective_dtype(dtype),
columns=list("abc"),
)
if sparse:
if is_integer_dtype(dtype):
fill_value = 0
elif dtype == bool:
fill_value = False
else:
fill_value = 0.0
expected = expected.apply(SparseArray, fill_value=fill_value)
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
tm.assert_frame_equal(result, expected)
result = get_dummies(s_series, sparse=sparse, dtype=dtype)
tm.assert_frame_equal(result, expected)
result = get_dummies(s_df, columns=s_df.columns, sparse=sparse, dtype=dtype)
if sparse:
dtype_name = f"Sparse[{self.effective_dtype(dtype).name}, {fill_value}]"
else:
dtype_name = self.effective_dtype(dtype).name
expected = Series({dtype_name: 8}, name="count")
result = result.dtypes.value_counts()
result.index = [str(i) for i in result.index]
tm.assert_series_equal(result, expected)
result = get_dummies(s_df, columns=["a"], sparse=sparse, dtype=dtype)
key = "string" if using_infer_string else "object"
expected_counts = {"int64": 1, key: 1}
expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0)
expected = Series(expected_counts, name="count").sort_index()
result = result.dtypes.value_counts()
result.index = [str(i) for i in result.index]
result = result.sort_index()
tm.assert_series_equal(result, expected)
def test_get_dummies_just_na(self, sparse):
just_na_list = [np.nan]
just_na_series = Series(just_na_list)
just_na_series_index = Series(just_na_list, index=["A"])
res_list = get_dummies(just_na_list, sparse=sparse)
res_series = get_dummies(just_na_series, sparse=sparse)
res_series_index = get_dummies(just_na_series_index, sparse=sparse)
assert res_list.empty
assert res_series.empty
assert res_series_index.empty
assert res_list.index.tolist() == [0]
assert res_series.index.tolist() == [0]
assert res_series_index.index.tolist() == ["A"]
def test_get_dummies_include_na(self, sparse, dtype):
s = ["a", "b", np.nan]
res = get_dummies(s, sparse=sparse, dtype=dtype)
exp = DataFrame(
{"a": [1, 0, 0], "b": [0, 1, 0]}, dtype=self.effective_dtype(dtype)
)
if sparse:
if dtype.kind == "b":
exp = exp.apply(SparseArray, fill_value=False)
else:
exp = exp.apply(SparseArray, fill_value=0.0)
tm.assert_frame_equal(res, exp)
# Sparse dataframes do not allow nan labelled columns, see #GH8822
res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype)
exp_na = DataFrame(
{np.nan: [0, 0, 1], "a": [1, 0, 0], "b": [0, 1, 0]},
dtype=self.effective_dtype(dtype),
)
exp_na = exp_na.reindex(["a", "b", np.nan], axis=1)
# hack (NaN handling in assert_index_equal)
exp_na.columns = res_na.columns
if sparse:
if dtype.kind == "b":
exp_na = exp_na.apply(SparseArray, fill_value=False)
else:
exp_na = exp_na.apply(SparseArray, fill_value=0.0)
tm.assert_frame_equal(res_na, exp_na)
res_just_na = get_dummies([np.nan], dummy_na=True, sparse=sparse, dtype=dtype)
exp_just_na = DataFrame(
Series(1, index=[0]), columns=[np.nan], dtype=self.effective_dtype(dtype)
)
tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
def test_get_dummies_unicode(self, sparse):
# See GH 6885 - get_dummies chokes on unicode values
e = "e"
eacute = unicodedata.lookup("LATIN SMALL LETTER E WITH ACUTE")
s = [e, eacute, eacute]
res = get_dummies(s, prefix="letter", sparse=sparse)
exp = DataFrame(
{"letter_e": [True, False, False], f"letter_{eacute}": [False, True, True]}
)
if sparse:
exp = exp.apply(SparseArray, fill_value=False)
tm.assert_frame_equal(res, exp)
def test_dataframe_dummies_all_obj(self, df, sparse):
df = df[["A", "B"]]
result = get_dummies(df, sparse=sparse)
expected = DataFrame(
{"A_a": [1, 0, 1], "A_b": [0, 1, 0], "B_b": [1, 1, 0], "B_c": [0, 0, 1]},
dtype=bool,
)
if sparse:
expected = DataFrame(
{
"A_a": SparseArray([1, 0, 1], dtype="bool"),
"A_b": SparseArray([0, 1, 0], dtype="bool"),
"B_b": SparseArray([1, 1, 0], dtype="bool"),
"B_c": SparseArray([0, 0, 1], dtype="bool"),
}
)
tm.assert_frame_equal(result, expected)
def test_dataframe_dummies_string_dtype(self, df, using_infer_string):
# GH44965
df = df[["A", "B"]]
df = df.astype({"A": "object", "B": "string"})
result = get_dummies(df)
expected = DataFrame(
{
"A_a": [1, 0, 1],
"A_b": [0, 1, 0],
"B_b": [1, 1, 0],
"B_c": [0, 0, 1],
},
dtype=bool,
)
if not using_infer_string:
# infer_string returns numpy bools
expected[["B_b", "B_c"]] = expected[["B_b", "B_c"]].astype("boolean")
tm.assert_frame_equal(result, expected)
def test_dataframe_dummies_mix_default(self, df, sparse, dtype):
result = get_dummies(df, sparse=sparse, dtype=dtype)
if sparse:
arr = SparseArray
if dtype.kind == "b":
typ = SparseDtype(dtype, False)
else:
typ = SparseDtype(dtype, 0)
else:
arr = np.array
typ = dtype
expected = DataFrame(
{
"C": [1, 2, 3],
"A_a": arr([1, 0, 1], dtype=typ),
"A_b": arr([0, 1, 0], dtype=typ),
"B_b": arr([1, 1, 0], dtype=typ),
"B_c": arr([0, 0, 1], dtype=typ),
}
)
expected = expected[["C", "A_a", "A_b", "B_b", "B_c"]]
tm.assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_list(self, df, sparse):
prefixes = ["from_A", "from_B"]
result = get_dummies(df, prefix=prefixes, sparse=sparse)
expected = DataFrame(
{
"C": [1, 2, 3],
"from_A_a": [True, False, True],
"from_A_b": [False, True, False],
"from_B_b": [True, True, False],
"from_B_c": [False, False, True],
},
)
expected[["C"]] = df[["C"]]
cols = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
expected = expected[["C"] + cols]
typ = SparseArray if sparse else Series
expected[cols] = expected[cols].apply(lambda x: typ(x))
tm.assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_str(self, df, sparse):
# not that you should do this...
result = get_dummies(df, prefix="bad", sparse=sparse)
bad_columns = ["bad_a", "bad_b", "bad_b", "bad_c"]
expected = DataFrame(
[
[1, True, False, True, False],
[2, False, True, True, False],
[3, True, False, False, True],
],
columns=["C"] + bad_columns,
)
expected = expected.astype({"C": np.int64})
if sparse:
# work around astyping & assigning with duplicate columns
# https://github.com/pandas-dev/pandas/issues/14427
expected = pd.concat(
[
Series([1, 2, 3], name="C"),
Series([True, False, True], name="bad_a", dtype="Sparse[bool]"),
Series([False, True, False], name="bad_b", dtype="Sparse[bool]"),
Series([True, True, False], name="bad_b", dtype="Sparse[bool]"),
Series([False, False, True], name="bad_c", dtype="Sparse[bool]"),
],
axis=1,
)
tm.assert_frame_equal(result, expected)
def test_dataframe_dummies_subset(self, df, sparse):
result = get_dummies(df, prefix=["from_A"], columns=["A"], sparse=sparse)
expected = DataFrame(
{
"B": ["b", "b", "c"],
"C": [1, 2, 3],
"from_A_a": [1, 0, 1],
"from_A_b": [0, 1, 0],
},
)
cols = expected.columns
expected[cols[1:]] = expected[cols[1:]].astype(bool)
expected[["C"]] = df[["C"]]
if sparse:
cols = ["from_A_a", "from_A_b"]
expected[cols] = expected[cols].astype(SparseDtype("bool", False))
tm.assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_sep(self, df, sparse):
result = get_dummies(df, prefix_sep="..", sparse=sparse)
expected = DataFrame(
{
"C": [1, 2, 3],
"A..a": [True, False, True],
"A..b": [False, True, False],
"B..b": [True, True, False],
"B..c": [False, False, True],
},
)
expected[["C"]] = df[["C"]]
expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]]
if sparse:
cols = ["A..a", "A..b", "B..b", "B..c"]
expected[cols] = expected[cols].astype(SparseDtype("bool", False))
tm.assert_frame_equal(result, expected)
result = get_dummies(df, prefix_sep=["..", "__"], sparse=sparse)
expected = expected.rename(columns={"B..b": "B__b", "B..c": "B__c"})
tm.assert_frame_equal(result, expected)
result = get_dummies(df, prefix_sep={"A": "..", "B": "__"}, sparse=sparse)
tm.assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_bad_length(self, df, sparse):
msg = re.escape(
"Length of 'prefix' (1) did not match the length of the columns being "
"encoded (2)"
)
with pytest.raises(ValueError, match=msg):
get_dummies(df, prefix=["too few"], sparse=sparse)
def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse):
msg = re.escape(
"Length of 'prefix_sep' (1) did not match the length of the columns being "
"encoded (2)"
)
with pytest.raises(ValueError, match=msg):
get_dummies(df, prefix_sep=["bad"], sparse=sparse)
def test_dataframe_dummies_prefix_dict(self, sparse):
prefixes = {"A": "from_A", "B": "from_B"}
df = DataFrame({"C": [1, 2, 3], "A": ["a", "b", "a"], "B": ["b", "b", "c"]})
result = get_dummies(df, prefix=prefixes, sparse=sparse)
expected = DataFrame(
{
"C": [1, 2, 3],
"from_A_a": [1, 0, 1],
"from_A_b": [0, 1, 0],
"from_B_b": [1, 1, 0],
"from_B_c": [0, 0, 1],
}
)
columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
expected[columns] = expected[columns].astype(bool)
if sparse:
expected[columns] = expected[columns].astype(SparseDtype("bool", False))
tm.assert_frame_equal(result, expected)
def test_dataframe_dummies_with_na(self, df, sparse, dtype):
df.loc[3, :] = [np.nan, np.nan, np.nan]
result = get_dummies(df, dummy_na=True, sparse=sparse, dtype=dtype).sort_index(
axis=1
)
if sparse:
arr = SparseArray
if dtype.kind == "b":
typ = SparseDtype(dtype, False)
else:
typ = SparseDtype(dtype, 0)
else:
arr = np.array
typ = dtype
expected = DataFrame(
{
"C": [1, 2, 3, np.nan],
"A_a": arr([1, 0, 1, 0], dtype=typ),
"A_b": arr([0, 1, 0, 0], dtype=typ),
"A_nan": arr([0, 0, 0, 1], dtype=typ),
"B_b": arr([1, 1, 0, 0], dtype=typ),
"B_c": arr([0, 0, 1, 0], dtype=typ),
"B_nan": arr([0, 0, 0, 1], dtype=typ),
}
).sort_index(axis=1)
tm.assert_frame_equal(result, expected)
result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype)
expected = expected[["C", "A_a", "A_b", "B_b", "B_c"]]
tm.assert_frame_equal(result, expected)
def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
df["cat"] = Categorical(["x", "y", "y"])
result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1)
if sparse:
arr = SparseArray
if dtype.kind == "b":
typ = SparseDtype(dtype, False)
else:
typ = SparseDtype(dtype, 0)
else:
arr = np.array
typ = dtype
expected = DataFrame(
{
"C": [1, 2, 3],
"A_a": arr([1, 0, 1], dtype=typ),
"A_b": arr([0, 1, 0], dtype=typ),
"B_b": arr([1, 1, 0], dtype=typ),
"B_c": arr([0, 0, 1], dtype=typ),
"cat_x": arr([1, 0, 0], dtype=typ),
"cat_y": arr([0, 1, 1], dtype=typ),
}
).sort_index(axis=1)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"get_dummies_kwargs,expected",
[
(
{"data": DataFrame({"ä": ["a"]})},
DataFrame({"ä_a": [True]}),
),
(
{"data": DataFrame({"x": ["ä"]})},
DataFrame({"x_ä": [True]}),
),
(
{"data": DataFrame({"x": ["a"]}), "prefix": "ä"},
DataFrame({"ä_a": [True]}),
),
(
{"data": DataFrame({"x": ["a"]}), "prefix_sep": "ä"},
DataFrame({"xäa": [True]}),
),
],
)
def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected):
# GH22084 get_dummies incorrectly encodes unicode characters
# in dataframe column names
result = get_dummies(**get_dummies_kwargs)
tm.assert_frame_equal(result, expected)
def test_get_dummies_basic_drop_first(self, sparse):
# GH12402 Add a new parameter `drop_first` to avoid collinearity
# Basic case
s_list = list("abc")
s_series = Series(s_list)
s_series_index = Series(s_list, list("ABC"))
expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=bool)
result = get_dummies(s_list, drop_first=True, sparse=sparse)
if sparse:
expected = expected.apply(SparseArray, fill_value=False)
tm.assert_frame_equal(result, expected)
result = get_dummies(s_series, drop_first=True, sparse=sparse)
tm.assert_frame_equal(result, expected)
expected.index = list("ABC")
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
tm.assert_frame_equal(result, expected)
def test_get_dummies_basic_drop_first_one_level(self, sparse):
# Test the case that categorical variable only has one level.
s_list = list("aaa")
s_series = Series(s_list)
s_series_index = Series(s_list, list("ABC"))
expected = DataFrame(index=RangeIndex(3))
result = get_dummies(s_list, drop_first=True, sparse=sparse)
tm.assert_frame_equal(result, expected)
result = get_dummies(s_series, drop_first=True, sparse=sparse)
tm.assert_frame_equal(result, expected)
expected = DataFrame(index=list("ABC"))
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
tm.assert_frame_equal(result, expected)
def test_get_dummies_basic_drop_first_NA(self, sparse):
# Test NA handling together with drop_first
s_NA = ["a", "b", np.nan]
res = get_dummies(s_NA, drop_first=True, sparse=sparse)
exp = DataFrame({"b": [0, 1, 0]}, dtype=bool)
if sparse:
exp = exp.apply(SparseArray, fill_value=False)
tm.assert_frame_equal(res, exp)
res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse)
exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=bool).reindex(
["b", np.nan], axis=1
)
if sparse:
exp_na = exp_na.apply(SparseArray, fill_value=False)
tm.assert_frame_equal(res_na, exp_na)
res_just_na = get_dummies(
[np.nan], dummy_na=True, drop_first=True, sparse=sparse
)
exp_just_na = DataFrame(index=RangeIndex(1))
tm.assert_frame_equal(res_just_na, exp_just_na)
def test_dataframe_dummies_drop_first(self, df, sparse):
df = df[["A", "B"]]
result = get_dummies(df, drop_first=True, sparse=sparse)
expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=bool)
if sparse:
expected = expected.apply(SparseArray, fill_value=False)
tm.assert_frame_equal(result, expected)
def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype):
df["cat"] = Categorical(["x", "y", "y"])
result = get_dummies(df, drop_first=True, sparse=sparse)
expected = DataFrame(
{"C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1]}
)
cols = ["A_b", "B_c", "cat_y"]
expected[cols] = expected[cols].astype(bool)
expected = expected[["C", "A_b", "B_c", "cat_y"]]
if sparse:
for col in cols:
expected[col] = SparseArray(expected[col])
tm.assert_frame_equal(result, expected)
def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
df.loc[3, :] = [np.nan, np.nan, np.nan]
result = get_dummies(
df, dummy_na=True, drop_first=True, sparse=sparse
).sort_index(axis=1)
expected = DataFrame(
{
"C": [1, 2, 3, np.nan],
"A_b": [0, 1, 0, 0],
"A_nan": [0, 0, 0, 1],
"B_c": [0, 0, 1, 0],
"B_nan": [0, 0, 0, 1],
}
)
cols = ["A_b", "A_nan", "B_c", "B_nan"]
expected[cols] = expected[cols].astype(bool)
expected = expected.sort_index(axis=1)
if sparse:
for col in cols:
expected[col] = SparseArray(expected[col])
tm.assert_frame_equal(result, expected)
result = get_dummies(df, dummy_na=False, drop_first=True, sparse=sparse)
expected = expected[["C", "A_b", "B_c"]]
tm.assert_frame_equal(result, expected)
def test_get_dummies_int_int(self):
data = Series([1, 2, 1])
result = get_dummies(data)
expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=bool)
tm.assert_frame_equal(result, expected)
data = Series(Categorical(["a", "b", "a"]))
result = get_dummies(data)
expected = DataFrame(
[[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=bool
)
tm.assert_frame_equal(result, expected)
def test_get_dummies_int_df(self, dtype):
data = DataFrame(
{
"A": [1, 2, 1],
"B": Categorical(["a", "b", "a"]),
"C": [1, 2, 1],
"D": [1.0, 2.0, 1.0],
}
)
columns = ["C", "D", "A_1", "A_2", "B_a", "B_b"]
expected = DataFrame(
[[1, 1.0, 1, 0, 1, 0], [2, 2.0, 0, 1, 0, 1], [1, 1.0, 1, 0, 1, 0]],
columns=columns,
)
expected[columns[2:]] = expected[columns[2:]].astype(dtype)
result = get_dummies(data, columns=["A", "B"], dtype=dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("ordered", [True, False])
def test_dataframe_dummies_preserve_categorical_dtype(self, dtype, ordered):
# GH13854
cat = Categorical(list("xy"), categories=list("xyz"), ordered=ordered)
result = get_dummies(cat, dtype=dtype)
data = np.array([[1, 0, 0], [0, 1, 0]], dtype=self.effective_dtype(dtype))
cols = CategoricalIndex(
cat.categories, categories=cat.categories, ordered=ordered
)
expected = DataFrame(data, columns=cols, dtype=self.effective_dtype(dtype))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("sparse", [True, False])
def test_get_dummies_dont_sparsify_all_columns(self, sparse):
# GH18914
df = DataFrame.from_dict({"GDP": [1, 2], "Nation": ["AB", "CD"]})
df = get_dummies(df, columns=["Nation"], sparse=sparse)
df2 = df.reindex(columns=["GDP"])
tm.assert_frame_equal(df[["GDP"]], df2)
def test_get_dummies_duplicate_columns(self, df):
# GH20839
df.columns = ["A", "A", "A"]
result = get_dummies(df).sort_index(axis=1)
expected = DataFrame(
[
[1, True, False, True, False],
[2, False, True, True, False],
[3, True, False, False, True],
],
columns=["A", "A_a", "A_b", "A_b", "A_c"],
).sort_index(axis=1)
expected = expected.astype({"A": np.int64})
tm.assert_frame_equal(result, expected)
def test_get_dummies_all_sparse(self):
df = DataFrame({"A": [1, 2]})
result = get_dummies(df, columns=["A"], sparse=True)
dtype = SparseDtype("bool", False)
expected = DataFrame(
{
"A_1": SparseArray([1, 0], dtype=dtype),
"A_2": SparseArray([0, 1], dtype=dtype),
}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("values", ["baz"])
def test_get_dummies_with_string_values(self, values):
# issue #28383
df = DataFrame(
{
"bar": [1, 2, 3, 4, 5, 6],
"foo": ["one", "one", "one", "two", "two", "two"],
"baz": ["A", "B", "C", "A", "B", "C"],
"zoo": ["x", "y", "z", "q", "w", "t"],
}
)
msg = "Input must be a list-like for parameter `columns`"
with pytest.raises(TypeError, match=msg):
get_dummies(df, columns=values)
def test_get_dummies_ea_dtype_series(self, any_numeric_ea_and_arrow_dtype):
# GH#32430
ser = Series(list("abca"))
result = get_dummies(ser, dtype=any_numeric_ea_and_arrow_dtype)
expected = DataFrame(
{"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]},
dtype=any_numeric_ea_and_arrow_dtype,
)
tm.assert_frame_equal(result, expected)
def test_get_dummies_ea_dtype_dataframe(self, any_numeric_ea_and_arrow_dtype):
# GH#32430
df = DataFrame({"x": list("abca")})
result = get_dummies(df, dtype=any_numeric_ea_and_arrow_dtype)
expected = DataFrame(
{"x_a": [1, 0, 0, 1], "x_b": [0, 1, 0, 0], "x_c": [0, 0, 1, 0]},
dtype=any_numeric_ea_and_arrow_dtype,
)
tm.assert_frame_equal(result, expected)
@td.skip_if_no("pyarrow")
def test_get_dummies_ea_dtype(self):
# GH#56273
for dtype, exp_dtype in [
("string[pyarrow]", "boolean"),
("string[pyarrow_numpy]", "bool"),
(CategoricalDtype(Index(["a"], dtype="string[pyarrow]")), "boolean"),
(CategoricalDtype(Index(["a"], dtype="string[pyarrow_numpy]")), "bool"),
]:
df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1})
result = get_dummies(df)
expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)})
tm.assert_frame_equal(result, expected)
@td.skip_if_no("pyarrow")
def test_get_dummies_arrow_dtype(self):
# GH#56273
df = DataFrame({"name": Series(["a"], dtype=ArrowDtype(pa.string())), "x": 1})
result = get_dummies(df)
expected = DataFrame({"x": 1, "name_a": Series([True], dtype="bool[pyarrow]")})
tm.assert_frame_equal(result, expected)
df = DataFrame(
{
"name": Series(
["a"],
dtype=CategoricalDtype(Index(["a"], dtype=ArrowDtype(pa.string()))),
),
"x": 1,
}
)
result = get_dummies(df)
tm.assert_frame_equal(result, expected)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,254 @@
import numpy as np
import pytest
from pandas._libs import lib
import pandas as pd
from pandas import (
Index,
MultiIndex,
)
import pandas._testing as tm
@pytest.mark.parametrize(
"input_index, input_columns, input_values, "
"expected_values, expected_columns, expected_index",
[
(
["lev4"],
"lev3",
"values",
[
[0.0, np.nan],
[np.nan, 1.0],
[2.0, np.nan],
[np.nan, 3.0],
[4.0, np.nan],
[np.nan, 5.0],
[6.0, np.nan],
[np.nan, 7.0],
],
Index([1, 2], name="lev3"),
Index([1, 2, 3, 4, 5, 6, 7, 8], name="lev4"),
),
(
["lev4"],
"lev3",
lib.no_default,
[
[1.0, np.nan, 1.0, np.nan, 0.0, np.nan],
[np.nan, 1.0, np.nan, 1.0, np.nan, 1.0],
[1.0, np.nan, 2.0, np.nan, 2.0, np.nan],
[np.nan, 1.0, np.nan, 2.0, np.nan, 3.0],
[2.0, np.nan, 1.0, np.nan, 4.0, np.nan],
[np.nan, 2.0, np.nan, 1.0, np.nan, 5.0],
[2.0, np.nan, 2.0, np.nan, 6.0, np.nan],
[np.nan, 2.0, np.nan, 2.0, np.nan, 7.0],
],
MultiIndex.from_tuples(
[
("lev1", 1),
("lev1", 2),
("lev2", 1),
("lev2", 2),
("values", 1),
("values", 2),
],
names=[None, "lev3"],
),
Index([1, 2, 3, 4, 5, 6, 7, 8], name="lev4"),
),
(
["lev1", "lev2"],
"lev3",
"values",
[[0, 1], [2, 3], [4, 5], [6, 7]],
Index([1, 2], name="lev3"),
MultiIndex.from_tuples(
[(1, 1), (1, 2), (2, 1), (2, 2)], names=["lev1", "lev2"]
),
),
(
["lev1", "lev2"],
"lev3",
lib.no_default,
[[1, 2, 0, 1], [3, 4, 2, 3], [5, 6, 4, 5], [7, 8, 6, 7]],
MultiIndex.from_tuples(
[("lev4", 1), ("lev4", 2), ("values", 1), ("values", 2)],
names=[None, "lev3"],
),
MultiIndex.from_tuples(
[(1, 1), (1, 2), (2, 1), (2, 2)], names=["lev1", "lev2"]
),
),
],
)
def test_pivot_list_like_index(
input_index,
input_columns,
input_values,
expected_values,
expected_columns,
expected_index,
):
# GH 21425, test when index is given a list
df = pd.DataFrame(
{
"lev1": [1, 1, 1, 1, 2, 2, 2, 2],
"lev2": [1, 1, 2, 2, 1, 1, 2, 2],
"lev3": [1, 2, 1, 2, 1, 2, 1, 2],
"lev4": [1, 2, 3, 4, 5, 6, 7, 8],
"values": [0, 1, 2, 3, 4, 5, 6, 7],
}
)
result = df.pivot(index=input_index, columns=input_columns, values=input_values)
expected = pd.DataFrame(
expected_values, columns=expected_columns, index=expected_index
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"input_index, input_columns, input_values, "
"expected_values, expected_columns, expected_index",
[
(
"lev4",
["lev3"],
"values",
[
[0.0, np.nan],
[np.nan, 1.0],
[2.0, np.nan],
[np.nan, 3.0],
[4.0, np.nan],
[np.nan, 5.0],
[6.0, np.nan],
[np.nan, 7.0],
],
Index([1, 2], name="lev3"),
Index([1, 2, 3, 4, 5, 6, 7, 8], name="lev4"),
),
(
["lev1", "lev2"],
["lev3"],
"values",
[[0, 1], [2, 3], [4, 5], [6, 7]],
Index([1, 2], name="lev3"),
MultiIndex.from_tuples(
[(1, 1), (1, 2), (2, 1), (2, 2)], names=["lev1", "lev2"]
),
),
(
["lev1"],
["lev2", "lev3"],
"values",
[[0, 1, 2, 3], [4, 5, 6, 7]],
MultiIndex.from_tuples(
[(1, 1), (1, 2), (2, 1), (2, 2)], names=["lev2", "lev3"]
),
Index([1, 2], name="lev1"),
),
(
["lev1", "lev2"],
["lev3", "lev4"],
"values",
[
[0.0, 1.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
[np.nan, np.nan, 2.0, 3.0, np.nan, np.nan, np.nan, np.nan],
[np.nan, np.nan, np.nan, np.nan, 4.0, 5.0, np.nan, np.nan],
[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 6.0, 7.0],
],
MultiIndex.from_tuples(
[(1, 1), (2, 2), (1, 3), (2, 4), (1, 5), (2, 6), (1, 7), (2, 8)],
names=["lev3", "lev4"],
),
MultiIndex.from_tuples(
[(1, 1), (1, 2), (2, 1), (2, 2)], names=["lev1", "lev2"]
),
),
],
)
def test_pivot_list_like_columns(
input_index,
input_columns,
input_values,
expected_values,
expected_columns,
expected_index,
):
# GH 21425, test when columns is given a list
df = pd.DataFrame(
{
"lev1": [1, 1, 1, 1, 2, 2, 2, 2],
"lev2": [1, 1, 2, 2, 1, 1, 2, 2],
"lev3": [1, 2, 1, 2, 1, 2, 1, 2],
"lev4": [1, 2, 3, 4, 5, 6, 7, 8],
"values": [0, 1, 2, 3, 4, 5, 6, 7],
}
)
result = df.pivot(index=input_index, columns=input_columns, values=input_values)
expected = pd.DataFrame(
expected_values, columns=expected_columns, index=expected_index
)
tm.assert_frame_equal(result, expected)
def test_pivot_multiindexed_rows_and_cols(using_array_manager):
# GH 36360
df = pd.DataFrame(
data=np.arange(12).reshape(4, 3),
columns=MultiIndex.from_tuples(
[(0, 0), (0, 1), (0, 2)], names=["col_L0", "col_L1"]
),
index=MultiIndex.from_tuples(
[(0, 0, 0), (0, 0, 1), (1, 1, 1), (1, 0, 0)],
names=["idx_L0", "idx_L1", "idx_L2"],
),
)
res = df.pivot_table(
index=["idx_L0"],
columns=["idx_L1"],
values=[(0, 1)],
aggfunc=lambda col: col.values.sum(),
)
expected = pd.DataFrame(
data=[[5, np.nan], [10, 7.0]],
columns=MultiIndex.from_tuples(
[(0, 1, 0), (0, 1, 1)], names=["col_L0", "col_L1", "idx_L1"]
),
index=Index([0, 1], dtype="int64", name="idx_L0"),
)
if not using_array_manager:
# BlockManager does not preserve the dtypes
expected = expected.astype("float64")
tm.assert_frame_equal(res, expected)
def test_pivot_df_multiindex_index_none():
# GH 23955
df = pd.DataFrame(
[
["A", "A1", "label1", 1],
["A", "A2", "label2", 2],
["B", "A1", "label1", 3],
["B", "A2", "label2", 4],
],
columns=["index_1", "index_2", "label", "value"],
)
df = df.set_index(["index_1", "index_2"])
result = df.pivot(columns="label", values="value")
expected = pd.DataFrame(
[[1.0, np.nan], [np.nan, 2.0], [3.0, np.nan], [np.nan, 4.0]],
index=df.index,
columns=Index(["label1", "label2"], name="label"),
)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,305 @@
import os
import numpy as np
import pytest
import pandas as pd
from pandas import (
Categorical,
DatetimeIndex,
Interval,
IntervalIndex,
NaT,
Series,
Timedelta,
TimedeltaIndex,
Timestamp,
cut,
date_range,
isna,
qcut,
timedelta_range,
)
import pandas._testing as tm
from pandas.api.types import CategoricalDtype
from pandas.tseries.offsets import Day
def test_qcut():
arr = np.random.default_rng(2).standard_normal(1000)
# We store the bins as Index that have been
# rounded to comparisons are a bit tricky.
labels, _ = qcut(arr, 4, retbins=True)
ex_bins = np.quantile(arr, [0, 0.25, 0.5, 0.75, 1.0])
result = labels.categories.left.values
assert np.allclose(result, ex_bins[:-1], atol=1e-2)
result = labels.categories.right.values
assert np.allclose(result, ex_bins[1:], atol=1e-2)
ex_levels = cut(arr, ex_bins, include_lowest=True)
tm.assert_categorical_equal(labels, ex_levels)
def test_qcut_bounds():
arr = np.random.default_rng(2).standard_normal(1000)
factor = qcut(arr, 10, labels=False)
assert len(np.unique(factor)) == 10
def test_qcut_specify_quantiles():
arr = np.random.default_rng(2).standard_normal(100)
factor = qcut(arr, [0, 0.25, 0.5, 0.75, 1.0])
expected = qcut(arr, 4)
tm.assert_categorical_equal(factor, expected)
def test_qcut_all_bins_same():
with pytest.raises(ValueError, match="edges.*unique"):
qcut([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3)
def test_qcut_include_lowest():
values = np.arange(10)
ii = qcut(values, 4)
ex_levels = IntervalIndex(
[
Interval(-0.001, 2.25),
Interval(2.25, 4.5),
Interval(4.5, 6.75),
Interval(6.75, 9),
]
)
tm.assert_index_equal(ii.categories, ex_levels)
def test_qcut_nas():
arr = np.random.default_rng(2).standard_normal(100)
arr[:20] = np.nan
result = qcut(arr, 4)
assert isna(result[:20]).all()
def test_qcut_index():
result = qcut([0, 2], 2)
intervals = [Interval(-0.001, 1), Interval(1, 2)]
expected = Categorical(intervals, ordered=True)
tm.assert_categorical_equal(result, expected)
def test_qcut_binning_issues(datapath):
# see gh-1978, gh-1979
cut_file = datapath(os.path.join("reshape", "data", "cut_data.csv"))
arr = np.loadtxt(cut_file)
result = qcut(arr, 20)
starts = []
ends = []
for lev in np.unique(result):
s = lev.left
e = lev.right
assert s != e
starts.append(float(s))
ends.append(float(e))
for (sp, sn), (ep, en) in zip(
zip(starts[:-1], starts[1:]), zip(ends[:-1], ends[1:])
):
assert sp < sn
assert ep < en
assert ep <= sn
def test_qcut_return_intervals():
ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
res = qcut(ser, [0, 0.333, 0.666, 1])
exp_levels = np.array(
[Interval(-0.001, 2.664), Interval(2.664, 5.328), Interval(5.328, 8)]
)
exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(
CategoricalDtype(ordered=True)
)
tm.assert_series_equal(res, exp)
@pytest.mark.parametrize("labels", ["foo", 1, True])
def test_qcut_incorrect_labels(labels):
# GH 13318
values = range(5)
msg = "Bin labels must either be False, None or passed in as a list-like argument"
with pytest.raises(ValueError, match=msg):
qcut(values, 4, labels=labels)
@pytest.mark.parametrize("labels", [["a", "b", "c"], list(range(3))])
def test_qcut_wrong_length_labels(labels):
# GH 13318
values = range(10)
msg = "Bin labels must be one fewer than the number of bin edges"
with pytest.raises(ValueError, match=msg):
qcut(values, 4, labels=labels)
@pytest.mark.parametrize(
"labels, expected",
[
(["a", "b", "c"], Categorical(["a", "b", "c"], ordered=True)),
(list(range(3)), Categorical([0, 1, 2], ordered=True)),
],
)
def test_qcut_list_like_labels(labels, expected):
# GH 13318
values = range(3)
result = qcut(values, 3, labels=labels)
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,msg",
[
({"duplicates": "drop"}, None),
({}, "Bin edges must be unique"),
({"duplicates": "raise"}, "Bin edges must be unique"),
({"duplicates": "foo"}, "invalid value for 'duplicates' parameter"),
],
)
def test_qcut_duplicates_bin(kwargs, msg):
# see gh-7751
values = [0, 0, 0, 0, 1, 2, 3]
if msg is not None:
with pytest.raises(ValueError, match=msg):
qcut(values, 3, **kwargs)
else:
result = qcut(values, 3, **kwargs)
expected = IntervalIndex([Interval(-0.001, 1), Interval(1, 3)])
tm.assert_index_equal(result.categories, expected)
@pytest.mark.parametrize(
"data,start,end", [(9.0, 8.999, 9.0), (0.0, -0.001, 0.0), (-9.0, -9.001, -9.0)]
)
@pytest.mark.parametrize("length", [1, 2])
@pytest.mark.parametrize("labels", [None, False])
def test_single_quantile(data, start, end, length, labels):
# see gh-15431
ser = Series([data] * length)
result = qcut(ser, 1, labels=labels)
if labels is None:
intervals = IntervalIndex([Interval(start, end)] * length, closed="right")
expected = Series(intervals).astype(CategoricalDtype(ordered=True))
else:
expected = Series([0] * length, dtype=np.intp)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"ser",
[
Series(DatetimeIndex(["20180101", NaT, "20180103"])),
Series(TimedeltaIndex(["0 days", NaT, "2 days"])),
],
ids=lambda x: str(x.dtype),
)
def test_qcut_nat(ser, unit):
# see gh-19768
ser = ser.dt.as_unit(unit)
td = Timedelta(1, unit=unit).as_unit(unit)
left = Series([ser[0] - td, np.nan, ser[2] - Day()], dtype=ser.dtype)
right = Series([ser[2] - Day(), np.nan, ser[2]], dtype=ser.dtype)
intervals = IntervalIndex.from_arrays(left, right)
expected = Series(Categorical(intervals, ordered=True))
result = qcut(ser, 2)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("bins", [3, np.linspace(0, 1, 4)])
def test_datetime_tz_qcut(bins):
# see gh-19872
tz = "US/Eastern"
ser = Series(date_range("20130101", periods=3, tz=tz))
result = qcut(ser, bins)
expected = Series(
IntervalIndex(
[
Interval(
Timestamp("2012-12-31 23:59:59.999999999", tz=tz),
Timestamp("2013-01-01 16:00:00", tz=tz),
),
Interval(
Timestamp("2013-01-01 16:00:00", tz=tz),
Timestamp("2013-01-02 08:00:00", tz=tz),
),
Interval(
Timestamp("2013-01-02 08:00:00", tz=tz),
Timestamp("2013-01-03 00:00:00", tz=tz),
),
]
)
).astype(CategoricalDtype(ordered=True))
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"arg,expected_bins",
[
[
timedelta_range("1day", periods=3),
TimedeltaIndex(["1 days", "2 days", "3 days"]),
],
[
date_range("20180101", periods=3),
DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"]),
],
],
)
def test_date_like_qcut_bins(arg, expected_bins):
# see gh-19891
ser = Series(arg)
result, result_bins = qcut(ser, 2, retbins=True)
tm.assert_index_equal(result_bins, expected_bins)
@pytest.mark.parametrize("bins", [6, 7])
@pytest.mark.parametrize(
"box, compare",
[
(Series, tm.assert_series_equal),
(np.array, tm.assert_categorical_equal),
(list, tm.assert_equal),
],
)
def test_qcut_bool_coercion_to_int(bins, box, compare):
# issue 20303
data_expected = box([0, 1, 1, 0, 1] * 10)
data_result = box([False, True, True, False, True] * 10)
expected = qcut(data_expected, bins, duplicates="drop")
result = qcut(data_result, bins, duplicates="drop")
compare(result, expected)
@pytest.mark.parametrize("q", [2, 5, 10])
def test_qcut_nullable_integer(q, any_numeric_ea_dtype):
arr = pd.array(np.arange(100), dtype=any_numeric_ea_dtype)
arr[::2] = pd.NA
result = qcut(arr, q)
expected = qcut(arr.astype(float), q)
tm.assert_categorical_equal(result, expected)

View File

@ -0,0 +1,365 @@
import numpy as np
import pytest
from pandas.core.dtypes.concat import union_categoricals
import pandas as pd
from pandas import (
Categorical,
CategoricalIndex,
Series,
)
import pandas._testing as tm
class TestUnionCategoricals:
@pytest.mark.parametrize(
"a, b, combined",
[
(list("abc"), list("abd"), list("abcabd")),
([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),
(
["b", "b", np.nan, "a"],
["a", np.nan, "c"],
["b", "b", np.nan, "a", "a", np.nan, "c"],
),
(
pd.date_range("2014-01-01", "2014-01-05"),
pd.date_range("2014-01-06", "2014-01-07"),
pd.date_range("2014-01-01", "2014-01-07"),
),
(
pd.date_range("2014-01-01", "2014-01-05", tz="US/Central"),
pd.date_range("2014-01-06", "2014-01-07", tz="US/Central"),
pd.date_range("2014-01-01", "2014-01-07", tz="US/Central"),
),
(
pd.period_range("2014-01-01", "2014-01-05"),
pd.period_range("2014-01-06", "2014-01-07"),
pd.period_range("2014-01-01", "2014-01-07"),
),
],
)
@pytest.mark.parametrize("box", [Categorical, CategoricalIndex, Series])
def test_union_categorical(self, a, b, combined, box):
# GH 13361
result = union_categoricals([box(Categorical(a)), box(Categorical(b))])
expected = Categorical(combined)
tm.assert_categorical_equal(result, expected)
def test_union_categorical_ordered_appearance(self):
# new categories ordered by appearance
s = Categorical(["x", "y", "z"])
s2 = Categorical(["a", "b", "c"])
result = union_categoricals([s, s2])
expected = Categorical(
["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"]
)
tm.assert_categorical_equal(result, expected)
def test_union_categorical_ordered_true(self):
s = Categorical([0, 1.2, 2], ordered=True)
s2 = Categorical([0, 1.2, 2], ordered=True)
result = union_categoricals([s, s2])
expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
tm.assert_categorical_equal(result, expected)
def test_union_categorical_match_types(self):
# must exactly match types
s = Categorical([0, 1.2, 2])
s2 = Categorical([2, 3, 4])
msg = "dtype of categories must be the same"
with pytest.raises(TypeError, match=msg):
union_categoricals([s, s2])
def test_union_categorical_empty(self):
msg = "No Categoricals to union"
with pytest.raises(ValueError, match=msg):
union_categoricals([])
def test_union_categoricals_nan(self):
# GH 13759
res = union_categoricals(
[Categorical([1, 2, np.nan]), Categorical([3, 2, np.nan])]
)
exp = Categorical([1, 2, np.nan, 3, 2, np.nan])
tm.assert_categorical_equal(res, exp)
res = union_categoricals(
[Categorical(["A", "B"]), Categorical(["B", "B", np.nan])]
)
exp = Categorical(["A", "B", "B", "B", np.nan])
tm.assert_categorical_equal(res, exp)
val1 = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-03-01"), pd.NaT]
val2 = [pd.NaT, pd.Timestamp("2011-01-01"), pd.Timestamp("2011-02-01")]
res = union_categoricals([Categorical(val1), Categorical(val2)])
exp = Categorical(
val1 + val2,
categories=[
pd.Timestamp("2011-01-01"),
pd.Timestamp("2011-03-01"),
pd.Timestamp("2011-02-01"),
],
)
tm.assert_categorical_equal(res, exp)
# all NaN
res = union_categoricals(
[
Categorical(np.array([np.nan, np.nan], dtype=object)),
Categorical(["X"], categories=pd.Index(["X"], dtype=object)),
]
)
exp = Categorical([np.nan, np.nan, "X"])
tm.assert_categorical_equal(res, exp)
res = union_categoricals(
[Categorical([np.nan, np.nan]), Categorical([np.nan, np.nan])]
)
exp = Categorical([np.nan, np.nan, np.nan, np.nan])
tm.assert_categorical_equal(res, exp)
@pytest.mark.parametrize("val", [[], ["1"]])
def test_union_categoricals_empty(self, val, request, using_infer_string):
# GH 13759
if using_infer_string and val == ["1"]:
request.applymarker(pytest.mark.xfail("object and strings dont match"))
res = union_categoricals([Categorical([]), Categorical(val)])
exp = Categorical(val)
tm.assert_categorical_equal(res, exp)
def test_union_categorical_same_category(self):
# check fastpath
c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
res = union_categoricals([c1, c2])
exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan], categories=[1, 2, 3, 4])
tm.assert_categorical_equal(res, exp)
def test_union_categorical_same_category_str(self):
c1 = Categorical(["z", "z", "z"], categories=["x", "y", "z"])
c2 = Categorical(["x", "x", "x"], categories=["x", "y", "z"])
res = union_categoricals([c1, c2])
exp = Categorical(["z", "z", "z", "x", "x", "x"], categories=["x", "y", "z"])
tm.assert_categorical_equal(res, exp)
def test_union_categorical_same_categories_different_order(self):
# https://github.com/pandas-dev/pandas/issues/19096
c1 = Categorical(["a", "b", "c"], categories=["a", "b", "c"])
c2 = Categorical(["a", "b", "c"], categories=["b", "a", "c"])
result = union_categoricals([c1, c2])
expected = Categorical(
["a", "b", "c", "a", "b", "c"], categories=["a", "b", "c"]
)
tm.assert_categorical_equal(result, expected)
def test_union_categoricals_ordered(self):
c1 = Categorical([1, 2, 3], ordered=True)
c2 = Categorical([1, 2, 3], ordered=False)
msg = "Categorical.ordered must be the same"
with pytest.raises(TypeError, match=msg):
union_categoricals([c1, c2])
res = union_categoricals([c1, c1])
exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
tm.assert_categorical_equal(res, exp)
c1 = Categorical([1, 2, 3, np.nan], ordered=True)
c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
res = union_categoricals([c1, c2])
exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
tm.assert_categorical_equal(res, exp)
c1 = Categorical([1, 2, 3], ordered=True)
c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
msg = "to union ordered Categoricals, all categories must be the same"
with pytest.raises(TypeError, match=msg):
union_categoricals([c1, c2])
def test_union_categoricals_ignore_order(self):
# GH 15219
c1 = Categorical([1, 2, 3], ordered=True)
c2 = Categorical([1, 2, 3], ordered=False)
res = union_categoricals([c1, c2], ignore_order=True)
exp = Categorical([1, 2, 3, 1, 2, 3])
tm.assert_categorical_equal(res, exp)
msg = "Categorical.ordered must be the same"
with pytest.raises(TypeError, match=msg):
union_categoricals([c1, c2], ignore_order=False)
res = union_categoricals([c1, c1], ignore_order=True)
exp = Categorical([1, 2, 3, 1, 2, 3])
tm.assert_categorical_equal(res, exp)
res = union_categoricals([c1, c1], ignore_order=False)
exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True)
tm.assert_categorical_equal(res, exp)
c1 = Categorical([1, 2, 3, np.nan], ordered=True)
c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
res = union_categoricals([c1, c2], ignore_order=True)
exp = Categorical([1, 2, 3, np.nan, 3, 2])
tm.assert_categorical_equal(res, exp)
c1 = Categorical([1, 2, 3], ordered=True)
c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
res = union_categoricals([c1, c2], ignore_order=True)
exp = Categorical([1, 2, 3, 1, 2, 3])
tm.assert_categorical_equal(res, exp)
res = union_categoricals([c2, c1], ignore_order=True, sort_categories=True)
exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
tm.assert_categorical_equal(res, exp)
c1 = Categorical([1, 2, 3], ordered=True)
c2 = Categorical([4, 5, 6], ordered=True)
result = union_categoricals([c1, c2], ignore_order=True)
expected = Categorical([1, 2, 3, 4, 5, 6])
tm.assert_categorical_equal(result, expected)
msg = "to union ordered Categoricals, all categories must be the same"
with pytest.raises(TypeError, match=msg):
union_categoricals([c1, c2], ignore_order=False)
with pytest.raises(TypeError, match=msg):
union_categoricals([c1, c2])
def test_union_categoricals_sort(self):
# GH 13846
c1 = Categorical(["x", "y", "z"])
c2 = Categorical(["a", "b", "c"])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical(
["x", "y", "z", "a", "b", "c"], categories=["a", "b", "c", "x", "y", "z"]
)
tm.assert_categorical_equal(result, expected)
# fastpath
c1 = Categorical(["a", "b"], categories=["b", "a", "c"])
c2 = Categorical(["b", "c"], categories=["b", "a", "c"])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
tm.assert_categorical_equal(result, expected)
c1 = Categorical(["a", "b"], categories=["c", "a", "b"])
c2 = Categorical(["b", "c"], categories=["c", "a", "b"])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
tm.assert_categorical_equal(result, expected)
# fastpath - skip resort
c1 = Categorical(["a", "b"], categories=["a", "b", "c"])
c2 = Categorical(["b", "c"], categories=["a", "b", "c"])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
tm.assert_categorical_equal(result, expected)
c1 = Categorical(["x", np.nan])
c2 = Categorical([np.nan, "b"])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical(["x", np.nan, np.nan, "b"], categories=["b", "x"])
tm.assert_categorical_equal(result, expected)
c1 = Categorical([np.nan])
c2 = Categorical([np.nan])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical([np.nan, np.nan])
tm.assert_categorical_equal(result, expected)
c1 = Categorical([])
c2 = Categorical([])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical([])
tm.assert_categorical_equal(result, expected)
c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True)
c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True)
msg = "Cannot use sort_categories=True with ordered Categoricals"
with pytest.raises(TypeError, match=msg):
union_categoricals([c1, c2], sort_categories=True)
def test_union_categoricals_sort_false(self):
# GH 13846
c1 = Categorical(["x", "y", "z"])
c2 = Categorical(["a", "b", "c"])
result = union_categoricals([c1, c2], sort_categories=False)
expected = Categorical(
["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"]
)
tm.assert_categorical_equal(result, expected)
def test_union_categoricals_sort_false_fastpath(self):
# fastpath
c1 = Categorical(["a", "b"], categories=["b", "a", "c"])
c2 = Categorical(["b", "c"], categories=["b", "a", "c"])
result = union_categoricals([c1, c2], sort_categories=False)
expected = Categorical(["a", "b", "b", "c"], categories=["b", "a", "c"])
tm.assert_categorical_equal(result, expected)
def test_union_categoricals_sort_false_skipresort(self):
# fastpath - skip resort
c1 = Categorical(["a", "b"], categories=["a", "b", "c"])
c2 = Categorical(["b", "c"], categories=["a", "b", "c"])
result = union_categoricals([c1, c2], sort_categories=False)
expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
tm.assert_categorical_equal(result, expected)
def test_union_categoricals_sort_false_one_nan(self):
c1 = Categorical(["x", np.nan])
c2 = Categorical([np.nan, "b"])
result = union_categoricals([c1, c2], sort_categories=False)
expected = Categorical(["x", np.nan, np.nan, "b"], categories=["x", "b"])
tm.assert_categorical_equal(result, expected)
def test_union_categoricals_sort_false_only_nan(self):
c1 = Categorical([np.nan])
c2 = Categorical([np.nan])
result = union_categoricals([c1, c2], sort_categories=False)
expected = Categorical([np.nan, np.nan])
tm.assert_categorical_equal(result, expected)
def test_union_categoricals_sort_false_empty(self):
c1 = Categorical([])
c2 = Categorical([])
result = union_categoricals([c1, c2], sort_categories=False)
expected = Categorical([])
tm.assert_categorical_equal(result, expected)
def test_union_categoricals_sort_false_ordered_true(self):
c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True)
c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True)
result = union_categoricals([c1, c2], sort_categories=False)
expected = Categorical(
["b", "a", "a", "c"], categories=["b", "a", "c"], ordered=True
)
tm.assert_categorical_equal(result, expected)
def test_union_categorical_unwrap(self):
# GH 14173
c1 = Categorical(["a", "b"])
c2 = Series(["b", "c"], dtype="category")
result = union_categoricals([c1, c2])
expected = Categorical(["a", "b", "b", "c"])
tm.assert_categorical_equal(result, expected)
c2 = CategoricalIndex(c2)
result = union_categoricals([c1, c2])
tm.assert_categorical_equal(result, expected)
c1 = Series(c1)
result = union_categoricals([c1, c2])
tm.assert_categorical_equal(result, expected)
msg = "all components to combine must be Categorical"
with pytest.raises(TypeError, match=msg):
union_categoricals([c1, ["a", "b", "c"]])

View File

@ -0,0 +1,79 @@
import numpy as np
import pytest
from pandas import (
Index,
date_range,
)
import pandas._testing as tm
from pandas.core.reshape.util import cartesian_product
class TestCartesianProduct:
def test_simple(self):
x, y = list("ABC"), [1, 22]
result1, result2 = cartesian_product([x, y])
expected1 = np.array(["A", "A", "B", "B", "C", "C"])
expected2 = np.array([1, 22, 1, 22, 1, 22])
tm.assert_numpy_array_equal(result1, expected1)
tm.assert_numpy_array_equal(result2, expected2)
def test_datetimeindex(self):
# regression test for GitHub issue #6439
# make sure that the ordering on datetimeindex is consistent
x = date_range("2000-01-01", periods=2)
result1, result2 = (Index(y).day for y in cartesian_product([x, x]))
expected1 = Index([1, 1, 2, 2], dtype=np.int32)
expected2 = Index([1, 2, 1, 2], dtype=np.int32)
tm.assert_index_equal(result1, expected1)
tm.assert_index_equal(result2, expected2)
def test_tzaware_retained(self):
x = date_range("2000-01-01", periods=2, tz="US/Pacific")
y = np.array([3, 4])
result1, result2 = cartesian_product([x, y])
expected = x.repeat(2)
tm.assert_index_equal(result1, expected)
def test_tzaware_retained_categorical(self):
x = date_range("2000-01-01", periods=2, tz="US/Pacific").astype("category")
y = np.array([3, 4])
result1, result2 = cartesian_product([x, y])
expected = x.repeat(2)
tm.assert_index_equal(result1, expected)
@pytest.mark.parametrize("x, y", [[[], []], [[0, 1], []], [[], ["a", "b", "c"]]])
def test_empty(self, x, y):
# product of empty factors
expected1 = np.array([], dtype=np.asarray(x).dtype)
expected2 = np.array([], dtype=np.asarray(y).dtype)
result1, result2 = cartesian_product([x, y])
tm.assert_numpy_array_equal(result1, expected1)
tm.assert_numpy_array_equal(result2, expected2)
def test_empty_input(self):
# empty product (empty input):
result = cartesian_product([])
expected = []
assert result == expected
@pytest.mark.parametrize(
"X", [1, [1], [1, 2], [[1], 2], "a", ["a"], ["a", "b"], [["a"], "b"]]
)
def test_invalid_input(self, X):
msg = "Input must be a list-like of list-likes"
with pytest.raises(TypeError, match=msg):
cartesian_product(X=X)
def test_exceed_product_space(self):
# GH31355: raise useful error when produce space is too large
msg = "Product space too large to allocate arrays!"
with pytest.raises(ValueError, match=msg):
dims = [np.arange(0, 22, dtype=np.int16) for i in range(12)] + [
(np.arange(15128, dtype=np.int16)),
]
cartesian_product(X=dims)