Updated script that can be controled by Nodejs web app
This commit is contained in:
@ -0,0 +1,7 @@
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def sort(request):
|
||||
"""Boolean sort keyword for concat and DataFrame.append."""
|
||||
return request.param
|
@ -0,0 +1,389 @@
|
||||
import datetime as dt
|
||||
from itertools import combinations
|
||||
|
||||
import dateutil
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
Timestamp,
|
||||
concat,
|
||||
isna,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestAppend:
|
||||
def test_append(self, sort, float_frame):
|
||||
mixed_frame = float_frame.copy()
|
||||
mixed_frame["foo"] = "bar"
|
||||
|
||||
begin_index = float_frame.index[:5]
|
||||
end_index = float_frame.index[5:]
|
||||
|
||||
begin_frame = float_frame.reindex(begin_index)
|
||||
end_frame = float_frame.reindex(end_index)
|
||||
|
||||
appended = begin_frame._append(end_frame)
|
||||
tm.assert_almost_equal(appended["A"], float_frame["A"])
|
||||
|
||||
del end_frame["A"]
|
||||
partial_appended = begin_frame._append(end_frame, sort=sort)
|
||||
assert "A" in partial_appended
|
||||
|
||||
partial_appended = end_frame._append(begin_frame, sort=sort)
|
||||
assert "A" in partial_appended
|
||||
|
||||
# mixed type handling
|
||||
appended = mixed_frame[:5]._append(mixed_frame[5:])
|
||||
tm.assert_frame_equal(appended, mixed_frame)
|
||||
|
||||
# what to test here
|
||||
mixed_appended = mixed_frame[:5]._append(float_frame[5:], sort=sort)
|
||||
mixed_appended2 = float_frame[:5]._append(mixed_frame[5:], sort=sort)
|
||||
|
||||
# all equal except 'foo' column
|
||||
tm.assert_frame_equal(
|
||||
mixed_appended.reindex(columns=["A", "B", "C", "D"]),
|
||||
mixed_appended2.reindex(columns=["A", "B", "C", "D"]),
|
||||
)
|
||||
|
||||
def test_append_empty(self, float_frame):
|
||||
empty = DataFrame()
|
||||
|
||||
appended = float_frame._append(empty)
|
||||
tm.assert_frame_equal(float_frame, appended)
|
||||
assert appended is not float_frame
|
||||
|
||||
appended = empty._append(float_frame)
|
||||
tm.assert_frame_equal(float_frame, appended)
|
||||
assert appended is not float_frame
|
||||
|
||||
def test_append_overlap_raises(self, float_frame):
|
||||
msg = "Indexes have overlapping values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
float_frame._append(float_frame, verify_integrity=True)
|
||||
|
||||
def test_append_new_columns(self):
|
||||
# see gh-6129: new columns
|
||||
df = DataFrame({"a": {"x": 1, "y": 2}, "b": {"x": 3, "y": 4}})
|
||||
row = Series([5, 6, 7], index=["a", "b", "c"], name="z")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": {"x": 1, "y": 2, "z": 5},
|
||||
"b": {"x": 3, "y": 4, "z": 6},
|
||||
"c": {"z": 7},
|
||||
}
|
||||
)
|
||||
result = df._append(row)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_append_length0_frame(self, sort):
|
||||
df = DataFrame(columns=["A", "B", "C"])
|
||||
df3 = DataFrame(index=[0, 1], columns=["A", "B"])
|
||||
df5 = df._append(df3, sort=sort)
|
||||
|
||||
expected = DataFrame(index=[0, 1], columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(df5, expected)
|
||||
|
||||
def test_append_records(self):
|
||||
arr1 = np.zeros((2,), dtype=("i4,f4,S10"))
|
||||
arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
|
||||
|
||||
arr2 = np.zeros((3,), dtype=("i4,f4,S10"))
|
||||
arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")]
|
||||
|
||||
df1 = DataFrame(arr1)
|
||||
df2 = DataFrame(arr2)
|
||||
|
||||
result = df1._append(df2, ignore_index=True)
|
||||
expected = DataFrame(np.concatenate((arr1, arr2)))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# rewrite sort fixture, since we also want to test default of None
|
||||
def test_append_sorts(self, sort):
|
||||
df1 = DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"])
|
||||
df2 = DataFrame({"a": [1, 2], "c": [3, 4]}, index=[2, 3])
|
||||
|
||||
result = df1._append(df2, sort=sort)
|
||||
|
||||
# for None / True
|
||||
expected = DataFrame(
|
||||
{"b": [1, 2, None, None], "a": [1, 2, 1, 2], "c": [None, None, 3, 4]},
|
||||
columns=["a", "b", "c"],
|
||||
)
|
||||
if sort is False:
|
||||
expected = expected[["b", "a", "c"]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_append_different_columns(self, sort):
|
||||
df = DataFrame(
|
||||
{
|
||||
"bools": np.random.default_rng(2).standard_normal(10) > 0,
|
||||
"ints": np.random.default_rng(2).integers(0, 10, 10),
|
||||
"floats": np.random.default_rng(2).standard_normal(10),
|
||||
"strings": ["foo", "bar"] * 5,
|
||||
}
|
||||
)
|
||||
|
||||
a = df[:5].loc[:, ["bools", "ints", "floats"]]
|
||||
b = df[5:].loc[:, ["strings", "ints", "floats"]]
|
||||
|
||||
appended = a._append(b, sort=sort)
|
||||
assert isna(appended["strings"][0:4]).all()
|
||||
assert isna(appended["bools"][5:]).all()
|
||||
|
||||
def test_append_many(self, sort, float_frame):
|
||||
chunks = [
|
||||
float_frame[:5],
|
||||
float_frame[5:10],
|
||||
float_frame[10:15],
|
||||
float_frame[15:],
|
||||
]
|
||||
|
||||
result = chunks[0]._append(chunks[1:])
|
||||
tm.assert_frame_equal(result, float_frame)
|
||||
|
||||
chunks[-1] = chunks[-1].copy()
|
||||
chunks[-1]["foo"] = "bar"
|
||||
result = chunks[0]._append(chunks[1:], sort=sort)
|
||||
tm.assert_frame_equal(result.loc[:, float_frame.columns], float_frame)
|
||||
assert (result["foo"][15:] == "bar").all()
|
||||
assert result["foo"][:15].isna().all()
|
||||
|
||||
def test_append_preserve_index_name(self):
|
||||
# #980
|
||||
df1 = DataFrame(columns=["A", "B", "C"])
|
||||
df1 = df1.set_index(["A"])
|
||||
df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], columns=["A", "B", "C"])
|
||||
df2 = df2.set_index(["A"])
|
||||
|
||||
msg = "The behavior of array concatenation with empty entries is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df1._append(df2)
|
||||
assert result.index.name == "A"
|
||||
|
||||
indexes_can_append = [
|
||||
pd.RangeIndex(3),
|
||||
Index([4, 5, 6]),
|
||||
Index([4.5, 5.5, 6.5]),
|
||||
Index(list("abc")),
|
||||
pd.CategoricalIndex("A B C".split()),
|
||||
pd.CategoricalIndex("D E F".split(), ordered=True),
|
||||
pd.IntervalIndex.from_breaks([7, 8, 9, 10]),
|
||||
pd.DatetimeIndex(
|
||||
[
|
||||
dt.datetime(2013, 1, 3, 0, 0),
|
||||
dt.datetime(2013, 1, 3, 6, 10),
|
||||
dt.datetime(2013, 1, 3, 7, 12),
|
||||
]
|
||||
),
|
||||
pd.MultiIndex.from_arrays(["A B C".split(), "D E F".split()]),
|
||||
]
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index", indexes_can_append, ids=lambda x: type(x).__name__
|
||||
)
|
||||
def test_append_same_columns_type(self, index):
|
||||
# GH18359
|
||||
|
||||
# df wider than ser
|
||||
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=index)
|
||||
ser_index = index[:2]
|
||||
ser = Series([7, 8], index=ser_index, name=2)
|
||||
result = df._append(ser)
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3.0], [4, 5, 6], [7, 8, np.nan]], index=[0, 1, 2], columns=index
|
||||
)
|
||||
# integer dtype is preserved for columns present in ser.index
|
||||
assert expected.dtypes.iloc[0].kind == "i"
|
||||
assert expected.dtypes.iloc[1].kind == "i"
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# ser wider than df
|
||||
ser_index = index
|
||||
index = index[:2]
|
||||
df = DataFrame([[1, 2], [4, 5]], columns=index)
|
||||
ser = Series([7, 8, 9], index=ser_index, name=2)
|
||||
result = df._append(ser)
|
||||
expected = DataFrame(
|
||||
[[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]],
|
||||
index=[0, 1, 2],
|
||||
columns=ser_index,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"df_columns, series_index",
|
||||
combinations(indexes_can_append, r=2),
|
||||
ids=lambda x: type(x).__name__,
|
||||
)
|
||||
def test_append_different_columns_types(self, df_columns, series_index):
|
||||
# GH18359
|
||||
# See also test 'test_append_different_columns_types_raises' below
|
||||
# for errors raised when appending
|
||||
|
||||
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns)
|
||||
ser = Series([7, 8, 9], index=series_index, name=2)
|
||||
|
||||
result = df._append(ser)
|
||||
idx_diff = ser.index.difference(df_columns)
|
||||
combined_columns = Index(df_columns.tolist()).append(idx_diff)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1.0, 2.0, 3.0, np.nan, np.nan, np.nan],
|
||||
[4, 5, 6, np.nan, np.nan, np.nan],
|
||||
[np.nan, np.nan, np.nan, 7, 8, 9],
|
||||
],
|
||||
index=[0, 1, 2],
|
||||
columns=combined_columns,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_append_dtype_coerce(self, sort):
|
||||
# GH 4993
|
||||
# appending with datetime will incorrectly convert datetime64
|
||||
|
||||
df1 = DataFrame(
|
||||
index=[1, 2],
|
||||
data=[dt.datetime(2013, 1, 1, 0, 0), dt.datetime(2013, 1, 2, 0, 0)],
|
||||
columns=["start_time"],
|
||||
)
|
||||
df2 = DataFrame(
|
||||
index=[4, 5],
|
||||
data=[
|
||||
[dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 3, 6, 10)],
|
||||
[dt.datetime(2013, 1, 4, 0, 0), dt.datetime(2013, 1, 4, 7, 10)],
|
||||
],
|
||||
columns=["start_time", "end_time"],
|
||||
)
|
||||
|
||||
expected = concat(
|
||||
[
|
||||
Series(
|
||||
[
|
||||
pd.NaT,
|
||||
pd.NaT,
|
||||
dt.datetime(2013, 1, 3, 6, 10),
|
||||
dt.datetime(2013, 1, 4, 7, 10),
|
||||
],
|
||||
name="end_time",
|
||||
),
|
||||
Series(
|
||||
[
|
||||
dt.datetime(2013, 1, 1, 0, 0),
|
||||
dt.datetime(2013, 1, 2, 0, 0),
|
||||
dt.datetime(2013, 1, 3, 0, 0),
|
||||
dt.datetime(2013, 1, 4, 0, 0),
|
||||
],
|
||||
name="start_time",
|
||||
),
|
||||
],
|
||||
axis=1,
|
||||
sort=sort,
|
||||
)
|
||||
result = df1._append(df2, ignore_index=True, sort=sort)
|
||||
if sort:
|
||||
expected = expected[["end_time", "start_time"]]
|
||||
else:
|
||||
expected = expected[["start_time", "end_time"]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_append_missing_column_proper_upcast(self, sort):
|
||||
df1 = DataFrame({"A": np.array([1, 2, 3, 4], dtype="i8")})
|
||||
df2 = DataFrame({"B": np.array([True, False, True, False], dtype=bool)})
|
||||
|
||||
appended = df1._append(df2, ignore_index=True, sort=sort)
|
||||
assert appended["A"].dtype == "f8"
|
||||
assert appended["B"].dtype == "O"
|
||||
|
||||
def test_append_empty_frame_to_series_with_dateutil_tz(self):
|
||||
# GH 23682
|
||||
date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc())
|
||||
ser = Series({"a": 1.0, "b": 2.0, "date": date})
|
||||
df = DataFrame(columns=["c", "d"])
|
||||
result_a = df._append(ser, ignore_index=True)
|
||||
expected = DataFrame(
|
||||
[[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"]
|
||||
)
|
||||
# These columns get cast to object after append
|
||||
expected["c"] = expected["c"].astype(object)
|
||||
expected["d"] = expected["d"].astype(object)
|
||||
tm.assert_frame_equal(result_a, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
[[np.nan, np.nan, 1.0, 2.0, date]] * 2, columns=["c", "d", "a", "b", "date"]
|
||||
)
|
||||
expected["c"] = expected["c"].astype(object)
|
||||
expected["d"] = expected["d"].astype(object)
|
||||
result_b = result_a._append(ser, ignore_index=True)
|
||||
tm.assert_frame_equal(result_b, expected)
|
||||
|
||||
result = df._append([ser, ser], ignore_index=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_append_empty_tz_frame_with_datetime64ns(self, using_array_manager):
|
||||
# https://github.com/pandas-dev/pandas/issues/35460
|
||||
df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]")
|
||||
|
||||
# pd.NaT gets inferred as tz-naive, so append result is tz-naive
|
||||
result = df._append({"a": pd.NaT}, ignore_index=True)
|
||||
if using_array_manager:
|
||||
expected = DataFrame({"a": [pd.NaT]}, dtype=object)
|
||||
else:
|
||||
expected = DataFrame({"a": [np.nan]}, dtype=object)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# also test with typed value to append
|
||||
df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]")
|
||||
other = Series({"a": pd.NaT}, dtype="datetime64[ns]")
|
||||
result = df._append(other, ignore_index=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# mismatched tz
|
||||
other = Series({"a": pd.NaT}, dtype="datetime64[ns, US/Pacific]")
|
||||
result = df._append(other, ignore_index=True)
|
||||
expected = DataFrame({"a": [pd.NaT]}).astype(object)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
|
||||
)
|
||||
@pytest.mark.parametrize("val", [1, "NaT"])
|
||||
def test_append_empty_frame_with_timedelta64ns_nat(
|
||||
self, dtype_str, val, using_array_manager
|
||||
):
|
||||
# https://github.com/pandas-dev/pandas/issues/35460
|
||||
df = DataFrame(columns=["a"]).astype(dtype_str)
|
||||
|
||||
other = DataFrame({"a": [np.timedelta64(val, "ns")]})
|
||||
result = df._append(other, ignore_index=True)
|
||||
|
||||
expected = other.astype(object)
|
||||
if isinstance(val, str) and dtype_str != "int64" and not using_array_manager:
|
||||
# TODO: expected used to be `other.astype(object)` which is a more
|
||||
# reasonable result. This was changed when tightening
|
||||
# assert_frame_equal's treatment of mismatched NAs to match the
|
||||
# existing behavior.
|
||||
expected = DataFrame({"a": [np.nan]}, dtype=object)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
|
||||
)
|
||||
@pytest.mark.parametrize("val", [1, "NaT"])
|
||||
def test_append_frame_with_timedelta64ns_nat(self, dtype_str, val):
|
||||
# https://github.com/pandas-dev/pandas/issues/35460
|
||||
df = DataFrame({"a": pd.array([1], dtype=dtype_str)})
|
||||
|
||||
other = DataFrame({"a": [np.timedelta64(val, "ns")]})
|
||||
result = df._append(other, ignore_index=True)
|
||||
|
||||
expected = DataFrame({"a": [df.iloc[0, 0], other.iloc[0, 0]]}, dtype=object)
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,753 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=list(
|
||||
{
|
||||
"bool": [True, False, True],
|
||||
"int64": [1, 2, 3],
|
||||
"float64": [1.1, np.nan, 3.3],
|
||||
"category": Categorical(["X", "Y", "Z"]),
|
||||
"object": ["a", "b", "c"],
|
||||
"datetime64[ns]": [
|
||||
pd.Timestamp("2011-01-01"),
|
||||
pd.Timestamp("2011-01-02"),
|
||||
pd.Timestamp("2011-01-03"),
|
||||
],
|
||||
"datetime64[ns, US/Eastern]": [
|
||||
pd.Timestamp("2011-01-01", tz="US/Eastern"),
|
||||
pd.Timestamp("2011-01-02", tz="US/Eastern"),
|
||||
pd.Timestamp("2011-01-03", tz="US/Eastern"),
|
||||
],
|
||||
"timedelta64[ns]": [
|
||||
pd.Timedelta("1 days"),
|
||||
pd.Timedelta("2 days"),
|
||||
pd.Timedelta("3 days"),
|
||||
],
|
||||
"period[M]": [
|
||||
pd.Period("2011-01", freq="M"),
|
||||
pd.Period("2011-02", freq="M"),
|
||||
pd.Period("2011-03", freq="M"),
|
||||
],
|
||||
}.items()
|
||||
)
|
||||
)
|
||||
def item(request):
|
||||
key, data = request.param
|
||||
return key, data
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def item2(item):
|
||||
return item
|
||||
|
||||
|
||||
class TestConcatAppendCommon:
|
||||
"""
|
||||
Test common dtype coercion rules between concat and append.
|
||||
"""
|
||||
|
||||
def test_dtypes(self, item, index_or_series, using_infer_string):
|
||||
# to confirm test case covers intended dtypes
|
||||
typ, vals = item
|
||||
obj = index_or_series(vals)
|
||||
if typ == "object" and using_infer_string:
|
||||
typ = "string"
|
||||
if isinstance(obj, Index):
|
||||
assert obj.dtype == typ
|
||||
elif isinstance(obj, Series):
|
||||
if typ.startswith("period"):
|
||||
assert obj.dtype == "Period[M]"
|
||||
else:
|
||||
assert obj.dtype == typ
|
||||
|
||||
def test_concatlike_same_dtypes(self, item):
|
||||
# GH 13660
|
||||
typ1, vals1 = item
|
||||
|
||||
vals2 = vals1
|
||||
vals3 = vals1
|
||||
|
||||
if typ1 == "category":
|
||||
exp_data = Categorical(list(vals1) + list(vals2))
|
||||
exp_data3 = Categorical(list(vals1) + list(vals2) + list(vals3))
|
||||
else:
|
||||
exp_data = vals1 + vals2
|
||||
exp_data3 = vals1 + vals2 + vals3
|
||||
|
||||
# ----- Index ----- #
|
||||
|
||||
# index.append
|
||||
res = Index(vals1).append(Index(vals2))
|
||||
exp = Index(exp_data)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
# 3 elements
|
||||
res = Index(vals1).append([Index(vals2), Index(vals3)])
|
||||
exp = Index(exp_data3)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
# index.append name mismatch
|
||||
i1 = Index(vals1, name="x")
|
||||
i2 = Index(vals2, name="y")
|
||||
res = i1.append(i2)
|
||||
exp = Index(exp_data)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
# index.append name match
|
||||
i1 = Index(vals1, name="x")
|
||||
i2 = Index(vals2, name="x")
|
||||
res = i1.append(i2)
|
||||
exp = Index(exp_data, name="x")
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
# cannot append non-index
|
||||
with pytest.raises(TypeError, match="all inputs must be Index"):
|
||||
Index(vals1).append(vals2)
|
||||
|
||||
with pytest.raises(TypeError, match="all inputs must be Index"):
|
||||
Index(vals1).append([Index(vals2), vals3])
|
||||
|
||||
# ----- Series ----- #
|
||||
|
||||
# series.append
|
||||
res = Series(vals1)._append(Series(vals2), ignore_index=True)
|
||||
exp = Series(exp_data)
|
||||
tm.assert_series_equal(res, exp, check_index_type=True)
|
||||
|
||||
# concat
|
||||
res = pd.concat([Series(vals1), Series(vals2)], ignore_index=True)
|
||||
tm.assert_series_equal(res, exp, check_index_type=True)
|
||||
|
||||
# 3 elements
|
||||
res = Series(vals1)._append([Series(vals2), Series(vals3)], ignore_index=True)
|
||||
exp = Series(exp_data3)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = pd.concat(
|
||||
[Series(vals1), Series(vals2), Series(vals3)],
|
||||
ignore_index=True,
|
||||
)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
# name mismatch
|
||||
s1 = Series(vals1, name="x")
|
||||
s2 = Series(vals2, name="y")
|
||||
res = s1._append(s2, ignore_index=True)
|
||||
exp = Series(exp_data)
|
||||
tm.assert_series_equal(res, exp, check_index_type=True)
|
||||
|
||||
res = pd.concat([s1, s2], ignore_index=True)
|
||||
tm.assert_series_equal(res, exp, check_index_type=True)
|
||||
|
||||
# name match
|
||||
s1 = Series(vals1, name="x")
|
||||
s2 = Series(vals2, name="x")
|
||||
res = s1._append(s2, ignore_index=True)
|
||||
exp = Series(exp_data, name="x")
|
||||
tm.assert_series_equal(res, exp, check_index_type=True)
|
||||
|
||||
res = pd.concat([s1, s2], ignore_index=True)
|
||||
tm.assert_series_equal(res, exp, check_index_type=True)
|
||||
|
||||
# cannot append non-index
|
||||
msg = (
|
||||
r"cannot concatenate object of type '.+'; "
|
||||
"only Series and DataFrame objs are valid"
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
Series(vals1)._append(vals2)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
Series(vals1)._append([Series(vals2), vals3])
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
pd.concat([Series(vals1), vals2])
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
pd.concat([Series(vals1), Series(vals2), vals3])
|
||||
|
||||
def test_concatlike_dtypes_coercion(self, item, item2, request):
|
||||
# GH 13660
|
||||
typ1, vals1 = item
|
||||
typ2, vals2 = item2
|
||||
|
||||
vals3 = vals2
|
||||
|
||||
# basically infer
|
||||
exp_index_dtype = None
|
||||
exp_series_dtype = None
|
||||
|
||||
if typ1 == typ2:
|
||||
pytest.skip("same dtype is tested in test_concatlike_same_dtypes")
|
||||
elif typ1 == "category" or typ2 == "category":
|
||||
pytest.skip("categorical type tested elsewhere")
|
||||
|
||||
# specify expected dtype
|
||||
if typ1 == "bool" and typ2 in ("int64", "float64"):
|
||||
# series coerces to numeric based on numpy rule
|
||||
# index doesn't because bool is object dtype
|
||||
exp_series_dtype = typ2
|
||||
mark = pytest.mark.xfail(reason="GH#39187 casting to object")
|
||||
request.applymarker(mark)
|
||||
elif typ2 == "bool" and typ1 in ("int64", "float64"):
|
||||
exp_series_dtype = typ1
|
||||
mark = pytest.mark.xfail(reason="GH#39187 casting to object")
|
||||
request.applymarker(mark)
|
||||
elif typ1 in {"datetime64[ns, US/Eastern]", "timedelta64[ns]"} or typ2 in {
|
||||
"datetime64[ns, US/Eastern]",
|
||||
"timedelta64[ns]",
|
||||
}:
|
||||
exp_index_dtype = object
|
||||
exp_series_dtype = object
|
||||
|
||||
exp_data = vals1 + vals2
|
||||
exp_data3 = vals1 + vals2 + vals3
|
||||
|
||||
# ----- Index ----- #
|
||||
|
||||
# index.append
|
||||
# GH#39817
|
||||
res = Index(vals1).append(Index(vals2))
|
||||
exp = Index(exp_data, dtype=exp_index_dtype)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
# 3 elements
|
||||
res = Index(vals1).append([Index(vals2), Index(vals3)])
|
||||
exp = Index(exp_data3, dtype=exp_index_dtype)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
# ----- Series ----- #
|
||||
|
||||
# series._append
|
||||
# GH#39817
|
||||
res = Series(vals1)._append(Series(vals2), ignore_index=True)
|
||||
exp = Series(exp_data, dtype=exp_series_dtype)
|
||||
tm.assert_series_equal(res, exp, check_index_type=True)
|
||||
|
||||
# concat
|
||||
# GH#39817
|
||||
res = pd.concat([Series(vals1), Series(vals2)], ignore_index=True)
|
||||
tm.assert_series_equal(res, exp, check_index_type=True)
|
||||
|
||||
# 3 elements
|
||||
# GH#39817
|
||||
res = Series(vals1)._append([Series(vals2), Series(vals3)], ignore_index=True)
|
||||
exp = Series(exp_data3, dtype=exp_series_dtype)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
# GH#39817
|
||||
res = pd.concat(
|
||||
[Series(vals1), Series(vals2), Series(vals3)],
|
||||
ignore_index=True,
|
||||
)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
def test_concatlike_common_coerce_to_pandas_object(self):
|
||||
# GH 13626
|
||||
# result must be Timestamp/Timedelta, not datetime.datetime/timedelta
|
||||
dti = pd.DatetimeIndex(["2011-01-01", "2011-01-02"])
|
||||
tdi = pd.TimedeltaIndex(["1 days", "2 days"])
|
||||
|
||||
exp = Index(
|
||||
[
|
||||
pd.Timestamp("2011-01-01"),
|
||||
pd.Timestamp("2011-01-02"),
|
||||
pd.Timedelta("1 days"),
|
||||
pd.Timedelta("2 days"),
|
||||
]
|
||||
)
|
||||
|
||||
res = dti.append(tdi)
|
||||
tm.assert_index_equal(res, exp)
|
||||
assert isinstance(res[0], pd.Timestamp)
|
||||
assert isinstance(res[-1], pd.Timedelta)
|
||||
|
||||
dts = Series(dti)
|
||||
tds = Series(tdi)
|
||||
res = dts._append(tds)
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
assert isinstance(res.iloc[0], pd.Timestamp)
|
||||
assert isinstance(res.iloc[-1], pd.Timedelta)
|
||||
|
||||
res = pd.concat([dts, tds])
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
assert isinstance(res.iloc[0], pd.Timestamp)
|
||||
assert isinstance(res.iloc[-1], pd.Timedelta)
|
||||
|
||||
def test_concatlike_datetimetz(self, tz_aware_fixture):
|
||||
tz = tz_aware_fixture
|
||||
# GH 7795
|
||||
dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz)
|
||||
dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz=tz)
|
||||
|
||||
exp = pd.DatetimeIndex(
|
||||
["2011-01-01", "2011-01-02", "2012-01-01", "2012-01-02"], tz=tz
|
||||
)
|
||||
|
||||
res = dti1.append(dti2)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
dts1 = Series(dti1)
|
||||
dts2 = Series(dti2)
|
||||
res = dts1._append(dts2)
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
res = pd.concat([dts1, dts2])
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
@pytest.mark.parametrize("tz", ["UTC", "US/Eastern", "Asia/Tokyo", "EST5EDT"])
|
||||
def test_concatlike_datetimetz_short(self, tz):
|
||||
# GH#7795
|
||||
ix1 = pd.date_range(start="2014-07-15", end="2014-07-17", freq="D", tz=tz)
|
||||
ix2 = pd.DatetimeIndex(["2014-07-11", "2014-07-21"], tz=tz)
|
||||
df1 = DataFrame(0, index=ix1, columns=["A", "B"])
|
||||
df2 = DataFrame(0, index=ix2, columns=["A", "B"])
|
||||
|
||||
exp_idx = pd.DatetimeIndex(
|
||||
["2014-07-15", "2014-07-16", "2014-07-17", "2014-07-11", "2014-07-21"],
|
||||
tz=tz,
|
||||
).as_unit("ns")
|
||||
exp = DataFrame(0, index=exp_idx, columns=["A", "B"])
|
||||
|
||||
tm.assert_frame_equal(df1._append(df2), exp)
|
||||
tm.assert_frame_equal(pd.concat([df1, df2]), exp)
|
||||
|
||||
def test_concatlike_datetimetz_to_object(self, tz_aware_fixture):
|
||||
tz = tz_aware_fixture
|
||||
# GH 13660
|
||||
|
||||
# different tz coerces to object
|
||||
dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz)
|
||||
dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"])
|
||||
|
||||
exp = Index(
|
||||
[
|
||||
pd.Timestamp("2011-01-01", tz=tz),
|
||||
pd.Timestamp("2011-01-02", tz=tz),
|
||||
pd.Timestamp("2012-01-01"),
|
||||
pd.Timestamp("2012-01-02"),
|
||||
],
|
||||
dtype=object,
|
||||
)
|
||||
|
||||
res = dti1.append(dti2)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
dts1 = Series(dti1)
|
||||
dts2 = Series(dti2)
|
||||
res = dts1._append(dts2)
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
res = pd.concat([dts1, dts2])
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
# different tz
|
||||
dti3 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz="US/Pacific")
|
||||
|
||||
exp = Index(
|
||||
[
|
||||
pd.Timestamp("2011-01-01", tz=tz),
|
||||
pd.Timestamp("2011-01-02", tz=tz),
|
||||
pd.Timestamp("2012-01-01", tz="US/Pacific"),
|
||||
pd.Timestamp("2012-01-02", tz="US/Pacific"),
|
||||
],
|
||||
dtype=object,
|
||||
)
|
||||
|
||||
res = dti1.append(dti3)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
dts1 = Series(dti1)
|
||||
dts3 = Series(dti3)
|
||||
res = dts1._append(dts3)
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
res = pd.concat([dts1, dts3])
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
def test_concatlike_common_period(self):
|
||||
# GH 13660
|
||||
pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M")
|
||||
pi2 = pd.PeriodIndex(["2012-01", "2012-02"], freq="M")
|
||||
|
||||
exp = pd.PeriodIndex(["2011-01", "2011-02", "2012-01", "2012-02"], freq="M")
|
||||
|
||||
res = pi1.append(pi2)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
ps1 = Series(pi1)
|
||||
ps2 = Series(pi2)
|
||||
res = ps1._append(ps2)
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
res = pd.concat([ps1, ps2])
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
def test_concatlike_common_period_diff_freq_to_object(self):
|
||||
# GH 13221
|
||||
pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M")
|
||||
pi2 = pd.PeriodIndex(["2012-01-01", "2012-02-01"], freq="D")
|
||||
|
||||
exp = Index(
|
||||
[
|
||||
pd.Period("2011-01", freq="M"),
|
||||
pd.Period("2011-02", freq="M"),
|
||||
pd.Period("2012-01-01", freq="D"),
|
||||
pd.Period("2012-02-01", freq="D"),
|
||||
],
|
||||
dtype=object,
|
||||
)
|
||||
|
||||
res = pi1.append(pi2)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
ps1 = Series(pi1)
|
||||
ps2 = Series(pi2)
|
||||
res = ps1._append(ps2)
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
res = pd.concat([ps1, ps2])
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
def test_concatlike_common_period_mixed_dt_to_object(self):
|
||||
# GH 13221
|
||||
# different datetimelike
|
||||
pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M")
|
||||
tdi = pd.TimedeltaIndex(["1 days", "2 days"])
|
||||
exp = Index(
|
||||
[
|
||||
pd.Period("2011-01", freq="M"),
|
||||
pd.Period("2011-02", freq="M"),
|
||||
pd.Timedelta("1 days"),
|
||||
pd.Timedelta("2 days"),
|
||||
],
|
||||
dtype=object,
|
||||
)
|
||||
|
||||
res = pi1.append(tdi)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
ps1 = Series(pi1)
|
||||
tds = Series(tdi)
|
||||
res = ps1._append(tds)
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
res = pd.concat([ps1, tds])
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
# inverse
|
||||
exp = Index(
|
||||
[
|
||||
pd.Timedelta("1 days"),
|
||||
pd.Timedelta("2 days"),
|
||||
pd.Period("2011-01", freq="M"),
|
||||
pd.Period("2011-02", freq="M"),
|
||||
],
|
||||
dtype=object,
|
||||
)
|
||||
|
||||
res = tdi.append(pi1)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
ps1 = Series(pi1)
|
||||
tds = Series(tdi)
|
||||
res = tds._append(ps1)
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
res = pd.concat([tds, ps1])
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
def test_concat_categorical(self):
|
||||
# GH 13524
|
||||
|
||||
# same categories -> category
|
||||
s1 = Series([1, 2, np.nan], dtype="category")
|
||||
s2 = Series([2, 1, 2], dtype="category")
|
||||
|
||||
exp = Series([1, 2, np.nan, 2, 1, 2], dtype="category")
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
|
||||
|
||||
# partially different categories => not-category
|
||||
s1 = Series([3, 2], dtype="category")
|
||||
s2 = Series([2, 1], dtype="category")
|
||||
|
||||
exp = Series([3, 2, 2, 1])
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
|
||||
|
||||
# completely different categories (same dtype) => not-category
|
||||
s1 = Series([10, 11, np.nan], dtype="category")
|
||||
s2 = Series([np.nan, 1, 3, 2], dtype="category")
|
||||
|
||||
exp = Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype=np.float64)
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
|
||||
|
||||
def test_union_categorical_same_categories_different_order(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/19096
|
||||
a = Series(Categorical(["a", "b", "c"], categories=["a", "b", "c"]))
|
||||
b = Series(Categorical(["a", "b", "c"], categories=["b", "a", "c"]))
|
||||
result = pd.concat([a, b], ignore_index=True)
|
||||
expected = Series(
|
||||
Categorical(["a", "b", "c", "a", "b", "c"], categories=["a", "b", "c"])
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_categorical_coercion(self):
|
||||
# GH 13524
|
||||
|
||||
# category + not-category => not-category
|
||||
s1 = Series([1, 2, np.nan], dtype="category")
|
||||
s2 = Series([2, 1, 2])
|
||||
|
||||
exp = Series([1, 2, np.nan, 2, 1, 2], dtype=np.float64)
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
|
||||
|
||||
# result shouldn't be affected by 1st elem dtype
|
||||
exp = Series([2, 1, 2, 1, 2, np.nan], dtype=np.float64)
|
||||
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)
|
||||
|
||||
# all values are not in category => not-category
|
||||
s1 = Series([3, 2], dtype="category")
|
||||
s2 = Series([2, 1])
|
||||
|
||||
exp = Series([3, 2, 2, 1])
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
|
||||
|
||||
exp = Series([2, 1, 3, 2])
|
||||
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)
|
||||
|
||||
# completely different categories => not-category
|
||||
s1 = Series([10, 11, np.nan], dtype="category")
|
||||
s2 = Series([1, 3, 2])
|
||||
|
||||
exp = Series([10, 11, np.nan, 1, 3, 2], dtype=np.float64)
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
|
||||
|
||||
exp = Series([1, 3, 2, 10, 11, np.nan], dtype=np.float64)
|
||||
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)
|
||||
|
||||
# different dtype => not-category
|
||||
s1 = Series([10, 11, np.nan], dtype="category")
|
||||
s2 = Series(["a", "b", "c"])
|
||||
|
||||
exp = Series([10, 11, np.nan, "a", "b", "c"])
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
|
||||
|
||||
exp = Series(["a", "b", "c", 10, 11, np.nan])
|
||||
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)
|
||||
|
||||
# if normal series only contains NaN-likes => not-category
|
||||
s1 = Series([10, 11], dtype="category")
|
||||
s2 = Series([np.nan, np.nan, np.nan])
|
||||
|
||||
exp = Series([10, 11, np.nan, np.nan, np.nan])
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
|
||||
|
||||
exp = Series([np.nan, np.nan, np.nan, 10, 11])
|
||||
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)
|
||||
|
||||
def test_concat_categorical_3elem_coercion(self):
|
||||
# GH 13524
|
||||
|
||||
# mixed dtypes => not-category
|
||||
s1 = Series([1, 2, np.nan], dtype="category")
|
||||
s2 = Series([2, 1, 2], dtype="category")
|
||||
s3 = Series([1, 2, 1, 2, np.nan])
|
||||
|
||||
exp = Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype="float")
|
||||
tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append([s2, s3], ignore_index=True), exp)
|
||||
|
||||
exp = Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype="float")
|
||||
tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s3._append([s1, s2], ignore_index=True), exp)
|
||||
|
||||
# values are all in either category => not-category
|
||||
s1 = Series([4, 5, 6], dtype="category")
|
||||
s2 = Series([1, 2, 3], dtype="category")
|
||||
s3 = Series([1, 3, 4])
|
||||
|
||||
exp = Series([4, 5, 6, 1, 2, 3, 1, 3, 4])
|
||||
tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append([s2, s3], ignore_index=True), exp)
|
||||
|
||||
exp = Series([1, 3, 4, 4, 5, 6, 1, 2, 3])
|
||||
tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s3._append([s1, s2], ignore_index=True), exp)
|
||||
|
||||
# values are all in either category => not-category
|
||||
s1 = Series([4, 5, 6], dtype="category")
|
||||
s2 = Series([1, 2, 3], dtype="category")
|
||||
s3 = Series([10, 11, 12])
|
||||
|
||||
exp = Series([4, 5, 6, 1, 2, 3, 10, 11, 12])
|
||||
tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append([s2, s3], ignore_index=True), exp)
|
||||
|
||||
exp = Series([10, 11, 12, 4, 5, 6, 1, 2, 3])
|
||||
tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s3._append([s1, s2], ignore_index=True), exp)
|
||||
|
||||
def test_concat_categorical_multi_coercion(self):
|
||||
# GH 13524
|
||||
|
||||
s1 = Series([1, 3], dtype="category")
|
||||
s2 = Series([3, 4], dtype="category")
|
||||
s3 = Series([2, 3])
|
||||
s4 = Series([2, 2], dtype="category")
|
||||
s5 = Series([1, np.nan])
|
||||
s6 = Series([1, 3, 2], dtype="category")
|
||||
|
||||
# mixed dtype, values are all in categories => not-category
|
||||
exp = Series([1, 3, 3, 4, 2, 3, 2, 2, 1, np.nan, 1, 3, 2])
|
||||
res = pd.concat([s1, s2, s3, s4, s5, s6], ignore_index=True)
|
||||
tm.assert_series_equal(res, exp)
|
||||
res = s1._append([s2, s3, s4, s5, s6], ignore_index=True)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
exp = Series([1, 3, 2, 1, np.nan, 2, 2, 2, 3, 3, 4, 1, 3])
|
||||
res = pd.concat([s6, s5, s4, s3, s2, s1], ignore_index=True)
|
||||
tm.assert_series_equal(res, exp)
|
||||
res = s6._append([s5, s4, s3, s2, s1], ignore_index=True)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
def test_concat_categorical_ordered(self):
|
||||
# GH 13524
|
||||
|
||||
s1 = Series(Categorical([1, 2, np.nan], ordered=True))
|
||||
s2 = Series(Categorical([2, 1, 2], ordered=True))
|
||||
|
||||
exp = Series(Categorical([1, 2, np.nan, 2, 1, 2], ordered=True))
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
|
||||
|
||||
exp = Series(Categorical([1, 2, np.nan, 2, 1, 2, 1, 2, np.nan], ordered=True))
|
||||
tm.assert_series_equal(pd.concat([s1, s2, s1], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append([s2, s1], ignore_index=True), exp)
|
||||
|
||||
def test_concat_categorical_coercion_nan(self):
|
||||
# GH 13524
|
||||
|
||||
# some edge cases
|
||||
# category + not-category => not category
|
||||
s1 = Series(np.array([np.nan, np.nan], dtype=np.float64), dtype="category")
|
||||
s2 = Series([np.nan, 1])
|
||||
|
||||
exp = Series([np.nan, np.nan, np.nan, 1])
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
|
||||
|
||||
s1 = Series([1, np.nan], dtype="category")
|
||||
s2 = Series([np.nan, np.nan])
|
||||
|
||||
exp = Series([1, np.nan, np.nan, np.nan], dtype="float")
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
|
||||
|
||||
# mixed dtype, all nan-likes => not-category
|
||||
s1 = Series([np.nan, np.nan], dtype="category")
|
||||
s2 = Series([np.nan, np.nan])
|
||||
|
||||
exp = Series([np.nan, np.nan, np.nan, np.nan])
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
|
||||
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)
|
||||
|
||||
# all category nan-likes => category
|
||||
s1 = Series([np.nan, np.nan], dtype="category")
|
||||
s2 = Series([np.nan, np.nan], dtype="category")
|
||||
|
||||
exp = Series([np.nan, np.nan, np.nan, np.nan], dtype="category")
|
||||
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
|
||||
|
||||
def test_concat_categorical_empty(self):
|
||||
# GH 13524
|
||||
|
||||
s1 = Series([], dtype="category")
|
||||
s2 = Series([1, 2], dtype="category")
|
||||
|
||||
msg = "The behavior of array concatenation with empty entries is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2)
|
||||
tm.assert_series_equal(s1._append(s2, ignore_index=True), s2)
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2)
|
||||
tm.assert_series_equal(s2._append(s1, ignore_index=True), s2)
|
||||
|
||||
s1 = Series([], dtype="category")
|
||||
s2 = Series([], dtype="category")
|
||||
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2)
|
||||
tm.assert_series_equal(s1._append(s2, ignore_index=True), s2)
|
||||
|
||||
s1 = Series([], dtype="category")
|
||||
s2 = Series([], dtype="object")
|
||||
|
||||
# different dtype => not-category
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2)
|
||||
tm.assert_series_equal(s1._append(s2, ignore_index=True), s2)
|
||||
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2)
|
||||
tm.assert_series_equal(s2._append(s1, ignore_index=True), s2)
|
||||
|
||||
s1 = Series([], dtype="category")
|
||||
s2 = Series([np.nan, np.nan])
|
||||
|
||||
# empty Series is ignored
|
||||
exp = Series([np.nan, np.nan])
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)
|
||||
|
||||
def test_categorical_concat_append(self):
|
||||
cat = Categorical(["a", "b"], categories=["a", "b"])
|
||||
vals = [1, 2]
|
||||
df = DataFrame({"cats": cat, "vals": vals})
|
||||
cat2 = Categorical(["a", "b", "a", "b"], categories=["a", "b"])
|
||||
vals2 = [1, 2, 1, 2]
|
||||
exp = DataFrame({"cats": cat2, "vals": vals2}, index=Index([0, 1, 0, 1]))
|
||||
|
||||
tm.assert_frame_equal(pd.concat([df, df]), exp)
|
||||
tm.assert_frame_equal(df._append(df), exp)
|
||||
|
||||
# GH 13524 can concat different categories
|
||||
cat3 = Categorical(["a", "b"], categories=["a", "b", "c"])
|
||||
vals3 = [1, 2]
|
||||
df_different_categories = DataFrame({"cats": cat3, "vals": vals3})
|
||||
|
||||
res = pd.concat([df, df_different_categories], ignore_index=True)
|
||||
exp = DataFrame({"cats": list("abab"), "vals": [1, 2, 1, 2]})
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
res = df._append(df_different_categories, ignore_index=True)
|
||||
tm.assert_frame_equal(res, exp)
|
@ -0,0 +1,273 @@
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.core.dtypes.dtypes import CategoricalDtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestCategoricalConcat:
|
||||
def test_categorical_concat(self, sort):
|
||||
# See GH 10177
|
||||
df1 = DataFrame(
|
||||
np.arange(18, dtype="int64").reshape(6, 3), columns=["a", "b", "c"]
|
||||
)
|
||||
|
||||
df2 = DataFrame(np.arange(14, dtype="int64").reshape(7, 2), columns=["a", "c"])
|
||||
|
||||
cat_values = ["one", "one", "two", "one", "two", "two", "one"]
|
||||
df2["h"] = Series(Categorical(cat_values))
|
||||
|
||||
res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=sort)
|
||||
exp = DataFrame(
|
||||
{
|
||||
"a": [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12],
|
||||
"b": [
|
||||
1,
|
||||
4,
|
||||
7,
|
||||
10,
|
||||
13,
|
||||
16,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
"c": [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13],
|
||||
"h": [None] * 6 + cat_values,
|
||||
}
|
||||
)
|
||||
exp["h"] = exp["h"].astype(df2["h"].dtype)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_categorical_concat_dtypes(self, using_infer_string):
|
||||
# GH8143
|
||||
index = ["cat", "obj", "num"]
|
||||
cat = Categorical(["a", "b", "c"])
|
||||
obj = Series(["a", "b", "c"])
|
||||
num = Series([1, 2, 3])
|
||||
df = pd.concat([Series(cat), obj, num], axis=1, keys=index)
|
||||
|
||||
result = df.dtypes == (
|
||||
object if not using_infer_string else "string[pyarrow_numpy]"
|
||||
)
|
||||
expected = Series([False, True, False], index=index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.dtypes == "int64"
|
||||
expected = Series([False, False, True], index=index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.dtypes == "category"
|
||||
expected = Series([True, False, False], index=index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_categoricalindex(self):
|
||||
# GH 16111, categories that aren't lexsorted
|
||||
categories = [9, 0, 1, 2, 3]
|
||||
|
||||
a = Series(1, index=pd.CategoricalIndex([9, 0], categories=categories))
|
||||
b = Series(2, index=pd.CategoricalIndex([0, 1], categories=categories))
|
||||
c = Series(3, index=pd.CategoricalIndex([1, 2], categories=categories))
|
||||
|
||||
result = pd.concat([a, b, c], axis=1)
|
||||
|
||||
exp_idx = pd.CategoricalIndex([9, 0, 1, 2], categories=categories)
|
||||
exp = DataFrame(
|
||||
{
|
||||
0: [1, 1, np.nan, np.nan],
|
||||
1: [np.nan, 2, 2, np.nan],
|
||||
2: [np.nan, np.nan, 3, 3],
|
||||
},
|
||||
columns=[0, 1, 2],
|
||||
index=exp_idx,
|
||||
)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
def test_categorical_concat_preserve(self):
|
||||
# GH 8641 series concat not preserving category dtype
|
||||
# GH 13524 can concat different categories
|
||||
s = Series(list("abc"), dtype="category")
|
||||
s2 = Series(list("abd"), dtype="category")
|
||||
|
||||
exp = Series(list("abcabd"))
|
||||
res = pd.concat([s, s2], ignore_index=True)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
exp = Series(list("abcabc"), dtype="category")
|
||||
res = pd.concat([s, s], ignore_index=True)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
exp = Series(list("abcabc"), index=[0, 1, 2, 0, 1, 2], dtype="category")
|
||||
res = pd.concat([s, s])
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
a = Series(np.arange(6, dtype="int64"))
|
||||
b = Series(list("aabbca"))
|
||||
|
||||
df2 = DataFrame({"A": a, "B": b.astype(CategoricalDtype(list("cab")))})
|
||||
res = pd.concat([df2, df2])
|
||||
exp = DataFrame(
|
||||
{
|
||||
"A": pd.concat([a, a]),
|
||||
"B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_categorical_index_preserver(self):
|
||||
a = Series(np.arange(6, dtype="int64"))
|
||||
b = Series(list("aabbca"))
|
||||
|
||||
df2 = DataFrame(
|
||||
{"A": a, "B": b.astype(CategoricalDtype(list("cab")))}
|
||||
).set_index("B")
|
||||
result = pd.concat([df2, df2])
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": pd.concat([a, a]),
|
||||
"B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))),
|
||||
}
|
||||
).set_index("B")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# wrong categories -> uses concat_compat, which casts to object
|
||||
df3 = DataFrame(
|
||||
{"A": a, "B": Categorical(b, categories=list("abe"))}
|
||||
).set_index("B")
|
||||
result = pd.concat([df2, df3])
|
||||
expected = pd.concat(
|
||||
[
|
||||
df2.set_axis(df2.index.astype(object), axis=0),
|
||||
df3.set_axis(df3.index.astype(object), axis=0),
|
||||
]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_categorical_tz(self):
|
||||
# GH-23816
|
||||
a = Series(pd.date_range("2017-01-01", periods=2, tz="US/Pacific"))
|
||||
b = Series(["a", "b"], dtype="category")
|
||||
result = pd.concat([a, b], ignore_index=True)
|
||||
expected = Series(
|
||||
[
|
||||
pd.Timestamp("2017-01-01", tz="US/Pacific"),
|
||||
pd.Timestamp("2017-01-02", tz="US/Pacific"),
|
||||
"a",
|
||||
"b",
|
||||
]
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_categorical_datetime(self):
|
||||
# GH-39443
|
||||
df1 = DataFrame(
|
||||
{"x": Series(datetime(2021, 1, 1), index=[0], dtype="category")}
|
||||
)
|
||||
df2 = DataFrame(
|
||||
{"x": Series(datetime(2021, 1, 2), index=[1], dtype="category")}
|
||||
)
|
||||
|
||||
result = pd.concat([df1, df2])
|
||||
expected = DataFrame(
|
||||
{"x": Series([datetime(2021, 1, 1), datetime(2021, 1, 2)])}
|
||||
)
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_concat_categorical_unchanged(self):
|
||||
# GH-12007
|
||||
# test fix for when concat on categorical and float
|
||||
# coerces dtype categorical -> float
|
||||
df = DataFrame(Series(["a", "b", "c"], dtype="category", name="A"))
|
||||
ser = Series([0, 1, 2], index=[0, 1, 3], name="B")
|
||||
result = pd.concat([df, ser], axis=1)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": Series(["a", "b", "c", np.nan], dtype="category"),
|
||||
"B": Series([0, 1, np.nan, 2], dtype="float"),
|
||||
}
|
||||
)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_categorical_concat_gh7864(self):
|
||||
# GH 7864
|
||||
# make sure ordering is preserved
|
||||
df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list("abbaae")})
|
||||
df["grade"] = Categorical(df["raw_grade"])
|
||||
df["grade"].cat.set_categories(["e", "a", "b"])
|
||||
|
||||
df1 = df[0:3]
|
||||
df2 = df[3:]
|
||||
|
||||
tm.assert_index_equal(df["grade"].cat.categories, df1["grade"].cat.categories)
|
||||
tm.assert_index_equal(df["grade"].cat.categories, df2["grade"].cat.categories)
|
||||
|
||||
dfx = pd.concat([df1, df2])
|
||||
tm.assert_index_equal(df["grade"].cat.categories, dfx["grade"].cat.categories)
|
||||
|
||||
dfa = df1._append(df2)
|
||||
tm.assert_index_equal(df["grade"].cat.categories, dfa["grade"].cat.categories)
|
||||
|
||||
def test_categorical_index_upcast(self):
|
||||
# GH 17629
|
||||
# test upcasting to object when concatenating on categorical indexes
|
||||
# with non-identical categories
|
||||
|
||||
a = DataFrame({"foo": [1, 2]}, index=Categorical(["foo", "bar"]))
|
||||
b = DataFrame({"foo": [4, 3]}, index=Categorical(["baz", "bar"]))
|
||||
|
||||
res = pd.concat([a, b])
|
||||
exp = DataFrame({"foo": [1, 2, 4, 3]}, index=["foo", "bar", "baz", "bar"])
|
||||
|
||||
tm.assert_equal(res, exp)
|
||||
|
||||
a = Series([1, 2], index=Categorical(["foo", "bar"]))
|
||||
b = Series([4, 3], index=Categorical(["baz", "bar"]))
|
||||
|
||||
res = pd.concat([a, b])
|
||||
exp = Series([1, 2, 4, 3], index=["foo", "bar", "baz", "bar"])
|
||||
|
||||
tm.assert_equal(res, exp)
|
||||
|
||||
def test_categorical_missing_from_one_frame(self):
|
||||
# GH 25412
|
||||
df1 = DataFrame({"f1": [1, 2, 3]})
|
||||
df2 = DataFrame({"f1": [2, 3, 1], "f2": Series([4, 4, 4]).astype("category")})
|
||||
result = pd.concat([df1, df2], sort=True)
|
||||
dtype = CategoricalDtype([4])
|
||||
expected = DataFrame(
|
||||
{
|
||||
"f1": [1, 2, 3, 2, 3, 1],
|
||||
"f2": Categorical.from_codes([-1, -1, -1, 0, 0, 0], dtype=dtype),
|
||||
},
|
||||
index=[0, 1, 2, 0, 1, 2],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_categorical_same_categories_different_order(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/24845
|
||||
|
||||
c1 = pd.CategoricalIndex(["a", "a"], categories=["a", "b"], ordered=False)
|
||||
c2 = pd.CategoricalIndex(["b", "b"], categories=["b", "a"], ordered=False)
|
||||
c3 = pd.CategoricalIndex(
|
||||
["a", "a", "b", "b"], categories=["a", "b"], ordered=False
|
||||
)
|
||||
|
||||
df1 = DataFrame({"A": [1, 2]}, index=c1)
|
||||
df2 = DataFrame({"A": [3, 4]}, index=c2)
|
||||
|
||||
result = pd.concat((df1, df2))
|
||||
expected = DataFrame({"A": [1, 2, 3, 4]}, index=c3)
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,912 @@
|
||||
from collections import (
|
||||
abc,
|
||||
deque,
|
||||
)
|
||||
from collections.abc import Iterator
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import InvalidIndexError
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
PeriodIndex,
|
||||
Series,
|
||||
concat,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays import SparseArray
|
||||
from pandas.tests.extension.decimal import to_decimal
|
||||
|
||||
|
||||
class TestConcatenate:
|
||||
def test_append_concat(self):
|
||||
# GH#1815
|
||||
d1 = date_range("12/31/1990", "12/31/1999", freq="YE-DEC")
|
||||
d2 = date_range("12/31/2000", "12/31/2009", freq="YE-DEC")
|
||||
|
||||
s1 = Series(np.random.default_rng(2).standard_normal(10), d1)
|
||||
s2 = Series(np.random.default_rng(2).standard_normal(10), d2)
|
||||
|
||||
s1 = s1.to_period()
|
||||
s2 = s2.to_period()
|
||||
|
||||
# drops index
|
||||
result = concat([s1, s2])
|
||||
assert isinstance(result.index, PeriodIndex)
|
||||
assert result.index[0] == s1.index[0]
|
||||
|
||||
def test_concat_copy(self, using_array_manager, using_copy_on_write):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((4, 3)))
|
||||
df2 = DataFrame(np.random.default_rng(2).integers(0, 10, size=4).reshape(4, 1))
|
||||
df3 = DataFrame({5: "foo"}, index=range(4))
|
||||
|
||||
# These are actual copies.
|
||||
result = concat([df, df2, df3], axis=1, copy=True)
|
||||
|
||||
if not using_copy_on_write:
|
||||
for arr in result._mgr.arrays:
|
||||
assert not any(
|
||||
np.shares_memory(arr, y)
|
||||
for x in [df, df2, df3]
|
||||
for y in x._mgr.arrays
|
||||
)
|
||||
else:
|
||||
for arr in result._mgr.arrays:
|
||||
assert arr.base is not None
|
||||
|
||||
# These are the same.
|
||||
result = concat([df, df2, df3], axis=1, copy=False)
|
||||
|
||||
for arr in result._mgr.arrays:
|
||||
if arr.dtype.kind == "f":
|
||||
assert arr.base is df._mgr.arrays[0].base
|
||||
elif arr.dtype.kind in ["i", "u"]:
|
||||
assert arr.base is df2._mgr.arrays[0].base
|
||||
elif arr.dtype == object:
|
||||
if using_array_manager:
|
||||
# we get the same array object, which has no base
|
||||
assert arr is df3._mgr.arrays[0]
|
||||
else:
|
||||
assert arr.base is not None
|
||||
|
||||
# Float block was consolidated.
|
||||
df4 = DataFrame(np.random.default_rng(2).standard_normal((4, 1)))
|
||||
result = concat([df, df2, df3, df4], axis=1, copy=False)
|
||||
for arr in result._mgr.arrays:
|
||||
if arr.dtype.kind == "f":
|
||||
if using_array_manager or using_copy_on_write:
|
||||
# this is a view on some array in either df or df4
|
||||
assert any(
|
||||
np.shares_memory(arr, other)
|
||||
for other in df._mgr.arrays + df4._mgr.arrays
|
||||
)
|
||||
else:
|
||||
# the block was consolidated, so we got a copy anyway
|
||||
assert arr.base is None
|
||||
elif arr.dtype.kind in ["i", "u"]:
|
||||
assert arr.base is df2._mgr.arrays[0].base
|
||||
elif arr.dtype == object:
|
||||
# this is a view on df3
|
||||
assert any(np.shares_memory(arr, other) for other in df3._mgr.arrays)
|
||||
|
||||
def test_concat_with_group_keys(self):
|
||||
# axis=0
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((3, 4)))
|
||||
df2 = DataFrame(np.random.default_rng(2).standard_normal((4, 4)))
|
||||
|
||||
result = concat([df, df2], keys=[0, 1])
|
||||
exp_index = MultiIndex.from_arrays(
|
||||
[[0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 0, 1, 2, 3]]
|
||||
)
|
||||
expected = DataFrame(np.r_[df.values, df2.values], index=exp_index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = concat([df, df], keys=[0, 1])
|
||||
exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])
|
||||
expected = DataFrame(np.r_[df.values, df.values], index=exp_index2)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# axis=1
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((4, 3)))
|
||||
df2 = DataFrame(np.random.default_rng(2).standard_normal((4, 4)))
|
||||
|
||||
result = concat([df, df2], keys=[0, 1], axis=1)
|
||||
expected = DataFrame(np.c_[df.values, df2.values], columns=exp_index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = concat([df, df], keys=[0, 1], axis=1)
|
||||
expected = DataFrame(np.c_[df.values, df.values], columns=exp_index2)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_keys_specific_levels(self):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)))
|
||||
pieces = [df.iloc[:, [0, 1]], df.iloc[:, [2]], df.iloc[:, [3]]]
|
||||
level = ["three", "two", "one", "zero"]
|
||||
result = concat(
|
||||
pieces,
|
||||
axis=1,
|
||||
keys=["one", "two", "three"],
|
||||
levels=[level],
|
||||
names=["group_key"],
|
||||
)
|
||||
|
||||
tm.assert_index_equal(result.columns.levels[0], Index(level, name="group_key"))
|
||||
tm.assert_index_equal(result.columns.levels[1], Index([0, 1, 2, 3]))
|
||||
|
||||
assert result.columns.names == ["group_key", None]
|
||||
|
||||
@pytest.mark.parametrize("mapping", ["mapping", "dict"])
|
||||
def test_concat_mapping(self, mapping, non_dict_mapping_subclass):
|
||||
constructor = dict if mapping == "dict" else non_dict_mapping_subclass
|
||||
frames = constructor(
|
||||
{
|
||||
"foo": DataFrame(np.random.default_rng(2).standard_normal((4, 3))),
|
||||
"bar": DataFrame(np.random.default_rng(2).standard_normal((4, 3))),
|
||||
"baz": DataFrame(np.random.default_rng(2).standard_normal((4, 3))),
|
||||
"qux": DataFrame(np.random.default_rng(2).standard_normal((4, 3))),
|
||||
}
|
||||
)
|
||||
|
||||
sorted_keys = list(frames.keys())
|
||||
|
||||
result = concat(frames)
|
||||
expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = concat(frames, axis=1)
|
||||
expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys, axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
keys = ["baz", "foo", "bar"]
|
||||
result = concat(frames, keys=keys)
|
||||
expected = concat([frames[k] for k in keys], keys=keys)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_keys_and_levels(self):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((1, 3)))
|
||||
df2 = DataFrame(np.random.default_rng(2).standard_normal((1, 4)))
|
||||
|
||||
levels = [["foo", "baz"], ["one", "two"]]
|
||||
names = ["first", "second"]
|
||||
result = concat(
|
||||
[df, df2, df, df2],
|
||||
keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")],
|
||||
levels=levels,
|
||||
names=names,
|
||||
)
|
||||
expected = concat([df, df2, df, df2])
|
||||
exp_index = MultiIndex(
|
||||
levels=levels + [[0]],
|
||||
codes=[[0, 0, 1, 1], [0, 1, 0, 1], [0, 0, 0, 0]],
|
||||
names=names + [None],
|
||||
)
|
||||
expected.index = exp_index
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# no names
|
||||
result = concat(
|
||||
[df, df2, df, df2],
|
||||
keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")],
|
||||
levels=levels,
|
||||
)
|
||||
assert result.index.names == (None,) * 3
|
||||
|
||||
# no levels
|
||||
result = concat(
|
||||
[df, df2, df, df2],
|
||||
keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")],
|
||||
names=["first", "second"],
|
||||
)
|
||||
assert result.index.names == ("first", "second", None)
|
||||
tm.assert_index_equal(
|
||||
result.index.levels[0], Index(["baz", "foo"], name="first")
|
||||
)
|
||||
|
||||
def test_concat_keys_levels_no_overlap(self):
|
||||
# GH #1406
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((1, 3)), index=["a"])
|
||||
df2 = DataFrame(np.random.default_rng(2).standard_normal((1, 4)), index=["b"])
|
||||
|
||||
msg = "Values not found in passed level"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
concat([df, df], keys=["one", "two"], levels=[["foo", "bar", "baz"]])
|
||||
|
||||
msg = "Key one not in level"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
concat([df, df2], keys=["one", "two"], levels=[["foo", "bar", "baz"]])
|
||||
|
||||
def test_crossed_dtypes_weird_corner(self):
|
||||
columns = ["A", "B", "C", "D"]
|
||||
df1 = DataFrame(
|
||||
{
|
||||
"A": np.array([1, 2, 3, 4], dtype="f8"),
|
||||
"B": np.array([1, 2, 3, 4], dtype="i8"),
|
||||
"C": np.array([1, 2, 3, 4], dtype="f8"),
|
||||
"D": np.array([1, 2, 3, 4], dtype="i8"),
|
||||
},
|
||||
columns=columns,
|
||||
)
|
||||
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"A": np.array([1, 2, 3, 4], dtype="i8"),
|
||||
"B": np.array([1, 2, 3, 4], dtype="f8"),
|
||||
"C": np.array([1, 2, 3, 4], dtype="i8"),
|
||||
"D": np.array([1, 2, 3, 4], dtype="f8"),
|
||||
},
|
||||
columns=columns,
|
||||
)
|
||||
|
||||
appended = concat([df1, df2], ignore_index=True)
|
||||
expected = DataFrame(
|
||||
np.concatenate([df1.values, df2.values], axis=0), columns=columns
|
||||
)
|
||||
tm.assert_frame_equal(appended, expected)
|
||||
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((1, 3)), index=["a"])
|
||||
df2 = DataFrame(np.random.default_rng(2).standard_normal((1, 4)), index=["b"])
|
||||
result = concat([df, df2], keys=["one", "two"], names=["first", "second"])
|
||||
assert result.index.names == ("first", "second")
|
||||
|
||||
def test_with_mixed_tuples(self, sort):
|
||||
# 10697
|
||||
# columns have mixed tuples, so handle properly
|
||||
df1 = DataFrame({"A": "foo", ("B", 1): "bar"}, index=range(2))
|
||||
df2 = DataFrame({"B": "foo", ("B", 1): "bar"}, index=range(2))
|
||||
|
||||
# it works
|
||||
concat([df1, df2], sort=sort)
|
||||
|
||||
def test_concat_mixed_objs_columns(self):
|
||||
# Test column-wise concat for mixed series/frames (axis=1)
|
||||
# G2385
|
||||
|
||||
index = date_range("01-Jan-2013", periods=10, freq="h")
|
||||
arr = np.arange(10, dtype="int64")
|
||||
s1 = Series(arr, index=index)
|
||||
s2 = Series(arr, index=index)
|
||||
df = DataFrame(arr.reshape(-1, 1), index=index)
|
||||
|
||||
expected = DataFrame(
|
||||
np.repeat(arr, 2).reshape(-1, 2), index=index, columns=[0, 0]
|
||||
)
|
||||
result = concat([df, df], axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
np.repeat(arr, 2).reshape(-1, 2), index=index, columns=[0, 1]
|
||||
)
|
||||
result = concat([s1, s2], axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
np.repeat(arr, 3).reshape(-1, 3), index=index, columns=[0, 1, 2]
|
||||
)
|
||||
result = concat([s1, s2, s1], axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
np.repeat(arr, 5).reshape(-1, 5), index=index, columns=[0, 0, 1, 2, 3]
|
||||
)
|
||||
result = concat([s1, df, s2, s2, s1], axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# with names
|
||||
s1.name = "foo"
|
||||
expected = DataFrame(
|
||||
np.repeat(arr, 3).reshape(-1, 3), index=index, columns=["foo", 0, 0]
|
||||
)
|
||||
result = concat([s1, df, s2], axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
s2.name = "bar"
|
||||
expected = DataFrame(
|
||||
np.repeat(arr, 3).reshape(-1, 3), index=index, columns=["foo", 0, "bar"]
|
||||
)
|
||||
result = concat([s1, df, s2], axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# ignore index
|
||||
expected = DataFrame(
|
||||
np.repeat(arr, 3).reshape(-1, 3), index=index, columns=[0, 1, 2]
|
||||
)
|
||||
result = concat([s1, df, s2], axis=1, ignore_index=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_mixed_objs_index(self):
|
||||
# Test row-wise concat for mixed series/frames with a common name
|
||||
# GH2385, GH15047
|
||||
|
||||
index = date_range("01-Jan-2013", periods=10, freq="h")
|
||||
arr = np.arange(10, dtype="int64")
|
||||
s1 = Series(arr, index=index)
|
||||
s2 = Series(arr, index=index)
|
||||
df = DataFrame(arr.reshape(-1, 1), index=index)
|
||||
|
||||
expected = DataFrame(
|
||||
np.tile(arr, 3).reshape(-1, 1), index=index.tolist() * 3, columns=[0]
|
||||
)
|
||||
result = concat([s1, df, s2])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_mixed_objs_index_names(self):
|
||||
# Test row-wise concat for mixed series/frames with distinct names
|
||||
# GH2385, GH15047
|
||||
|
||||
index = date_range("01-Jan-2013", periods=10, freq="h")
|
||||
arr = np.arange(10, dtype="int64")
|
||||
s1 = Series(arr, index=index, name="foo")
|
||||
s2 = Series(arr, index=index, name="bar")
|
||||
df = DataFrame(arr.reshape(-1, 1), index=index)
|
||||
|
||||
expected = DataFrame(
|
||||
np.kron(np.where(np.identity(3) == 1, 1, np.nan), arr).T,
|
||||
index=index.tolist() * 3,
|
||||
columns=["foo", 0, "bar"],
|
||||
)
|
||||
result = concat([s1, df, s2])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Rename all series to 0 when ignore_index=True
|
||||
expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), columns=[0])
|
||||
result = concat([s1, df, s2], ignore_index=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dtype_coercion(self):
|
||||
# 12411
|
||||
df = DataFrame({"date": [pd.Timestamp("20130101").tz_localize("UTC"), pd.NaT]})
|
||||
|
||||
result = concat([df.iloc[[0]], df.iloc[[1]]])
|
||||
tm.assert_series_equal(result.dtypes, df.dtypes)
|
||||
|
||||
# 12045
|
||||
df = DataFrame({"date": [datetime(2012, 1, 1), datetime(1012, 1, 2)]})
|
||||
result = concat([df.iloc[[0]], df.iloc[[1]]])
|
||||
tm.assert_series_equal(result.dtypes, df.dtypes)
|
||||
|
||||
# 11594
|
||||
df = DataFrame({"text": ["some words"] + [None] * 9})
|
||||
result = concat([df.iloc[[0]], df.iloc[[1]]])
|
||||
tm.assert_series_equal(result.dtypes, df.dtypes)
|
||||
|
||||
def test_concat_single_with_key(self):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)))
|
||||
|
||||
result = concat([df], keys=["foo"])
|
||||
expected = concat([df, df], keys=["foo", "bar"])
|
||||
tm.assert_frame_equal(result, expected[:10])
|
||||
|
||||
def test_concat_no_items_raises(self):
|
||||
with pytest.raises(ValueError, match="No objects to concatenate"):
|
||||
concat([])
|
||||
|
||||
def test_concat_exclude_none(self):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)))
|
||||
|
||||
pieces = [df[:5], None, None, df[5:]]
|
||||
result = concat(pieces)
|
||||
tm.assert_frame_equal(result, df)
|
||||
with pytest.raises(ValueError, match="All objects passed were None"):
|
||||
concat([None, None])
|
||||
|
||||
def test_concat_keys_with_none(self):
|
||||
# #1649
|
||||
df0 = DataFrame([[10, 20, 30], [10, 20, 30], [10, 20, 30]])
|
||||
|
||||
result = concat({"a": None, "b": df0, "c": df0[:2], "d": df0[:1], "e": df0})
|
||||
expected = concat({"b": df0, "c": df0[:2], "d": df0[:1], "e": df0})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = concat(
|
||||
[None, df0, df0[:2], df0[:1], df0], keys=["a", "b", "c", "d", "e"]
|
||||
)
|
||||
expected = concat([df0, df0[:2], df0[:1], df0], keys=["b", "c", "d", "e"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_bug_1719(self):
|
||||
ts1 = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
ts2 = ts1.copy()[::2]
|
||||
|
||||
# to join with union
|
||||
# these two are of different length!
|
||||
left = concat([ts1, ts2], join="outer", axis=1)
|
||||
right = concat([ts2, ts1], join="outer", axis=1)
|
||||
|
||||
assert len(left) == len(right)
|
||||
|
||||
def test_concat_bug_2972(self):
|
||||
ts0 = Series(np.zeros(5))
|
||||
ts1 = Series(np.ones(5))
|
||||
ts0.name = ts1.name = "same name"
|
||||
result = concat([ts0, ts1], axis=1)
|
||||
|
||||
expected = DataFrame({0: ts0, 1: ts1})
|
||||
expected.columns = ["same name", "same name"]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_bug_3602(self):
|
||||
# GH 3602, duplicate columns
|
||||
df1 = DataFrame(
|
||||
{
|
||||
"firmNo": [0, 0, 0, 0],
|
||||
"prc": [6, 6, 6, 6],
|
||||
"stringvar": ["rrr", "rrr", "rrr", "rrr"],
|
||||
}
|
||||
)
|
||||
df2 = DataFrame(
|
||||
{"C": [9, 10, 11, 12], "misc": [1, 2, 3, 4], "prc": [6, 6, 6, 6]}
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[0, 6, "rrr", 9, 1, 6],
|
||||
[0, 6, "rrr", 10, 2, 6],
|
||||
[0, 6, "rrr", 11, 3, 6],
|
||||
[0, 6, "rrr", 12, 4, 6],
|
||||
]
|
||||
)
|
||||
expected.columns = ["firmNo", "prc", "stringvar", "C", "misc", "prc"]
|
||||
|
||||
result = concat([df1, df2], axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_iterables(self):
|
||||
# GH8645 check concat works with tuples, list, generators, and weird
|
||||
# stuff like deque and custom iterables
|
||||
df1 = DataFrame([1, 2, 3])
|
||||
df2 = DataFrame([4, 5, 6])
|
||||
expected = DataFrame([1, 2, 3, 4, 5, 6])
|
||||
tm.assert_frame_equal(concat((df1, df2), ignore_index=True), expected)
|
||||
tm.assert_frame_equal(concat([df1, df2], ignore_index=True), expected)
|
||||
tm.assert_frame_equal(
|
||||
concat((df for df in (df1, df2)), ignore_index=True), expected
|
||||
)
|
||||
tm.assert_frame_equal(concat(deque((df1, df2)), ignore_index=True), expected)
|
||||
|
||||
class CustomIterator1:
|
||||
def __len__(self) -> int:
|
||||
return 2
|
||||
|
||||
def __getitem__(self, index):
|
||||
try:
|
||||
return {0: df1, 1: df2}[index]
|
||||
except KeyError as err:
|
||||
raise IndexError from err
|
||||
|
||||
tm.assert_frame_equal(concat(CustomIterator1(), ignore_index=True), expected)
|
||||
|
||||
class CustomIterator2(abc.Iterable):
|
||||
def __iter__(self) -> Iterator:
|
||||
yield df1
|
||||
yield df2
|
||||
|
||||
tm.assert_frame_equal(concat(CustomIterator2(), ignore_index=True), expected)
|
||||
|
||||
def test_concat_order(self):
|
||||
# GH 17344, GH#47331
|
||||
dfs = [DataFrame(index=range(3), columns=["a", 1, None])]
|
||||
dfs += [DataFrame(index=range(3), columns=[None, 1, "a"]) for _ in range(100)]
|
||||
|
||||
result = concat(dfs, sort=True).columns
|
||||
expected = Index([1, "a", None])
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
def test_concat_different_extension_dtypes_upcasts(self):
|
||||
a = Series(pd.array([1, 2], dtype="Int64"))
|
||||
b = Series(to_decimal([1, 2]))
|
||||
|
||||
result = concat([a, b], ignore_index=True)
|
||||
expected = Series([1, 2, Decimal(1), Decimal(2)], dtype=object)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_ordered_dict(self):
|
||||
# GH 21510
|
||||
expected = concat(
|
||||
[Series(range(3)), Series(range(4))], keys=["First", "Another"]
|
||||
)
|
||||
result = concat({"First": Series(range(3)), "Another": Series(range(4))})
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_duplicate_indices_raise(self):
|
||||
# GH 45888: test raise for concat DataFrames with duplicate indices
|
||||
# https://github.com/pandas-dev/pandas/issues/36263
|
||||
df1 = DataFrame(
|
||||
np.random.default_rng(2).standard_normal(5),
|
||||
index=[0, 1, 2, 3, 3],
|
||||
columns=["a"],
|
||||
)
|
||||
df2 = DataFrame(
|
||||
np.random.default_rng(2).standard_normal(5),
|
||||
index=[0, 1, 2, 2, 4],
|
||||
columns=["b"],
|
||||
)
|
||||
msg = "Reindexing only valid with uniquely valued Index objects"
|
||||
with pytest.raises(InvalidIndexError, match=msg):
|
||||
concat([df1, df2], axis=1)
|
||||
|
||||
|
||||
def test_concat_no_unnecessary_upcast(float_numpy_dtype, frame_or_series):
|
||||
# GH 13247
|
||||
dims = frame_or_series(dtype=object).ndim
|
||||
dt = float_numpy_dtype
|
||||
|
||||
dfs = [
|
||||
frame_or_series(np.array([1], dtype=dt, ndmin=dims)),
|
||||
frame_or_series(np.array([np.nan], dtype=dt, ndmin=dims)),
|
||||
frame_or_series(np.array([5], dtype=dt, ndmin=dims)),
|
||||
]
|
||||
x = concat(dfs)
|
||||
assert x.values.dtype == dt
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pdt", [Series, DataFrame])
|
||||
def test_concat_will_upcast(pdt, any_signed_int_numpy_dtype):
|
||||
dt = any_signed_int_numpy_dtype
|
||||
dims = pdt().ndim
|
||||
dfs = [
|
||||
pdt(np.array([1], dtype=dt, ndmin=dims)),
|
||||
pdt(np.array([np.nan], ndmin=dims)),
|
||||
pdt(np.array([5], dtype=dt, ndmin=dims)),
|
||||
]
|
||||
x = concat(dfs)
|
||||
assert x.values.dtype == "float64"
|
||||
|
||||
|
||||
def test_concat_empty_and_non_empty_frame_regression():
|
||||
# GH 18178 regression test
|
||||
df1 = DataFrame({"foo": [1]})
|
||||
df2 = DataFrame({"foo": []})
|
||||
expected = DataFrame({"foo": [1.0]})
|
||||
result = concat([df1, df2])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_concat_sparse():
|
||||
# GH 23557
|
||||
a = Series(SparseArray([0, 1, 2]))
|
||||
expected = DataFrame(data=[[0, 0], [1, 1], [2, 2]]).astype(
|
||||
pd.SparseDtype(np.int64, 0)
|
||||
)
|
||||
result = concat([a, a], axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_concat_dense_sparse():
|
||||
# GH 30668
|
||||
dtype = pd.SparseDtype(np.float64, None)
|
||||
a = Series(pd.arrays.SparseArray([1, None]), dtype=dtype)
|
||||
b = Series([1], dtype=float)
|
||||
expected = Series(data=[1, None, 1], index=[0, 1, 0]).astype(dtype)
|
||||
result = concat([a, b], axis=0)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("keys", [["e", "f", "f"], ["f", "e", "f"]])
|
||||
def test_duplicate_keys(keys):
|
||||
# GH 33654
|
||||
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
|
||||
s1 = Series([7, 8, 9], name="c")
|
||||
s2 = Series([10, 11, 12], name="d")
|
||||
result = concat([df, s1, s2], axis=1, keys=keys)
|
||||
expected_values = [[1, 4, 7, 10], [2, 5, 8, 11], [3, 6, 9, 12]]
|
||||
expected_columns = MultiIndex.from_tuples(
|
||||
[(keys[0], "a"), (keys[0], "b"), (keys[1], "c"), (keys[2], "d")]
|
||||
)
|
||||
expected = DataFrame(expected_values, columns=expected_columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_duplicate_keys_same_frame():
|
||||
# GH 43595
|
||||
keys = ["e", "e"]
|
||||
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
|
||||
result = concat([df, df], axis=1, keys=keys)
|
||||
expected_values = [[1, 4, 1, 4], [2, 5, 2, 5], [3, 6, 3, 6]]
|
||||
expected_columns = MultiIndex.from_tuples(
|
||||
[(keys[0], "a"), (keys[0], "b"), (keys[1], "a"), (keys[1], "b")]
|
||||
)
|
||||
expected = DataFrame(expected_values, columns=expected_columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning"
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"obj",
|
||||
[
|
||||
tm.SubclassedDataFrame({"A": np.arange(0, 10)}),
|
||||
tm.SubclassedSeries(np.arange(0, 10), name="A"),
|
||||
],
|
||||
)
|
||||
def test_concat_preserves_subclass(obj):
|
||||
# GH28330 -- preserve subclass
|
||||
|
||||
result = concat([obj, obj])
|
||||
assert isinstance(result, type(obj))
|
||||
|
||||
|
||||
def test_concat_frame_axis0_extension_dtypes():
|
||||
# preserve extension dtype (through common_dtype mechanism)
|
||||
df1 = DataFrame({"a": pd.array([1, 2, 3], dtype="Int64")})
|
||||
df2 = DataFrame({"a": np.array([4, 5, 6])})
|
||||
|
||||
result = concat([df1, df2], ignore_index=True)
|
||||
expected = DataFrame({"a": [1, 2, 3, 4, 5, 6]}, dtype="Int64")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = concat([df2, df1], ignore_index=True)
|
||||
expected = DataFrame({"a": [4, 5, 6, 1, 2, 3]}, dtype="Int64")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_concat_preserves_extension_int64_dtype():
|
||||
# GH 24768
|
||||
df_a = DataFrame({"a": [-1]}, dtype="Int64")
|
||||
df_b = DataFrame({"b": [1]}, dtype="Int64")
|
||||
result = concat([df_a, df_b], ignore_index=True)
|
||||
expected = DataFrame({"a": [-1, None], "b": [None, 1]}, dtype="Int64")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype1,dtype2,expected_dtype",
|
||||
[
|
||||
("bool", "bool", "bool"),
|
||||
("boolean", "bool", "boolean"),
|
||||
("bool", "boolean", "boolean"),
|
||||
("boolean", "boolean", "boolean"),
|
||||
],
|
||||
)
|
||||
def test_concat_bool_types(dtype1, dtype2, expected_dtype):
|
||||
# GH 42800
|
||||
ser1 = Series([True, False], dtype=dtype1)
|
||||
ser2 = Series([False, True], dtype=dtype2)
|
||||
result = concat([ser1, ser2], ignore_index=True)
|
||||
expected = Series([True, False, False, True], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("keys", "integrity"),
|
||||
[
|
||||
(["red"] * 3, True),
|
||||
(["red"] * 3, False),
|
||||
(["red", "blue", "red"], False),
|
||||
(["red", "blue", "red"], True),
|
||||
],
|
||||
)
|
||||
def test_concat_repeated_keys(keys, integrity):
|
||||
# GH: 20816
|
||||
series_list = [Series({"a": 1}), Series({"b": 2}), Series({"c": 3})]
|
||||
result = concat(series_list, keys=keys, verify_integrity=integrity)
|
||||
tuples = list(zip(keys, ["a", "b", "c"]))
|
||||
expected = Series([1, 2, 3], index=MultiIndex.from_tuples(tuples))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_concat_null_object_with_dti():
|
||||
# GH#40841
|
||||
dti = pd.DatetimeIndex(
|
||||
["2021-04-08 21:21:14+00:00"], dtype="datetime64[ns, UTC]", name="Time (UTC)"
|
||||
)
|
||||
right = DataFrame(data={"C": [0.5274]}, index=dti)
|
||||
|
||||
idx = Index([None], dtype="object", name="Maybe Time (UTC)")
|
||||
left = DataFrame(data={"A": [None], "B": [np.nan]}, index=idx)
|
||||
|
||||
result = concat([left, right], axis="columns")
|
||||
|
||||
exp_index = Index([None, dti[0]], dtype=object)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": np.array([None, np.nan], dtype=object),
|
||||
"B": [np.nan, np.nan],
|
||||
"C": [np.nan, 0.5274],
|
||||
},
|
||||
index=exp_index,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_concat_multiindex_with_empty_rangeindex():
|
||||
# GH#41234
|
||||
mi = MultiIndex.from_tuples([("B", 1), ("C", 1)])
|
||||
df1 = DataFrame([[1, 2]], columns=mi)
|
||||
df2 = DataFrame(index=[1], columns=pd.RangeIndex(0))
|
||||
|
||||
result = concat([df1, df2])
|
||||
expected = DataFrame([[1, 2], [np.nan, np.nan]], columns=mi)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
Series(data=[1, 2]),
|
||||
DataFrame(
|
||||
data={
|
||||
"col1": [1, 2],
|
||||
}
|
||||
),
|
||||
DataFrame(dtype=float),
|
||||
Series(dtype=float),
|
||||
],
|
||||
)
|
||||
def test_concat_drop_attrs(data):
|
||||
# GH#41828
|
||||
df1 = data.copy()
|
||||
df1.attrs = {1: 1}
|
||||
df2 = data.copy()
|
||||
df2.attrs = {1: 2}
|
||||
df = concat([df1, df2])
|
||||
assert len(df.attrs) == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
Series(data=[1, 2]),
|
||||
DataFrame(
|
||||
data={
|
||||
"col1": [1, 2],
|
||||
}
|
||||
),
|
||||
DataFrame(dtype=float),
|
||||
Series(dtype=float),
|
||||
],
|
||||
)
|
||||
def test_concat_retain_attrs(data):
|
||||
# GH#41828
|
||||
df1 = data.copy()
|
||||
df1.attrs = {1: 1}
|
||||
df2 = data.copy()
|
||||
df2.attrs = {1: 1}
|
||||
df = concat([df1, df2])
|
||||
assert df.attrs[1] == 1
|
||||
|
||||
|
||||
@td.skip_array_manager_invalid_test
|
||||
@pytest.mark.parametrize("df_dtype", ["float64", "int64", "datetime64[ns]"])
|
||||
@pytest.mark.parametrize("empty_dtype", [None, "float64", "object"])
|
||||
def test_concat_ignore_empty_object_float(empty_dtype, df_dtype):
|
||||
# https://github.com/pandas-dev/pandas/issues/45637
|
||||
df = DataFrame({"foo": [1, 2], "bar": [1, 2]}, dtype=df_dtype)
|
||||
empty = DataFrame(columns=["foo", "bar"], dtype=empty_dtype)
|
||||
|
||||
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
|
||||
warn = None
|
||||
if df_dtype == "datetime64[ns]" or (
|
||||
df_dtype == "float64" and empty_dtype != "float64"
|
||||
):
|
||||
warn = FutureWarning
|
||||
with tm.assert_produces_warning(warn, match=msg):
|
||||
result = concat([empty, df])
|
||||
expected = df
|
||||
if df_dtype == "int64":
|
||||
# TODO what exact behaviour do we want for integer eventually?
|
||||
if empty_dtype == "float64":
|
||||
expected = df.astype("float64")
|
||||
else:
|
||||
expected = df.astype("object")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@td.skip_array_manager_invalid_test
|
||||
@pytest.mark.parametrize("df_dtype", ["float64", "int64", "datetime64[ns]"])
|
||||
@pytest.mark.parametrize("empty_dtype", [None, "float64", "object"])
|
||||
def test_concat_ignore_all_na_object_float(empty_dtype, df_dtype):
|
||||
df = DataFrame({"foo": [1, 2], "bar": [1, 2]}, dtype=df_dtype)
|
||||
empty = DataFrame({"foo": [np.nan], "bar": [np.nan]}, dtype=empty_dtype)
|
||||
|
||||
if df_dtype == "int64":
|
||||
# TODO what exact behaviour do we want for integer eventually?
|
||||
if empty_dtype == "object":
|
||||
df_dtype = "object"
|
||||
else:
|
||||
df_dtype = "float64"
|
||||
|
||||
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
|
||||
warn = None
|
||||
if empty_dtype != df_dtype and empty_dtype is not None:
|
||||
warn = FutureWarning
|
||||
elif df_dtype == "datetime64[ns]":
|
||||
warn = FutureWarning
|
||||
|
||||
with tm.assert_produces_warning(warn, match=msg):
|
||||
result = concat([empty, df], ignore_index=True)
|
||||
|
||||
expected = DataFrame({"foo": [np.nan, 1, 2], "bar": [np.nan, 1, 2]}, dtype=df_dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@td.skip_array_manager_invalid_test
|
||||
def test_concat_ignore_empty_from_reindex():
|
||||
# https://github.com/pandas-dev/pandas/pull/43507#issuecomment-920375856
|
||||
df1 = DataFrame({"a": [1], "b": [pd.Timestamp("2012-01-01")]})
|
||||
df2 = DataFrame({"a": [2]})
|
||||
|
||||
aligned = df2.reindex(columns=df1.columns)
|
||||
|
||||
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = concat([df1, aligned], ignore_index=True)
|
||||
expected = df1 = DataFrame({"a": [1, 2], "b": [pd.Timestamp("2012-01-01"), pd.NaT]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_concat_mismatched_keys_length():
|
||||
# GH#43485
|
||||
ser = Series(range(5))
|
||||
sers = [ser + n for n in range(4)]
|
||||
keys = ["A", "B", "C"]
|
||||
|
||||
msg = r"The behavior of pd.concat with len\(keys\) != len\(objs\) is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
concat(sers, keys=keys, axis=1)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
concat(sers, keys=keys, axis=0)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
concat((x for x in sers), keys=(y for y in keys), axis=1)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
concat((x for x in sers), keys=(y for y in keys), axis=0)
|
||||
|
||||
|
||||
def test_concat_multiindex_with_category():
|
||||
df1 = DataFrame(
|
||||
{
|
||||
"c1": Series(list("abc"), dtype="category"),
|
||||
"c2": Series(list("eee"), dtype="category"),
|
||||
"i2": Series([1, 2, 3]),
|
||||
}
|
||||
)
|
||||
df1 = df1.set_index(["c1", "c2"])
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"c1": Series(list("abc"), dtype="category"),
|
||||
"c2": Series(list("eee"), dtype="category"),
|
||||
"i2": Series([4, 5, 6]),
|
||||
}
|
||||
)
|
||||
df2 = df2.set_index(["c1", "c2"])
|
||||
result = concat([df1, df2])
|
||||
expected = DataFrame(
|
||||
{
|
||||
"c1": Series(list("abcabc"), dtype="category"),
|
||||
"c2": Series(list("eeeeee"), dtype="category"),
|
||||
"i2": Series([1, 2, 3, 4, 5, 6]),
|
||||
}
|
||||
)
|
||||
expected = expected.set_index(["c1", "c2"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_concat_ea_upcast():
|
||||
# GH#54848
|
||||
df1 = DataFrame(["a"], dtype="string")
|
||||
df2 = DataFrame([1], dtype="Int64")
|
||||
result = concat([df1, df2])
|
||||
expected = DataFrame(["a", 1], index=[0, 0])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_concat_none_with_timezone_timestamp():
|
||||
# GH#52093
|
||||
df1 = DataFrame([{"A": None}])
|
||||
df2 = DataFrame([{"A": pd.Timestamp("1990-12-20 00:00:00+00:00")}])
|
||||
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = concat([df1, df2], ignore_index=True)
|
||||
expected = DataFrame({"A": [None, pd.Timestamp("1990-12-20 00:00:00+00:00")]})
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,230 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameConcat:
|
||||
def test_concat_multiple_frames_dtypes(self):
|
||||
# GH#2759
|
||||
df1 = DataFrame(data=np.ones((10, 2)), columns=["foo", "bar"], dtype=np.float64)
|
||||
df2 = DataFrame(data=np.ones((10, 2)), dtype=np.float32)
|
||||
results = concat((df1, df2), axis=1).dtypes
|
||||
expected = Series(
|
||||
[np.dtype("float64")] * 2 + [np.dtype("float32")] * 2,
|
||||
index=["foo", "bar", 0, 1],
|
||||
)
|
||||
tm.assert_series_equal(results, expected)
|
||||
|
||||
def test_concat_tuple_keys(self):
|
||||
# GH#14438
|
||||
df1 = DataFrame(np.ones((2, 2)), columns=list("AB"))
|
||||
df2 = DataFrame(np.ones((3, 2)) * 2, columns=list("AB"))
|
||||
results = concat((df1, df2), keys=[("bee", "bah"), ("bee", "boo")])
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": {
|
||||
("bee", "bah", 0): 1.0,
|
||||
("bee", "bah", 1): 1.0,
|
||||
("bee", "boo", 0): 2.0,
|
||||
("bee", "boo", 1): 2.0,
|
||||
("bee", "boo", 2): 2.0,
|
||||
},
|
||||
"B": {
|
||||
("bee", "bah", 0): 1.0,
|
||||
("bee", "bah", 1): 1.0,
|
||||
("bee", "boo", 0): 2.0,
|
||||
("bee", "boo", 1): 2.0,
|
||||
("bee", "boo", 2): 2.0,
|
||||
},
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(results, expected)
|
||||
|
||||
def test_concat_named_keys(self):
|
||||
# GH#14252
|
||||
df = DataFrame({"foo": [1, 2], "bar": [0.1, 0.2]})
|
||||
index = Index(["a", "b"], name="baz")
|
||||
concatted_named_from_keys = concat([df, df], keys=index)
|
||||
expected_named = DataFrame(
|
||||
{"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]},
|
||||
index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=["baz", None]),
|
||||
)
|
||||
tm.assert_frame_equal(concatted_named_from_keys, expected_named)
|
||||
|
||||
index_no_name = Index(["a", "b"], name=None)
|
||||
concatted_named_from_names = concat([df, df], keys=index_no_name, names=["baz"])
|
||||
tm.assert_frame_equal(concatted_named_from_names, expected_named)
|
||||
|
||||
concatted_unnamed = concat([df, df], keys=index_no_name)
|
||||
expected_unnamed = DataFrame(
|
||||
{"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]},
|
||||
index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=[None, None]),
|
||||
)
|
||||
tm.assert_frame_equal(concatted_unnamed, expected_unnamed)
|
||||
|
||||
def test_concat_axis_parameter(self):
|
||||
# GH#14369
|
||||
df1 = DataFrame({"A": [0.1, 0.2]}, index=range(2))
|
||||
df2 = DataFrame({"A": [0.3, 0.4]}, index=range(2))
|
||||
|
||||
# Index/row/0 DataFrame
|
||||
expected_index = DataFrame({"A": [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1])
|
||||
|
||||
concatted_index = concat([df1, df2], axis="index")
|
||||
tm.assert_frame_equal(concatted_index, expected_index)
|
||||
|
||||
concatted_row = concat([df1, df2], axis="rows")
|
||||
tm.assert_frame_equal(concatted_row, expected_index)
|
||||
|
||||
concatted_0 = concat([df1, df2], axis=0)
|
||||
tm.assert_frame_equal(concatted_0, expected_index)
|
||||
|
||||
# Columns/1 DataFrame
|
||||
expected_columns = DataFrame(
|
||||
[[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=["A", "A"]
|
||||
)
|
||||
|
||||
concatted_columns = concat([df1, df2], axis="columns")
|
||||
tm.assert_frame_equal(concatted_columns, expected_columns)
|
||||
|
||||
concatted_1 = concat([df1, df2], axis=1)
|
||||
tm.assert_frame_equal(concatted_1, expected_columns)
|
||||
|
||||
series1 = Series([0.1, 0.2])
|
||||
series2 = Series([0.3, 0.4])
|
||||
|
||||
# Index/row/0 Series
|
||||
expected_index_series = Series([0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1])
|
||||
|
||||
concatted_index_series = concat([series1, series2], axis="index")
|
||||
tm.assert_series_equal(concatted_index_series, expected_index_series)
|
||||
|
||||
concatted_row_series = concat([series1, series2], axis="rows")
|
||||
tm.assert_series_equal(concatted_row_series, expected_index_series)
|
||||
|
||||
concatted_0_series = concat([series1, series2], axis=0)
|
||||
tm.assert_series_equal(concatted_0_series, expected_index_series)
|
||||
|
||||
# Columns/1 Series
|
||||
expected_columns_series = DataFrame(
|
||||
[[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1]
|
||||
)
|
||||
|
||||
concatted_columns_series = concat([series1, series2], axis="columns")
|
||||
tm.assert_frame_equal(concatted_columns_series, expected_columns_series)
|
||||
|
||||
concatted_1_series = concat([series1, series2], axis=1)
|
||||
tm.assert_frame_equal(concatted_1_series, expected_columns_series)
|
||||
|
||||
# Testing ValueError
|
||||
with pytest.raises(ValueError, match="No axis named"):
|
||||
concat([series1, series2], axis="something")
|
||||
|
||||
def test_concat_numerical_names(self):
|
||||
# GH#15262, GH#12223
|
||||
df = DataFrame(
|
||||
{"col": range(9)},
|
||||
dtype="int32",
|
||||
index=(
|
||||
pd.MultiIndex.from_product(
|
||||
[["A0", "A1", "A2"], ["B0", "B1", "B2"]], names=[1, 2]
|
||||
)
|
||||
),
|
||||
)
|
||||
result = concat((df.iloc[:2, :], df.iloc[-2:, :]))
|
||||
expected = DataFrame(
|
||||
{"col": [0, 1, 7, 8]},
|
||||
dtype="int32",
|
||||
index=pd.MultiIndex.from_tuples(
|
||||
[("A0", "B0"), ("A0", "B1"), ("A2", "B1"), ("A2", "B2")], names=[1, 2]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_astype_dup_col(self):
|
||||
# GH#23049
|
||||
df = DataFrame([{"a": "b"}])
|
||||
df = concat([df, df], axis=1)
|
||||
|
||||
result = df.astype("category")
|
||||
expected = DataFrame(
|
||||
np.array(["b", "b"]).reshape(1, 2), columns=["a", "a"]
|
||||
).astype("category")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_dataframe_keys_bug(self, sort):
|
||||
t1 = DataFrame(
|
||||
{"value": Series([1, 2, 3], index=Index(["a", "b", "c"], name="id"))}
|
||||
)
|
||||
t2 = DataFrame({"value": Series([7, 8], index=Index(["a", "b"], name="id"))})
|
||||
|
||||
# it works
|
||||
result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort)
|
||||
assert list(result.columns) == [("t1", "value"), ("t2", "value")]
|
||||
|
||||
def test_concat_bool_with_int(self):
|
||||
# GH#42092 we may want to change this to return object, but that
|
||||
# would need a deprecation
|
||||
df1 = DataFrame(Series([True, False, True, True], dtype="bool"))
|
||||
df2 = DataFrame(Series([1, 0, 1], dtype="int64"))
|
||||
|
||||
result = concat([df1, df2])
|
||||
expected = concat([df1.astype("int64"), df2])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_duplicates_in_index_with_keys(self):
|
||||
# GH#42651
|
||||
index = [1, 1, 3]
|
||||
data = [1, 2, 3]
|
||||
|
||||
df = DataFrame(data=data, index=index)
|
||||
result = concat([df], keys=["A"], names=["ID", "date"])
|
||||
mi = pd.MultiIndex.from_product([["A"], index], names=["ID", "date"])
|
||||
expected = DataFrame(data=data, index=mi)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_index_equal(result.index.levels[1], Index([1, 3], name="date"))
|
||||
|
||||
@pytest.mark.parametrize("ignore_index", [True, False])
|
||||
@pytest.mark.parametrize("order", ["C", "F"])
|
||||
@pytest.mark.parametrize("axis", [0, 1])
|
||||
def test_concat_copies(self, axis, order, ignore_index, using_copy_on_write):
|
||||
# based on asv ConcatDataFrames
|
||||
df = DataFrame(np.zeros((10, 5), dtype=np.float32, order=order))
|
||||
|
||||
res = concat([df] * 5, axis=axis, ignore_index=ignore_index, copy=True)
|
||||
|
||||
if not using_copy_on_write:
|
||||
for arr in res._iter_column_arrays():
|
||||
for arr2 in df._iter_column_arrays():
|
||||
assert not np.shares_memory(arr, arr2)
|
||||
|
||||
def test_outer_sort_columns(self):
|
||||
# GH#47127
|
||||
df1 = DataFrame({"A": [0], "B": [1], 0: 1})
|
||||
df2 = DataFrame({"A": [100]})
|
||||
result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
|
||||
expected = DataFrame({0: [1.0, np.nan], "A": [0, 100], "B": [1.0, np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_inner_sort_columns(self):
|
||||
# GH#47127
|
||||
df1 = DataFrame({"A": [0], "B": [1], 0: 1})
|
||||
df2 = DataFrame({"A": [100], 0: 2})
|
||||
result = concat([df1, df2], ignore_index=True, join="inner", sort=True)
|
||||
expected = DataFrame({0: [1, 2], "A": [0, 100]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_columns_one_df(self):
|
||||
# GH#47127
|
||||
df1 = DataFrame({"A": [100], 0: 2})
|
||||
result = concat([df1], ignore_index=True, join="inner", sort=True)
|
||||
expected = DataFrame({0: [2], "A": [100]})
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,606 @@
|
||||
import datetime as dt
|
||||
from datetime import datetime
|
||||
|
||||
import dateutil
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
concat,
|
||||
date_range,
|
||||
to_timedelta,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDatetimeConcat:
|
||||
def test_concat_datetime64_block(self):
|
||||
rng = date_range("1/1/2000", periods=10)
|
||||
|
||||
df = DataFrame({"time": rng})
|
||||
|
||||
result = concat([df, df])
|
||||
assert (result.iloc[:10]["time"] == rng).all()
|
||||
assert (result.iloc[10:]["time"] == rng).all()
|
||||
|
||||
def test_concat_datetime_datetime64_frame(self):
|
||||
# GH#2624
|
||||
rows = []
|
||||
rows.append([datetime(2010, 1, 1), 1])
|
||||
rows.append([datetime(2010, 1, 2), "hi"])
|
||||
|
||||
df2_obj = DataFrame.from_records(rows, columns=["date", "test"])
|
||||
|
||||
ind = date_range(start="2000/1/1", freq="D", periods=10)
|
||||
df1 = DataFrame({"date": ind, "test": range(10)})
|
||||
|
||||
# it works!
|
||||
concat([df1, df2_obj])
|
||||
|
||||
def test_concat_datetime_timezone(self):
|
||||
# GH 18523
|
||||
idx1 = date_range("2011-01-01", periods=3, freq="h", tz="Europe/Paris")
|
||||
idx2 = date_range(start=idx1[0], end=idx1[-1], freq="h")
|
||||
df1 = DataFrame({"a": [1, 2, 3]}, index=idx1)
|
||||
df2 = DataFrame({"b": [1, 2, 3]}, index=idx2)
|
||||
result = concat([df1, df2], axis=1)
|
||||
|
||||
exp_idx = DatetimeIndex(
|
||||
[
|
||||
"2011-01-01 00:00:00+01:00",
|
||||
"2011-01-01 01:00:00+01:00",
|
||||
"2011-01-01 02:00:00+01:00",
|
||||
],
|
||||
dtype="M8[ns, Europe/Paris]",
|
||||
freq="h",
|
||||
)
|
||||
expected = DataFrame(
|
||||
[[1, 1], [2, 2], [3, 3]], index=exp_idx, columns=["a", "b"]
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
idx3 = date_range("2011-01-01", periods=3, freq="h", tz="Asia/Tokyo")
|
||||
df3 = DataFrame({"b": [1, 2, 3]}, index=idx3)
|
||||
result = concat([df1, df3], axis=1)
|
||||
|
||||
exp_idx = DatetimeIndex(
|
||||
[
|
||||
"2010-12-31 15:00:00+00:00",
|
||||
"2010-12-31 16:00:00+00:00",
|
||||
"2010-12-31 17:00:00+00:00",
|
||||
"2010-12-31 23:00:00+00:00",
|
||||
"2011-01-01 00:00:00+00:00",
|
||||
"2011-01-01 01:00:00+00:00",
|
||||
]
|
||||
).as_unit("ns")
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[np.nan, 1],
|
||||
[np.nan, 2],
|
||||
[np.nan, 3],
|
||||
[1, np.nan],
|
||||
[2, np.nan],
|
||||
[3, np.nan],
|
||||
],
|
||||
index=exp_idx,
|
||||
columns=["a", "b"],
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH 13783: Concat after resample
|
||||
result = concat([df1.resample("h").mean(), df2.resample("h").mean()], sort=True)
|
||||
expected = DataFrame(
|
||||
{"a": [1, 2, 3] + [np.nan] * 3, "b": [np.nan] * 3 + [1, 2, 3]},
|
||||
index=idx1.append(idx1),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_datetimeindex_freq(self):
|
||||
# GH 3232
|
||||
# Monotonic index result
|
||||
dr = date_range("01-Jan-2013", periods=100, freq="50ms", tz="UTC")
|
||||
data = list(range(100))
|
||||
expected = DataFrame(data, index=dr)
|
||||
result = concat([expected[:50], expected[50:]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Non-monotonic index result
|
||||
result = concat([expected[50:], expected[:50]])
|
||||
expected = DataFrame(data[50:] + data[:50], index=dr[50:].append(dr[:50]))
|
||||
expected.index._data.freq = None
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_multiindex_datetime_object_index(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/11058
|
||||
idx = Index(
|
||||
[dt.date(2013, 1, 1), dt.date(2014, 1, 1), dt.date(2015, 1, 1)],
|
||||
dtype="object",
|
||||
)
|
||||
|
||||
s = Series(
|
||||
["a", "b"],
|
||||
index=MultiIndex.from_arrays(
|
||||
[
|
||||
[1, 2],
|
||||
idx[:-1],
|
||||
],
|
||||
names=["first", "second"],
|
||||
),
|
||||
)
|
||||
s2 = Series(
|
||||
["a", "b"],
|
||||
index=MultiIndex.from_arrays(
|
||||
[[1, 2], idx[::2]],
|
||||
names=["first", "second"],
|
||||
),
|
||||
)
|
||||
mi = MultiIndex.from_arrays(
|
||||
[[1, 2, 2], idx],
|
||||
names=["first", "second"],
|
||||
)
|
||||
assert mi.levels[1].dtype == object
|
||||
|
||||
expected = DataFrame(
|
||||
[["a", "a"], ["b", np.nan], [np.nan, "b"]],
|
||||
index=mi,
|
||||
)
|
||||
result = concat([s, s2], axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_NaT_series(self):
|
||||
# GH 11693
|
||||
# test for merging NaT series with datetime series.
|
||||
x = Series(
|
||||
date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="US/Eastern")
|
||||
)
|
||||
y = Series(pd.NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]")
|
||||
expected = Series([x[0], x[1], pd.NaT, pd.NaT])
|
||||
|
||||
result = concat([x, y], ignore_index=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# all NaT with tz
|
||||
expected = Series(pd.NaT, index=range(4), dtype="datetime64[ns, US/Eastern]")
|
||||
result = concat([y, y], ignore_index=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_NaT_series2(self):
|
||||
# without tz
|
||||
x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h"))
|
||||
y = Series(date_range("20151124 10:00", "20151124 11:00", freq="1h"))
|
||||
y[:] = pd.NaT
|
||||
expected = Series([x[0], x[1], pd.NaT, pd.NaT])
|
||||
result = concat([x, y], ignore_index=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# all NaT without tz
|
||||
x[:] = pd.NaT
|
||||
expected = Series(pd.NaT, index=range(4), dtype="datetime64[ns]")
|
||||
result = concat([x, y], ignore_index=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz", [None, "UTC"])
|
||||
def test_concat_NaT_dataframes(self, tz):
|
||||
# GH 12396
|
||||
|
||||
dti = DatetimeIndex([pd.NaT, pd.NaT], tz=tz)
|
||||
first = DataFrame({0: dti})
|
||||
second = DataFrame(
|
||||
[[Timestamp("2015/01/01", tz=tz)], [Timestamp("2016/01/01", tz=tz)]],
|
||||
index=[2, 3],
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
pd.NaT,
|
||||
pd.NaT,
|
||||
Timestamp("2015/01/01", tz=tz),
|
||||
Timestamp("2016/01/01", tz=tz),
|
||||
]
|
||||
)
|
||||
|
||||
result = concat([first, second], axis=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz1", [None, "UTC"])
|
||||
@pytest.mark.parametrize("tz2", [None, "UTC"])
|
||||
@pytest.mark.parametrize("item", [pd.NaT, Timestamp("20150101")])
|
||||
def test_concat_NaT_dataframes_all_NaT_axis_0(
|
||||
self, tz1, tz2, item, using_array_manager
|
||||
):
|
||||
# GH 12396
|
||||
|
||||
# tz-naive
|
||||
first = DataFrame([[pd.NaT], [pd.NaT]]).apply(lambda x: x.dt.tz_localize(tz1))
|
||||
second = DataFrame([item]).apply(lambda x: x.dt.tz_localize(tz2))
|
||||
|
||||
result = concat([first, second], axis=0)
|
||||
expected = DataFrame(Series([pd.NaT, pd.NaT, item], index=[0, 1, 0]))
|
||||
expected = expected.apply(lambda x: x.dt.tz_localize(tz2))
|
||||
if tz1 != tz2:
|
||||
expected = expected.astype(object)
|
||||
if item is pd.NaT and not using_array_manager:
|
||||
# GH#18463
|
||||
# TODO: setting nan here is to keep the test passing as we
|
||||
# make assert_frame_equal stricter, but is nan really the
|
||||
# ideal behavior here?
|
||||
if tz1 is not None:
|
||||
expected.iloc[-1, 0] = np.nan
|
||||
else:
|
||||
expected.iloc[:-1, 0] = np.nan
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz1", [None, "UTC"])
|
||||
@pytest.mark.parametrize("tz2", [None, "UTC"])
|
||||
def test_concat_NaT_dataframes_all_NaT_axis_1(self, tz1, tz2):
|
||||
# GH 12396
|
||||
|
||||
first = DataFrame(Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1))
|
||||
second = DataFrame(Series([pd.NaT]).dt.tz_localize(tz2), columns=[1])
|
||||
expected = DataFrame(
|
||||
{
|
||||
0: Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1),
|
||||
1: Series([pd.NaT, pd.NaT]).dt.tz_localize(tz2),
|
||||
}
|
||||
)
|
||||
result = concat([first, second], axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz1", [None, "UTC"])
|
||||
@pytest.mark.parametrize("tz2", [None, "UTC"])
|
||||
def test_concat_NaT_series_dataframe_all_NaT(self, tz1, tz2):
|
||||
# GH 12396
|
||||
|
||||
# tz-naive
|
||||
first = Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1)
|
||||
second = DataFrame(
|
||||
[
|
||||
[Timestamp("2015/01/01", tz=tz2)],
|
||||
[Timestamp("2016/01/01", tz=tz2)],
|
||||
],
|
||||
index=[2, 3],
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
pd.NaT,
|
||||
pd.NaT,
|
||||
Timestamp("2015/01/01", tz=tz2),
|
||||
Timestamp("2016/01/01", tz=tz2),
|
||||
]
|
||||
)
|
||||
if tz1 != tz2:
|
||||
expected = expected.astype(object)
|
||||
|
||||
result = concat([first, second])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestTimezoneConcat:
|
||||
def test_concat_tz_series(self):
|
||||
# gh-11755: tz and no tz
|
||||
x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC"))
|
||||
y = Series(date_range("2012-01-01", "2012-01-02"))
|
||||
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
|
||||
result = concat([x, y], ignore_index=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_tz_series2(self):
|
||||
# gh-11887: concat tz and object
|
||||
x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC"))
|
||||
y = Series(["a", "b"])
|
||||
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
|
||||
result = concat([x, y], ignore_index=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_tz_series3(self, unit, unit2):
|
||||
# see gh-12217 and gh-12306
|
||||
# Concatenating two UTC times
|
||||
first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]")
|
||||
first[0] = first[0].dt.tz_localize("UTC")
|
||||
|
||||
second = DataFrame([[datetime(2016, 1, 2)]], dtype=f"M8[{unit2}]")
|
||||
second[0] = second[0].dt.tz_localize("UTC")
|
||||
|
||||
result = concat([first, second])
|
||||
exp_unit = tm.get_finest_unit(unit, unit2)
|
||||
assert result[0].dtype == f"datetime64[{exp_unit}, UTC]"
|
||||
|
||||
def test_concat_tz_series4(self, unit, unit2):
|
||||
# Concatenating two London times
|
||||
first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]")
|
||||
first[0] = first[0].dt.tz_localize("Europe/London")
|
||||
|
||||
second = DataFrame([[datetime(2016, 1, 2)]], dtype=f"M8[{unit2}]")
|
||||
second[0] = second[0].dt.tz_localize("Europe/London")
|
||||
|
||||
result = concat([first, second])
|
||||
exp_unit = tm.get_finest_unit(unit, unit2)
|
||||
assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]"
|
||||
|
||||
def test_concat_tz_series5(self, unit, unit2):
|
||||
# Concatenating 2+1 London times
|
||||
first = DataFrame(
|
||||
[[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]], dtype=f"M8[{unit}]"
|
||||
)
|
||||
first[0] = first[0].dt.tz_localize("Europe/London")
|
||||
|
||||
second = DataFrame([[datetime(2016, 1, 3)]], dtype=f"M8[{unit2}]")
|
||||
second[0] = second[0].dt.tz_localize("Europe/London")
|
||||
|
||||
result = concat([first, second])
|
||||
exp_unit = tm.get_finest_unit(unit, unit2)
|
||||
assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]"
|
||||
|
||||
def test_concat_tz_series6(self, unit, unit2):
|
||||
# Concatenating 1+2 London times
|
||||
first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]")
|
||||
first[0] = first[0].dt.tz_localize("Europe/London")
|
||||
|
||||
second = DataFrame(
|
||||
[[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]], dtype=f"M8[{unit2}]"
|
||||
)
|
||||
second[0] = second[0].dt.tz_localize("Europe/London")
|
||||
|
||||
result = concat([first, second])
|
||||
exp_unit = tm.get_finest_unit(unit, unit2)
|
||||
assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]"
|
||||
|
||||
def test_concat_tz_series_tzlocal(self):
|
||||
# see gh-13583
|
||||
x = [
|
||||
Timestamp("2011-01-01", tz=dateutil.tz.tzlocal()),
|
||||
Timestamp("2011-02-01", tz=dateutil.tz.tzlocal()),
|
||||
]
|
||||
y = [
|
||||
Timestamp("2012-01-01", tz=dateutil.tz.tzlocal()),
|
||||
Timestamp("2012-02-01", tz=dateutil.tz.tzlocal()),
|
||||
]
|
||||
|
||||
result = concat([Series(x), Series(y)], ignore_index=True)
|
||||
tm.assert_series_equal(result, Series(x + y))
|
||||
assert result.dtype == "datetime64[ns, tzlocal()]"
|
||||
|
||||
def test_concat_tz_series_with_datetimelike(self):
|
||||
# see gh-12620: tz and timedelta
|
||||
x = [
|
||||
Timestamp("2011-01-01", tz="US/Eastern"),
|
||||
Timestamp("2011-02-01", tz="US/Eastern"),
|
||||
]
|
||||
y = [pd.Timedelta("1 day"), pd.Timedelta("2 day")]
|
||||
result = concat([Series(x), Series(y)], ignore_index=True)
|
||||
tm.assert_series_equal(result, Series(x + y, dtype="object"))
|
||||
|
||||
# tz and period
|
||||
y = [pd.Period("2011-03", freq="M"), pd.Period("2011-04", freq="M")]
|
||||
result = concat([Series(x), Series(y)], ignore_index=True)
|
||||
tm.assert_series_equal(result, Series(x + y, dtype="object"))
|
||||
|
||||
def test_concat_tz_frame(self):
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"A": Timestamp("20130102", tz="US/Eastern"),
|
||||
"B": Timestamp("20130603", tz="CET"),
|
||||
},
|
||||
index=range(5),
|
||||
)
|
||||
|
||||
# concat
|
||||
df3 = concat([df2.A.to_frame(), df2.B.to_frame()], axis=1)
|
||||
tm.assert_frame_equal(df2, df3)
|
||||
|
||||
def test_concat_multiple_tzs(self):
|
||||
# GH#12467
|
||||
# combining datetime tz-aware and naive DataFrames
|
||||
ts1 = Timestamp("2015-01-01", tz=None)
|
||||
ts2 = Timestamp("2015-01-01", tz="UTC")
|
||||
ts3 = Timestamp("2015-01-01", tz="EST")
|
||||
|
||||
df1 = DataFrame({"time": [ts1]})
|
||||
df2 = DataFrame({"time": [ts2]})
|
||||
df3 = DataFrame({"time": [ts3]})
|
||||
|
||||
results = concat([df1, df2]).reset_index(drop=True)
|
||||
expected = DataFrame({"time": [ts1, ts2]}, dtype=object)
|
||||
tm.assert_frame_equal(results, expected)
|
||||
|
||||
results = concat([df1, df3]).reset_index(drop=True)
|
||||
expected = DataFrame({"time": [ts1, ts3]}, dtype=object)
|
||||
tm.assert_frame_equal(results, expected)
|
||||
|
||||
results = concat([df2, df3]).reset_index(drop=True)
|
||||
expected = DataFrame({"time": [ts2, ts3]})
|
||||
tm.assert_frame_equal(results, expected)
|
||||
|
||||
def test_concat_multiindex_with_tz(self):
|
||||
# GH 6606
|
||||
df = DataFrame(
|
||||
{
|
||||
"dt": DatetimeIndex(
|
||||
[
|
||||
datetime(2014, 1, 1),
|
||||
datetime(2014, 1, 2),
|
||||
datetime(2014, 1, 3),
|
||||
],
|
||||
dtype="M8[ns, US/Pacific]",
|
||||
),
|
||||
"b": ["A", "B", "C"],
|
||||
"c": [1, 2, 3],
|
||||
"d": [4, 5, 6],
|
||||
}
|
||||
)
|
||||
df = df.set_index(["dt", "b"])
|
||||
|
||||
exp_idx1 = DatetimeIndex(
|
||||
["2014-01-01", "2014-01-02", "2014-01-03"] * 2,
|
||||
dtype="M8[ns, US/Pacific]",
|
||||
name="dt",
|
||||
)
|
||||
exp_idx2 = Index(["A", "B", "C"] * 2, name="b")
|
||||
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
|
||||
expected = DataFrame(
|
||||
{"c": [1, 2, 3] * 2, "d": [4, 5, 6] * 2}, index=exp_idx, columns=["c", "d"]
|
||||
)
|
||||
|
||||
result = concat([df, df])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_tz_not_aligned(self):
|
||||
# GH#22796
|
||||
ts = pd.to_datetime([1, 2]).tz_localize("UTC")
|
||||
a = DataFrame({"A": ts})
|
||||
b = DataFrame({"A": ts, "B": ts})
|
||||
result = concat([a, b], sort=True, ignore_index=True)
|
||||
expected = DataFrame(
|
||||
{"A": list(ts) + list(ts), "B": [pd.NaT, pd.NaT] + list(ts)}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"t1",
|
||||
[
|
||||
"2015-01-01",
|
||||
pytest.param(
|
||||
pd.NaT,
|
||||
marks=pytest.mark.xfail(
|
||||
reason="GH23037 incorrect dtype when concatenating"
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_concat_tz_NaT(self, t1):
|
||||
# GH#22796
|
||||
# Concatenating tz-aware multicolumn DataFrames
|
||||
ts1 = Timestamp(t1, tz="UTC")
|
||||
ts2 = Timestamp("2015-01-01", tz="UTC")
|
||||
ts3 = Timestamp("2015-01-01", tz="UTC")
|
||||
|
||||
df1 = DataFrame([[ts1, ts2]])
|
||||
df2 = DataFrame([[ts3]])
|
||||
|
||||
result = concat([df1, df2])
|
||||
expected = DataFrame([[ts1, ts2], [ts3, pd.NaT]], index=[0, 0])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_tz_with_empty(self):
|
||||
# GH 9188
|
||||
result = concat(
|
||||
[DataFrame(date_range("2000", periods=1, tz="UTC")), DataFrame()]
|
||||
)
|
||||
expected = DataFrame(date_range("2000", periods=1, tz="UTC"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestPeriodConcat:
|
||||
def test_concat_period_series(self):
|
||||
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
|
||||
y = Series(pd.PeriodIndex(["2015-10-01", "2016-01-01"], freq="D"))
|
||||
expected = Series([x[0], x[1], y[0], y[1]], dtype="Period[D]")
|
||||
result = concat([x, y], ignore_index=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_period_multiple_freq_series(self):
|
||||
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
|
||||
y = Series(pd.PeriodIndex(["2015-10-01", "2016-01-01"], freq="M"))
|
||||
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
|
||||
result = concat([x, y], ignore_index=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
assert result.dtype == "object"
|
||||
|
||||
def test_concat_period_other_series(self):
|
||||
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
|
||||
y = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="M"))
|
||||
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
|
||||
result = concat([x, y], ignore_index=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
assert result.dtype == "object"
|
||||
|
||||
def test_concat_period_other_series2(self):
|
||||
# non-period
|
||||
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
|
||||
y = Series(DatetimeIndex(["2015-11-01", "2015-12-01"]))
|
||||
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
|
||||
result = concat([x, y], ignore_index=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
assert result.dtype == "object"
|
||||
|
||||
def test_concat_period_other_series3(self):
|
||||
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
|
||||
y = Series(["A", "B"])
|
||||
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
|
||||
result = concat([x, y], ignore_index=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
assert result.dtype == "object"
|
||||
|
||||
|
||||
def test_concat_timedelta64_block():
|
||||
rng = to_timedelta(np.arange(10), unit="s")
|
||||
|
||||
df = DataFrame({"time": rng})
|
||||
|
||||
result = concat([df, df])
|
||||
tm.assert_frame_equal(result.iloc[:10], df)
|
||||
tm.assert_frame_equal(result.iloc[10:], df)
|
||||
|
||||
|
||||
def test_concat_multiindex_datetime_nat():
|
||||
# GH#44900
|
||||
left = DataFrame({"a": 1}, index=MultiIndex.from_tuples([(1, pd.NaT)]))
|
||||
right = DataFrame(
|
||||
{"b": 2}, index=MultiIndex.from_tuples([(1, pd.NaT), (2, pd.NaT)])
|
||||
)
|
||||
result = concat([left, right], axis="columns")
|
||||
expected = DataFrame(
|
||||
{"a": [1.0, np.nan], "b": 2}, MultiIndex.from_tuples([(1, pd.NaT), (2, pd.NaT)])
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_concat_float_datetime64(using_array_manager):
|
||||
# GH#32934
|
||||
df_time = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")})
|
||||
df_float = DataFrame({"A": pd.array([1.0], dtype="float64")})
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
pd.array(["2000"], dtype="datetime64[ns]")[0],
|
||||
pd.array([1.0], dtype="float64")[0],
|
||||
]
|
||||
},
|
||||
index=[0, 0],
|
||||
)
|
||||
result = concat([df_time, df_float])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame({"A": pd.array([], dtype="object")})
|
||||
result = concat([df_time.iloc[:0], df_float.iloc[:0]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame({"A": pd.array([1.0], dtype="object")})
|
||||
result = concat([df_time.iloc[:0], df_float])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
if not using_array_manager:
|
||||
expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")})
|
||||
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = concat([df_time, df_float.iloc[:0]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")}).astype(
|
||||
{"A": "object"}
|
||||
)
|
||||
result = concat([df_time, df_float.iloc[:0]])
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,295 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
RangeIndex,
|
||||
Series,
|
||||
concat,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestEmptyConcat:
|
||||
def test_handle_empty_objects(self, sort, using_infer_string):
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd")
|
||||
)
|
||||
|
||||
dfcopy = df[:5].copy()
|
||||
dfcopy["foo"] = "bar"
|
||||
empty = df[5:5]
|
||||
|
||||
frames = [dfcopy, empty, empty, df[5:]]
|
||||
concatted = concat(frames, axis=0, sort=sort)
|
||||
|
||||
expected = df.reindex(columns=["a", "b", "c", "d", "foo"])
|
||||
expected["foo"] = expected["foo"].astype(
|
||||
object if not using_infer_string else "string[pyarrow_numpy]"
|
||||
)
|
||||
expected.loc[0:4, "foo"] = "bar"
|
||||
|
||||
tm.assert_frame_equal(concatted, expected)
|
||||
|
||||
# empty as first element with time series
|
||||
# GH3259
|
||||
df = DataFrame(
|
||||
{"A": range(10000)}, index=date_range("20130101", periods=10000, freq="s")
|
||||
)
|
||||
empty = DataFrame()
|
||||
result = concat([df, empty], axis=1)
|
||||
tm.assert_frame_equal(result, df)
|
||||
result = concat([empty, df], axis=1)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
result = concat([df, empty])
|
||||
tm.assert_frame_equal(result, df)
|
||||
result = concat([empty, df])
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
def test_concat_empty_series(self):
|
||||
# GH 11082
|
||||
s1 = Series([1, 2, 3], name="x")
|
||||
s2 = Series(name="y", dtype="float64")
|
||||
res = concat([s1, s2], axis=1)
|
||||
exp = DataFrame(
|
||||
{"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan]},
|
||||
index=RangeIndex(3),
|
||||
)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
s1 = Series([1, 2, 3], name="x")
|
||||
s2 = Series(name="y", dtype="float64")
|
||||
msg = "The behavior of array concatenation with empty entries is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
res = concat([s1, s2], axis=0)
|
||||
# name will be reset
|
||||
exp = Series([1, 2, 3])
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
# empty Series with no name
|
||||
s1 = Series([1, 2, 3], name="x")
|
||||
s2 = Series(name=None, dtype="float64")
|
||||
res = concat([s1, s2], axis=1)
|
||||
exp = DataFrame(
|
||||
{"x": [1, 2, 3], 0: [np.nan, np.nan, np.nan]},
|
||||
columns=["x", 0],
|
||||
index=RangeIndex(3),
|
||||
)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
@pytest.mark.parametrize("tz", [None, "UTC"])
|
||||
@pytest.mark.parametrize("values", [[], [1, 2, 3]])
|
||||
def test_concat_empty_series_timelike(self, tz, values):
|
||||
# GH 18447
|
||||
|
||||
first = Series([], dtype="M8[ns]").dt.tz_localize(tz)
|
||||
dtype = None if values else np.float64
|
||||
second = Series(values, dtype=dtype)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
0: Series([pd.NaT] * len(values), dtype="M8[ns]").dt.tz_localize(tz),
|
||||
1: values,
|
||||
}
|
||||
)
|
||||
result = concat([first, second], axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"left,right,expected",
|
||||
[
|
||||
# booleans
|
||||
(np.bool_, np.int32, np.object_), # changed from int32 in 2.0 GH#39817
|
||||
(np.bool_, np.float32, np.object_),
|
||||
# datetime-like
|
||||
("m8[ns]", np.bool_, np.object_),
|
||||
("m8[ns]", np.int64, np.object_),
|
||||
("M8[ns]", np.bool_, np.object_),
|
||||
("M8[ns]", np.int64, np.object_),
|
||||
# categorical
|
||||
("category", "category", "category"),
|
||||
("category", "object", "object"),
|
||||
],
|
||||
)
|
||||
def test_concat_empty_series_dtypes(self, left, right, expected):
|
||||
# GH#39817, GH#45101
|
||||
result = concat([Series(dtype=left), Series(dtype=right)])
|
||||
assert result.dtype == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", ["float64", "int8", "uint8", "bool", "m8[ns]", "M8[ns]"]
|
||||
)
|
||||
def test_concat_empty_series_dtypes_match_roundtrips(self, dtype):
|
||||
dtype = np.dtype(dtype)
|
||||
|
||||
result = concat([Series(dtype=dtype)])
|
||||
assert result.dtype == dtype
|
||||
|
||||
result = concat([Series(dtype=dtype), Series(dtype=dtype)])
|
||||
assert result.dtype == dtype
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["float64", "int8", "uint8", "m8[ns]", "M8[ns]"])
|
||||
@pytest.mark.parametrize(
|
||||
"dtype2",
|
||||
["float64", "int8", "uint8", "m8[ns]", "M8[ns]"],
|
||||
)
|
||||
def test_concat_empty_series_dtypes_roundtrips(self, dtype, dtype2):
|
||||
# round-tripping with self & like self
|
||||
if dtype == dtype2:
|
||||
pytest.skip("same dtype is not applicable for test")
|
||||
|
||||
def int_result_type(dtype, dtype2):
|
||||
typs = {dtype.kind, dtype2.kind}
|
||||
if not len(typs - {"i", "u", "b"}) and (
|
||||
dtype.kind == "i" or dtype2.kind == "i"
|
||||
):
|
||||
return "i"
|
||||
elif not len(typs - {"u", "b"}) and (
|
||||
dtype.kind == "u" or dtype2.kind == "u"
|
||||
):
|
||||
return "u"
|
||||
return None
|
||||
|
||||
def float_result_type(dtype, dtype2):
|
||||
typs = {dtype.kind, dtype2.kind}
|
||||
if not len(typs - {"f", "i", "u"}) and (
|
||||
dtype.kind == "f" or dtype2.kind == "f"
|
||||
):
|
||||
return "f"
|
||||
return None
|
||||
|
||||
def get_result_type(dtype, dtype2):
|
||||
result = float_result_type(dtype, dtype2)
|
||||
if result is not None:
|
||||
return result
|
||||
result = int_result_type(dtype, dtype2)
|
||||
if result is not None:
|
||||
return result
|
||||
return "O"
|
||||
|
||||
dtype = np.dtype(dtype)
|
||||
dtype2 = np.dtype(dtype2)
|
||||
expected = get_result_type(dtype, dtype2)
|
||||
result = concat([Series(dtype=dtype), Series(dtype=dtype2)]).dtype
|
||||
assert result.kind == expected
|
||||
|
||||
def test_concat_empty_series_dtypes_triple(self):
|
||||
assert (
|
||||
concat(
|
||||
[Series(dtype="M8[ns]"), Series(dtype=np.bool_), Series(dtype=np.int64)]
|
||||
).dtype
|
||||
== np.object_
|
||||
)
|
||||
|
||||
def test_concat_empty_series_dtype_category_with_array(self):
|
||||
# GH#18515
|
||||
assert (
|
||||
concat(
|
||||
[Series(np.array([]), dtype="category"), Series(dtype="float64")]
|
||||
).dtype
|
||||
== "float64"
|
||||
)
|
||||
|
||||
def test_concat_empty_series_dtypes_sparse(self):
|
||||
result = concat(
|
||||
[
|
||||
Series(dtype="float64").astype("Sparse"),
|
||||
Series(dtype="float64").astype("Sparse"),
|
||||
]
|
||||
)
|
||||
assert result.dtype == "Sparse[float64]"
|
||||
|
||||
result = concat(
|
||||
[Series(dtype="float64").astype("Sparse"), Series(dtype="float64")]
|
||||
)
|
||||
expected = pd.SparseDtype(np.float64)
|
||||
assert result.dtype == expected
|
||||
|
||||
result = concat(
|
||||
[Series(dtype="float64").astype("Sparse"), Series(dtype="object")]
|
||||
)
|
||||
expected = pd.SparseDtype("object")
|
||||
assert result.dtype == expected
|
||||
|
||||
def test_concat_empty_df_object_dtype(self):
|
||||
# GH 9149
|
||||
df_1 = DataFrame({"Row": [0, 1, 1], "EmptyCol": np.nan, "NumberCol": [1, 2, 3]})
|
||||
df_2 = DataFrame(columns=df_1.columns)
|
||||
result = concat([df_1, df_2], axis=0)
|
||||
expected = df_1.astype(object)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_empty_dataframe_dtypes(self):
|
||||
df = DataFrame(columns=list("abc"))
|
||||
df["a"] = df["a"].astype(np.bool_)
|
||||
df["b"] = df["b"].astype(np.int32)
|
||||
df["c"] = df["c"].astype(np.float64)
|
||||
|
||||
result = concat([df, df])
|
||||
assert result["a"].dtype == np.bool_
|
||||
assert result["b"].dtype == np.int32
|
||||
assert result["c"].dtype == np.float64
|
||||
|
||||
result = concat([df, df.astype(np.float64)])
|
||||
assert result["a"].dtype == np.object_
|
||||
assert result["b"].dtype == np.float64
|
||||
assert result["c"].dtype == np.float64
|
||||
|
||||
def test_concat_inner_join_empty(self):
|
||||
# GH 15328
|
||||
df_empty = DataFrame()
|
||||
df_a = DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64")
|
||||
df_expected = DataFrame({"a": []}, index=RangeIndex(0), dtype="int64")
|
||||
|
||||
result = concat([df_a, df_empty], axis=1, join="inner")
|
||||
tm.assert_frame_equal(result, df_expected)
|
||||
|
||||
result = concat([df_a, df_empty], axis=1, join="outer")
|
||||
tm.assert_frame_equal(result, df_a)
|
||||
|
||||
def test_empty_dtype_coerce(self):
|
||||
# xref to #12411
|
||||
# xref to #12045
|
||||
# xref to #11594
|
||||
# see below
|
||||
|
||||
# 10571
|
||||
df1 = DataFrame(data=[[1, None], [2, None]], columns=["a", "b"])
|
||||
df2 = DataFrame(data=[[3, None], [4, None]], columns=["a", "b"])
|
||||
result = concat([df1, df2])
|
||||
expected = df1.dtypes
|
||||
tm.assert_series_equal(result.dtypes, expected)
|
||||
|
||||
def test_concat_empty_dataframe(self):
|
||||
# 39037
|
||||
df1 = DataFrame(columns=["a", "b"])
|
||||
df2 = DataFrame(columns=["b", "c"])
|
||||
result = concat([df1, df2, df1])
|
||||
expected = DataFrame(columns=["a", "b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df3 = DataFrame(columns=["a", "b"])
|
||||
df4 = DataFrame(columns=["b"])
|
||||
result = concat([df3, df4])
|
||||
expected = DataFrame(columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_empty_dataframe_different_dtypes(self, using_infer_string):
|
||||
# 39037
|
||||
df1 = DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
|
||||
df2 = DataFrame({"a": [1, 2, 3]})
|
||||
|
||||
result = concat([df1[:0], df2[:0]])
|
||||
assert result["a"].dtype == np.int64
|
||||
assert result["b"].dtype == np.object_ if not using_infer_string else "string"
|
||||
|
||||
def test_concat_to_empty_ea(self):
|
||||
"""48510 `concat` to an empty EA should maintain type EA dtype."""
|
||||
df_empty = DataFrame({"a": pd.array([], dtype=pd.Int64Dtype())})
|
||||
df_new = DataFrame({"a": pd.array([1, 2, 3], dtype=pd.Int64Dtype())})
|
||||
expected = df_new.copy()
|
||||
result = concat([df_empty, df_new])
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,472 @@
|
||||
from copy import deepcopy
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import PerformanceWarning
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestIndexConcat:
|
||||
def test_concat_ignore_index(self, sort):
|
||||
frame1 = DataFrame(
|
||||
{"test1": ["a", "b", "c"], "test2": [1, 2, 3], "test3": [4.5, 3.2, 1.2]}
|
||||
)
|
||||
frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]})
|
||||
frame1.index = Index(["x", "y", "z"])
|
||||
frame2.index = Index(["x", "y", "q"])
|
||||
|
||||
v1 = concat([frame1, frame2], axis=1, ignore_index=True, sort=sort)
|
||||
|
||||
nan = np.nan
|
||||
expected = DataFrame(
|
||||
[
|
||||
[nan, nan, nan, 4.3],
|
||||
["a", 1, 4.5, 5.2],
|
||||
["b", 2, 3.2, 2.2],
|
||||
["c", 3, 1.2, nan],
|
||||
],
|
||||
index=Index(["q", "x", "y", "z"]),
|
||||
)
|
||||
if not sort:
|
||||
expected = expected.loc[["x", "y", "z", "q"]]
|
||||
|
||||
tm.assert_frame_equal(v1, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"name_in1,name_in2,name_in3,name_out",
|
||||
[
|
||||
("idx", "idx", "idx", "idx"),
|
||||
("idx", "idx", None, None),
|
||||
("idx", None, None, None),
|
||||
("idx1", "idx2", None, None),
|
||||
("idx1", "idx1", "idx2", None),
|
||||
("idx1", "idx2", "idx3", None),
|
||||
(None, None, None, None),
|
||||
],
|
||||
)
|
||||
def test_concat_same_index_names(self, name_in1, name_in2, name_in3, name_out):
|
||||
# GH13475
|
||||
indices = [
|
||||
Index(["a", "b", "c"], name=name_in1),
|
||||
Index(["b", "c", "d"], name=name_in2),
|
||||
Index(["c", "d", "e"], name=name_in3),
|
||||
]
|
||||
frames = [
|
||||
DataFrame({c: [0, 1, 2]}, index=i) for i, c in zip(indices, ["x", "y", "z"])
|
||||
]
|
||||
result = concat(frames, axis=1)
|
||||
|
||||
exp_ind = Index(["a", "b", "c", "d", "e"], name=name_out)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"x": [0, 1, 2, np.nan, np.nan],
|
||||
"y": [np.nan, 0, 1, 2, np.nan],
|
||||
"z": [np.nan, np.nan, 0, 1, 2],
|
||||
},
|
||||
index=exp_ind,
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_rename_index(self):
|
||||
a = DataFrame(
|
||||
np.random.default_rng(2).random((3, 3)),
|
||||
columns=list("ABC"),
|
||||
index=Index(list("abc"), name="index_a"),
|
||||
)
|
||||
b = DataFrame(
|
||||
np.random.default_rng(2).random((3, 3)),
|
||||
columns=list("ABC"),
|
||||
index=Index(list("abc"), name="index_b"),
|
||||
)
|
||||
|
||||
result = concat([a, b], keys=["key0", "key1"], names=["lvl0", "lvl1"])
|
||||
|
||||
exp = concat([a, b], keys=["key0", "key1"], names=["lvl0"])
|
||||
names = list(exp.index.names)
|
||||
names[1] = "lvl1"
|
||||
exp.index.set_names(names, inplace=True)
|
||||
|
||||
tm.assert_frame_equal(result, exp)
|
||||
assert result.index.names == exp.index.names
|
||||
|
||||
def test_concat_copy_index_series(self, axis, using_copy_on_write):
|
||||
# GH 29879
|
||||
ser = Series([1, 2])
|
||||
comb = concat([ser, ser], axis=axis, copy=True)
|
||||
if not using_copy_on_write or axis in [0, "index"]:
|
||||
assert comb.index is not ser.index
|
||||
else:
|
||||
assert comb.index is ser.index
|
||||
|
||||
def test_concat_copy_index_frame(self, axis, using_copy_on_write):
|
||||
# GH 29879
|
||||
df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
|
||||
comb = concat([df, df], axis=axis, copy=True)
|
||||
if not using_copy_on_write:
|
||||
assert not comb.index.is_(df.index)
|
||||
assert not comb.columns.is_(df.columns)
|
||||
elif axis in [0, "index"]:
|
||||
assert not comb.index.is_(df.index)
|
||||
assert comb.columns.is_(df.columns)
|
||||
elif axis in [1, "columns"]:
|
||||
assert comb.index.is_(df.index)
|
||||
assert not comb.columns.is_(df.columns)
|
||||
|
||||
def test_default_index(self):
|
||||
# is_series and ignore_index
|
||||
s1 = Series([1, 2, 3], name="x")
|
||||
s2 = Series([4, 5, 6], name="y")
|
||||
res = concat([s1, s2], axis=1, ignore_index=True)
|
||||
assert isinstance(res.columns, pd.RangeIndex)
|
||||
exp = DataFrame([[1, 4], [2, 5], [3, 6]])
|
||||
# use check_index_type=True to check the result have
|
||||
# RangeIndex (default index)
|
||||
tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
|
||||
|
||||
# is_series and all inputs have no names
|
||||
s1 = Series([1, 2, 3])
|
||||
s2 = Series([4, 5, 6])
|
||||
res = concat([s1, s2], axis=1, ignore_index=False)
|
||||
assert isinstance(res.columns, pd.RangeIndex)
|
||||
exp = DataFrame([[1, 4], [2, 5], [3, 6]])
|
||||
exp.columns = pd.RangeIndex(2)
|
||||
tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
|
||||
|
||||
# is_dataframe and ignore_index
|
||||
df1 = DataFrame({"A": [1, 2], "B": [5, 6]})
|
||||
df2 = DataFrame({"A": [3, 4], "B": [7, 8]})
|
||||
|
||||
res = concat([df1, df2], axis=0, ignore_index=True)
|
||||
exp = DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], columns=["A", "B"])
|
||||
tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
|
||||
|
||||
res = concat([df1, df2], axis=1, ignore_index=True)
|
||||
exp = DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]])
|
||||
tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
|
||||
|
||||
def test_dups_index(self):
|
||||
# GH 4771
|
||||
|
||||
# single dtypes
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).integers(0, 10, size=40).reshape(10, 4),
|
||||
columns=["A", "A", "C", "C"],
|
||||
)
|
||||
|
||||
result = concat([df, df], axis=1)
|
||||
tm.assert_frame_equal(result.iloc[:, :4], df)
|
||||
tm.assert_frame_equal(result.iloc[:, 4:], df)
|
||||
|
||||
result = concat([df, df], axis=0)
|
||||
tm.assert_frame_equal(result.iloc[:10], df)
|
||||
tm.assert_frame_equal(result.iloc[10:], df)
|
||||
|
||||
# multi dtypes
|
||||
df = concat(
|
||||
[
|
||||
DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=["A", "A", "B", "B"],
|
||||
),
|
||||
DataFrame(
|
||||
np.random.default_rng(2).integers(0, 10, size=20).reshape(10, 2),
|
||||
columns=["A", "C"],
|
||||
),
|
||||
],
|
||||
axis=1,
|
||||
)
|
||||
|
||||
result = concat([df, df], axis=1)
|
||||
tm.assert_frame_equal(result.iloc[:, :6], df)
|
||||
tm.assert_frame_equal(result.iloc[:, 6:], df)
|
||||
|
||||
result = concat([df, df], axis=0)
|
||||
tm.assert_frame_equal(result.iloc[:10], df)
|
||||
tm.assert_frame_equal(result.iloc[10:], df)
|
||||
|
||||
# append
|
||||
result = df.iloc[0:8, :]._append(df.iloc[8:])
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
result = df.iloc[0:8, :]._append(df.iloc[8:9])._append(df.iloc[9:10])
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
expected = concat([df, df], axis=0)
|
||||
result = df._append(df)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestMultiIndexConcat:
|
||||
def test_concat_multiindex_with_keys(self, multiindex_dataframe_random_data):
|
||||
frame = multiindex_dataframe_random_data
|
||||
index = frame.index
|
||||
result = concat([frame, frame], keys=[0, 1], names=["iteration"])
|
||||
|
||||
assert result.index.names == ("iteration",) + index.names
|
||||
tm.assert_frame_equal(result.loc[0], frame)
|
||||
tm.assert_frame_equal(result.loc[1], frame)
|
||||
assert result.index.nlevels == 3
|
||||
|
||||
def test_concat_multiindex_with_none_in_index_names(self):
|
||||
# GH 15787
|
||||
index = MultiIndex.from_product([[1], range(5)], names=["level1", None])
|
||||
df = DataFrame({"col": range(5)}, index=index, dtype=np.int32)
|
||||
|
||||
result = concat([df, df], keys=[1, 2], names=["level2"])
|
||||
index = MultiIndex.from_product(
|
||||
[[1, 2], [1], range(5)], names=["level2", "level1", None]
|
||||
)
|
||||
expected = DataFrame({"col": list(range(5)) * 2}, index=index, dtype=np.int32)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = concat([df, df[:2]], keys=[1, 2], names=["level2"])
|
||||
level2 = [1] * 5 + [2] * 2
|
||||
level1 = [1] * 7
|
||||
no_name = list(range(5)) + list(range(2))
|
||||
tuples = list(zip(level2, level1, no_name))
|
||||
index = MultiIndex.from_tuples(tuples, names=["level2", "level1", None])
|
||||
expected = DataFrame({"col": no_name}, index=index, dtype=np.int32)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_multiindex_rangeindex(self):
|
||||
# GH13542
|
||||
# when multi-index levels are RangeIndex objects
|
||||
# there is a bug in concat with objects of len 1
|
||||
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((9, 2)))
|
||||
df.index = MultiIndex(
|
||||
levels=[pd.RangeIndex(3), pd.RangeIndex(3)],
|
||||
codes=[np.repeat(np.arange(3), 3), np.tile(np.arange(3), 3)],
|
||||
)
|
||||
|
||||
res = concat([df.iloc[[2, 3, 4], :], df.iloc[[5], :]])
|
||||
exp = df.iloc[[2, 3, 4, 5], :]
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_concat_multiindex_dfs_with_deepcopy(self):
|
||||
# GH 9967
|
||||
example_multiindex1 = MultiIndex.from_product([["a"], ["b"]])
|
||||
example_dataframe1 = DataFrame([0], index=example_multiindex1)
|
||||
|
||||
example_multiindex2 = MultiIndex.from_product([["a"], ["c"]])
|
||||
example_dataframe2 = DataFrame([1], index=example_multiindex2)
|
||||
|
||||
example_dict = {"s1": example_dataframe1, "s2": example_dataframe2}
|
||||
expected_index = MultiIndex(
|
||||
levels=[["s1", "s2"], ["a"], ["b", "c"]],
|
||||
codes=[[0, 1], [0, 0], [0, 1]],
|
||||
names=["testname", None, None],
|
||||
)
|
||||
expected = DataFrame([[0], [1]], index=expected_index)
|
||||
result_copy = concat(deepcopy(example_dict), names=["testname"])
|
||||
tm.assert_frame_equal(result_copy, expected)
|
||||
result_no_copy = concat(example_dict, names=["testname"])
|
||||
tm.assert_frame_equal(result_no_copy, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"mi1_list",
|
||||
[
|
||||
[["a"], range(2)],
|
||||
[["b"], np.arange(2.0, 4.0)],
|
||||
[["c"], ["A", "B"]],
|
||||
[["d"], pd.date_range(start="2017", end="2018", periods=2)],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"mi2_list",
|
||||
[
|
||||
[["a"], range(2)],
|
||||
[["b"], np.arange(2.0, 4.0)],
|
||||
[["c"], ["A", "B"]],
|
||||
[["d"], pd.date_range(start="2017", end="2018", periods=2)],
|
||||
],
|
||||
)
|
||||
def test_concat_with_various_multiindex_dtypes(
|
||||
self, mi1_list: list, mi2_list: list
|
||||
):
|
||||
# GitHub #23478
|
||||
mi1 = MultiIndex.from_product(mi1_list)
|
||||
mi2 = MultiIndex.from_product(mi2_list)
|
||||
|
||||
df1 = DataFrame(np.zeros((1, len(mi1))), columns=mi1)
|
||||
df2 = DataFrame(np.zeros((1, len(mi2))), columns=mi2)
|
||||
|
||||
if mi1_list[0] == mi2_list[0]:
|
||||
expected_mi = MultiIndex(
|
||||
levels=[mi1_list[0], list(mi1_list[1])],
|
||||
codes=[[0, 0, 0, 0], [0, 1, 0, 1]],
|
||||
)
|
||||
else:
|
||||
expected_mi = MultiIndex(
|
||||
levels=[
|
||||
mi1_list[0] + mi2_list[0],
|
||||
list(mi1_list[1]) + list(mi2_list[1]),
|
||||
],
|
||||
codes=[[0, 0, 1, 1], [0, 1, 2, 3]],
|
||||
)
|
||||
|
||||
expected_df = DataFrame(np.zeros((1, len(expected_mi))), columns=expected_mi)
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
result_df = concat((df1, df2), axis=1)
|
||||
|
||||
tm.assert_frame_equal(expected_df, result_df)
|
||||
|
||||
def test_concat_multiindex_(self):
|
||||
# GitHub #44786
|
||||
df = DataFrame({"col": ["a", "b", "c"]}, index=["1", "2", "2"])
|
||||
df = concat([df], keys=["X"])
|
||||
|
||||
iterables = [["X"], ["1", "2", "2"]]
|
||||
result_index = df.index
|
||||
expected_index = MultiIndex.from_product(iterables)
|
||||
|
||||
tm.assert_index_equal(result_index, expected_index)
|
||||
|
||||
result_df = df
|
||||
expected_df = DataFrame(
|
||||
{"col": ["a", "b", "c"]}, index=MultiIndex.from_product(iterables)
|
||||
)
|
||||
tm.assert_frame_equal(result_df, expected_df)
|
||||
|
||||
def test_concat_with_key_not_unique(self):
|
||||
# GitHub #46519
|
||||
df1 = DataFrame({"name": [1]})
|
||||
df2 = DataFrame({"name": [2]})
|
||||
df3 = DataFrame({"name": [3]})
|
||||
df_a = concat([df1, df2, df3], keys=["x", "y", "x"])
|
||||
# the warning is caused by indexing unsorted multi-index
|
||||
with tm.assert_produces_warning(
|
||||
PerformanceWarning, match="indexing past lexsort depth"
|
||||
):
|
||||
out_a = df_a.loc[("x", 0), :]
|
||||
|
||||
df_b = DataFrame(
|
||||
{"name": [1, 2, 3]}, index=Index([("x", 0), ("y", 0), ("x", 0)])
|
||||
)
|
||||
with tm.assert_produces_warning(
|
||||
PerformanceWarning, match="indexing past lexsort depth"
|
||||
):
|
||||
out_b = df_b.loc[("x", 0)]
|
||||
|
||||
tm.assert_frame_equal(out_a, out_b)
|
||||
|
||||
df1 = DataFrame({"name": ["a", "a", "b"]})
|
||||
df2 = DataFrame({"name": ["a", "b"]})
|
||||
df3 = DataFrame({"name": ["c", "d"]})
|
||||
df_a = concat([df1, df2, df3], keys=["x", "y", "x"])
|
||||
with tm.assert_produces_warning(
|
||||
PerformanceWarning, match="indexing past lexsort depth"
|
||||
):
|
||||
out_a = df_a.loc[("x", 0), :]
|
||||
|
||||
df_b = DataFrame(
|
||||
{
|
||||
"a": ["x", "x", "x", "y", "y", "x", "x"],
|
||||
"b": [0, 1, 2, 0, 1, 0, 1],
|
||||
"name": list("aababcd"),
|
||||
}
|
||||
).set_index(["a", "b"])
|
||||
df_b.index.names = [None, None]
|
||||
with tm.assert_produces_warning(
|
||||
PerformanceWarning, match="indexing past lexsort depth"
|
||||
):
|
||||
out_b = df_b.loc[("x", 0), :]
|
||||
|
||||
tm.assert_frame_equal(out_a, out_b)
|
||||
|
||||
def test_concat_with_duplicated_levels(self):
|
||||
# keyword levels should be unique
|
||||
df1 = DataFrame({"A": [1]}, index=["x"])
|
||||
df2 = DataFrame({"A": [1]}, index=["y"])
|
||||
msg = r"Level values not unique: \['x', 'y', 'y'\]"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
concat([df1, df2], keys=["x", "y"], levels=[["x", "y", "y"]])
|
||||
|
||||
@pytest.mark.parametrize("levels", [[["x", "y"]], [["x", "y", "y"]]])
|
||||
def test_concat_with_levels_with_none_keys(self, levels):
|
||||
df1 = DataFrame({"A": [1]}, index=["x"])
|
||||
df2 = DataFrame({"A": [1]}, index=["y"])
|
||||
msg = "levels supported only when keys is not None"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
concat([df1, df2], levels=levels)
|
||||
|
||||
def test_concat_range_index_result(self):
|
||||
# GH#47501
|
||||
df1 = DataFrame({"a": [1, 2]})
|
||||
df2 = DataFrame({"b": [1, 2]})
|
||||
|
||||
result = concat([df1, df2], sort=True, axis=1)
|
||||
expected = DataFrame({"a": [1, 2], "b": [1, 2]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
expected_index = pd.RangeIndex(0, 2)
|
||||
tm.assert_index_equal(result.index, expected_index, exact=True)
|
||||
|
||||
def test_concat_index_keep_dtype(self):
|
||||
# GH#47329
|
||||
df1 = DataFrame([[0, 1, 1]], columns=Index([1, 2, 3], dtype="object"))
|
||||
df2 = DataFrame([[0, 1]], columns=Index([1, 2], dtype="object"))
|
||||
result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
|
||||
expected = DataFrame(
|
||||
[[0, 1, 1.0], [0, 1, np.nan]], columns=Index([1, 2, 3], dtype="object")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_index_keep_dtype_ea_numeric(self, any_numeric_ea_dtype):
|
||||
# GH#47329
|
||||
df1 = DataFrame(
|
||||
[[0, 1, 1]], columns=Index([1, 2, 3], dtype=any_numeric_ea_dtype)
|
||||
)
|
||||
df2 = DataFrame([[0, 1]], columns=Index([1, 2], dtype=any_numeric_ea_dtype))
|
||||
result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
|
||||
expected = DataFrame(
|
||||
[[0, 1, 1.0], [0, 1, np.nan]],
|
||||
columns=Index([1, 2, 3], dtype=any_numeric_ea_dtype),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Int8", "Int16", "Int32"])
|
||||
def test_concat_index_find_common(self, dtype):
|
||||
# GH#47329
|
||||
df1 = DataFrame([[0, 1, 1]], columns=Index([1, 2, 3], dtype=dtype))
|
||||
df2 = DataFrame([[0, 1]], columns=Index([1, 2], dtype="Int32"))
|
||||
result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
|
||||
expected = DataFrame(
|
||||
[[0, 1, 1.0], [0, 1, np.nan]], columns=Index([1, 2, 3], dtype="Int32")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_axis_1_sort_false_rangeindex(self, using_infer_string):
|
||||
# GH 46675
|
||||
s1 = Series(["a", "b", "c"])
|
||||
s2 = Series(["a", "b"])
|
||||
s3 = Series(["a", "b", "c", "d"])
|
||||
s4 = Series(
|
||||
[], dtype=object if not using_infer_string else "string[pyarrow_numpy]"
|
||||
)
|
||||
result = concat(
|
||||
[s1, s2, s3, s4], sort=False, join="outer", ignore_index=False, axis=1
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
["a"] * 3 + [np.nan],
|
||||
["b"] * 3 + [np.nan],
|
||||
["c", np.nan] * 2,
|
||||
[np.nan] * 2 + ["d"] + [np.nan],
|
||||
],
|
||||
dtype=object if not using_infer_string else "string[pyarrow_numpy]",
|
||||
)
|
||||
tm.assert_frame_equal(
|
||||
result, expected, check_index_type=True, check_column_type=True
|
||||
)
|
@ -0,0 +1,54 @@
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
concat,
|
||||
read_csv,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestInvalidConcat:
|
||||
@pytest.mark.parametrize("obj", [1, {}, [1, 2], (1, 2)])
|
||||
def test_concat_invalid(self, obj):
|
||||
# trying to concat a ndframe with a non-ndframe
|
||||
df1 = DataFrame(range(2))
|
||||
msg = (
|
||||
f"cannot concatenate object of type '{type(obj)}'; "
|
||||
"only Series and DataFrame objs are valid"
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
concat([df1, obj])
|
||||
|
||||
def test_concat_invalid_first_argument(self):
|
||||
df1 = DataFrame(range(2))
|
||||
msg = (
|
||||
"first argument must be an iterable of pandas "
|
||||
'objects, you passed an object of type "DataFrame"'
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
concat(df1)
|
||||
|
||||
def test_concat_generator_obj(self):
|
||||
# generator ok though
|
||||
concat(DataFrame(np.random.default_rng(2).random((5, 5))) for _ in range(3))
|
||||
|
||||
def test_concat_textreader_obj(self):
|
||||
# text reader ok
|
||||
# GH6583
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
|
||||
with read_csv(StringIO(data), chunksize=1) as reader:
|
||||
result = concat(reader, ignore_index=True)
|
||||
expected = read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,175 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
concat,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestSeriesConcat:
|
||||
def test_concat_series(self):
|
||||
ts = Series(
|
||||
np.arange(20, dtype=np.float64),
|
||||
index=date_range("2020-01-01", periods=20),
|
||||
name="foo",
|
||||
)
|
||||
ts.name = "foo"
|
||||
|
||||
pieces = [ts[:5], ts[5:15], ts[15:]]
|
||||
|
||||
result = concat(pieces)
|
||||
tm.assert_series_equal(result, ts)
|
||||
assert result.name == ts.name
|
||||
|
||||
result = concat(pieces, keys=[0, 1, 2])
|
||||
expected = ts.copy()
|
||||
|
||||
ts.index = DatetimeIndex(np.array(ts.index.values, dtype="M8[ns]"))
|
||||
|
||||
exp_codes = [np.repeat([0, 1, 2], [len(x) for x in pieces]), np.arange(len(ts))]
|
||||
exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], codes=exp_codes)
|
||||
expected.index = exp_index
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_empty_and_non_empty_series_regression(self):
|
||||
# GH 18187 regression test
|
||||
s1 = Series([1])
|
||||
s2 = Series([], dtype=object)
|
||||
|
||||
expected = s1
|
||||
msg = "The behavior of array concatenation with empty entries is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = concat([s1, s2])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_series_axis1(self):
|
||||
ts = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
|
||||
pieces = [ts[:-2], ts[2:], ts[2:-2]]
|
||||
|
||||
result = concat(pieces, axis=1)
|
||||
expected = DataFrame(pieces).T
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = concat(pieces, keys=["A", "B", "C"], axis=1)
|
||||
expected = DataFrame(pieces, index=["A", "B", "C"]).T
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_series_axis1_preserves_series_names(self):
|
||||
# preserve series names, #2489
|
||||
s = Series(np.random.default_rng(2).standard_normal(5), name="A")
|
||||
s2 = Series(np.random.default_rng(2).standard_normal(5), name="B")
|
||||
|
||||
result = concat([s, s2], axis=1)
|
||||
expected = DataFrame({"A": s, "B": s2})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
s2.name = None
|
||||
result = concat([s, s2], axis=1)
|
||||
tm.assert_index_equal(result.columns, Index(["A", 0], dtype="object"))
|
||||
|
||||
def test_concat_series_axis1_with_reindex(self, sort):
|
||||
# must reindex, #2603
|
||||
s = Series(
|
||||
np.random.default_rng(2).standard_normal(3), index=["c", "a", "b"], name="A"
|
||||
)
|
||||
s2 = Series(
|
||||
np.random.default_rng(2).standard_normal(4),
|
||||
index=["d", "a", "b", "c"],
|
||||
name="B",
|
||||
)
|
||||
result = concat([s, s2], axis=1, sort=sort)
|
||||
expected = DataFrame({"A": s, "B": s2}, index=["c", "a", "b", "d"])
|
||||
if sort:
|
||||
expected = expected.sort_index()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_series_axis1_names_applied(self):
|
||||
# ensure names argument is not ignored on axis=1, #23490
|
||||
s = Series([1, 2, 3])
|
||||
s2 = Series([4, 5, 6])
|
||||
result = concat([s, s2], axis=1, keys=["a", "b"], names=["A"])
|
||||
expected = DataFrame(
|
||||
[[1, 4], [2, 5], [3, 6]], columns=Index(["a", "b"], name="A")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = concat([s, s2], axis=1, keys=[("a", 1), ("b", 2)], names=["A", "B"])
|
||||
expected = DataFrame(
|
||||
[[1, 4], [2, 5], [3, 6]],
|
||||
columns=MultiIndex.from_tuples([("a", 1), ("b", 2)], names=["A", "B"]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_series_axis1_same_names_ignore_index(self):
|
||||
dates = date_range("01-Jan-2013", "01-Jan-2014", freq="MS")[0:-1]
|
||||
s1 = Series(
|
||||
np.random.default_rng(2).standard_normal(len(dates)),
|
||||
index=dates,
|
||||
name="value",
|
||||
)
|
||||
s2 = Series(
|
||||
np.random.default_rng(2).standard_normal(len(dates)),
|
||||
index=dates,
|
||||
name="value",
|
||||
)
|
||||
|
||||
result = concat([s1, s2], axis=1, ignore_index=True)
|
||||
expected = Index(range(2))
|
||||
|
||||
tm.assert_index_equal(result.columns, expected, exact=True)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"s1name,s2name", [(np.int64(190), (43, 0)), (190, (43, 0))]
|
||||
)
|
||||
def test_concat_series_name_npscalar_tuple(self, s1name, s2name):
|
||||
# GH21015
|
||||
s1 = Series({"a": 1, "b": 2}, name=s1name)
|
||||
s2 = Series({"c": 5, "d": 6}, name=s2name)
|
||||
result = concat([s1, s2])
|
||||
expected = Series({"a": 1, "b": 2, "c": 5, "d": 6})
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_series_partial_columns_names(self):
|
||||
# GH10698
|
||||
named_series = Series([1, 2], name="foo")
|
||||
unnamed_series1 = Series([1, 2])
|
||||
unnamed_series2 = Series([4, 5])
|
||||
|
||||
result = concat([named_series, unnamed_series1, unnamed_series2], axis=1)
|
||||
expected = DataFrame(
|
||||
{"foo": [1, 2], 0: [1, 2], 1: [4, 5]}, columns=["foo", 0, 1]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = concat(
|
||||
[named_series, unnamed_series1, unnamed_series2],
|
||||
axis=1,
|
||||
keys=["red", "blue", "yellow"],
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"red": [1, 2], "blue": [1, 2], "yellow": [4, 5]},
|
||||
columns=["red", "blue", "yellow"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = concat(
|
||||
[named_series, unnamed_series1, unnamed_series2], axis=1, ignore_index=True
|
||||
)
|
||||
expected = DataFrame({0: [1, 2], 1: [1, 2], 2: [4, 5]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_series_length_one_reversed(self, frame_or_series):
|
||||
# GH39401
|
||||
obj = frame_or_series([100])
|
||||
result = concat([obj.iloc[::-1]])
|
||||
tm.assert_equal(result, obj)
|
@ -0,0 +1,118 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestConcatSort:
|
||||
def test_concat_sorts_columns(self, sort):
|
||||
# GH-4588
|
||||
df1 = DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"])
|
||||
df2 = DataFrame({"a": [3, 4], "c": [5, 6]})
|
||||
|
||||
# for sort=True/None
|
||||
expected = DataFrame(
|
||||
{"a": [1, 2, 3, 4], "b": [1, 2, None, None], "c": [None, None, 5, 6]},
|
||||
columns=["a", "b", "c"],
|
||||
)
|
||||
|
||||
if sort is False:
|
||||
expected = expected[["b", "a", "c"]]
|
||||
|
||||
# default
|
||||
with tm.assert_produces_warning(None):
|
||||
result = pd.concat([df1, df2], ignore_index=True, sort=sort)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_sorts_index(self, sort):
|
||||
df1 = DataFrame({"a": [1, 2, 3]}, index=["c", "a", "b"])
|
||||
df2 = DataFrame({"b": [1, 2]}, index=["a", "b"])
|
||||
|
||||
# For True/None
|
||||
expected = DataFrame(
|
||||
{"a": [2, 3, 1], "b": [1, 2, None]},
|
||||
index=["a", "b", "c"],
|
||||
columns=["a", "b"],
|
||||
)
|
||||
if sort is False:
|
||||
expected = expected.loc[["c", "a", "b"]]
|
||||
|
||||
# Warn and sort by default
|
||||
with tm.assert_produces_warning(None):
|
||||
result = pd.concat([df1, df2], axis=1, sort=sort)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_inner_sort(self, sort):
|
||||
# https://github.com/pandas-dev/pandas/pull/20613
|
||||
df1 = DataFrame(
|
||||
{"a": [1, 2], "b": [1, 2], "c": [1, 2]}, columns=["b", "a", "c"]
|
||||
)
|
||||
df2 = DataFrame({"a": [1, 2], "b": [3, 4]}, index=[3, 4])
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
# unset sort should *not* warn for inner join
|
||||
# since that never sorted
|
||||
result = pd.concat([df1, df2], sort=sort, join="inner", ignore_index=True)
|
||||
|
||||
expected = DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]}, columns=["b", "a"])
|
||||
if sort is True:
|
||||
expected = expected[["a", "b"]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_aligned_sort(self):
|
||||
# GH-4588
|
||||
df = DataFrame({"c": [1, 2], "b": [3, 4], "a": [5, 6]}, columns=["c", "b", "a"])
|
||||
result = pd.concat([df, df], sort=True, ignore_index=True)
|
||||
expected = DataFrame(
|
||||
{"a": [5, 6, 5, 6], "b": [3, 4, 3, 4], "c": [1, 2, 1, 2]},
|
||||
columns=["a", "b", "c"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = pd.concat(
|
||||
[df, df[["c", "b"]]], join="inner", sort=True, ignore_index=True
|
||||
)
|
||||
expected = expected[["b", "c"]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_aligned_sort_does_not_raise(self):
|
||||
# GH-4588
|
||||
# We catch TypeErrors from sorting internally and do not re-raise.
|
||||
df = DataFrame({1: [1, 2], "a": [3, 4]}, columns=[1, "a"])
|
||||
expected = DataFrame({1: [1, 2, 1, 2], "a": [3, 4, 3, 4]}, columns=[1, "a"])
|
||||
result = pd.concat([df, df], ignore_index=True, sort=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_frame_with_sort_false(self):
|
||||
# GH 43375
|
||||
result = pd.concat(
|
||||
[DataFrame({i: i}, index=[i]) for i in range(2, 0, -1)], sort=False
|
||||
)
|
||||
expected = DataFrame([[2, np.nan], [np.nan, 1]], index=[2, 1], columns=[2, 1])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH 37937
|
||||
df1 = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=[1, 2, 3])
|
||||
df2 = DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]}, index=[3, 1, 6])
|
||||
result = pd.concat([df2, df1], axis=1, sort=False)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[7.0, 10.0, 3.0, 6.0],
|
||||
[8.0, 11.0, 1.0, 4.0],
|
||||
[9.0, 12.0, np.nan, np.nan],
|
||||
[np.nan, np.nan, 2.0, 5.0],
|
||||
],
|
||||
index=[3, 1, 6, 2],
|
||||
columns=["c", "d", "a", "b"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_sort_none_raises(self):
|
||||
# GH#41518
|
||||
df = DataFrame({1: [1, 2], "a": [3, 4]})
|
||||
msg = "The 'sort' keyword only accepts boolean values; None was passed."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
pd.concat([df, df], sort=None)
|
1101
lib/python3.13/site-packages/pandas/tests/reshape/merge/test_join.py
Normal file
1101
lib/python3.13/site-packages/pandas/tests/reshape/merge/test_join.py
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,111 @@
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.reshape.merge import (
|
||||
MergeError,
|
||||
merge,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("input_col", "output_cols"), [("b", ["a", "b"]), ("a", ["a_x", "a_y"])]
|
||||
)
|
||||
def test_merge_cross(input_col, output_cols):
|
||||
# GH#5401
|
||||
left = DataFrame({"a": [1, 3]})
|
||||
right = DataFrame({input_col: [3, 4]})
|
||||
left_copy = left.copy()
|
||||
right_copy = right.copy()
|
||||
result = merge(left, right, how="cross")
|
||||
expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(left, left_copy)
|
||||
tm.assert_frame_equal(right, right_copy)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{"left_index": True},
|
||||
{"right_index": True},
|
||||
{"on": "a"},
|
||||
{"left_on": "a"},
|
||||
{"right_on": "b"},
|
||||
],
|
||||
)
|
||||
def test_merge_cross_error_reporting(kwargs):
|
||||
# GH#5401
|
||||
left = DataFrame({"a": [1, 3]})
|
||||
right = DataFrame({"b": [3, 4]})
|
||||
msg = (
|
||||
"Can not pass on, right_on, left_on or set right_index=True or "
|
||||
"left_index=True"
|
||||
)
|
||||
with pytest.raises(MergeError, match=msg):
|
||||
merge(left, right, how="cross", **kwargs)
|
||||
|
||||
|
||||
def test_merge_cross_mixed_dtypes():
|
||||
# GH#5401
|
||||
left = DataFrame(["a", "b", "c"], columns=["A"])
|
||||
right = DataFrame(range(2), columns=["B"])
|
||||
result = merge(left, right, how="cross")
|
||||
expected = DataFrame({"A": ["a", "a", "b", "b", "c", "c"], "B": [0, 1, 0, 1, 0, 1]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_merge_cross_more_than_one_column():
|
||||
# GH#5401
|
||||
left = DataFrame({"A": list("ab"), "B": [2, 1]})
|
||||
right = DataFrame({"C": range(2), "D": range(4, 6)})
|
||||
result = merge(left, right, how="cross")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": ["a", "a", "b", "b"],
|
||||
"B": [2, 2, 1, 1],
|
||||
"C": [0, 1, 0, 1],
|
||||
"D": [4, 5, 4, 5],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_merge_cross_null_values(nulls_fixture):
|
||||
# GH#5401
|
||||
left = DataFrame({"a": [1, nulls_fixture]})
|
||||
right = DataFrame({"b": ["a", "b"], "c": [1.0, 2.0]})
|
||||
result = merge(left, right, how="cross")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": [1, 1, nulls_fixture, nulls_fixture],
|
||||
"b": ["a", "b", "a", "b"],
|
||||
"c": [1.0, 2.0, 1.0, 2.0],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_join_cross_error_reporting():
|
||||
# GH#5401
|
||||
left = DataFrame({"a": [1, 3]})
|
||||
right = DataFrame({"a": [3, 4]})
|
||||
msg = (
|
||||
"Can not pass on, right_on, left_on or set right_index=True or "
|
||||
"left_index=True"
|
||||
)
|
||||
with pytest.raises(MergeError, match=msg):
|
||||
left.join(right, how="cross", on="a")
|
||||
|
||||
|
||||
def test_merge_cross_series():
|
||||
# GH#54055
|
||||
ls = Series([1, 2, 3, 4], index=[1, 2, 3, 4], name="left")
|
||||
rs = Series([3, 4, 5, 6], index=[3, 4, 5, 6], name="right")
|
||||
res = merge(ls, rs, how="cross")
|
||||
|
||||
expected = merge(ls.to_frame(), rs.to_frame(), how="cross")
|
||||
tm.assert_frame_equal(res, expected)
|
@ -0,0 +1,186 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df1():
|
||||
return DataFrame(
|
||||
{
|
||||
"outer": [1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4],
|
||||
"inner": [1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2],
|
||||
"v1": np.linspace(0, 1, 11),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df2():
|
||||
return DataFrame(
|
||||
{
|
||||
"outer": [1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3],
|
||||
"inner": [1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3],
|
||||
"v2": np.linspace(10, 11, 12),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]])
|
||||
def left_df(request, df1):
|
||||
"""Construct left test DataFrame with specified levels
|
||||
(any of 'outer', 'inner', and 'v1')
|
||||
"""
|
||||
levels = request.param
|
||||
if levels:
|
||||
df1 = df1.set_index(levels)
|
||||
|
||||
return df1
|
||||
|
||||
|
||||
@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]])
|
||||
def right_df(request, df2):
|
||||
"""Construct right test DataFrame with specified levels
|
||||
(any of 'outer', 'inner', and 'v2')
|
||||
"""
|
||||
levels = request.param
|
||||
|
||||
if levels:
|
||||
df2 = df2.set_index(levels)
|
||||
|
||||
return df2
|
||||
|
||||
|
||||
def compute_expected(df_left, df_right, on=None, left_on=None, right_on=None, how=None):
|
||||
"""
|
||||
Compute the expected merge result for the test case.
|
||||
|
||||
This method computes the expected result of merging two DataFrames on
|
||||
a combination of their columns and index levels. It does so by
|
||||
explicitly dropping/resetting their named index levels, performing a
|
||||
merge on their columns, and then finally restoring the appropriate
|
||||
index in the result.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_left : DataFrame
|
||||
The left DataFrame (may have zero or more named index levels)
|
||||
df_right : DataFrame
|
||||
The right DataFrame (may have zero or more named index levels)
|
||||
on : list of str
|
||||
The on parameter to the merge operation
|
||||
left_on : list of str
|
||||
The left_on parameter to the merge operation
|
||||
right_on : list of str
|
||||
The right_on parameter to the merge operation
|
||||
how : str
|
||||
The how parameter to the merge operation
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
The expected merge result
|
||||
"""
|
||||
# Handle on param if specified
|
||||
if on is not None:
|
||||
left_on, right_on = on, on
|
||||
|
||||
# Compute input named index levels
|
||||
left_levels = [n for n in df_left.index.names if n is not None]
|
||||
right_levels = [n for n in df_right.index.names if n is not None]
|
||||
|
||||
# Compute output named index levels
|
||||
output_levels = [i for i in left_on if i in right_levels and i in left_levels]
|
||||
|
||||
# Drop index levels that aren't involved in the merge
|
||||
drop_left = [n for n in left_levels if n not in left_on]
|
||||
if drop_left:
|
||||
df_left = df_left.reset_index(drop_left, drop=True)
|
||||
|
||||
drop_right = [n for n in right_levels if n not in right_on]
|
||||
if drop_right:
|
||||
df_right = df_right.reset_index(drop_right, drop=True)
|
||||
|
||||
# Convert remaining index levels to columns
|
||||
reset_left = [n for n in left_levels if n in left_on]
|
||||
if reset_left:
|
||||
df_left = df_left.reset_index(level=reset_left)
|
||||
|
||||
reset_right = [n for n in right_levels if n in right_on]
|
||||
if reset_right:
|
||||
df_right = df_right.reset_index(level=reset_right)
|
||||
|
||||
# Perform merge
|
||||
expected = df_left.merge(df_right, left_on=left_on, right_on=right_on, how=how)
|
||||
|
||||
# Restore index levels
|
||||
if output_levels:
|
||||
expected = expected.set_index(output_levels)
|
||||
|
||||
return expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"on,how",
|
||||
[
|
||||
(["outer"], "inner"),
|
||||
(["inner"], "left"),
|
||||
(["outer", "inner"], "right"),
|
||||
(["inner", "outer"], "outer"),
|
||||
],
|
||||
)
|
||||
def test_merge_indexes_and_columns_on(left_df, right_df, on, how):
|
||||
# Construct expected result
|
||||
expected = compute_expected(left_df, right_df, on=on, how=how)
|
||||
|
||||
# Perform merge
|
||||
result = left_df.merge(right_df, on=on, how=how)
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"left_on,right_on,how",
|
||||
[
|
||||
(["outer"], ["outer"], "inner"),
|
||||
(["inner"], ["inner"], "right"),
|
||||
(["outer", "inner"], ["outer", "inner"], "left"),
|
||||
(["inner", "outer"], ["inner", "outer"], "outer"),
|
||||
],
|
||||
)
|
||||
def test_merge_indexes_and_columns_lefton_righton(
|
||||
left_df, right_df, left_on, right_on, how
|
||||
):
|
||||
# Construct expected result
|
||||
expected = compute_expected(
|
||||
left_df, right_df, left_on=left_on, right_on=right_on, how=how
|
||||
)
|
||||
|
||||
# Perform merge
|
||||
result = left_df.merge(right_df, left_on=left_on, right_on=right_on, how=how)
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("left_index", ["inner", ["inner", "outer"]])
|
||||
def test_join_indexes_and_columns_on(df1, df2, left_index, join_type):
|
||||
# Construct left_df
|
||||
left_df = df1.set_index(left_index)
|
||||
|
||||
# Construct right_df
|
||||
right_df = df2.set_index(["outer", "inner"])
|
||||
|
||||
# Result
|
||||
expected = (
|
||||
left_df.reset_index()
|
||||
.join(
|
||||
right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y"
|
||||
)
|
||||
.set_index(left_index)
|
||||
)
|
||||
|
||||
# Perform join
|
||||
result = left_df.join(
|
||||
right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y"
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
@ -0,0 +1,244 @@
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
merge_ordered,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def left():
|
||||
return DataFrame({"key": ["a", "c", "e"], "lvalue": [1, 2.0, 3]})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def right():
|
||||
return DataFrame({"key": ["b", "c", "d", "f"], "rvalue": [1, 2, 3.0, 4]})
|
||||
|
||||
|
||||
class TestMergeOrdered:
|
||||
def test_basic(self, left, right):
|
||||
result = merge_ordered(left, right, on="key")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key": ["a", "b", "c", "d", "e", "f"],
|
||||
"lvalue": [1, np.nan, 2, np.nan, 3, np.nan],
|
||||
"rvalue": [np.nan, 1, 2, 3, np.nan, 4],
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_ffill(self, left, right):
|
||||
result = merge_ordered(left, right, on="key", fill_method="ffill")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key": ["a", "b", "c", "d", "e", "f"],
|
||||
"lvalue": [1.0, 1, 2, 2, 3, 3.0],
|
||||
"rvalue": [np.nan, 1, 2, 3, 3, 4],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_multigroup(self, left, right):
|
||||
left = pd.concat([left, left], ignore_index=True)
|
||||
|
||||
left["group"] = ["a"] * 3 + ["b"] * 3
|
||||
|
||||
result = merge_ordered(
|
||||
left, right, on="key", left_by="group", fill_method="ffill"
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key": ["a", "b", "c", "d", "e", "f"] * 2,
|
||||
"lvalue": [1.0, 1, 2, 2, 3, 3.0] * 2,
|
||||
"rvalue": [np.nan, 1, 2, 3, 3, 4] * 2,
|
||||
}
|
||||
)
|
||||
expected["group"] = ["a"] * 6 + ["b"] * 6
|
||||
|
||||
tm.assert_frame_equal(result, expected.loc[:, result.columns])
|
||||
|
||||
result2 = merge_ordered(
|
||||
right, left, on="key", right_by="group", fill_method="ffill"
|
||||
)
|
||||
tm.assert_frame_equal(result, result2.loc[:, result.columns])
|
||||
|
||||
result = merge_ordered(left, right, on="key", left_by="group")
|
||||
assert result["group"].notna().all()
|
||||
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning"
|
||||
)
|
||||
def test_merge_type(self, left, right):
|
||||
class NotADataFrame(DataFrame):
|
||||
@property
|
||||
def _constructor(self):
|
||||
return NotADataFrame
|
||||
|
||||
nad = NotADataFrame(left)
|
||||
result = nad.merge(right, on="key")
|
||||
|
||||
assert isinstance(result, NotADataFrame)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"df_seq, pattern",
|
||||
[
|
||||
((), "[Nn]o objects"),
|
||||
([], "[Nn]o objects"),
|
||||
({}, "[Nn]o objects"),
|
||||
([None], "objects.*None"),
|
||||
([None, None], "objects.*None"),
|
||||
],
|
||||
)
|
||||
def test_empty_sequence_concat(self, df_seq, pattern):
|
||||
# GH 9157
|
||||
with pytest.raises(ValueError, match=pattern):
|
||||
pd.concat(df_seq)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg", [[DataFrame()], [None, DataFrame()], [DataFrame(), None]]
|
||||
)
|
||||
def test_empty_sequence_concat_ok(self, arg):
|
||||
pd.concat(arg)
|
||||
|
||||
def test_doc_example(self):
|
||||
left = DataFrame(
|
||||
{
|
||||
"group": list("aaabbb"),
|
||||
"key": ["a", "c", "e", "a", "c", "e"],
|
||||
"lvalue": [1, 2, 3] * 2,
|
||||
}
|
||||
)
|
||||
|
||||
right = DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})
|
||||
|
||||
result = merge_ordered(left, right, fill_method="ffill", left_by="group")
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"group": list("aaaaabbbbb"),
|
||||
"key": ["a", "b", "c", "d", "e"] * 2,
|
||||
"lvalue": [1, 1, 2, 2, 3] * 2,
|
||||
"rvalue": [np.nan, 1, 2, 3, 3] * 2,
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"left, right, on, left_by, right_by, expected",
|
||||
[
|
||||
(
|
||||
DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}),
|
||||
DataFrame({"T": [2], "E": [1]}),
|
||||
["T"],
|
||||
["G", "H"],
|
||||
None,
|
||||
DataFrame(
|
||||
{
|
||||
"G": ["g"] * 3,
|
||||
"H": ["h"] * 3,
|
||||
"T": [1, 2, 3],
|
||||
"E": [np.nan, 1.0, np.nan],
|
||||
}
|
||||
),
|
||||
),
|
||||
(
|
||||
DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}),
|
||||
DataFrame({"T": [2], "E": [1]}),
|
||||
"T",
|
||||
["G", "H"],
|
||||
None,
|
||||
DataFrame(
|
||||
{
|
||||
"G": ["g"] * 3,
|
||||
"H": ["h"] * 3,
|
||||
"T": [1, 2, 3],
|
||||
"E": [np.nan, 1.0, np.nan],
|
||||
}
|
||||
),
|
||||
),
|
||||
(
|
||||
DataFrame({"T": [2], "E": [1]}),
|
||||
DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}),
|
||||
["T"],
|
||||
None,
|
||||
["G", "H"],
|
||||
DataFrame(
|
||||
{
|
||||
"T": [1, 2, 3],
|
||||
"E": [np.nan, 1.0, np.nan],
|
||||
"G": ["g"] * 3,
|
||||
"H": ["h"] * 3,
|
||||
}
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_list_type_by(self, left, right, on, left_by, right_by, expected):
|
||||
# GH 35269
|
||||
result = merge_ordered(
|
||||
left=left,
|
||||
right=right,
|
||||
on=on,
|
||||
left_by=left_by,
|
||||
right_by=right_by,
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_left_by_length_equals_to_right_shape0(self):
|
||||
# GH 38166
|
||||
left = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHE"))
|
||||
right = DataFrame([[2, 1]], columns=list("ET"))
|
||||
result = merge_ordered(left, right, on="E", left_by=["G", "H"])
|
||||
expected = DataFrame(
|
||||
{"G": ["g"] * 3, "H": ["h"] * 3, "E": [1, 2, 3], "T": [np.nan, 1.0, np.nan]}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_elements_not_in_by_but_in_df(self):
|
||||
# GH 38167
|
||||
left = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHE"))
|
||||
right = DataFrame([[2, 1]], columns=list("ET"))
|
||||
msg = r"\{'h'\} not found in left columns"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
merge_ordered(left, right, on="E", left_by=["G", "h"])
|
||||
|
||||
@pytest.mark.parametrize("invalid_method", ["linear", "carrot"])
|
||||
def test_ffill_validate_fill_method(self, left, right, invalid_method):
|
||||
# GH 55884
|
||||
with pytest.raises(
|
||||
ValueError, match=re.escape("fill_method must be 'ffill' or None")
|
||||
):
|
||||
merge_ordered(left, right, on="key", fill_method=invalid_method)
|
||||
|
||||
def test_ffill_left_merge(self):
|
||||
# GH 57010
|
||||
df1 = DataFrame(
|
||||
{
|
||||
"key": ["a", "c", "e", "a", "c", "e"],
|
||||
"lvalue": [1, 2, 3, 1, 2, 3],
|
||||
"group": ["a", "a", "a", "b", "b", "b"],
|
||||
}
|
||||
)
|
||||
df2 = DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})
|
||||
result = merge_ordered(
|
||||
df1, df2, fill_method="ffill", left_by="group", how="left"
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key": ["a", "c", "e", "a", "c", "e"],
|
||||
"lvalue": [1, 2, 3, 1, 2, 3],
|
||||
"group": ["a", "a", "a", "b", "b", "b"],
|
||||
"rvalue": [np.nan, 2.0, 2.0, np.nan, 2.0, 2.0],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,934 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
RangeIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.reshape.concat import concat
|
||||
from pandas.core.reshape.merge import merge
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def left():
|
||||
"""left dataframe (not multi-indexed) for multi-index join tests"""
|
||||
# a little relevant example with NAs
|
||||
key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"]
|
||||
key2 = ["two", "one", "three", "one", "two", "one", "two", "two", "three", "one"]
|
||||
|
||||
data = np.random.default_rng(2).standard_normal(len(key1))
|
||||
return DataFrame({"key1": key1, "key2": key2, "data": data})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def right(multiindex_dataframe_random_data):
|
||||
"""right dataframe (multi-indexed) for multi-index join tests"""
|
||||
df = multiindex_dataframe_random_data
|
||||
df.index.names = ["key1", "key2"]
|
||||
|
||||
df.columns = ["j_one", "j_two", "j_three"]
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def left_multi():
|
||||
return DataFrame(
|
||||
{
|
||||
"Origin": ["A", "A", "B", "B", "C"],
|
||||
"Destination": ["A", "B", "A", "C", "A"],
|
||||
"Period": ["AM", "AM", "IP", "AM", "OP"],
|
||||
"TripPurp": ["hbw", "nhb", "hbo", "nhb", "hbw"],
|
||||
"Trips": [1987, 3647, 2470, 4296, 4444],
|
||||
},
|
||||
columns=["Origin", "Destination", "Period", "TripPurp", "Trips"],
|
||||
).set_index(["Origin", "Destination", "Period", "TripPurp"])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def right_multi():
|
||||
return DataFrame(
|
||||
{
|
||||
"Origin": ["A", "A", "B", "B", "C", "C", "E"],
|
||||
"Destination": ["A", "B", "A", "B", "A", "B", "F"],
|
||||
"Period": ["AM", "AM", "IP", "AM", "OP", "IP", "AM"],
|
||||
"LinkType": ["a", "b", "c", "b", "a", "b", "a"],
|
||||
"Distance": [100, 80, 90, 80, 75, 35, 55],
|
||||
},
|
||||
columns=["Origin", "Destination", "Period", "LinkType", "Distance"],
|
||||
).set_index(["Origin", "Destination", "Period", "LinkType"])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def on_cols_multi():
|
||||
return ["Origin", "Destination", "Period"]
|
||||
|
||||
|
||||
class TestMergeMulti:
|
||||
def test_merge_on_multikey(self, left, right, join_type):
|
||||
on_cols = ["key1", "key2"]
|
||||
result = left.join(right, on=on_cols, how=join_type).reset_index(drop=True)
|
||||
|
||||
expected = merge(left, right.reset_index(), on=on_cols, how=join_type)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = left.join(right, on=on_cols, how=join_type, sort=True).reset_index(
|
||||
drop=True
|
||||
)
|
||||
|
||||
expected = merge(
|
||||
left, right.reset_index(), on=on_cols, how=join_type, sort=True
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
|
||||
)
|
||||
@pytest.mark.parametrize("sort", [True, False])
|
||||
def test_left_join_multi_index(self, sort, infer_string):
|
||||
with option_context("future.infer_string", infer_string):
|
||||
icols = ["1st", "2nd", "3rd"]
|
||||
|
||||
def bind_cols(df):
|
||||
iord = lambda a: 0 if a != a else ord(a)
|
||||
f = lambda ts: ts.map(iord) - ord("a")
|
||||
return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10
|
||||
|
||||
def run_asserts(left, right, sort):
|
||||
res = left.join(right, on=icols, how="left", sort=sort)
|
||||
|
||||
assert len(left) < len(res) + 1
|
||||
assert not res["4th"].isna().any()
|
||||
assert not res["5th"].isna().any()
|
||||
|
||||
tm.assert_series_equal(res["4th"], -res["5th"], check_names=False)
|
||||
result = bind_cols(res.iloc[:, :-2])
|
||||
tm.assert_series_equal(res["4th"], result, check_names=False)
|
||||
assert result.name is None
|
||||
|
||||
if sort:
|
||||
tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort"))
|
||||
|
||||
out = merge(left, right.reset_index(), on=icols, sort=sort, how="left")
|
||||
|
||||
res.index = RangeIndex(len(res))
|
||||
tm.assert_frame_equal(out, res)
|
||||
|
||||
lc = list(map(chr, np.arange(ord("a"), ord("z") + 1)))
|
||||
left = DataFrame(
|
||||
np.random.default_rng(2).choice(lc, (50, 2)), columns=["1st", "3rd"]
|
||||
)
|
||||
# Explicit cast to float to avoid implicit cast when setting nan
|
||||
left.insert(
|
||||
1,
|
||||
"2nd",
|
||||
np.random.default_rng(2).integers(0, 10, len(left)).astype("float"),
|
||||
)
|
||||
|
||||
i = np.random.default_rng(2).permutation(len(left))
|
||||
right = left.iloc[i].copy()
|
||||
|
||||
left["4th"] = bind_cols(left)
|
||||
right["5th"] = -bind_cols(right)
|
||||
right.set_index(icols, inplace=True)
|
||||
|
||||
run_asserts(left, right, sort)
|
||||
|
||||
# inject some nulls
|
||||
left.loc[1::4, "1st"] = np.nan
|
||||
left.loc[2::5, "2nd"] = np.nan
|
||||
left.loc[3::6, "3rd"] = np.nan
|
||||
left["4th"] = bind_cols(left)
|
||||
|
||||
i = np.random.default_rng(2).permutation(len(left))
|
||||
right = left.iloc[i, :-1]
|
||||
right["5th"] = -bind_cols(right)
|
||||
right.set_index(icols, inplace=True)
|
||||
|
||||
run_asserts(left, right, sort)
|
||||
|
||||
@pytest.mark.parametrize("sort", [False, True])
|
||||
def test_merge_right_vs_left(self, left, right, sort):
|
||||
# compare left vs right merge with multikey
|
||||
on_cols = ["key1", "key2"]
|
||||
merged_left_right = left.merge(
|
||||
right, left_on=on_cols, right_index=True, how="left", sort=sort
|
||||
)
|
||||
|
||||
merge_right_left = right.merge(
|
||||
left, right_on=on_cols, left_index=True, how="right", sort=sort
|
||||
)
|
||||
|
||||
# Reorder columns
|
||||
merge_right_left = merge_right_left[merged_left_right.columns]
|
||||
|
||||
tm.assert_frame_equal(merged_left_right, merge_right_left)
|
||||
|
||||
def test_merge_multiple_cols_with_mixed_cols_index(self):
|
||||
# GH29522
|
||||
s = Series(
|
||||
range(6),
|
||||
MultiIndex.from_product([["A", "B"], [1, 2, 3]], names=["lev1", "lev2"]),
|
||||
name="Amount",
|
||||
)
|
||||
df = DataFrame({"lev1": list("AAABBB"), "lev2": [1, 2, 3, 1, 2, 3], "col": 0})
|
||||
result = merge(df, s.reset_index(), on=["lev1", "lev2"])
|
||||
expected = DataFrame(
|
||||
{
|
||||
"lev1": list("AAABBB"),
|
||||
"lev2": [1, 2, 3, 1, 2, 3],
|
||||
"col": [0] * 6,
|
||||
"Amount": range(6),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_compress_group_combinations(self):
|
||||
# ~ 40000000 possible unique groups
|
||||
key1 = [str(i) for i in range(10000)]
|
||||
key1 = np.tile(key1, 2)
|
||||
key2 = key1[::-1]
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"key1": key1,
|
||||
"key2": key2,
|
||||
"value1": np.random.default_rng(2).standard_normal(20000),
|
||||
}
|
||||
)
|
||||
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"key1": key1[::2],
|
||||
"key2": key2[::2],
|
||||
"value2": np.random.default_rng(2).standard_normal(10000),
|
||||
}
|
||||
)
|
||||
|
||||
# just to hit the label compression code path
|
||||
merge(df, df2, how="outer")
|
||||
|
||||
def test_left_join_index_preserve_order(self):
|
||||
on_cols = ["k1", "k2"]
|
||||
left = DataFrame(
|
||||
{
|
||||
"k1": [0, 1, 2] * 8,
|
||||
"k2": ["foo", "bar"] * 12,
|
||||
"v": np.array(np.arange(24), dtype=np.int64),
|
||||
}
|
||||
)
|
||||
|
||||
index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
|
||||
right = DataFrame({"v2": [5, 7]}, index=index)
|
||||
|
||||
result = left.join(right, on=on_cols)
|
||||
|
||||
expected = left.copy()
|
||||
expected["v2"] = np.nan
|
||||
expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5
|
||||
expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result.sort_values(on_cols, kind="mergesort", inplace=True)
|
||||
expected = left.join(right, on=on_cols, sort=True)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# test join with multi dtypes blocks
|
||||
left = DataFrame(
|
||||
{
|
||||
"k1": [0, 1, 2] * 8,
|
||||
"k2": ["foo", "bar"] * 12,
|
||||
"k3": np.array([0, 1, 2] * 8, dtype=np.float32),
|
||||
"v": np.array(np.arange(24), dtype=np.int32),
|
||||
}
|
||||
)
|
||||
|
||||
index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
|
||||
right = DataFrame({"v2": [5, 7]}, index=index)
|
||||
|
||||
result = left.join(right, on=on_cols)
|
||||
|
||||
expected = left.copy()
|
||||
expected["v2"] = np.nan
|
||||
expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5
|
||||
expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = result.sort_values(on_cols, kind="mergesort")
|
||||
expected = left.join(right, on=on_cols, sort=True)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_left_join_index_multi_match_multiindex(self):
|
||||
left = DataFrame(
|
||||
[
|
||||
["X", "Y", "C", "a"],
|
||||
["W", "Y", "C", "e"],
|
||||
["V", "Q", "A", "h"],
|
||||
["V", "R", "D", "i"],
|
||||
["X", "Y", "D", "b"],
|
||||
["X", "Y", "A", "c"],
|
||||
["W", "Q", "B", "f"],
|
||||
["W", "R", "C", "g"],
|
||||
["V", "Y", "C", "j"],
|
||||
["X", "Y", "B", "d"],
|
||||
],
|
||||
columns=["cola", "colb", "colc", "tag"],
|
||||
index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8],
|
||||
)
|
||||
|
||||
right = DataFrame(
|
||||
[
|
||||
["W", "R", "C", 0],
|
||||
["W", "Q", "B", 3],
|
||||
["W", "Q", "B", 8],
|
||||
["X", "Y", "A", 1],
|
||||
["X", "Y", "A", 4],
|
||||
["X", "Y", "B", 5],
|
||||
["X", "Y", "C", 6],
|
||||
["X", "Y", "C", 9],
|
||||
["X", "Q", "C", -6],
|
||||
["X", "R", "C", -9],
|
||||
["V", "Y", "C", 7],
|
||||
["V", "R", "D", 2],
|
||||
["V", "R", "D", -1],
|
||||
["V", "Q", "A", -3],
|
||||
],
|
||||
columns=["col1", "col2", "col3", "val"],
|
||||
).set_index(["col1", "col2", "col3"])
|
||||
|
||||
result = left.join(right, on=["cola", "colb", "colc"], how="left")
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
["X", "Y", "C", "a", 6],
|
||||
["X", "Y", "C", "a", 9],
|
||||
["W", "Y", "C", "e", np.nan],
|
||||
["V", "Q", "A", "h", -3],
|
||||
["V", "R", "D", "i", 2],
|
||||
["V", "R", "D", "i", -1],
|
||||
["X", "Y", "D", "b", np.nan],
|
||||
["X", "Y", "A", "c", 1],
|
||||
["X", "Y", "A", "c", 4],
|
||||
["W", "Q", "B", "f", 3],
|
||||
["W", "Q", "B", "f", 8],
|
||||
["W", "R", "C", "g", 0],
|
||||
["V", "Y", "C", "j", 7],
|
||||
["X", "Y", "B", "d", 5],
|
||||
],
|
||||
columns=["cola", "colb", "colc", "tag", "val"],
|
||||
index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8],
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = left.join(right, on=["cola", "colb", "colc"], how="left", sort=True)
|
||||
|
||||
expected = expected.sort_values(["cola", "colb", "colc"], kind="mergesort")
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_left_join_index_multi_match(self):
|
||||
left = DataFrame(
|
||||
[["c", 0], ["b", 1], ["a", 2], ["b", 3]],
|
||||
columns=["tag", "val"],
|
||||
index=[2, 0, 1, 3],
|
||||
)
|
||||
|
||||
right = DataFrame(
|
||||
[
|
||||
["a", "v"],
|
||||
["c", "w"],
|
||||
["c", "x"],
|
||||
["d", "y"],
|
||||
["a", "z"],
|
||||
["c", "r"],
|
||||
["e", "q"],
|
||||
["c", "s"],
|
||||
],
|
||||
columns=["tag", "char"],
|
||||
).set_index("tag")
|
||||
|
||||
result = left.join(right, on="tag", how="left")
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
["c", 0, "w"],
|
||||
["c", 0, "x"],
|
||||
["c", 0, "r"],
|
||||
["c", 0, "s"],
|
||||
["b", 1, np.nan],
|
||||
["a", 2, "v"],
|
||||
["a", 2, "z"],
|
||||
["b", 3, np.nan],
|
||||
],
|
||||
columns=["tag", "val", "char"],
|
||||
index=[2, 2, 2, 2, 0, 1, 1, 3],
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = left.join(right, on="tag", how="left", sort=True)
|
||||
expected2 = expected.sort_values("tag", kind="mergesort")
|
||||
|
||||
tm.assert_frame_equal(result, expected2)
|
||||
|
||||
# GH7331 - maintain left frame order in left merge
|
||||
result = merge(left, right.reset_index(), how="left", on="tag")
|
||||
expected.index = RangeIndex(len(expected))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_left_merge_na_buglet(self):
|
||||
left = DataFrame(
|
||||
{
|
||||
"id": list("abcde"),
|
||||
"v1": np.random.default_rng(2).standard_normal(5),
|
||||
"v2": np.random.default_rng(2).standard_normal(5),
|
||||
"dummy": list("abcde"),
|
||||
"v3": np.random.default_rng(2).standard_normal(5),
|
||||
},
|
||||
columns=["id", "v1", "v2", "dummy", "v3"],
|
||||
)
|
||||
right = DataFrame(
|
||||
{
|
||||
"id": ["a", "b", np.nan, np.nan, np.nan],
|
||||
"sv3": [1.234, 5.678, np.nan, np.nan, np.nan],
|
||||
}
|
||||
)
|
||||
|
||||
result = merge(left, right, on="id", how="left")
|
||||
|
||||
rdf = right.drop(["id"], axis=1)
|
||||
expected = left.join(rdf)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_merge_na_keys(self):
|
||||
data = [
|
||||
[1950, "A", 1.5],
|
||||
[1950, "B", 1.5],
|
||||
[1955, "B", 1.5],
|
||||
[1960, "B", np.nan],
|
||||
[1970, "B", 4.0],
|
||||
[1950, "C", 4.0],
|
||||
[1960, "C", np.nan],
|
||||
[1965, "C", 3.0],
|
||||
[1970, "C", 4.0],
|
||||
]
|
||||
|
||||
frame = DataFrame(data, columns=["year", "panel", "data"])
|
||||
|
||||
other_data = [
|
||||
[1960, "A", np.nan],
|
||||
[1970, "A", np.nan],
|
||||
[1955, "A", np.nan],
|
||||
[1965, "A", np.nan],
|
||||
[1965, "B", np.nan],
|
||||
[1955, "C", np.nan],
|
||||
]
|
||||
other = DataFrame(other_data, columns=["year", "panel", "data"])
|
||||
|
||||
result = frame.merge(other, how="outer")
|
||||
|
||||
expected = frame.fillna(-999).merge(other.fillna(-999), how="outer")
|
||||
expected = expected.replace(-999, np.nan)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("klass", [None, np.asarray, Series, Index])
|
||||
def test_merge_datetime_index(self, klass):
|
||||
# see gh-19038
|
||||
df = DataFrame(
|
||||
[1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"]
|
||||
)
|
||||
df.index = pd.to_datetime(df.index)
|
||||
on_vector = df.index.year
|
||||
|
||||
if klass is not None:
|
||||
on_vector = klass(on_vector)
|
||||
|
||||
exp_years = np.array([2016, 2017, 2018], dtype=np.int32)
|
||||
expected = DataFrame({"a": [1, 2, 3], "key_1": exp_years})
|
||||
|
||||
result = df.merge(df, on=["a", on_vector], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame({"key_0": exp_years, "a_x": [1, 2, 3], "a_y": [1, 2, 3]})
|
||||
|
||||
result = df.merge(df, on=[df.index.year], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("merge_type", ["left", "right"])
|
||||
def test_merge_datetime_multi_index_empty_df(self, merge_type):
|
||||
# see gh-36895
|
||||
|
||||
left = DataFrame(
|
||||
data={
|
||||
"data": [1.5, 1.5],
|
||||
},
|
||||
index=MultiIndex.from_tuples(
|
||||
[[Timestamp("1950-01-01"), "A"], [Timestamp("1950-01-02"), "B"]],
|
||||
names=["date", "panel"],
|
||||
),
|
||||
)
|
||||
|
||||
right = DataFrame(
|
||||
index=MultiIndex.from_tuples([], names=["date", "panel"]), columns=["state"]
|
||||
)
|
||||
|
||||
expected_index = MultiIndex.from_tuples(
|
||||
[[Timestamp("1950-01-01"), "A"], [Timestamp("1950-01-02"), "B"]],
|
||||
names=["date", "panel"],
|
||||
)
|
||||
|
||||
if merge_type == "left":
|
||||
expected = DataFrame(
|
||||
data={
|
||||
"data": [1.5, 1.5],
|
||||
"state": np.array([np.nan, np.nan], dtype=object),
|
||||
},
|
||||
index=expected_index,
|
||||
)
|
||||
results_merge = left.merge(right, how="left", on=["date", "panel"])
|
||||
results_join = left.join(right, how="left")
|
||||
else:
|
||||
expected = DataFrame(
|
||||
data={
|
||||
"state": np.array([np.nan, np.nan], dtype=object),
|
||||
"data": [1.5, 1.5],
|
||||
},
|
||||
index=expected_index,
|
||||
)
|
||||
results_merge = right.merge(left, how="right", on=["date", "panel"])
|
||||
results_join = right.join(left, how="right")
|
||||
|
||||
tm.assert_frame_equal(results_merge, expected)
|
||||
tm.assert_frame_equal(results_join, expected)
|
||||
|
||||
@pytest.fixture
|
||||
def household(self):
|
||||
household = DataFrame(
|
||||
{
|
||||
"household_id": [1, 2, 3],
|
||||
"male": [0, 1, 0],
|
||||
"wealth": [196087.3, 316478.7, 294750],
|
||||
},
|
||||
columns=["household_id", "male", "wealth"],
|
||||
).set_index("household_id")
|
||||
return household
|
||||
|
||||
@pytest.fixture
|
||||
def portfolio(self):
|
||||
portfolio = DataFrame(
|
||||
{
|
||||
"household_id": [1, 2, 2, 3, 3, 3, 4],
|
||||
"asset_id": [
|
||||
"nl0000301109",
|
||||
"nl0000289783",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"nl0000289965",
|
||||
np.nan,
|
||||
],
|
||||
"name": [
|
||||
"ABN Amro",
|
||||
"Robeco",
|
||||
"Royal Dutch Shell",
|
||||
"Royal Dutch Shell",
|
||||
"AAB Eastern Europe Equity Fund",
|
||||
"Postbank BioTech Fonds",
|
||||
np.nan,
|
||||
],
|
||||
"share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
|
||||
},
|
||||
columns=["household_id", "asset_id", "name", "share"],
|
||||
).set_index(["household_id", "asset_id"])
|
||||
return portfolio
|
||||
|
||||
@pytest.fixture
|
||||
def expected(self):
|
||||
expected = (
|
||||
DataFrame(
|
||||
{
|
||||
"male": [0, 1, 1, 0, 0, 0],
|
||||
"wealth": [
|
||||
196087.3,
|
||||
316478.7,
|
||||
316478.7,
|
||||
294750.0,
|
||||
294750.0,
|
||||
294750.0,
|
||||
],
|
||||
"name": [
|
||||
"ABN Amro",
|
||||
"Robeco",
|
||||
"Royal Dutch Shell",
|
||||
"Royal Dutch Shell",
|
||||
"AAB Eastern Europe Equity Fund",
|
||||
"Postbank BioTech Fonds",
|
||||
],
|
||||
"share": [1.00, 0.40, 0.60, 0.15, 0.60, 0.25],
|
||||
"household_id": [1, 2, 2, 3, 3, 3],
|
||||
"asset_id": [
|
||||
"nl0000301109",
|
||||
"nl0000289783",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"nl0000289965",
|
||||
],
|
||||
}
|
||||
)
|
||||
.set_index(["household_id", "asset_id"])
|
||||
.reindex(columns=["male", "wealth", "name", "share"])
|
||||
)
|
||||
return expected
|
||||
|
||||
def test_join_multi_levels(self, portfolio, household, expected):
|
||||
portfolio = portfolio.copy()
|
||||
household = household.copy()
|
||||
|
||||
# GH 3662
|
||||
# merge multi-levels
|
||||
result = household.join(portfolio, how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_multi_levels_merge_equivalence(self, portfolio, household, expected):
|
||||
portfolio = portfolio.copy()
|
||||
household = household.copy()
|
||||
|
||||
# equivalency
|
||||
result = merge(
|
||||
household.reset_index(),
|
||||
portfolio.reset_index(),
|
||||
on=["household_id"],
|
||||
how="inner",
|
||||
).set_index(["household_id", "asset_id"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_multi_levels_outer(self, portfolio, household, expected):
|
||||
portfolio = portfolio.copy()
|
||||
household = household.copy()
|
||||
|
||||
result = household.join(portfolio, how="outer")
|
||||
expected = concat(
|
||||
[
|
||||
expected,
|
||||
(
|
||||
DataFrame(
|
||||
{"share": [1.00]},
|
||||
index=MultiIndex.from_tuples(
|
||||
[(4, np.nan)], names=["household_id", "asset_id"]
|
||||
),
|
||||
)
|
||||
),
|
||||
],
|
||||
axis=0,
|
||||
sort=True,
|
||||
).reindex(columns=expected.columns)
|
||||
tm.assert_frame_equal(result, expected, check_index_type=False)
|
||||
|
||||
def test_join_multi_levels_invalid(self, portfolio, household):
|
||||
portfolio = portfolio.copy()
|
||||
household = household.copy()
|
||||
|
||||
# invalid cases
|
||||
household.index.name = "foo"
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="cannot join with no overlapping index names"
|
||||
):
|
||||
household.join(portfolio, how="inner")
|
||||
|
||||
portfolio2 = portfolio.copy()
|
||||
portfolio2.index.set_names(["household_id", "foo"])
|
||||
|
||||
with pytest.raises(ValueError, match="columns overlap but no suffix specified"):
|
||||
portfolio2.join(portfolio, how="inner")
|
||||
|
||||
def test_join_multi_levels2(self):
|
||||
# some more advanced merges
|
||||
# GH6360
|
||||
household = DataFrame(
|
||||
{
|
||||
"household_id": [1, 2, 2, 3, 3, 3, 4],
|
||||
"asset_id": [
|
||||
"nl0000301109",
|
||||
"nl0000301109",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"nl0000289965",
|
||||
np.nan,
|
||||
],
|
||||
"share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
|
||||
},
|
||||
columns=["household_id", "asset_id", "share"],
|
||||
).set_index(["household_id", "asset_id"])
|
||||
|
||||
log_return = DataFrame(
|
||||
{
|
||||
"asset_id": [
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"lu0197800237",
|
||||
],
|
||||
"t": [233, 234, 235, 180, 181],
|
||||
"log_return": [
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.03025441,
|
||||
0.036997,
|
||||
],
|
||||
}
|
||||
).set_index(["asset_id", "t"])
|
||||
|
||||
expected = (
|
||||
DataFrame(
|
||||
{
|
||||
"household_id": [2, 2, 2, 3, 3, 3, 3, 3],
|
||||
"asset_id": [
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"lu0197800237",
|
||||
],
|
||||
"t": [233, 234, 235, 233, 234, 235, 180, 181],
|
||||
"share": [0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6],
|
||||
"log_return": [
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.03025441,
|
||||
0.036997,
|
||||
],
|
||||
}
|
||||
)
|
||||
.set_index(["household_id", "asset_id", "t"])
|
||||
.reindex(columns=["share", "log_return"])
|
||||
)
|
||||
|
||||
# this is the equivalency
|
||||
result = merge(
|
||||
household.reset_index(),
|
||||
log_return.reset_index(),
|
||||
on=["asset_id"],
|
||||
how="inner",
|
||||
).set_index(["household_id", "asset_id", "t"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = (
|
||||
DataFrame(
|
||||
{
|
||||
"household_id": [2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 2, 4],
|
||||
"asset_id": [
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"lu0197800237",
|
||||
"nl0000289965",
|
||||
"nl0000301109",
|
||||
"nl0000301109",
|
||||
None,
|
||||
],
|
||||
"t": [
|
||||
233,
|
||||
234,
|
||||
235,
|
||||
233,
|
||||
234,
|
||||
235,
|
||||
180,
|
||||
181,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
],
|
||||
"share": [
|
||||
0.6,
|
||||
0.6,
|
||||
0.6,
|
||||
0.15,
|
||||
0.15,
|
||||
0.15,
|
||||
0.6,
|
||||
0.6,
|
||||
0.25,
|
||||
1.0,
|
||||
0.4,
|
||||
1.0,
|
||||
],
|
||||
"log_return": [
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.03025441,
|
||||
0.036997,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
],
|
||||
}
|
||||
)
|
||||
.set_index(["household_id", "asset_id", "t"])
|
||||
.reindex(columns=["share", "log_return"])
|
||||
)
|
||||
|
||||
result = merge(
|
||||
household.reset_index(),
|
||||
log_return.reset_index(),
|
||||
on=["asset_id"],
|
||||
how="outer",
|
||||
).set_index(["household_id", "asset_id", "t"])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestJoinMultiMulti:
|
||||
def test_join_multi_multi(self, left_multi, right_multi, join_type, on_cols_multi):
|
||||
left_names = left_multi.index.names
|
||||
right_names = right_multi.index.names
|
||||
if join_type == "right":
|
||||
level_order = right_names + left_names.difference(right_names)
|
||||
else:
|
||||
level_order = left_names + right_names.difference(left_names)
|
||||
# Multi-index join tests
|
||||
expected = (
|
||||
merge(
|
||||
left_multi.reset_index(),
|
||||
right_multi.reset_index(),
|
||||
how=join_type,
|
||||
on=on_cols_multi,
|
||||
)
|
||||
.set_index(level_order)
|
||||
.sort_index()
|
||||
)
|
||||
|
||||
result = left_multi.join(right_multi, how=join_type).sort_index()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_multi_empty_frames(
|
||||
self, left_multi, right_multi, join_type, on_cols_multi
|
||||
):
|
||||
left_multi = left_multi.drop(columns=left_multi.columns)
|
||||
right_multi = right_multi.drop(columns=right_multi.columns)
|
||||
|
||||
left_names = left_multi.index.names
|
||||
right_names = right_multi.index.names
|
||||
if join_type == "right":
|
||||
level_order = right_names + left_names.difference(right_names)
|
||||
else:
|
||||
level_order = left_names + right_names.difference(left_names)
|
||||
|
||||
expected = (
|
||||
merge(
|
||||
left_multi.reset_index(),
|
||||
right_multi.reset_index(),
|
||||
how=join_type,
|
||||
on=on_cols_multi,
|
||||
)
|
||||
.set_index(level_order)
|
||||
.sort_index()
|
||||
)
|
||||
|
||||
result = left_multi.join(right_multi, how=join_type).sort_index()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("box", [None, np.asarray, Series, Index])
|
||||
def test_merge_datetime_index(self, box):
|
||||
# see gh-19038
|
||||
df = DataFrame(
|
||||
[1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"]
|
||||
)
|
||||
df.index = pd.to_datetime(df.index)
|
||||
on_vector = df.index.year
|
||||
|
||||
if box is not None:
|
||||
on_vector = box(on_vector)
|
||||
|
||||
exp_years = np.array([2016, 2017, 2018], dtype=np.int32)
|
||||
expected = DataFrame({"a": [1, 2, 3], "key_1": exp_years})
|
||||
|
||||
result = df.merge(df, on=["a", on_vector], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame({"key_0": exp_years, "a_x": [1, 2, 3], "a_y": [1, 2, 3]})
|
||||
|
||||
result = df.merge(df, on=[df.index.year], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_single_common_level(self):
|
||||
index_left = MultiIndex.from_tuples(
|
||||
[("K0", "X0"), ("K0", "X1"), ("K1", "X2")], names=["key", "X"]
|
||||
)
|
||||
|
||||
left = DataFrame(
|
||||
{"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=index_left
|
||||
)
|
||||
|
||||
index_right = MultiIndex.from_tuples(
|
||||
[("K0", "Y0"), ("K1", "Y1"), ("K2", "Y2"), ("K2", "Y3")], names=["key", "Y"]
|
||||
)
|
||||
|
||||
right = DataFrame(
|
||||
{"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]},
|
||||
index=index_right,
|
||||
)
|
||||
|
||||
result = left.join(right)
|
||||
expected = merge(
|
||||
left.reset_index(), right.reset_index(), on=["key"], how="inner"
|
||||
).set_index(["key", "X", "Y"])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_multi_wrong_order(self):
|
||||
# GH 25760
|
||||
# GH 28956
|
||||
|
||||
midx1 = MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"])
|
||||
midx3 = MultiIndex.from_tuples([(4, 1), (3, 2), (3, 1)], names=["b", "a"])
|
||||
|
||||
left = DataFrame(index=midx1, data={"x": [10, 20, 30, 40]})
|
||||
right = DataFrame(index=midx3, data={"y": ["foo", "bar", "fing"]})
|
||||
|
||||
result = left.join(right)
|
||||
|
||||
expected = DataFrame(
|
||||
index=midx1,
|
||||
data={"x": [10, 20, 30, 40], "y": ["fing", "foo", "bar", np.nan]},
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,886 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
CategoricalDtype,
|
||||
CategoricalIndex,
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
crosstab,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df():
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
],
|
||||
"B": [
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"two",
|
||||
"two",
|
||||
"one",
|
||||
],
|
||||
"C": [
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"shiny",
|
||||
],
|
||||
"D": np.random.default_rng(2).standard_normal(11),
|
||||
"E": np.random.default_rng(2).standard_normal(11),
|
||||
"F": np.random.default_rng(2).standard_normal(11),
|
||||
}
|
||||
)
|
||||
|
||||
return pd.concat([df, df], ignore_index=True)
|
||||
|
||||
|
||||
class TestCrosstab:
|
||||
def test_crosstab_single(self, df):
|
||||
result = crosstab(df["A"], df["C"])
|
||||
expected = df.groupby(["A", "C"]).size().unstack()
|
||||
tm.assert_frame_equal(result, expected.fillna(0).astype(np.int64))
|
||||
|
||||
def test_crosstab_multiple(self, df):
|
||||
result = crosstab(df["A"], [df["B"], df["C"]])
|
||||
expected = df.groupby(["A", "B", "C"]).size()
|
||||
expected = expected.unstack("B").unstack("C").fillna(0).astype(np.int64)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = crosstab([df["B"], df["C"]], df["A"])
|
||||
expected = df.groupby(["B", "C", "A"]).size()
|
||||
expected = expected.unstack("A").fillna(0).astype(np.int64)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("box", [np.array, list, tuple])
|
||||
def test_crosstab_ndarray(self, box):
|
||||
# GH 44076
|
||||
a = box(np.random.default_rng(2).integers(0, 5, size=100))
|
||||
b = box(np.random.default_rng(2).integers(0, 3, size=100))
|
||||
c = box(np.random.default_rng(2).integers(0, 10, size=100))
|
||||
|
||||
df = DataFrame({"a": a, "b": b, "c": c})
|
||||
|
||||
result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"))
|
||||
expected = crosstab(df["a"], [df["b"], df["c"]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = crosstab([b, c], a, colnames=["a"], rownames=("b", "c"))
|
||||
expected = crosstab([df["b"], df["c"]], df["a"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# assign arbitrary names
|
||||
result = crosstab(a, c)
|
||||
expected = crosstab(df["a"], df["c"])
|
||||
expected.index.names = ["row_0"]
|
||||
expected.columns.names = ["col_0"]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_crosstab_non_aligned(self):
|
||||
# GH 17005
|
||||
a = Series([0, 1, 1], index=["a", "b", "c"])
|
||||
b = Series([3, 4, 3, 4, 3], index=["a", "b", "c", "d", "f"])
|
||||
c = np.array([3, 4, 3], dtype=np.int64)
|
||||
|
||||
expected = DataFrame(
|
||||
[[1, 0], [1, 1]],
|
||||
index=Index([0, 1], name="row_0"),
|
||||
columns=Index([3, 4], name="col_0"),
|
||||
)
|
||||
|
||||
result = crosstab(a, b)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = crosstab(a, c)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_crosstab_margins(self):
|
||||
a = np.random.default_rng(2).integers(0, 7, size=100)
|
||||
b = np.random.default_rng(2).integers(0, 3, size=100)
|
||||
c = np.random.default_rng(2).integers(0, 5, size=100)
|
||||
|
||||
df = DataFrame({"a": a, "b": b, "c": c})
|
||||
|
||||
result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"), margins=True)
|
||||
|
||||
assert result.index.names == ("a",)
|
||||
assert result.columns.names == ["b", "c"]
|
||||
|
||||
all_cols = result["All", ""]
|
||||
exp_cols = df.groupby(["a"]).size().astype("i8")
|
||||
# to keep index.name
|
||||
exp_margin = Series([len(df)], index=Index(["All"], name="a"))
|
||||
exp_cols = pd.concat([exp_cols, exp_margin])
|
||||
exp_cols.name = ("All", "")
|
||||
|
||||
tm.assert_series_equal(all_cols, exp_cols)
|
||||
|
||||
all_rows = result.loc["All"]
|
||||
exp_rows = df.groupby(["b", "c"]).size().astype("i8")
|
||||
exp_rows = pd.concat([exp_rows, Series([len(df)], index=[("All", "")])])
|
||||
exp_rows.name = "All"
|
||||
|
||||
exp_rows = exp_rows.reindex(all_rows.index)
|
||||
exp_rows = exp_rows.fillna(0).astype(np.int64)
|
||||
tm.assert_series_equal(all_rows, exp_rows)
|
||||
|
||||
def test_crosstab_margins_set_margin_name(self):
|
||||
# GH 15972
|
||||
a = np.random.default_rng(2).integers(0, 7, size=100)
|
||||
b = np.random.default_rng(2).integers(0, 3, size=100)
|
||||
c = np.random.default_rng(2).integers(0, 5, size=100)
|
||||
|
||||
df = DataFrame({"a": a, "b": b, "c": c})
|
||||
|
||||
result = crosstab(
|
||||
a,
|
||||
[b, c],
|
||||
rownames=["a"],
|
||||
colnames=("b", "c"),
|
||||
margins=True,
|
||||
margins_name="TOTAL",
|
||||
)
|
||||
|
||||
assert result.index.names == ("a",)
|
||||
assert result.columns.names == ["b", "c"]
|
||||
|
||||
all_cols = result["TOTAL", ""]
|
||||
exp_cols = df.groupby(["a"]).size().astype("i8")
|
||||
# to keep index.name
|
||||
exp_margin = Series([len(df)], index=Index(["TOTAL"], name="a"))
|
||||
exp_cols = pd.concat([exp_cols, exp_margin])
|
||||
exp_cols.name = ("TOTAL", "")
|
||||
|
||||
tm.assert_series_equal(all_cols, exp_cols)
|
||||
|
||||
all_rows = result.loc["TOTAL"]
|
||||
exp_rows = df.groupby(["b", "c"]).size().astype("i8")
|
||||
exp_rows = pd.concat([exp_rows, Series([len(df)], index=[("TOTAL", "")])])
|
||||
exp_rows.name = "TOTAL"
|
||||
|
||||
exp_rows = exp_rows.reindex(all_rows.index)
|
||||
exp_rows = exp_rows.fillna(0).astype(np.int64)
|
||||
tm.assert_series_equal(all_rows, exp_rows)
|
||||
|
||||
msg = "margins_name argument must be a string"
|
||||
for margins_name in [666, None, ["a", "b"]]:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
crosstab(
|
||||
a,
|
||||
[b, c],
|
||||
rownames=["a"],
|
||||
colnames=("b", "c"),
|
||||
margins=True,
|
||||
margins_name=margins_name,
|
||||
)
|
||||
|
||||
def test_crosstab_pass_values(self):
|
||||
a = np.random.default_rng(2).integers(0, 7, size=100)
|
||||
b = np.random.default_rng(2).integers(0, 3, size=100)
|
||||
c = np.random.default_rng(2).integers(0, 5, size=100)
|
||||
values = np.random.default_rng(2).standard_normal(100)
|
||||
|
||||
table = crosstab(
|
||||
[a, b], c, values, aggfunc="sum", rownames=["foo", "bar"], colnames=["baz"]
|
||||
)
|
||||
|
||||
df = DataFrame({"foo": a, "bar": b, "baz": c, "values": values})
|
||||
|
||||
expected = df.pivot_table(
|
||||
"values", index=["foo", "bar"], columns="baz", aggfunc="sum"
|
||||
)
|
||||
tm.assert_frame_equal(table, expected)
|
||||
|
||||
def test_crosstab_dropna(self):
|
||||
# GH 3820
|
||||
a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object)
|
||||
b = np.array(["one", "one", "two", "one", "two", "two", "two"], dtype=object)
|
||||
c = np.array(
|
||||
["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object
|
||||
)
|
||||
res = crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=False)
|
||||
m = MultiIndex.from_tuples(
|
||||
[("one", "dull"), ("one", "shiny"), ("two", "dull"), ("two", "shiny")],
|
||||
names=["b", "c"],
|
||||
)
|
||||
tm.assert_index_equal(res.columns, m)
|
||||
|
||||
def test_crosstab_no_overlap(self):
|
||||
# GS 10291
|
||||
|
||||
s1 = Series([1, 2, 3], index=[1, 2, 3])
|
||||
s2 = Series([4, 5, 6], index=[4, 5, 6])
|
||||
|
||||
actual = crosstab(s1, s2)
|
||||
expected = DataFrame(
|
||||
index=Index([], dtype="int64", name="row_0"),
|
||||
columns=Index([], dtype="int64", name="col_0"),
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_margin_dropna(self):
|
||||
# GH 12577
|
||||
# pivot_table counts null into margin ('All')
|
||||
# when margins=true and dropna=true
|
||||
|
||||
df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]})
|
||||
actual = crosstab(df.a, df.b, margins=True, dropna=True)
|
||||
expected = DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]])
|
||||
expected.index = Index([1.0, 2.0, "All"], name="a")
|
||||
expected.columns = Index([3, 4, "All"], name="b")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_margin_dropna2(self):
|
||||
df = DataFrame(
|
||||
{"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]}
|
||||
)
|
||||
actual = crosstab(df.a, df.b, margins=True, dropna=True)
|
||||
expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]])
|
||||
expected.index = Index([1.0, 2.0, "All"], name="a")
|
||||
expected.columns = Index([3.0, 4.0, "All"], name="b")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_margin_dropna3(self):
|
||||
df = DataFrame(
|
||||
{"a": [1, np.nan, np.nan, np.nan, np.nan, 2], "b": [3, 3, 4, 4, 4, 4]}
|
||||
)
|
||||
actual = crosstab(df.a, df.b, margins=True, dropna=True)
|
||||
expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]])
|
||||
expected.index = Index([1.0, 2.0, "All"], name="a")
|
||||
expected.columns = Index([3, 4, "All"], name="b")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_margin_dropna4(self):
|
||||
# GH 12642
|
||||
# _add_margins raises KeyError: Level None not found
|
||||
# when margins=True and dropna=False
|
||||
# GH: 10772: Keep np.nan in result with dropna=False
|
||||
df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]})
|
||||
actual = crosstab(df.a, df.b, margins=True, dropna=False)
|
||||
expected = DataFrame([[1, 0, 1.0], [1, 3, 4.0], [0, 1, np.nan], [2, 4, 6.0]])
|
||||
expected.index = Index([1.0, 2.0, np.nan, "All"], name="a")
|
||||
expected.columns = Index([3, 4, "All"], name="b")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_margin_dropna5(self):
|
||||
# GH: 10772: Keep np.nan in result with dropna=False
|
||||
df = DataFrame(
|
||||
{"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]}
|
||||
)
|
||||
actual = crosstab(df.a, df.b, margins=True, dropna=False)
|
||||
expected = DataFrame(
|
||||
[[1, 0, 0, 1.0], [0, 1, 0, 1.0], [0, 3, 1, np.nan], [1, 4, 0, 6.0]]
|
||||
)
|
||||
expected.index = Index([1.0, 2.0, np.nan, "All"], name="a")
|
||||
expected.columns = Index([3.0, 4.0, np.nan, "All"], name="b")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_margin_dropna6(self):
|
||||
# GH: 10772: Keep np.nan in result with dropna=False
|
||||
a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object)
|
||||
b = np.array(["one", "one", "two", "one", "two", np.nan, "two"], dtype=object)
|
||||
c = np.array(
|
||||
["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object
|
||||
)
|
||||
|
||||
actual = crosstab(
|
||||
a, [b, c], rownames=["a"], colnames=["b", "c"], margins=True, dropna=False
|
||||
)
|
||||
m = MultiIndex.from_arrays(
|
||||
[
|
||||
["one", "one", "two", "two", np.nan, np.nan, "All"],
|
||||
["dull", "shiny", "dull", "shiny", "dull", "shiny", ""],
|
||||
],
|
||||
names=["b", "c"],
|
||||
)
|
||||
expected = DataFrame(
|
||||
[[1, 0, 1, 0, 0, 0, 2], [2, 0, 1, 1, 0, 1, 5], [3, 0, 2, 1, 0, 0, 7]],
|
||||
columns=m,
|
||||
)
|
||||
expected.index = Index(["bar", "foo", "All"], name="a")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = crosstab(
|
||||
[a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=False
|
||||
)
|
||||
m = MultiIndex.from_arrays(
|
||||
[
|
||||
["bar", "bar", "bar", "foo", "foo", "foo", "All"],
|
||||
["one", "two", np.nan, "one", "two", np.nan, ""],
|
||||
],
|
||||
names=["a", "b"],
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1, 0, 1.0],
|
||||
[1, 0, 1.0],
|
||||
[0, 0, np.nan],
|
||||
[2, 0, 2.0],
|
||||
[1, 1, 2.0],
|
||||
[0, 1, np.nan],
|
||||
[5, 2, 7.0],
|
||||
],
|
||||
index=m,
|
||||
)
|
||||
expected.columns = Index(["dull", "shiny", "All"], name="c")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = crosstab(
|
||||
[a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=True
|
||||
)
|
||||
m = MultiIndex.from_arrays(
|
||||
[["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]],
|
||||
names=["a", "b"],
|
||||
)
|
||||
expected = DataFrame(
|
||||
[[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 1, 6]], index=m
|
||||
)
|
||||
expected.columns = Index(["dull", "shiny", "All"], name="c")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_crosstab_normalize(self):
|
||||
# Issue 12578
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]}
|
||||
)
|
||||
|
||||
rindex = Index([1, 2], name="a")
|
||||
cindex = Index([3, 4], name="b")
|
||||
full_normal = DataFrame([[0.2, 0], [0.2, 0.6]], index=rindex, columns=cindex)
|
||||
row_normal = DataFrame([[1.0, 0], [0.25, 0.75]], index=rindex, columns=cindex)
|
||||
col_normal = DataFrame([[0.5, 0], [0.5, 1.0]], index=rindex, columns=cindex)
|
||||
|
||||
# Check all normalize args
|
||||
tm.assert_frame_equal(crosstab(df.a, df.b, normalize="all"), full_normal)
|
||||
tm.assert_frame_equal(crosstab(df.a, df.b, normalize=True), full_normal)
|
||||
tm.assert_frame_equal(crosstab(df.a, df.b, normalize="index"), row_normal)
|
||||
tm.assert_frame_equal(crosstab(df.a, df.b, normalize="columns"), col_normal)
|
||||
tm.assert_frame_equal(
|
||||
crosstab(df.a, df.b, normalize=1),
|
||||
crosstab(df.a, df.b, normalize="columns"),
|
||||
)
|
||||
tm.assert_frame_equal(
|
||||
crosstab(df.a, df.b, normalize=0), crosstab(df.a, df.b, normalize="index")
|
||||
)
|
||||
|
||||
row_normal_margins = DataFrame(
|
||||
[[1.0, 0], [0.25, 0.75], [0.4, 0.6]],
|
||||
index=Index([1, 2, "All"], name="a", dtype="object"),
|
||||
columns=Index([3, 4], name="b", dtype="object"),
|
||||
)
|
||||
col_normal_margins = DataFrame(
|
||||
[[0.5, 0, 0.2], [0.5, 1.0, 0.8]],
|
||||
index=Index([1, 2], name="a", dtype="object"),
|
||||
columns=Index([3, 4, "All"], name="b", dtype="object"),
|
||||
)
|
||||
|
||||
all_normal_margins = DataFrame(
|
||||
[[0.2, 0, 0.2], [0.2, 0.6, 0.8], [0.4, 0.6, 1]],
|
||||
index=Index([1, 2, "All"], name="a", dtype="object"),
|
||||
columns=Index([3, 4, "All"], name="b", dtype="object"),
|
||||
)
|
||||
tm.assert_frame_equal(
|
||||
crosstab(df.a, df.b, normalize="index", margins=True), row_normal_margins
|
||||
)
|
||||
tm.assert_frame_equal(
|
||||
crosstab(df.a, df.b, normalize="columns", margins=True), col_normal_margins
|
||||
)
|
||||
tm.assert_frame_equal(
|
||||
crosstab(df.a, df.b, normalize=True, margins=True), all_normal_margins
|
||||
)
|
||||
|
||||
def test_crosstab_normalize_arrays(self):
|
||||
# GH#12578
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]}
|
||||
)
|
||||
|
||||
# Test arrays
|
||||
crosstab(
|
||||
[np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])], np.array([1, 2, 1, 2])
|
||||
)
|
||||
|
||||
# Test with aggfunc
|
||||
norm_counts = DataFrame(
|
||||
[[0.25, 0, 0.25], [0.25, 0.5, 0.75], [0.5, 0.5, 1]],
|
||||
index=Index([1, 2, "All"], name="a", dtype="object"),
|
||||
columns=Index([3, 4, "All"], name="b"),
|
||||
)
|
||||
test_case = crosstab(
|
||||
df.a, df.b, df.c, aggfunc="count", normalize="all", margins=True
|
||||
)
|
||||
tm.assert_frame_equal(test_case, norm_counts)
|
||||
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [0, 4, np.nan, 3, 3]}
|
||||
)
|
||||
|
||||
norm_sum = DataFrame(
|
||||
[[0, 0, 0.0], [0.4, 0.6, 1], [0.4, 0.6, 1]],
|
||||
index=Index([1, 2, "All"], name="a", dtype="object"),
|
||||
columns=Index([3, 4, "All"], name="b", dtype="object"),
|
||||
)
|
||||
msg = "using DataFrameGroupBy.sum"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
test_case = crosstab(
|
||||
df.a, df.b, df.c, aggfunc=np.sum, normalize="all", margins=True
|
||||
)
|
||||
tm.assert_frame_equal(test_case, norm_sum)
|
||||
|
||||
def test_crosstab_with_empties(self, using_array_manager):
|
||||
# Check handling of empties
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 2, 2, 2, 2],
|
||||
"b": [3, 3, 4, 4, 4],
|
||||
"c": [np.nan, np.nan, np.nan, np.nan, np.nan],
|
||||
}
|
||||
)
|
||||
|
||||
empty = DataFrame(
|
||||
[[0.0, 0.0], [0.0, 0.0]],
|
||||
index=Index([1, 2], name="a", dtype="int64"),
|
||||
columns=Index([3, 4], name="b"),
|
||||
)
|
||||
|
||||
for i in [True, "index", "columns"]:
|
||||
calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=i)
|
||||
tm.assert_frame_equal(empty, calculated)
|
||||
|
||||
nans = DataFrame(
|
||||
[[0.0, np.nan], [0.0, 0.0]],
|
||||
index=Index([1, 2], name="a", dtype="int64"),
|
||||
columns=Index([3, 4], name="b"),
|
||||
)
|
||||
if using_array_manager:
|
||||
# INFO(ArrayManager) column without NaNs can preserve int dtype
|
||||
nans[3] = nans[3].astype("int64")
|
||||
|
||||
calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=False)
|
||||
tm.assert_frame_equal(nans, calculated)
|
||||
|
||||
def test_crosstab_errors(self):
|
||||
# Issue 12578
|
||||
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]}
|
||||
)
|
||||
|
||||
error = "values cannot be used without an aggfunc."
|
||||
with pytest.raises(ValueError, match=error):
|
||||
crosstab(df.a, df.b, values=df.c)
|
||||
|
||||
error = "aggfunc cannot be used without values"
|
||||
with pytest.raises(ValueError, match=error):
|
||||
crosstab(df.a, df.b, aggfunc=np.mean)
|
||||
|
||||
error = "Not a valid normalize argument"
|
||||
with pytest.raises(ValueError, match=error):
|
||||
crosstab(df.a, df.b, normalize="42")
|
||||
|
||||
with pytest.raises(ValueError, match=error):
|
||||
crosstab(df.a, df.b, normalize=42)
|
||||
|
||||
error = "Not a valid margins argument"
|
||||
with pytest.raises(ValueError, match=error):
|
||||
crosstab(df.a, df.b, normalize="all", margins=42)
|
||||
|
||||
def test_crosstab_with_categorial_columns(self):
|
||||
# GH 8860
|
||||
df = DataFrame(
|
||||
{
|
||||
"MAKE": ["Honda", "Acura", "Tesla", "Honda", "Honda", "Acura"],
|
||||
"MODEL": ["Sedan", "Sedan", "Electric", "Pickup", "Sedan", "Sedan"],
|
||||
}
|
||||
)
|
||||
categories = ["Sedan", "Electric", "Pickup"]
|
||||
df["MODEL"] = df["MODEL"].astype("category").cat.set_categories(categories)
|
||||
result = crosstab(df["MAKE"], df["MODEL"])
|
||||
|
||||
expected_index = Index(["Acura", "Honda", "Tesla"], name="MAKE")
|
||||
expected_columns = CategoricalIndex(
|
||||
categories, categories=categories, ordered=False, name="MODEL"
|
||||
)
|
||||
expected_data = [[2, 0, 0], [2, 0, 1], [0, 1, 0]]
|
||||
expected = DataFrame(
|
||||
expected_data, index=expected_index, columns=expected_columns
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_crosstab_with_numpy_size(self):
|
||||
# GH 4003
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["one", "one", "two", "three"] * 6,
|
||||
"B": ["A", "B", "C"] * 8,
|
||||
"C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4,
|
||||
"D": np.random.default_rng(2).standard_normal(24),
|
||||
"E": np.random.default_rng(2).standard_normal(24),
|
||||
}
|
||||
)
|
||||
result = crosstab(
|
||||
index=[df["A"], df["B"]],
|
||||
columns=[df["C"]],
|
||||
margins=True,
|
||||
aggfunc=np.size,
|
||||
values=df["D"],
|
||||
)
|
||||
expected_index = MultiIndex(
|
||||
levels=[["All", "one", "three", "two"], ["", "A", "B", "C"]],
|
||||
codes=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0], [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]],
|
||||
names=["A", "B"],
|
||||
)
|
||||
expected_column = Index(["bar", "foo", "All"], name="C")
|
||||
expected_data = np.array(
|
||||
[
|
||||
[2.0, 2.0, 4.0],
|
||||
[2.0, 2.0, 4.0],
|
||||
[2.0, 2.0, 4.0],
|
||||
[2.0, np.nan, 2.0],
|
||||
[np.nan, 2.0, 2.0],
|
||||
[2.0, np.nan, 2.0],
|
||||
[np.nan, 2.0, 2.0],
|
||||
[2.0, np.nan, 2.0],
|
||||
[np.nan, 2.0, 2.0],
|
||||
[12.0, 12.0, 24.0],
|
||||
]
|
||||
)
|
||||
expected = DataFrame(
|
||||
expected_data, index=expected_index, columns=expected_column
|
||||
)
|
||||
# aggfunc is np.size, resulting in integers
|
||||
expected["All"] = expected["All"].astype("int64")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_crosstab_duplicate_names(self):
|
||||
# GH 13279 / 22529
|
||||
|
||||
s1 = Series(range(3), name="foo")
|
||||
s2_foo = Series(range(1, 4), name="foo")
|
||||
s2_bar = Series(range(1, 4), name="bar")
|
||||
s3 = Series(range(3), name="waldo")
|
||||
|
||||
# check result computed with duplicate labels against
|
||||
# result computed with unique labels, then relabelled
|
||||
mapper = {"bar": "foo"}
|
||||
|
||||
# duplicate row, column labels
|
||||
result = crosstab(s1, s2_foo)
|
||||
expected = crosstab(s1, s2_bar).rename_axis(columns=mapper, axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# duplicate row, unique column labels
|
||||
result = crosstab([s1, s2_foo], s3)
|
||||
expected = crosstab([s1, s2_bar], s3).rename_axis(index=mapper, axis=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# unique row, duplicate column labels
|
||||
result = crosstab(s3, [s1, s2_foo])
|
||||
expected = crosstab(s3, [s1, s2_bar]).rename_axis(columns=mapper, axis=1)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]])
|
||||
def test_crosstab_tuple_name(self, names):
|
||||
s1 = Series(range(3), name=names[0])
|
||||
s2 = Series(range(1, 4), name=names[1])
|
||||
|
||||
mi = MultiIndex.from_arrays([range(3), range(1, 4)], names=names)
|
||||
expected = Series(1, index=mi).unstack(1, fill_value=0)
|
||||
|
||||
result = crosstab(s1, s2)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_crosstab_both_tuple_names(self):
|
||||
# GH 18321
|
||||
s1 = Series(range(3), name=("a", "b"))
|
||||
s2 = Series(range(3), name=("c", "d"))
|
||||
|
||||
expected = DataFrame(
|
||||
np.eye(3, dtype="int64"),
|
||||
index=Index(range(3), name=("a", "b")),
|
||||
columns=Index(range(3), name=("c", "d")),
|
||||
)
|
||||
result = crosstab(s1, s2)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_crosstab_unsorted_order(self):
|
||||
df = DataFrame({"b": [3, 1, 2], "a": [5, 4, 6]}, index=["C", "A", "B"])
|
||||
result = crosstab(df.index, [df.b, df.a])
|
||||
e_idx = Index(["A", "B", "C"], name="row_0")
|
||||
e_columns = MultiIndex.from_tuples([(1, 4), (2, 6), (3, 5)], names=["b", "a"])
|
||||
expected = DataFrame(
|
||||
[[1, 0, 0], [0, 1, 0], [0, 0, 1]], index=e_idx, columns=e_columns
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_crosstab_normalize_multiple_columns(self):
|
||||
# GH 15150
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["one", "one", "two", "three"] * 6,
|
||||
"B": ["A", "B", "C"] * 8,
|
||||
"C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4,
|
||||
"D": [0] * 24,
|
||||
"E": [0] * 24,
|
||||
}
|
||||
)
|
||||
|
||||
msg = "using DataFrameGroupBy.sum"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = crosstab(
|
||||
[df.A, df.B],
|
||||
df.C,
|
||||
values=df.D,
|
||||
aggfunc=np.sum,
|
||||
normalize=True,
|
||||
margins=True,
|
||||
)
|
||||
expected = DataFrame(
|
||||
np.array([0] * 29 + [1], dtype=float).reshape(10, 3),
|
||||
columns=Index(["bar", "foo", "All"], name="C"),
|
||||
index=MultiIndex.from_tuples(
|
||||
[
|
||||
("one", "A"),
|
||||
("one", "B"),
|
||||
("one", "C"),
|
||||
("three", "A"),
|
||||
("three", "B"),
|
||||
("three", "C"),
|
||||
("two", "A"),
|
||||
("two", "B"),
|
||||
("two", "C"),
|
||||
("All", ""),
|
||||
],
|
||||
names=["A", "B"],
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_margin_normalize(self):
|
||||
# GH 27500
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
|
||||
"B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
|
||||
"C": [
|
||||
"small",
|
||||
"large",
|
||||
"large",
|
||||
"small",
|
||||
"small",
|
||||
"large",
|
||||
"small",
|
||||
"small",
|
||||
"large",
|
||||
],
|
||||
"D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
|
||||
"E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
|
||||
}
|
||||
)
|
||||
# normalize on index
|
||||
result = crosstab(
|
||||
[df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=0
|
||||
)
|
||||
expected = DataFrame(
|
||||
[[0.5, 0.5], [0.5, 0.5], [0.666667, 0.333333], [0, 1], [0.444444, 0.555556]]
|
||||
)
|
||||
expected.index = MultiIndex(
|
||||
levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]],
|
||||
codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]],
|
||||
names=["A", "B"],
|
||||
)
|
||||
expected.columns = Index(["large", "small"], name="C")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# normalize on columns
|
||||
result = crosstab(
|
||||
[df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=1
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[0.25, 0.2, 0.222222],
|
||||
[0.25, 0.2, 0.222222],
|
||||
[0.5, 0.2, 0.333333],
|
||||
[0, 0.4, 0.222222],
|
||||
]
|
||||
)
|
||||
expected.columns = Index(["large", "small", "Sub-Total"], name="C")
|
||||
expected.index = MultiIndex(
|
||||
levels=[["bar", "foo"], ["one", "two"]],
|
||||
codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
|
||||
names=["A", "B"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# normalize on both index and column
|
||||
result = crosstab(
|
||||
[df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=True
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[0.111111, 0.111111, 0.222222],
|
||||
[0.111111, 0.111111, 0.222222],
|
||||
[0.222222, 0.111111, 0.333333],
|
||||
[0.000000, 0.222222, 0.222222],
|
||||
[0.444444, 0.555555, 1],
|
||||
]
|
||||
)
|
||||
expected.columns = Index(["large", "small", "Sub-Total"], name="C")
|
||||
expected.index = MultiIndex(
|
||||
levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]],
|
||||
codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]],
|
||||
names=["A", "B"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_margin_normalize_multiple_columns(self):
|
||||
# GH 35144
|
||||
# use multiple columns with margins and normalization
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
|
||||
"B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
|
||||
"C": [
|
||||
"small",
|
||||
"large",
|
||||
"large",
|
||||
"small",
|
||||
"small",
|
||||
"large",
|
||||
"small",
|
||||
"small",
|
||||
"large",
|
||||
],
|
||||
"D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
|
||||
"E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
|
||||
}
|
||||
)
|
||||
result = crosstab(
|
||||
index=df.C,
|
||||
columns=[df.A, df.B],
|
||||
margins=True,
|
||||
margins_name="margin",
|
||||
normalize=True,
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[0.111111, 0.111111, 0.222222, 0.000000, 0.444444],
|
||||
[0.111111, 0.111111, 0.111111, 0.222222, 0.555556],
|
||||
[0.222222, 0.222222, 0.333333, 0.222222, 1.0],
|
||||
],
|
||||
index=["large", "small", "margin"],
|
||||
)
|
||||
expected.columns = MultiIndex(
|
||||
levels=[["bar", "foo", "margin"], ["", "one", "two"]],
|
||||
codes=[[0, 0, 1, 1, 2], [1, 2, 1, 2, 0]],
|
||||
names=["A", "B"],
|
||||
)
|
||||
expected.index.name = "C"
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_margin_support_Float(self):
|
||||
# GH 50313
|
||||
# use Float64 formats and function aggfunc with margins
|
||||
df = DataFrame(
|
||||
{"A": [1, 2, 2, 1], "B": [3, 3, 4, 5], "C": [-1.0, 10.0, 1.0, 10.0]},
|
||||
dtype="Float64",
|
||||
)
|
||||
result = crosstab(
|
||||
df["A"],
|
||||
df["B"],
|
||||
values=df["C"],
|
||||
aggfunc="sum",
|
||||
margins=True,
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[-1.0, pd.NA, 10.0, 9.0],
|
||||
[10.0, 1.0, pd.NA, 11.0],
|
||||
[9.0, 1.0, 10.0, 20.0],
|
||||
],
|
||||
index=Index([1.0, 2.0, "All"], dtype="object", name="A"),
|
||||
columns=Index([3.0, 4.0, 5.0, "All"], dtype="object", name="B"),
|
||||
dtype="Float64",
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_margin_with_ordered_categorical_column(self):
|
||||
# GH 25278
|
||||
df = DataFrame(
|
||||
{
|
||||
"First": ["B", "B", "C", "A", "B", "C"],
|
||||
"Second": ["C", "B", "B", "B", "C", "A"],
|
||||
}
|
||||
)
|
||||
df["First"] = df["First"].astype(CategoricalDtype(ordered=True))
|
||||
customized_categories_order = ["C", "A", "B"]
|
||||
df["First"] = df["First"].cat.reorder_categories(customized_categories_order)
|
||||
result = crosstab(df["First"], df["Second"], margins=True)
|
||||
|
||||
expected_index = Index(["C", "A", "B", "All"], name="First")
|
||||
expected_columns = Index(["A", "B", "C", "All"], name="Second")
|
||||
expected_data = [[1, 1, 0, 2], [0, 1, 0, 1], [0, 1, 2, 3], [1, 3, 2, 6]]
|
||||
expected = DataFrame(
|
||||
expected_data, index=expected_index, columns=expected_columns
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("a_dtype", ["category", "int64"])
|
||||
@pytest.mark.parametrize("b_dtype", ["category", "int64"])
|
||||
def test_categoricals(a_dtype, b_dtype):
|
||||
# https://github.com/pandas-dev/pandas/issues/37465
|
||||
g = np.random.default_rng(2)
|
||||
a = Series(g.integers(0, 3, size=100)).astype(a_dtype)
|
||||
b = Series(g.integers(0, 2, size=100)).astype(b_dtype)
|
||||
result = crosstab(a, b, margins=True, dropna=False)
|
||||
columns = Index([0, 1, "All"], dtype="object", name="col_0")
|
||||
index = Index([0, 1, 2, "All"], dtype="object", name="row_0")
|
||||
values = [[10, 18, 28], [23, 16, 39], [17, 16, 33], [50, 50, 100]]
|
||||
expected = DataFrame(values, index, columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Verify when categorical does not have all values present
|
||||
a.loc[a == 1] = 2
|
||||
a_is_cat = isinstance(a.dtype, CategoricalDtype)
|
||||
assert not a_is_cat or a.value_counts().loc[1] == 0
|
||||
result = crosstab(a, b, margins=True, dropna=False)
|
||||
values = [[10, 18, 28], [0, 0, 0], [40, 32, 72], [50, 50, 100]]
|
||||
expected = DataFrame(values, index, columns)
|
||||
if not a_is_cat:
|
||||
expected = expected.loc[[0, 2, "All"]]
|
||||
expected["All"] = expected["All"].astype("int64")
|
||||
tm.assert_frame_equal(result, expected)
|
791
lib/python3.13/site-packages/pandas/tests/reshape/test_cut.py
Normal file
791
lib/python3.13/site-packages/pandas/tests/reshape/test_cut.py
Normal file
@ -0,0 +1,791 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Index,
|
||||
Interval,
|
||||
IntervalIndex,
|
||||
Series,
|
||||
TimedeltaIndex,
|
||||
Timestamp,
|
||||
cut,
|
||||
date_range,
|
||||
interval_range,
|
||||
isna,
|
||||
qcut,
|
||||
timedelta_range,
|
||||
to_datetime,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.api.types import CategoricalDtype
|
||||
import pandas.core.reshape.tile as tmod
|
||||
|
||||
|
||||
def test_simple():
|
||||
data = np.ones(5, dtype="int64")
|
||||
result = cut(data, 4, labels=False)
|
||||
|
||||
expected = np.array([1, 1, 1, 1, 1])
|
||||
tm.assert_numpy_array_equal(result, expected, check_dtype=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", [list, np.array])
|
||||
def test_bins(func):
|
||||
data = func([0.2, 1.4, 2.5, 6.2, 9.7, 2.1])
|
||||
result, bins = cut(data, 3, retbins=True)
|
||||
|
||||
intervals = IntervalIndex.from_breaks(bins.round(3))
|
||||
intervals = intervals.take([0, 0, 0, 1, 2, 0])
|
||||
expected = Categorical(intervals, ordered=True)
|
||||
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, 6.53333333, 9.7]))
|
||||
|
||||
|
||||
def test_right():
|
||||
data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
|
||||
result, bins = cut(data, 4, right=True, retbins=True)
|
||||
|
||||
intervals = IntervalIndex.from_breaks(bins.round(3))
|
||||
expected = Categorical(intervals, ordered=True)
|
||||
expected = expected.take([0, 0, 0, 2, 3, 0, 0])
|
||||
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
tm.assert_almost_equal(bins, np.array([0.1905, 2.575, 4.95, 7.325, 9.7]))
|
||||
|
||||
|
||||
def test_no_right():
|
||||
data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
|
||||
result, bins = cut(data, 4, right=False, retbins=True)
|
||||
|
||||
intervals = IntervalIndex.from_breaks(bins.round(3), closed="left")
|
||||
intervals = intervals.take([0, 0, 0, 2, 3, 0, 1])
|
||||
expected = Categorical(intervals, ordered=True)
|
||||
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
tm.assert_almost_equal(bins, np.array([0.2, 2.575, 4.95, 7.325, 9.7095]))
|
||||
|
||||
|
||||
def test_bins_from_interval_index():
|
||||
c = cut(range(5), 3)
|
||||
expected = c
|
||||
result = cut(range(5), bins=expected.categories)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
expected = Categorical.from_codes(
|
||||
np.append(c.codes, -1), categories=c.categories, ordered=True
|
||||
)
|
||||
result = cut(range(6), bins=expected.categories)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
|
||||
def test_bins_from_interval_index_doc_example():
|
||||
# Make sure we preserve the bins.
|
||||
ages = np.array([10, 15, 13, 12, 23, 25, 28, 59, 60])
|
||||
c = cut(ages, bins=[0, 18, 35, 70])
|
||||
expected = IntervalIndex.from_tuples([(0, 18), (18, 35), (35, 70)])
|
||||
tm.assert_index_equal(c.categories, expected)
|
||||
|
||||
result = cut([25, 20, 50], bins=c.categories)
|
||||
tm.assert_index_equal(result.categories, expected)
|
||||
tm.assert_numpy_array_equal(result.codes, np.array([1, 1, 2], dtype="int8"))
|
||||
|
||||
|
||||
def test_bins_not_overlapping_from_interval_index():
|
||||
# see gh-23980
|
||||
msg = "Overlapping IntervalIndex is not accepted"
|
||||
ii = IntervalIndex.from_tuples([(0, 10), (2, 12), (4, 14)])
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut([5, 6], bins=ii)
|
||||
|
||||
|
||||
def test_bins_not_monotonic():
|
||||
msg = "bins must increase monotonically"
|
||||
data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(data, [0.1, 1.5, 1, 10])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"x, bins, expected",
|
||||
[
|
||||
(
|
||||
date_range("2017-12-31", periods=3),
|
||||
[Timestamp.min, Timestamp("2018-01-01"), Timestamp.max],
|
||||
IntervalIndex.from_tuples(
|
||||
[
|
||||
(Timestamp.min, Timestamp("2018-01-01")),
|
||||
(Timestamp("2018-01-01"), Timestamp.max),
|
||||
]
|
||||
),
|
||||
),
|
||||
(
|
||||
[-1, 0, 1],
|
||||
np.array(
|
||||
[np.iinfo(np.int64).min, 0, np.iinfo(np.int64).max], dtype="int64"
|
||||
),
|
||||
IntervalIndex.from_tuples(
|
||||
[(np.iinfo(np.int64).min, 0), (0, np.iinfo(np.int64).max)]
|
||||
),
|
||||
),
|
||||
(
|
||||
[
|
||||
np.timedelta64(-1, "ns"),
|
||||
np.timedelta64(0, "ns"),
|
||||
np.timedelta64(1, "ns"),
|
||||
],
|
||||
np.array(
|
||||
[
|
||||
np.timedelta64(-np.iinfo(np.int64).max, "ns"),
|
||||
np.timedelta64(0, "ns"),
|
||||
np.timedelta64(np.iinfo(np.int64).max, "ns"),
|
||||
]
|
||||
),
|
||||
IntervalIndex.from_tuples(
|
||||
[
|
||||
(
|
||||
np.timedelta64(-np.iinfo(np.int64).max, "ns"),
|
||||
np.timedelta64(0, "ns"),
|
||||
),
|
||||
(
|
||||
np.timedelta64(0, "ns"),
|
||||
np.timedelta64(np.iinfo(np.int64).max, "ns"),
|
||||
),
|
||||
]
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_bins_monotonic_not_overflowing(x, bins, expected):
|
||||
# GH 26045
|
||||
result = cut(x, bins)
|
||||
tm.assert_index_equal(result.categories, expected)
|
||||
|
||||
|
||||
def test_wrong_num_labels():
|
||||
msg = "Bin labels must be one fewer than the number of bin edges"
|
||||
data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(data, [0, 1, 10], labels=["foo", "bar", "baz"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"x,bins,msg",
|
||||
[
|
||||
([], 2, "Cannot cut empty array"),
|
||||
([1, 2, 3], 0.5, "`bins` should be a positive integer"),
|
||||
],
|
||||
)
|
||||
def test_cut_corner(x, bins, msg):
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(x, bins)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("arg", [2, np.eye(2), DataFrame(np.eye(2))])
|
||||
@pytest.mark.parametrize("cut_func", [cut, qcut])
|
||||
def test_cut_not_1d_arg(arg, cut_func):
|
||||
msg = "Input array must be 1 dimensional"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut_func(arg, 2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
[0, 1, 2, 3, 4, np.inf],
|
||||
[-np.inf, 0, 1, 2, 3, 4],
|
||||
[-np.inf, 0, 1, 2, 3, 4, np.inf],
|
||||
],
|
||||
)
|
||||
def test_int_bins_with_inf(data):
|
||||
# GH 24314
|
||||
msg = "cannot specify integer `bins` when input data contains infinity"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(data, bins=3)
|
||||
|
||||
|
||||
def test_cut_out_of_range_more():
|
||||
# see gh-1511
|
||||
name = "x"
|
||||
|
||||
ser = Series([0, -1, 0, 1, -3], name=name)
|
||||
ind = cut(ser, [0, 1], labels=False)
|
||||
|
||||
exp = Series([np.nan, np.nan, np.nan, 0, np.nan], name=name)
|
||||
tm.assert_series_equal(ind, exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"right,breaks,closed",
|
||||
[
|
||||
(True, [-1e-3, 0.25, 0.5, 0.75, 1], "right"),
|
||||
(False, [0, 0.25, 0.5, 0.75, 1 + 1e-3], "left"),
|
||||
],
|
||||
)
|
||||
def test_labels(right, breaks, closed):
|
||||
arr = np.tile(np.arange(0, 1.01, 0.1), 4)
|
||||
|
||||
result, bins = cut(arr, 4, retbins=True, right=right)
|
||||
ex_levels = IntervalIndex.from_breaks(breaks, closed=closed)
|
||||
tm.assert_index_equal(result.categories, ex_levels)
|
||||
|
||||
|
||||
def test_cut_pass_series_name_to_factor():
|
||||
name = "foo"
|
||||
ser = Series(np.random.default_rng(2).standard_normal(100), name=name)
|
||||
|
||||
factor = cut(ser, 4)
|
||||
assert factor.name == name
|
||||
|
||||
|
||||
def test_label_precision():
|
||||
arr = np.arange(0, 0.73, 0.01)
|
||||
result = cut(arr, 4, precision=2)
|
||||
|
||||
ex_levels = IntervalIndex.from_breaks([-0.00072, 0.18, 0.36, 0.54, 0.72])
|
||||
tm.assert_index_equal(result.categories, ex_levels)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("labels", [None, False])
|
||||
def test_na_handling(labels):
|
||||
arr = np.arange(0, 0.75, 0.01)
|
||||
arr[::3] = np.nan
|
||||
|
||||
result = cut(arr, 4, labels=labels)
|
||||
result = np.asarray(result)
|
||||
|
||||
expected = np.where(isna(arr), np.nan, result)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
|
||||
def test_inf_handling():
|
||||
data = np.arange(6)
|
||||
data_ser = Series(data, dtype="int64")
|
||||
|
||||
bins = [-np.inf, 2, 4, np.inf]
|
||||
result = cut(data, bins)
|
||||
result_ser = cut(data_ser, bins)
|
||||
|
||||
ex_uniques = IntervalIndex.from_breaks(bins)
|
||||
tm.assert_index_equal(result.categories, ex_uniques)
|
||||
|
||||
assert result[5] == Interval(4, np.inf)
|
||||
assert result[0] == Interval(-np.inf, 2)
|
||||
assert result_ser[5] == Interval(4, np.inf)
|
||||
assert result_ser[0] == Interval(-np.inf, 2)
|
||||
|
||||
|
||||
def test_cut_out_of_bounds():
|
||||
arr = np.random.default_rng(2).standard_normal(100)
|
||||
result = cut(arr, [-1, 0, 1])
|
||||
|
||||
mask = isna(result)
|
||||
ex_mask = (arr < -1) | (arr > 1)
|
||||
tm.assert_numpy_array_equal(mask, ex_mask)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"get_labels,get_expected",
|
||||
[
|
||||
(
|
||||
lambda labels: labels,
|
||||
lambda labels: Categorical(
|
||||
["Medium"] + 4 * ["Small"] + ["Medium", "Large"],
|
||||
categories=labels,
|
||||
ordered=True,
|
||||
),
|
||||
),
|
||||
(
|
||||
lambda labels: Categorical.from_codes([0, 1, 2], labels),
|
||||
lambda labels: Categorical.from_codes([1] + 4 * [0] + [1, 2], labels),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_cut_pass_labels(get_labels, get_expected):
|
||||
bins = [0, 25, 50, 100]
|
||||
arr = [50, 5, 10, 15, 20, 30, 70]
|
||||
labels = ["Small", "Medium", "Large"]
|
||||
|
||||
result = cut(arr, bins, labels=get_labels(labels))
|
||||
tm.assert_categorical_equal(result, get_expected(labels))
|
||||
|
||||
|
||||
def test_cut_pass_labels_compat():
|
||||
# see gh-16459
|
||||
arr = [50, 5, 10, 15, 20, 30, 70]
|
||||
labels = ["Good", "Medium", "Bad"]
|
||||
|
||||
result = cut(arr, 3, labels=labels)
|
||||
exp = cut(arr, 3, labels=Categorical(labels, categories=labels, ordered=True))
|
||||
tm.assert_categorical_equal(result, exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("x", [np.arange(11.0), np.arange(11.0) / 1e10])
|
||||
def test_round_frac_just_works(x):
|
||||
# It works.
|
||||
cut(x, 2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"val,precision,expected",
|
||||
[
|
||||
(-117.9998, 3, -118),
|
||||
(117.9998, 3, 118),
|
||||
(117.9998, 2, 118),
|
||||
(0.000123456, 2, 0.00012),
|
||||
],
|
||||
)
|
||||
def test_round_frac(val, precision, expected):
|
||||
# see gh-1979
|
||||
result = tmod._round_frac(val, precision=precision)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_cut_return_intervals():
|
||||
ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
|
||||
result = cut(ser, 3)
|
||||
|
||||
exp_bins = np.linspace(0, 8, num=4).round(3)
|
||||
exp_bins[0] -= 0.008
|
||||
|
||||
expected = Series(
|
||||
IntervalIndex.from_breaks(exp_bins, closed="right").take(
|
||||
[0, 0, 0, 1, 1, 1, 2, 2, 2]
|
||||
)
|
||||
).astype(CategoricalDtype(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_series_ret_bins():
|
||||
# see gh-8589
|
||||
ser = Series(np.arange(4))
|
||||
result, bins = cut(ser, 2, retbins=True)
|
||||
|
||||
expected = Series(
|
||||
IntervalIndex.from_breaks([-0.003, 1.5, 3], closed="right").repeat(2)
|
||||
).astype(CategoricalDtype(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,msg",
|
||||
[
|
||||
({"duplicates": "drop"}, None),
|
||||
({}, "Bin edges must be unique"),
|
||||
({"duplicates": "raise"}, "Bin edges must be unique"),
|
||||
({"duplicates": "foo"}, "invalid value for 'duplicates' parameter"),
|
||||
],
|
||||
)
|
||||
def test_cut_duplicates_bin(kwargs, msg):
|
||||
# see gh-20947
|
||||
bins = [0, 2, 4, 6, 10, 10]
|
||||
values = Series(np.array([1, 3, 5, 7, 9]), index=["a", "b", "c", "d", "e"])
|
||||
|
||||
if msg is not None:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(values, bins, **kwargs)
|
||||
else:
|
||||
result = cut(values, bins, **kwargs)
|
||||
expected = cut(values, pd.unique(np.asarray(bins)))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data", [9.0, -9.0, 0.0])
|
||||
@pytest.mark.parametrize("length", [1, 2])
|
||||
def test_single_bin(data, length):
|
||||
# see gh-14652, gh-15428
|
||||
ser = Series([data] * length)
|
||||
result = cut(ser, 1, labels=False)
|
||||
|
||||
expected = Series([0] * length, dtype=np.intp)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_1_writeable,array_2_writeable", [(True, True), (True, False), (False, False)]
|
||||
)
|
||||
def test_cut_read_only(array_1_writeable, array_2_writeable):
|
||||
# issue 18773
|
||||
array_1 = np.arange(0, 100, 10)
|
||||
array_1.flags.writeable = array_1_writeable
|
||||
|
||||
array_2 = np.arange(0, 100, 10)
|
||||
array_2.flags.writeable = array_2_writeable
|
||||
|
||||
hundred_elements = np.arange(100)
|
||||
tm.assert_categorical_equal(
|
||||
cut(hundred_elements, array_1), cut(hundred_elements, array_2)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"conv",
|
||||
[
|
||||
lambda v: Timestamp(v),
|
||||
lambda v: to_datetime(v),
|
||||
lambda v: np.datetime64(v),
|
||||
lambda v: Timestamp(v).to_pydatetime(),
|
||||
],
|
||||
)
|
||||
def test_datetime_bin(conv):
|
||||
data = [np.datetime64("2012-12-13"), np.datetime64("2012-12-15")]
|
||||
bin_data = ["2012-12-12", "2012-12-14", "2012-12-16"]
|
||||
|
||||
expected = Series(
|
||||
IntervalIndex(
|
||||
[
|
||||
Interval(Timestamp(bin_data[0]), Timestamp(bin_data[1])),
|
||||
Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2])),
|
||||
]
|
||||
)
|
||||
).astype(CategoricalDtype(ordered=True))
|
||||
|
||||
bins = [conv(v) for v in bin_data]
|
||||
result = Series(cut(data, bins=bins))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("box", [Series, Index, np.array, list])
|
||||
def test_datetime_cut(unit, box):
|
||||
# see gh-14714
|
||||
#
|
||||
# Testing time data when it comes in various collection types.
|
||||
data = to_datetime(["2013-01-01", "2013-01-02", "2013-01-03"]).astype(f"M8[{unit}]")
|
||||
data = box(data)
|
||||
result, _ = cut(data, 3, retbins=True)
|
||||
|
||||
if box is list:
|
||||
# We don't (yet) do inference on these, so get nanos
|
||||
unit = "ns"
|
||||
|
||||
if unit == "s":
|
||||
# See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425
|
||||
# for why we round to 8 seconds instead of 7
|
||||
left = DatetimeIndex(
|
||||
["2012-12-31 23:57:08", "2013-01-01 16:00:00", "2013-01-02 08:00:00"],
|
||||
dtype=f"M8[{unit}]",
|
||||
)
|
||||
else:
|
||||
left = DatetimeIndex(
|
||||
[
|
||||
"2012-12-31 23:57:07.200000",
|
||||
"2013-01-01 16:00:00",
|
||||
"2013-01-02 08:00:00",
|
||||
],
|
||||
dtype=f"M8[{unit}]",
|
||||
)
|
||||
right = DatetimeIndex(
|
||||
["2013-01-01 16:00:00", "2013-01-02 08:00:00", "2013-01-03 00:00:00"],
|
||||
dtype=f"M8[{unit}]",
|
||||
)
|
||||
|
||||
exp_intervals = IntervalIndex.from_arrays(left, right)
|
||||
expected = Series(exp_intervals).astype(CategoricalDtype(ordered=True))
|
||||
tm.assert_series_equal(Series(result), expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("box", [list, np.array, Index, Series])
|
||||
def test_datetime_tz_cut_mismatched_tzawareness(box):
|
||||
# GH#54964
|
||||
bins = box(
|
||||
[
|
||||
Timestamp("2013-01-01 04:57:07.200000"),
|
||||
Timestamp("2013-01-01 21:00:00"),
|
||||
Timestamp("2013-01-02 13:00:00"),
|
||||
Timestamp("2013-01-03 05:00:00"),
|
||||
]
|
||||
)
|
||||
ser = Series(date_range("20130101", periods=3, tz="US/Eastern"))
|
||||
|
||||
msg = "Cannot use timezone-naive bins with timezone-aware values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(ser, bins)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"bins",
|
||||
[
|
||||
3,
|
||||
[
|
||||
Timestamp("2013-01-01 04:57:07.200000", tz="UTC").tz_convert("US/Eastern"),
|
||||
Timestamp("2013-01-01 21:00:00", tz="UTC").tz_convert("US/Eastern"),
|
||||
Timestamp("2013-01-02 13:00:00", tz="UTC").tz_convert("US/Eastern"),
|
||||
Timestamp("2013-01-03 05:00:00", tz="UTC").tz_convert("US/Eastern"),
|
||||
],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("box", [list, np.array, Index, Series])
|
||||
def test_datetime_tz_cut(bins, box):
|
||||
# see gh-19872
|
||||
tz = "US/Eastern"
|
||||
ser = Series(date_range("20130101", periods=3, tz=tz))
|
||||
|
||||
if not isinstance(bins, int):
|
||||
bins = box(bins)
|
||||
|
||||
result = cut(ser, bins)
|
||||
expected = Series(
|
||||
IntervalIndex(
|
||||
[
|
||||
Interval(
|
||||
Timestamp("2012-12-31 23:57:07.200000", tz=tz),
|
||||
Timestamp("2013-01-01 16:00:00", tz=tz),
|
||||
),
|
||||
Interval(
|
||||
Timestamp("2013-01-01 16:00:00", tz=tz),
|
||||
Timestamp("2013-01-02 08:00:00", tz=tz),
|
||||
),
|
||||
Interval(
|
||||
Timestamp("2013-01-02 08:00:00", tz=tz),
|
||||
Timestamp("2013-01-03 00:00:00", tz=tz),
|
||||
),
|
||||
]
|
||||
)
|
||||
).astype(CategoricalDtype(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_datetime_nan_error():
|
||||
msg = "bins must be of datetime64 dtype"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(date_range("20130101", periods=3), bins=[0, 2, 4])
|
||||
|
||||
|
||||
def test_datetime_nan_mask():
|
||||
result = cut(
|
||||
date_range("20130102", periods=5), bins=date_range("20130101", periods=2)
|
||||
)
|
||||
|
||||
mask = result.categories.isna()
|
||||
tm.assert_numpy_array_equal(mask, np.array([False]))
|
||||
|
||||
mask = result.isna()
|
||||
tm.assert_numpy_array_equal(mask, np.array([False, True, True, True, True]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"])
|
||||
def test_datetime_cut_roundtrip(tz, unit):
|
||||
# see gh-19891
|
||||
ser = Series(date_range("20180101", periods=3, tz=tz, unit=unit))
|
||||
result, result_bins = cut(ser, 2, retbins=True)
|
||||
|
||||
expected = cut(ser, result_bins)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
if unit == "s":
|
||||
# TODO: constructing DatetimeIndex with dtype="M8[s]" without truncating
|
||||
# the first entry here raises in array_to_datetime. Should truncate
|
||||
# instead of raising?
|
||||
# See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425
|
||||
# for why we round to 8 seconds instead of 7
|
||||
expected_bins = DatetimeIndex(
|
||||
["2017-12-31 23:57:08", "2018-01-02 00:00:00", "2018-01-03 00:00:00"],
|
||||
dtype=f"M8[{unit}]",
|
||||
)
|
||||
else:
|
||||
expected_bins = DatetimeIndex(
|
||||
[
|
||||
"2017-12-31 23:57:07.200000",
|
||||
"2018-01-02 00:00:00",
|
||||
"2018-01-03 00:00:00",
|
||||
],
|
||||
dtype=f"M8[{unit}]",
|
||||
)
|
||||
expected_bins = expected_bins.tz_localize(tz)
|
||||
tm.assert_index_equal(result_bins, expected_bins)
|
||||
|
||||
|
||||
def test_timedelta_cut_roundtrip():
|
||||
# see gh-19891
|
||||
ser = Series(timedelta_range("1day", periods=3))
|
||||
result, result_bins = cut(ser, 2, retbins=True)
|
||||
|
||||
expected = cut(ser, result_bins)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
expected_bins = TimedeltaIndex(
|
||||
["0 days 23:57:07.200000", "2 days 00:00:00", "3 days 00:00:00"]
|
||||
)
|
||||
tm.assert_index_equal(result_bins, expected_bins)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bins", [6, 7])
|
||||
@pytest.mark.parametrize(
|
||||
"box, compare",
|
||||
[
|
||||
(Series, tm.assert_series_equal),
|
||||
(np.array, tm.assert_categorical_equal),
|
||||
(list, tm.assert_equal),
|
||||
],
|
||||
)
|
||||
def test_cut_bool_coercion_to_int(bins, box, compare):
|
||||
# issue 20303
|
||||
data_expected = box([0, 1, 1, 0, 1] * 10)
|
||||
data_result = box([False, True, True, False, True] * 10)
|
||||
expected = cut(data_expected, bins, duplicates="drop")
|
||||
result = cut(data_result, bins, duplicates="drop")
|
||||
compare(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("labels", ["foo", 1, True])
|
||||
def test_cut_incorrect_labels(labels):
|
||||
# GH 13318
|
||||
values = range(5)
|
||||
msg = "Bin labels must either be False, None or passed in as a list-like argument"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(values, 4, labels=labels)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bins", [3, [0, 5, 15]])
|
||||
@pytest.mark.parametrize("right", [True, False])
|
||||
@pytest.mark.parametrize("include_lowest", [True, False])
|
||||
def test_cut_nullable_integer(bins, right, include_lowest):
|
||||
a = np.random.default_rng(2).integers(0, 10, size=50).astype(float)
|
||||
a[::2] = np.nan
|
||||
result = cut(
|
||||
pd.array(a, dtype="Int64"), bins, right=right, include_lowest=include_lowest
|
||||
)
|
||||
expected = cut(a, bins, right=right, include_lowest=include_lowest)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, bins, labels, expected_codes, expected_labels",
|
||||
[
|
||||
([15, 17, 19], [14, 16, 18, 20], ["A", "B", "A"], [0, 1, 0], ["A", "B"]),
|
||||
([1, 3, 5], [0, 2, 4, 6, 8], [2, 0, 1, 2], [2, 0, 1], [0, 1, 2]),
|
||||
],
|
||||
)
|
||||
def test_cut_non_unique_labels(data, bins, labels, expected_codes, expected_labels):
|
||||
# GH 33141
|
||||
result = cut(data, bins=bins, labels=labels, ordered=False)
|
||||
expected = Categorical.from_codes(
|
||||
expected_codes, categories=expected_labels, ordered=False
|
||||
)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, bins, labels, expected_codes, expected_labels",
|
||||
[
|
||||
([15, 17, 19], [14, 16, 18, 20], ["C", "B", "A"], [0, 1, 2], ["C", "B", "A"]),
|
||||
([1, 3, 5], [0, 2, 4, 6, 8], [3, 0, 1, 2], [0, 1, 2], [3, 0, 1, 2]),
|
||||
],
|
||||
)
|
||||
def test_cut_unordered_labels(data, bins, labels, expected_codes, expected_labels):
|
||||
# GH 33141
|
||||
result = cut(data, bins=bins, labels=labels, ordered=False)
|
||||
expected = Categorical.from_codes(
|
||||
expected_codes, categories=expected_labels, ordered=False
|
||||
)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
|
||||
def test_cut_unordered_with_missing_labels_raises_error():
|
||||
# GH 33141
|
||||
msg = "'labels' must be provided if 'ordered = False'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut([0.5, 3], bins=[0, 1, 2], ordered=False)
|
||||
|
||||
|
||||
def test_cut_unordered_with_series_labels():
|
||||
# https://github.com/pandas-dev/pandas/issues/36603
|
||||
ser = Series([1, 2, 3, 4, 5])
|
||||
bins = Series([0, 2, 4, 6])
|
||||
labels = Series(["a", "b", "c"])
|
||||
result = cut(ser, bins=bins, labels=labels, ordered=False)
|
||||
expected = Series(["a", "a", "b", "b", "c"], dtype="category")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_cut_no_warnings():
|
||||
df = DataFrame({"value": np.random.default_rng(2).integers(0, 100, 20)})
|
||||
labels = [f"{i} - {i + 9}" for i in range(0, 100, 10)]
|
||||
with tm.assert_produces_warning(False):
|
||||
df["group"] = cut(df.value, range(0, 105, 10), right=False, labels=labels)
|
||||
|
||||
|
||||
def test_cut_with_duplicated_index_lowest_included():
|
||||
# GH 42185
|
||||
expected = Series(
|
||||
[Interval(-0.001, 2, closed="right")] * 3
|
||||
+ [Interval(2, 4, closed="right"), Interval(-0.001, 2, closed="right")],
|
||||
index=[0, 1, 2, 3, 0],
|
||||
dtype="category",
|
||||
).cat.as_ordered()
|
||||
|
||||
ser = Series([0, 1, 2, 3, 0], index=[0, 1, 2, 3, 0])
|
||||
result = cut(ser, bins=[0, 2, 4], include_lowest=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_cut_with_nonexact_categorical_indices():
|
||||
# GH 42424
|
||||
|
||||
ser = Series(range(100))
|
||||
ser1 = cut(ser, 10).value_counts().head(5)
|
||||
ser2 = cut(ser, 10).value_counts().tail(5)
|
||||
result = DataFrame({"1": ser1, "2": ser2})
|
||||
|
||||
index = pd.CategoricalIndex(
|
||||
[
|
||||
Interval(-0.099, 9.9, closed="right"),
|
||||
Interval(9.9, 19.8, closed="right"),
|
||||
Interval(19.8, 29.7, closed="right"),
|
||||
Interval(29.7, 39.6, closed="right"),
|
||||
Interval(39.6, 49.5, closed="right"),
|
||||
Interval(49.5, 59.4, closed="right"),
|
||||
Interval(59.4, 69.3, closed="right"),
|
||||
Interval(69.3, 79.2, closed="right"),
|
||||
Interval(79.2, 89.1, closed="right"),
|
||||
Interval(89.1, 99, closed="right"),
|
||||
],
|
||||
ordered=True,
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{"1": [10] * 5 + [np.nan] * 5, "2": [np.nan] * 5 + [10] * 5}, index=index
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
def test_cut_with_timestamp_tuple_labels():
|
||||
# GH 40661
|
||||
labels = [(Timestamp(10),), (Timestamp(20),), (Timestamp(30),)]
|
||||
result = cut([2, 4, 6], bins=[1, 3, 5, 7], labels=labels)
|
||||
|
||||
expected = Categorical.from_codes([0, 1, 2], labels, ordered=True)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
|
||||
def test_cut_bins_datetime_intervalindex():
|
||||
# https://github.com/pandas-dev/pandas/issues/46218
|
||||
bins = interval_range(Timestamp("2022-02-25"), Timestamp("2022-02-27"), freq="1D")
|
||||
# passing Series instead of list is important to trigger bug
|
||||
result = cut(Series([Timestamp("2022-02-26")]).astype("M8[ns]"), bins=bins)
|
||||
expected = Categorical.from_codes([0], bins, ordered=True)
|
||||
tm.assert_categorical_equal(result.array, expected)
|
||||
|
||||
|
||||
def test_cut_with_nullable_int64():
|
||||
# GH 30787
|
||||
series = Series([0, 1, 2, 3, 4, pd.NA, 6, 7], dtype="Int64")
|
||||
bins = [0, 2, 4, 6, 8]
|
||||
intervals = IntervalIndex.from_breaks(bins)
|
||||
|
||||
expected = Series(
|
||||
Categorical.from_codes([-1, 0, 0, 1, 1, -1, 2, 3], intervals, ordered=True)
|
||||
)
|
||||
|
||||
result = cut(series, bins=bins)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
@ -0,0 +1,447 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
from_dummies,
|
||||
get_dummies,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dummies_basic():
|
||||
return DataFrame(
|
||||
{
|
||||
"col1_a": [1, 0, 1],
|
||||
"col1_b": [0, 1, 0],
|
||||
"col2_a": [0, 1, 0],
|
||||
"col2_b": [1, 0, 0],
|
||||
"col2_c": [0, 0, 1],
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dummies_with_unassigned():
|
||||
return DataFrame(
|
||||
{
|
||||
"col1_a": [1, 0, 0],
|
||||
"col1_b": [0, 1, 0],
|
||||
"col2_a": [0, 1, 0],
|
||||
"col2_b": [0, 0, 0],
|
||||
"col2_c": [0, 0, 1],
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def test_error_wrong_data_type():
|
||||
dummies = [0, 1, 0]
|
||||
with pytest.raises(
|
||||
TypeError,
|
||||
match=r"Expected 'data' to be a 'DataFrame'; Received 'data' of type: list",
|
||||
):
|
||||
from_dummies(dummies)
|
||||
|
||||
|
||||
def test_error_no_prefix_contains_unassigned():
|
||||
dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]})
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=(
|
||||
r"Dummy DataFrame contains unassigned value\(s\); "
|
||||
r"First instance in row: 2"
|
||||
),
|
||||
):
|
||||
from_dummies(dummies)
|
||||
|
||||
|
||||
def test_error_no_prefix_wrong_default_category_type():
|
||||
dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]})
|
||||
with pytest.raises(
|
||||
TypeError,
|
||||
match=(
|
||||
r"Expected 'default_category' to be of type 'None', 'Hashable', or 'dict'; "
|
||||
r"Received 'default_category' of type: list"
|
||||
),
|
||||
):
|
||||
from_dummies(dummies, default_category=["c", "d"])
|
||||
|
||||
|
||||
def test_error_no_prefix_multi_assignment():
|
||||
dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]})
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=(
|
||||
r"Dummy DataFrame contains multi-assignment\(s\); "
|
||||
r"First instance in row: 2"
|
||||
),
|
||||
):
|
||||
from_dummies(dummies)
|
||||
|
||||
|
||||
def test_error_no_prefix_contains_nan():
|
||||
dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, np.nan]})
|
||||
with pytest.raises(
|
||||
ValueError, match=r"Dummy DataFrame contains NA value in column: 'b'"
|
||||
):
|
||||
from_dummies(dummies)
|
||||
|
||||
|
||||
def test_error_contains_non_dummies():
|
||||
dummies = DataFrame(
|
||||
{"a": [1, 6, 3, 1], "b": [0, 1, 0, 2], "c": ["c1", "c2", "c3", "c4"]}
|
||||
)
|
||||
with pytest.raises(
|
||||
TypeError,
|
||||
match=r"Passed DataFrame contains non-dummy data",
|
||||
):
|
||||
from_dummies(dummies)
|
||||
|
||||
|
||||
def test_error_with_prefix_multiple_seperators():
|
||||
dummies = DataFrame(
|
||||
{
|
||||
"col1_a": [1, 0, 1],
|
||||
"col1_b": [0, 1, 0],
|
||||
"col2-a": [0, 1, 0],
|
||||
"col2-b": [1, 0, 1],
|
||||
},
|
||||
)
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=(r"Separator not specified for column: col2-a"),
|
||||
):
|
||||
from_dummies(dummies, sep="_")
|
||||
|
||||
|
||||
def test_error_with_prefix_sep_wrong_type(dummies_basic):
|
||||
with pytest.raises(
|
||||
TypeError,
|
||||
match=(
|
||||
r"Expected 'sep' to be of type 'str' or 'None'; "
|
||||
r"Received 'sep' of type: list"
|
||||
),
|
||||
):
|
||||
from_dummies(dummies_basic, sep=["_"])
|
||||
|
||||
|
||||
def test_error_with_prefix_contains_unassigned(dummies_with_unassigned):
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=(
|
||||
r"Dummy DataFrame contains unassigned value\(s\); "
|
||||
r"First instance in row: 2"
|
||||
),
|
||||
):
|
||||
from_dummies(dummies_with_unassigned, sep="_")
|
||||
|
||||
|
||||
def test_error_with_prefix_default_category_wrong_type(dummies_with_unassigned):
|
||||
with pytest.raises(
|
||||
TypeError,
|
||||
match=(
|
||||
r"Expected 'default_category' to be of type 'None', 'Hashable', or 'dict'; "
|
||||
r"Received 'default_category' of type: list"
|
||||
),
|
||||
):
|
||||
from_dummies(dummies_with_unassigned, sep="_", default_category=["x", "y"])
|
||||
|
||||
|
||||
def test_error_with_prefix_default_category_dict_not_complete(
|
||||
dummies_with_unassigned,
|
||||
):
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=(
|
||||
r"Length of 'default_category' \(1\) did not match "
|
||||
r"the length of the columns being encoded \(2\)"
|
||||
),
|
||||
):
|
||||
from_dummies(dummies_with_unassigned, sep="_", default_category={"col1": "x"})
|
||||
|
||||
|
||||
def test_error_with_prefix_contains_nan(dummies_basic):
|
||||
# Set float64 dtype to avoid upcast when setting np.nan
|
||||
dummies_basic["col2_c"] = dummies_basic["col2_c"].astype("float64")
|
||||
dummies_basic.loc[2, "col2_c"] = np.nan
|
||||
with pytest.raises(
|
||||
ValueError, match=r"Dummy DataFrame contains NA value in column: 'col2_c'"
|
||||
):
|
||||
from_dummies(dummies_basic, sep="_")
|
||||
|
||||
|
||||
def test_error_with_prefix_contains_non_dummies(dummies_basic):
|
||||
# Set object dtype to avoid upcast when setting "str"
|
||||
dummies_basic["col2_c"] = dummies_basic["col2_c"].astype(object)
|
||||
dummies_basic.loc[2, "col2_c"] = "str"
|
||||
with pytest.raises(TypeError, match=r"Passed DataFrame contains non-dummy data"):
|
||||
from_dummies(dummies_basic, sep="_")
|
||||
|
||||
|
||||
def test_error_with_prefix_double_assignment():
|
||||
dummies = DataFrame(
|
||||
{
|
||||
"col1_a": [1, 0, 1],
|
||||
"col1_b": [1, 1, 0],
|
||||
"col2_a": [0, 1, 0],
|
||||
"col2_b": [1, 0, 0],
|
||||
"col2_c": [0, 0, 1],
|
||||
},
|
||||
)
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=(
|
||||
r"Dummy DataFrame contains multi-assignment\(s\); "
|
||||
r"First instance in row: 0"
|
||||
),
|
||||
):
|
||||
from_dummies(dummies, sep="_")
|
||||
|
||||
|
||||
def test_roundtrip_series_to_dataframe():
|
||||
categories = Series(["a", "b", "c", "a"])
|
||||
dummies = get_dummies(categories)
|
||||
result = from_dummies(dummies)
|
||||
expected = DataFrame({"": ["a", "b", "c", "a"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_roundtrip_single_column_dataframe():
|
||||
categories = DataFrame({"": ["a", "b", "c", "a"]})
|
||||
dummies = get_dummies(categories)
|
||||
result = from_dummies(dummies, sep="_")
|
||||
expected = categories
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_roundtrip_with_prefixes():
|
||||
categories = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]})
|
||||
dummies = get_dummies(categories)
|
||||
result = from_dummies(dummies, sep="_")
|
||||
expected = categories
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_prefix_string_cats_basic():
|
||||
dummies = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]})
|
||||
expected = DataFrame({"": ["a", "b", "c", "a"]})
|
||||
result = from_dummies(dummies)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_prefix_string_cats_basic_bool_values():
|
||||
dummies = DataFrame(
|
||||
{
|
||||
"a": [True, False, False, True],
|
||||
"b": [False, True, False, False],
|
||||
"c": [False, False, True, False],
|
||||
}
|
||||
)
|
||||
expected = DataFrame({"": ["a", "b", "c", "a"]})
|
||||
result = from_dummies(dummies)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_prefix_string_cats_basic_mixed_bool_values():
|
||||
dummies = DataFrame(
|
||||
{"a": [1, 0, 0, 1], "b": [False, True, False, False], "c": [0, 0, 1, 0]}
|
||||
)
|
||||
expected = DataFrame({"": ["a", "b", "c", "a"]})
|
||||
result = from_dummies(dummies)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_prefix_int_cats_basic():
|
||||
dummies = DataFrame(
|
||||
{1: [1, 0, 0, 0], 25: [0, 1, 0, 0], 2: [0, 0, 1, 0], 5: [0, 0, 0, 1]}
|
||||
)
|
||||
expected = DataFrame({"": [1, 25, 2, 5]})
|
||||
result = from_dummies(dummies)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_prefix_float_cats_basic():
|
||||
dummies = DataFrame(
|
||||
{1.0: [1, 0, 0, 0], 25.0: [0, 1, 0, 0], 2.5: [0, 0, 1, 0], 5.84: [0, 0, 0, 1]}
|
||||
)
|
||||
expected = DataFrame({"": [1.0, 25.0, 2.5, 5.84]})
|
||||
result = from_dummies(dummies)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_prefix_mixed_cats_basic():
|
||||
dummies = DataFrame(
|
||||
{
|
||||
1.23: [1, 0, 0, 0, 0],
|
||||
"c": [0, 1, 0, 0, 0],
|
||||
2: [0, 0, 1, 0, 0],
|
||||
False: [0, 0, 0, 1, 0],
|
||||
None: [0, 0, 0, 0, 1],
|
||||
}
|
||||
)
|
||||
expected = DataFrame({"": [1.23, "c", 2, False, None]}, dtype="object")
|
||||
result = from_dummies(dummies)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_prefix_string_cats_contains_get_dummies_NaN_column():
|
||||
dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "NaN": [0, 0, 1]})
|
||||
expected = DataFrame({"": ["a", "b", "NaN"]})
|
||||
result = from_dummies(dummies)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"default_category, expected",
|
||||
[
|
||||
pytest.param(
|
||||
"c",
|
||||
DataFrame({"": ["a", "b", "c"]}),
|
||||
id="default_category is a str",
|
||||
),
|
||||
pytest.param(
|
||||
1,
|
||||
DataFrame({"": ["a", "b", 1]}),
|
||||
id="default_category is a int",
|
||||
),
|
||||
pytest.param(
|
||||
1.25,
|
||||
DataFrame({"": ["a", "b", 1.25]}),
|
||||
id="default_category is a float",
|
||||
),
|
||||
pytest.param(
|
||||
0,
|
||||
DataFrame({"": ["a", "b", 0]}),
|
||||
id="default_category is a 0",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
DataFrame({"": ["a", "b", False]}),
|
||||
id="default_category is a bool",
|
||||
),
|
||||
pytest.param(
|
||||
(1, 2),
|
||||
DataFrame({"": ["a", "b", (1, 2)]}),
|
||||
id="default_category is a tuple",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_no_prefix_string_cats_default_category(
|
||||
default_category, expected, using_infer_string
|
||||
):
|
||||
dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]})
|
||||
result = from_dummies(dummies, default_category=default_category)
|
||||
if using_infer_string:
|
||||
expected[""] = expected[""].astype("string[pyarrow_numpy]")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_with_prefix_basic(dummies_basic):
|
||||
expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]})
|
||||
result = from_dummies(dummies_basic, sep="_")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_with_prefix_contains_get_dummies_NaN_column():
|
||||
dummies = DataFrame(
|
||||
{
|
||||
"col1_a": [1, 0, 0],
|
||||
"col1_b": [0, 1, 0],
|
||||
"col1_NaN": [0, 0, 1],
|
||||
"col2_a": [0, 1, 0],
|
||||
"col2_b": [0, 0, 0],
|
||||
"col2_c": [0, 0, 1],
|
||||
"col2_NaN": [1, 0, 0],
|
||||
},
|
||||
)
|
||||
expected = DataFrame({"col1": ["a", "b", "NaN"], "col2": ["NaN", "a", "c"]})
|
||||
result = from_dummies(dummies, sep="_")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"default_category, expected",
|
||||
[
|
||||
pytest.param(
|
||||
"x",
|
||||
DataFrame({"col1": ["a", "b", "x"], "col2": ["x", "a", "c"]}),
|
||||
id="default_category is a str",
|
||||
),
|
||||
pytest.param(
|
||||
0,
|
||||
DataFrame({"col1": ["a", "b", 0], "col2": [0, "a", "c"]}),
|
||||
id="default_category is a 0",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
DataFrame({"col1": ["a", "b", False], "col2": [False, "a", "c"]}),
|
||||
id="default_category is a False",
|
||||
),
|
||||
pytest.param(
|
||||
{"col2": 1, "col1": 2.5},
|
||||
DataFrame({"col1": ["a", "b", 2.5], "col2": [1, "a", "c"]}),
|
||||
id="default_category is a dict with int and float values",
|
||||
),
|
||||
pytest.param(
|
||||
{"col2": None, "col1": False},
|
||||
DataFrame({"col1": ["a", "b", False], "col2": [None, "a", "c"]}),
|
||||
id="default_category is a dict with bool and None values",
|
||||
),
|
||||
pytest.param(
|
||||
{"col2": (1, 2), "col1": [1.25, False]},
|
||||
DataFrame({"col1": ["a", "b", [1.25, False]], "col2": [(1, 2), "a", "c"]}),
|
||||
id="default_category is a dict with list and tuple values",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_with_prefix_default_category(
|
||||
dummies_with_unassigned, default_category, expected
|
||||
):
|
||||
result = from_dummies(
|
||||
dummies_with_unassigned, sep="_", default_category=default_category
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_ea_categories():
|
||||
# GH 54300
|
||||
df = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]})
|
||||
df.columns = df.columns.astype("string[python]")
|
||||
result = from_dummies(df)
|
||||
expected = DataFrame({"": Series(list("abca"), dtype="string[python]")})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_ea_categories_with_sep():
|
||||
# GH 54300
|
||||
df = DataFrame(
|
||||
{
|
||||
"col1_a": [1, 0, 1],
|
||||
"col1_b": [0, 1, 0],
|
||||
"col2_a": [0, 1, 0],
|
||||
"col2_b": [1, 0, 0],
|
||||
"col2_c": [0, 0, 1],
|
||||
}
|
||||
)
|
||||
df.columns = df.columns.astype("string[python]")
|
||||
result = from_dummies(df, sep="_")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"col1": Series(list("aba"), dtype="string[python]"),
|
||||
"col2": Series(list("bac"), dtype="string[python]"),
|
||||
}
|
||||
)
|
||||
expected.columns = expected.columns.astype("string[python]")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_maintain_original_index():
|
||||
# GH 54300
|
||||
df = DataFrame(
|
||||
{"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}, index=list("abcd")
|
||||
)
|
||||
result = from_dummies(df)
|
||||
expected = DataFrame({"": list("abca")}, index=list("abcd"))
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,743 @@
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas.core.dtypes.common import is_integer_dtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
ArrowDtype,
|
||||
Categorical,
|
||||
CategoricalDtype,
|
||||
CategoricalIndex,
|
||||
DataFrame,
|
||||
Index,
|
||||
RangeIndex,
|
||||
Series,
|
||||
SparseDtype,
|
||||
get_dummies,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays.sparse import SparseArray
|
||||
|
||||
try:
|
||||
import pyarrow as pa
|
||||
except ImportError:
|
||||
pa = None
|
||||
|
||||
|
||||
class TestGetDummies:
|
||||
@pytest.fixture
|
||||
def df(self):
|
||||
return DataFrame({"A": ["a", "b", "a"], "B": ["b", "b", "c"], "C": [1, 2, 3]})
|
||||
|
||||
@pytest.fixture(params=["uint8", "i8", np.float64, bool, None])
|
||||
def dtype(self, request):
|
||||
return np.dtype(request.param)
|
||||
|
||||
@pytest.fixture(params=["dense", "sparse"])
|
||||
def sparse(self, request):
|
||||
# params are strings to simplify reading test results,
|
||||
# e.g. TestGetDummies::test_basic[uint8-sparse] instead of [uint8-True]
|
||||
return request.param == "sparse"
|
||||
|
||||
def effective_dtype(self, dtype):
|
||||
if dtype is None:
|
||||
return np.uint8
|
||||
return dtype
|
||||
|
||||
def test_get_dummies_raises_on_dtype_object(self, df):
|
||||
msg = "dtype=object is not a valid dtype for get_dummies"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
get_dummies(df, dtype="object")
|
||||
|
||||
def test_get_dummies_basic(self, sparse, dtype):
|
||||
s_list = list("abc")
|
||||
s_series = Series(s_list)
|
||||
s_series_index = Series(s_list, list("ABC"))
|
||||
|
||||
expected = DataFrame(
|
||||
{"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]},
|
||||
dtype=self.effective_dtype(dtype),
|
||||
)
|
||||
if sparse:
|
||||
if dtype.kind == "b":
|
||||
expected = expected.apply(SparseArray, fill_value=False)
|
||||
else:
|
||||
expected = expected.apply(SparseArray, fill_value=0.0)
|
||||
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_series, sparse=sparse, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected.index = list("ABC")
|
||||
result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_get_dummies_basic_types(self, sparse, dtype, using_infer_string):
|
||||
# GH 10531
|
||||
s_list = list("abc")
|
||||
s_series = Series(s_list)
|
||||
s_df = DataFrame(
|
||||
{"a": [0, 1, 0, 1, 2], "b": ["A", "A", "B", "C", "C"], "c": [2, 3, 3, 3, 2]}
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]},
|
||||
dtype=self.effective_dtype(dtype),
|
||||
columns=list("abc"),
|
||||
)
|
||||
if sparse:
|
||||
if is_integer_dtype(dtype):
|
||||
fill_value = 0
|
||||
elif dtype == bool:
|
||||
fill_value = False
|
||||
else:
|
||||
fill_value = 0.0
|
||||
|
||||
expected = expected.apply(SparseArray, fill_value=fill_value)
|
||||
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_series, sparse=sparse, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_df, columns=s_df.columns, sparse=sparse, dtype=dtype)
|
||||
if sparse:
|
||||
dtype_name = f"Sparse[{self.effective_dtype(dtype).name}, {fill_value}]"
|
||||
else:
|
||||
dtype_name = self.effective_dtype(dtype).name
|
||||
|
||||
expected = Series({dtype_name: 8}, name="count")
|
||||
result = result.dtypes.value_counts()
|
||||
result.index = [str(i) for i in result.index]
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_df, columns=["a"], sparse=sparse, dtype=dtype)
|
||||
|
||||
key = "string" if using_infer_string else "object"
|
||||
expected_counts = {"int64": 1, key: 1}
|
||||
expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0)
|
||||
|
||||
expected = Series(expected_counts, name="count").sort_index()
|
||||
result = result.dtypes.value_counts()
|
||||
result.index = [str(i) for i in result.index]
|
||||
result = result.sort_index()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_get_dummies_just_na(self, sparse):
|
||||
just_na_list = [np.nan]
|
||||
just_na_series = Series(just_na_list)
|
||||
just_na_series_index = Series(just_na_list, index=["A"])
|
||||
|
||||
res_list = get_dummies(just_na_list, sparse=sparse)
|
||||
res_series = get_dummies(just_na_series, sparse=sparse)
|
||||
res_series_index = get_dummies(just_na_series_index, sparse=sparse)
|
||||
|
||||
assert res_list.empty
|
||||
assert res_series.empty
|
||||
assert res_series_index.empty
|
||||
|
||||
assert res_list.index.tolist() == [0]
|
||||
assert res_series.index.tolist() == [0]
|
||||
assert res_series_index.index.tolist() == ["A"]
|
||||
|
||||
def test_get_dummies_include_na(self, sparse, dtype):
|
||||
s = ["a", "b", np.nan]
|
||||
res = get_dummies(s, sparse=sparse, dtype=dtype)
|
||||
exp = DataFrame(
|
||||
{"a": [1, 0, 0], "b": [0, 1, 0]}, dtype=self.effective_dtype(dtype)
|
||||
)
|
||||
if sparse:
|
||||
if dtype.kind == "b":
|
||||
exp = exp.apply(SparseArray, fill_value=False)
|
||||
else:
|
||||
exp = exp.apply(SparseArray, fill_value=0.0)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# Sparse dataframes do not allow nan labelled columns, see #GH8822
|
||||
res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype)
|
||||
exp_na = DataFrame(
|
||||
{np.nan: [0, 0, 1], "a": [1, 0, 0], "b": [0, 1, 0]},
|
||||
dtype=self.effective_dtype(dtype),
|
||||
)
|
||||
exp_na = exp_na.reindex(["a", "b", np.nan], axis=1)
|
||||
# hack (NaN handling in assert_index_equal)
|
||||
exp_na.columns = res_na.columns
|
||||
if sparse:
|
||||
if dtype.kind == "b":
|
||||
exp_na = exp_na.apply(SparseArray, fill_value=False)
|
||||
else:
|
||||
exp_na = exp_na.apply(SparseArray, fill_value=0.0)
|
||||
tm.assert_frame_equal(res_na, exp_na)
|
||||
|
||||
res_just_na = get_dummies([np.nan], dummy_na=True, sparse=sparse, dtype=dtype)
|
||||
exp_just_na = DataFrame(
|
||||
Series(1, index=[0]), columns=[np.nan], dtype=self.effective_dtype(dtype)
|
||||
)
|
||||
tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
|
||||
|
||||
def test_get_dummies_unicode(self, sparse):
|
||||
# See GH 6885 - get_dummies chokes on unicode values
|
||||
e = "e"
|
||||
eacute = unicodedata.lookup("LATIN SMALL LETTER E WITH ACUTE")
|
||||
s = [e, eacute, eacute]
|
||||
res = get_dummies(s, prefix="letter", sparse=sparse)
|
||||
exp = DataFrame(
|
||||
{"letter_e": [True, False, False], f"letter_{eacute}": [False, True, True]}
|
||||
)
|
||||
if sparse:
|
||||
exp = exp.apply(SparseArray, fill_value=False)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_dataframe_dummies_all_obj(self, df, sparse):
|
||||
df = df[["A", "B"]]
|
||||
result = get_dummies(df, sparse=sparse)
|
||||
expected = DataFrame(
|
||||
{"A_a": [1, 0, 1], "A_b": [0, 1, 0], "B_b": [1, 1, 0], "B_c": [0, 0, 1]},
|
||||
dtype=bool,
|
||||
)
|
||||
if sparse:
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A_a": SparseArray([1, 0, 1], dtype="bool"),
|
||||
"A_b": SparseArray([0, 1, 0], dtype="bool"),
|
||||
"B_b": SparseArray([1, 1, 0], dtype="bool"),
|
||||
"B_c": SparseArray([0, 0, 1], dtype="bool"),
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_string_dtype(self, df, using_infer_string):
|
||||
# GH44965
|
||||
df = df[["A", "B"]]
|
||||
df = df.astype({"A": "object", "B": "string"})
|
||||
result = get_dummies(df)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A_a": [1, 0, 1],
|
||||
"A_b": [0, 1, 0],
|
||||
"B_b": [1, 1, 0],
|
||||
"B_c": [0, 0, 1],
|
||||
},
|
||||
dtype=bool,
|
||||
)
|
||||
if not using_infer_string:
|
||||
# infer_string returns numpy bools
|
||||
expected[["B_b", "B_c"]] = expected[["B_b", "B_c"]].astype("boolean")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_mix_default(self, df, sparse, dtype):
|
||||
result = get_dummies(df, sparse=sparse, dtype=dtype)
|
||||
if sparse:
|
||||
arr = SparseArray
|
||||
if dtype.kind == "b":
|
||||
typ = SparseDtype(dtype, False)
|
||||
else:
|
||||
typ = SparseDtype(dtype, 0)
|
||||
else:
|
||||
arr = np.array
|
||||
typ = dtype
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": [1, 2, 3],
|
||||
"A_a": arr([1, 0, 1], dtype=typ),
|
||||
"A_b": arr([0, 1, 0], dtype=typ),
|
||||
"B_b": arr([1, 1, 0], dtype=typ),
|
||||
"B_c": arr([0, 0, 1], dtype=typ),
|
||||
}
|
||||
)
|
||||
expected = expected[["C", "A_a", "A_b", "B_b", "B_c"]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_prefix_list(self, df, sparse):
|
||||
prefixes = ["from_A", "from_B"]
|
||||
result = get_dummies(df, prefix=prefixes, sparse=sparse)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": [1, 2, 3],
|
||||
"from_A_a": [True, False, True],
|
||||
"from_A_b": [False, True, False],
|
||||
"from_B_b": [True, True, False],
|
||||
"from_B_c": [False, False, True],
|
||||
},
|
||||
)
|
||||
expected[["C"]] = df[["C"]]
|
||||
cols = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
|
||||
expected = expected[["C"] + cols]
|
||||
|
||||
typ = SparseArray if sparse else Series
|
||||
expected[cols] = expected[cols].apply(lambda x: typ(x))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_prefix_str(self, df, sparse):
|
||||
# not that you should do this...
|
||||
result = get_dummies(df, prefix="bad", sparse=sparse)
|
||||
bad_columns = ["bad_a", "bad_b", "bad_b", "bad_c"]
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1, True, False, True, False],
|
||||
[2, False, True, True, False],
|
||||
[3, True, False, False, True],
|
||||
],
|
||||
columns=["C"] + bad_columns,
|
||||
)
|
||||
expected = expected.astype({"C": np.int64})
|
||||
if sparse:
|
||||
# work around astyping & assigning with duplicate columns
|
||||
# https://github.com/pandas-dev/pandas/issues/14427
|
||||
expected = pd.concat(
|
||||
[
|
||||
Series([1, 2, 3], name="C"),
|
||||
Series([True, False, True], name="bad_a", dtype="Sparse[bool]"),
|
||||
Series([False, True, False], name="bad_b", dtype="Sparse[bool]"),
|
||||
Series([True, True, False], name="bad_b", dtype="Sparse[bool]"),
|
||||
Series([False, False, True], name="bad_c", dtype="Sparse[bool]"),
|
||||
],
|
||||
axis=1,
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_subset(self, df, sparse):
|
||||
result = get_dummies(df, prefix=["from_A"], columns=["A"], sparse=sparse)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"B": ["b", "b", "c"],
|
||||
"C": [1, 2, 3],
|
||||
"from_A_a": [1, 0, 1],
|
||||
"from_A_b": [0, 1, 0],
|
||||
},
|
||||
)
|
||||
cols = expected.columns
|
||||
expected[cols[1:]] = expected[cols[1:]].astype(bool)
|
||||
expected[["C"]] = df[["C"]]
|
||||
if sparse:
|
||||
cols = ["from_A_a", "from_A_b"]
|
||||
expected[cols] = expected[cols].astype(SparseDtype("bool", False))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_prefix_sep(self, df, sparse):
|
||||
result = get_dummies(df, prefix_sep="..", sparse=sparse)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": [1, 2, 3],
|
||||
"A..a": [True, False, True],
|
||||
"A..b": [False, True, False],
|
||||
"B..b": [True, True, False],
|
||||
"B..c": [False, False, True],
|
||||
},
|
||||
)
|
||||
expected[["C"]] = df[["C"]]
|
||||
expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]]
|
||||
if sparse:
|
||||
cols = ["A..a", "A..b", "B..b", "B..c"]
|
||||
expected[cols] = expected[cols].astype(SparseDtype("bool", False))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(df, prefix_sep=["..", "__"], sparse=sparse)
|
||||
expected = expected.rename(columns={"B..b": "B__b", "B..c": "B__c"})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(df, prefix_sep={"A": "..", "B": "__"}, sparse=sparse)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_prefix_bad_length(self, df, sparse):
|
||||
msg = re.escape(
|
||||
"Length of 'prefix' (1) did not match the length of the columns being "
|
||||
"encoded (2)"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
get_dummies(df, prefix=["too few"], sparse=sparse)
|
||||
|
||||
def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse):
|
||||
msg = re.escape(
|
||||
"Length of 'prefix_sep' (1) did not match the length of the columns being "
|
||||
"encoded (2)"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
get_dummies(df, prefix_sep=["bad"], sparse=sparse)
|
||||
|
||||
def test_dataframe_dummies_prefix_dict(self, sparse):
|
||||
prefixes = {"A": "from_A", "B": "from_B"}
|
||||
df = DataFrame({"C": [1, 2, 3], "A": ["a", "b", "a"], "B": ["b", "b", "c"]})
|
||||
result = get_dummies(df, prefix=prefixes, sparse=sparse)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": [1, 2, 3],
|
||||
"from_A_a": [1, 0, 1],
|
||||
"from_A_b": [0, 1, 0],
|
||||
"from_B_b": [1, 1, 0],
|
||||
"from_B_c": [0, 0, 1],
|
||||
}
|
||||
)
|
||||
|
||||
columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
|
||||
expected[columns] = expected[columns].astype(bool)
|
||||
if sparse:
|
||||
expected[columns] = expected[columns].astype(SparseDtype("bool", False))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_with_na(self, df, sparse, dtype):
|
||||
df.loc[3, :] = [np.nan, np.nan, np.nan]
|
||||
result = get_dummies(df, dummy_na=True, sparse=sparse, dtype=dtype).sort_index(
|
||||
axis=1
|
||||
)
|
||||
|
||||
if sparse:
|
||||
arr = SparseArray
|
||||
if dtype.kind == "b":
|
||||
typ = SparseDtype(dtype, False)
|
||||
else:
|
||||
typ = SparseDtype(dtype, 0)
|
||||
else:
|
||||
arr = np.array
|
||||
typ = dtype
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": [1, 2, 3, np.nan],
|
||||
"A_a": arr([1, 0, 1, 0], dtype=typ),
|
||||
"A_b": arr([0, 1, 0, 0], dtype=typ),
|
||||
"A_nan": arr([0, 0, 0, 1], dtype=typ),
|
||||
"B_b": arr([1, 1, 0, 0], dtype=typ),
|
||||
"B_c": arr([0, 0, 1, 0], dtype=typ),
|
||||
"B_nan": arr([0, 0, 0, 1], dtype=typ),
|
||||
}
|
||||
).sort_index(axis=1)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype)
|
||||
expected = expected[["C", "A_a", "A_b", "B_b", "B_c"]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
|
||||
df["cat"] = Categorical(["x", "y", "y"])
|
||||
result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1)
|
||||
if sparse:
|
||||
arr = SparseArray
|
||||
if dtype.kind == "b":
|
||||
typ = SparseDtype(dtype, False)
|
||||
else:
|
||||
typ = SparseDtype(dtype, 0)
|
||||
else:
|
||||
arr = np.array
|
||||
typ = dtype
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": [1, 2, 3],
|
||||
"A_a": arr([1, 0, 1], dtype=typ),
|
||||
"A_b": arr([0, 1, 0], dtype=typ),
|
||||
"B_b": arr([1, 1, 0], dtype=typ),
|
||||
"B_c": arr([0, 0, 1], dtype=typ),
|
||||
"cat_x": arr([1, 0, 0], dtype=typ),
|
||||
"cat_y": arr([0, 1, 1], dtype=typ),
|
||||
}
|
||||
).sort_index(axis=1)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"get_dummies_kwargs,expected",
|
||||
[
|
||||
(
|
||||
{"data": DataFrame({"ä": ["a"]})},
|
||||
DataFrame({"ä_a": [True]}),
|
||||
),
|
||||
(
|
||||
{"data": DataFrame({"x": ["ä"]})},
|
||||
DataFrame({"x_ä": [True]}),
|
||||
),
|
||||
(
|
||||
{"data": DataFrame({"x": ["a"]}), "prefix": "ä"},
|
||||
DataFrame({"ä_a": [True]}),
|
||||
),
|
||||
(
|
||||
{"data": DataFrame({"x": ["a"]}), "prefix_sep": "ä"},
|
||||
DataFrame({"xäa": [True]}),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected):
|
||||
# GH22084 get_dummies incorrectly encodes unicode characters
|
||||
# in dataframe column names
|
||||
result = get_dummies(**get_dummies_kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_get_dummies_basic_drop_first(self, sparse):
|
||||
# GH12402 Add a new parameter `drop_first` to avoid collinearity
|
||||
# Basic case
|
||||
s_list = list("abc")
|
||||
s_series = Series(s_list)
|
||||
s_series_index = Series(s_list, list("ABC"))
|
||||
|
||||
expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=bool)
|
||||
|
||||
result = get_dummies(s_list, drop_first=True, sparse=sparse)
|
||||
if sparse:
|
||||
expected = expected.apply(SparseArray, fill_value=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_series, drop_first=True, sparse=sparse)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected.index = list("ABC")
|
||||
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_get_dummies_basic_drop_first_one_level(self, sparse):
|
||||
# Test the case that categorical variable only has one level.
|
||||
s_list = list("aaa")
|
||||
s_series = Series(s_list)
|
||||
s_series_index = Series(s_list, list("ABC"))
|
||||
|
||||
expected = DataFrame(index=RangeIndex(3))
|
||||
|
||||
result = get_dummies(s_list, drop_first=True, sparse=sparse)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_series, drop_first=True, sparse=sparse)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame(index=list("ABC"))
|
||||
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_get_dummies_basic_drop_first_NA(self, sparse):
|
||||
# Test NA handling together with drop_first
|
||||
s_NA = ["a", "b", np.nan]
|
||||
res = get_dummies(s_NA, drop_first=True, sparse=sparse)
|
||||
exp = DataFrame({"b": [0, 1, 0]}, dtype=bool)
|
||||
if sparse:
|
||||
exp = exp.apply(SparseArray, fill_value=False)
|
||||
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse)
|
||||
exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=bool).reindex(
|
||||
["b", np.nan], axis=1
|
||||
)
|
||||
if sparse:
|
||||
exp_na = exp_na.apply(SparseArray, fill_value=False)
|
||||
tm.assert_frame_equal(res_na, exp_na)
|
||||
|
||||
res_just_na = get_dummies(
|
||||
[np.nan], dummy_na=True, drop_first=True, sparse=sparse
|
||||
)
|
||||
exp_just_na = DataFrame(index=RangeIndex(1))
|
||||
tm.assert_frame_equal(res_just_na, exp_just_na)
|
||||
|
||||
def test_dataframe_dummies_drop_first(self, df, sparse):
|
||||
df = df[["A", "B"]]
|
||||
result = get_dummies(df, drop_first=True, sparse=sparse)
|
||||
expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=bool)
|
||||
if sparse:
|
||||
expected = expected.apply(SparseArray, fill_value=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype):
|
||||
df["cat"] = Categorical(["x", "y", "y"])
|
||||
result = get_dummies(df, drop_first=True, sparse=sparse)
|
||||
expected = DataFrame(
|
||||
{"C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1]}
|
||||
)
|
||||
cols = ["A_b", "B_c", "cat_y"]
|
||||
expected[cols] = expected[cols].astype(bool)
|
||||
expected = expected[["C", "A_b", "B_c", "cat_y"]]
|
||||
if sparse:
|
||||
for col in cols:
|
||||
expected[col] = SparseArray(expected[col])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
|
||||
df.loc[3, :] = [np.nan, np.nan, np.nan]
|
||||
result = get_dummies(
|
||||
df, dummy_na=True, drop_first=True, sparse=sparse
|
||||
).sort_index(axis=1)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": [1, 2, 3, np.nan],
|
||||
"A_b": [0, 1, 0, 0],
|
||||
"A_nan": [0, 0, 0, 1],
|
||||
"B_c": [0, 0, 1, 0],
|
||||
"B_nan": [0, 0, 0, 1],
|
||||
}
|
||||
)
|
||||
cols = ["A_b", "A_nan", "B_c", "B_nan"]
|
||||
expected[cols] = expected[cols].astype(bool)
|
||||
expected = expected.sort_index(axis=1)
|
||||
if sparse:
|
||||
for col in cols:
|
||||
expected[col] = SparseArray(expected[col])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(df, dummy_na=False, drop_first=True, sparse=sparse)
|
||||
expected = expected[["C", "A_b", "B_c"]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_get_dummies_int_int(self):
|
||||
data = Series([1, 2, 1])
|
||||
result = get_dummies(data)
|
||||
expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=bool)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
data = Series(Categorical(["a", "b", "a"]))
|
||||
result = get_dummies(data)
|
||||
expected = DataFrame(
|
||||
[[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=bool
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_get_dummies_int_df(self, dtype):
|
||||
data = DataFrame(
|
||||
{
|
||||
"A": [1, 2, 1],
|
||||
"B": Categorical(["a", "b", "a"]),
|
||||
"C": [1, 2, 1],
|
||||
"D": [1.0, 2.0, 1.0],
|
||||
}
|
||||
)
|
||||
columns = ["C", "D", "A_1", "A_2", "B_a", "B_b"]
|
||||
expected = DataFrame(
|
||||
[[1, 1.0, 1, 0, 1, 0], [2, 2.0, 0, 1, 0, 1], [1, 1.0, 1, 0, 1, 0]],
|
||||
columns=columns,
|
||||
)
|
||||
expected[columns[2:]] = expected[columns[2:]].astype(dtype)
|
||||
result = get_dummies(data, columns=["A", "B"], dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("ordered", [True, False])
|
||||
def test_dataframe_dummies_preserve_categorical_dtype(self, dtype, ordered):
|
||||
# GH13854
|
||||
cat = Categorical(list("xy"), categories=list("xyz"), ordered=ordered)
|
||||
result = get_dummies(cat, dtype=dtype)
|
||||
|
||||
data = np.array([[1, 0, 0], [0, 1, 0]], dtype=self.effective_dtype(dtype))
|
||||
cols = CategoricalIndex(
|
||||
cat.categories, categories=cat.categories, ordered=ordered
|
||||
)
|
||||
expected = DataFrame(data, columns=cols, dtype=self.effective_dtype(dtype))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("sparse", [True, False])
|
||||
def test_get_dummies_dont_sparsify_all_columns(self, sparse):
|
||||
# GH18914
|
||||
df = DataFrame.from_dict({"GDP": [1, 2], "Nation": ["AB", "CD"]})
|
||||
df = get_dummies(df, columns=["Nation"], sparse=sparse)
|
||||
df2 = df.reindex(columns=["GDP"])
|
||||
|
||||
tm.assert_frame_equal(df[["GDP"]], df2)
|
||||
|
||||
def test_get_dummies_duplicate_columns(self, df):
|
||||
# GH20839
|
||||
df.columns = ["A", "A", "A"]
|
||||
result = get_dummies(df).sort_index(axis=1)
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1, True, False, True, False],
|
||||
[2, False, True, True, False],
|
||||
[3, True, False, False, True],
|
||||
],
|
||||
columns=["A", "A_a", "A_b", "A_b", "A_c"],
|
||||
).sort_index(axis=1)
|
||||
|
||||
expected = expected.astype({"A": np.int64})
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_get_dummies_all_sparse(self):
|
||||
df = DataFrame({"A": [1, 2]})
|
||||
result = get_dummies(df, columns=["A"], sparse=True)
|
||||
dtype = SparseDtype("bool", False)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A_1": SparseArray([1, 0], dtype=dtype),
|
||||
"A_2": SparseArray([0, 1], dtype=dtype),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("values", ["baz"])
|
||||
def test_get_dummies_with_string_values(self, values):
|
||||
# issue #28383
|
||||
df = DataFrame(
|
||||
{
|
||||
"bar": [1, 2, 3, 4, 5, 6],
|
||||
"foo": ["one", "one", "one", "two", "two", "two"],
|
||||
"baz": ["A", "B", "C", "A", "B", "C"],
|
||||
"zoo": ["x", "y", "z", "q", "w", "t"],
|
||||
}
|
||||
)
|
||||
|
||||
msg = "Input must be a list-like for parameter `columns`"
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
get_dummies(df, columns=values)
|
||||
|
||||
def test_get_dummies_ea_dtype_series(self, any_numeric_ea_and_arrow_dtype):
|
||||
# GH#32430
|
||||
ser = Series(list("abca"))
|
||||
result = get_dummies(ser, dtype=any_numeric_ea_and_arrow_dtype)
|
||||
expected = DataFrame(
|
||||
{"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]},
|
||||
dtype=any_numeric_ea_and_arrow_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_get_dummies_ea_dtype_dataframe(self, any_numeric_ea_and_arrow_dtype):
|
||||
# GH#32430
|
||||
df = DataFrame({"x": list("abca")})
|
||||
result = get_dummies(df, dtype=any_numeric_ea_and_arrow_dtype)
|
||||
expected = DataFrame(
|
||||
{"x_a": [1, 0, 0, 1], "x_b": [0, 1, 0, 0], "x_c": [0, 0, 1, 0]},
|
||||
dtype=any_numeric_ea_and_arrow_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@td.skip_if_no("pyarrow")
|
||||
def test_get_dummies_ea_dtype(self):
|
||||
# GH#56273
|
||||
for dtype, exp_dtype in [
|
||||
("string[pyarrow]", "boolean"),
|
||||
("string[pyarrow_numpy]", "bool"),
|
||||
(CategoricalDtype(Index(["a"], dtype="string[pyarrow]")), "boolean"),
|
||||
(CategoricalDtype(Index(["a"], dtype="string[pyarrow_numpy]")), "bool"),
|
||||
]:
|
||||
df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1})
|
||||
result = get_dummies(df)
|
||||
expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@td.skip_if_no("pyarrow")
|
||||
def test_get_dummies_arrow_dtype(self):
|
||||
# GH#56273
|
||||
df = DataFrame({"name": Series(["a"], dtype=ArrowDtype(pa.string())), "x": 1})
|
||||
result = get_dummies(df)
|
||||
expected = DataFrame({"x": 1, "name_a": Series([True], dtype="bool[pyarrow]")})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"name": Series(
|
||||
["a"],
|
||||
dtype=CategoricalDtype(Index(["a"], dtype=ArrowDtype(pa.string()))),
|
||||
),
|
||||
"x": 1,
|
||||
}
|
||||
)
|
||||
result = get_dummies(df)
|
||||
tm.assert_frame_equal(result, expected)
|
1252
lib/python3.13/site-packages/pandas/tests/reshape/test_melt.py
Normal file
1252
lib/python3.13/site-packages/pandas/tests/reshape/test_melt.py
Normal file
File diff suppressed because it is too large
Load Diff
2714
lib/python3.13/site-packages/pandas/tests/reshape/test_pivot.py
Normal file
2714
lib/python3.13/site-packages/pandas/tests/reshape/test_pivot.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,254 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import lib
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_index, input_columns, input_values, "
|
||||
"expected_values, expected_columns, expected_index",
|
||||
[
|
||||
(
|
||||
["lev4"],
|
||||
"lev3",
|
||||
"values",
|
||||
[
|
||||
[0.0, np.nan],
|
||||
[np.nan, 1.0],
|
||||
[2.0, np.nan],
|
||||
[np.nan, 3.0],
|
||||
[4.0, np.nan],
|
||||
[np.nan, 5.0],
|
||||
[6.0, np.nan],
|
||||
[np.nan, 7.0],
|
||||
],
|
||||
Index([1, 2], name="lev3"),
|
||||
Index([1, 2, 3, 4, 5, 6, 7, 8], name="lev4"),
|
||||
),
|
||||
(
|
||||
["lev4"],
|
||||
"lev3",
|
||||
lib.no_default,
|
||||
[
|
||||
[1.0, np.nan, 1.0, np.nan, 0.0, np.nan],
|
||||
[np.nan, 1.0, np.nan, 1.0, np.nan, 1.0],
|
||||
[1.0, np.nan, 2.0, np.nan, 2.0, np.nan],
|
||||
[np.nan, 1.0, np.nan, 2.0, np.nan, 3.0],
|
||||
[2.0, np.nan, 1.0, np.nan, 4.0, np.nan],
|
||||
[np.nan, 2.0, np.nan, 1.0, np.nan, 5.0],
|
||||
[2.0, np.nan, 2.0, np.nan, 6.0, np.nan],
|
||||
[np.nan, 2.0, np.nan, 2.0, np.nan, 7.0],
|
||||
],
|
||||
MultiIndex.from_tuples(
|
||||
[
|
||||
("lev1", 1),
|
||||
("lev1", 2),
|
||||
("lev2", 1),
|
||||
("lev2", 2),
|
||||
("values", 1),
|
||||
("values", 2),
|
||||
],
|
||||
names=[None, "lev3"],
|
||||
),
|
||||
Index([1, 2, 3, 4, 5, 6, 7, 8], name="lev4"),
|
||||
),
|
||||
(
|
||||
["lev1", "lev2"],
|
||||
"lev3",
|
||||
"values",
|
||||
[[0, 1], [2, 3], [4, 5], [6, 7]],
|
||||
Index([1, 2], name="lev3"),
|
||||
MultiIndex.from_tuples(
|
||||
[(1, 1), (1, 2), (2, 1), (2, 2)], names=["lev1", "lev2"]
|
||||
),
|
||||
),
|
||||
(
|
||||
["lev1", "lev2"],
|
||||
"lev3",
|
||||
lib.no_default,
|
||||
[[1, 2, 0, 1], [3, 4, 2, 3], [5, 6, 4, 5], [7, 8, 6, 7]],
|
||||
MultiIndex.from_tuples(
|
||||
[("lev4", 1), ("lev4", 2), ("values", 1), ("values", 2)],
|
||||
names=[None, "lev3"],
|
||||
),
|
||||
MultiIndex.from_tuples(
|
||||
[(1, 1), (1, 2), (2, 1), (2, 2)], names=["lev1", "lev2"]
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_pivot_list_like_index(
|
||||
input_index,
|
||||
input_columns,
|
||||
input_values,
|
||||
expected_values,
|
||||
expected_columns,
|
||||
expected_index,
|
||||
):
|
||||
# GH 21425, test when index is given a list
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"lev1": [1, 1, 1, 1, 2, 2, 2, 2],
|
||||
"lev2": [1, 1, 2, 2, 1, 1, 2, 2],
|
||||
"lev3": [1, 2, 1, 2, 1, 2, 1, 2],
|
||||
"lev4": [1, 2, 3, 4, 5, 6, 7, 8],
|
||||
"values": [0, 1, 2, 3, 4, 5, 6, 7],
|
||||
}
|
||||
)
|
||||
|
||||
result = df.pivot(index=input_index, columns=input_columns, values=input_values)
|
||||
expected = pd.DataFrame(
|
||||
expected_values, columns=expected_columns, index=expected_index
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_index, input_columns, input_values, "
|
||||
"expected_values, expected_columns, expected_index",
|
||||
[
|
||||
(
|
||||
"lev4",
|
||||
["lev3"],
|
||||
"values",
|
||||
[
|
||||
[0.0, np.nan],
|
||||
[np.nan, 1.0],
|
||||
[2.0, np.nan],
|
||||
[np.nan, 3.0],
|
||||
[4.0, np.nan],
|
||||
[np.nan, 5.0],
|
||||
[6.0, np.nan],
|
||||
[np.nan, 7.0],
|
||||
],
|
||||
Index([1, 2], name="lev3"),
|
||||
Index([1, 2, 3, 4, 5, 6, 7, 8], name="lev4"),
|
||||
),
|
||||
(
|
||||
["lev1", "lev2"],
|
||||
["lev3"],
|
||||
"values",
|
||||
[[0, 1], [2, 3], [4, 5], [6, 7]],
|
||||
Index([1, 2], name="lev3"),
|
||||
MultiIndex.from_tuples(
|
||||
[(1, 1), (1, 2), (2, 1), (2, 2)], names=["lev1", "lev2"]
|
||||
),
|
||||
),
|
||||
(
|
||||
["lev1"],
|
||||
["lev2", "lev3"],
|
||||
"values",
|
||||
[[0, 1, 2, 3], [4, 5, 6, 7]],
|
||||
MultiIndex.from_tuples(
|
||||
[(1, 1), (1, 2), (2, 1), (2, 2)], names=["lev2", "lev3"]
|
||||
),
|
||||
Index([1, 2], name="lev1"),
|
||||
),
|
||||
(
|
||||
["lev1", "lev2"],
|
||||
["lev3", "lev4"],
|
||||
"values",
|
||||
[
|
||||
[0.0, 1.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
|
||||
[np.nan, np.nan, 2.0, 3.0, np.nan, np.nan, np.nan, np.nan],
|
||||
[np.nan, np.nan, np.nan, np.nan, 4.0, 5.0, np.nan, np.nan],
|
||||
[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 6.0, 7.0],
|
||||
],
|
||||
MultiIndex.from_tuples(
|
||||
[(1, 1), (2, 2), (1, 3), (2, 4), (1, 5), (2, 6), (1, 7), (2, 8)],
|
||||
names=["lev3", "lev4"],
|
||||
),
|
||||
MultiIndex.from_tuples(
|
||||
[(1, 1), (1, 2), (2, 1), (2, 2)], names=["lev1", "lev2"]
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_pivot_list_like_columns(
|
||||
input_index,
|
||||
input_columns,
|
||||
input_values,
|
||||
expected_values,
|
||||
expected_columns,
|
||||
expected_index,
|
||||
):
|
||||
# GH 21425, test when columns is given a list
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"lev1": [1, 1, 1, 1, 2, 2, 2, 2],
|
||||
"lev2": [1, 1, 2, 2, 1, 1, 2, 2],
|
||||
"lev3": [1, 2, 1, 2, 1, 2, 1, 2],
|
||||
"lev4": [1, 2, 3, 4, 5, 6, 7, 8],
|
||||
"values": [0, 1, 2, 3, 4, 5, 6, 7],
|
||||
}
|
||||
)
|
||||
|
||||
result = df.pivot(index=input_index, columns=input_columns, values=input_values)
|
||||
expected = pd.DataFrame(
|
||||
expected_values, columns=expected_columns, index=expected_index
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_pivot_multiindexed_rows_and_cols(using_array_manager):
|
||||
# GH 36360
|
||||
|
||||
df = pd.DataFrame(
|
||||
data=np.arange(12).reshape(4, 3),
|
||||
columns=MultiIndex.from_tuples(
|
||||
[(0, 0), (0, 1), (0, 2)], names=["col_L0", "col_L1"]
|
||||
),
|
||||
index=MultiIndex.from_tuples(
|
||||
[(0, 0, 0), (0, 0, 1), (1, 1, 1), (1, 0, 0)],
|
||||
names=["idx_L0", "idx_L1", "idx_L2"],
|
||||
),
|
||||
)
|
||||
|
||||
res = df.pivot_table(
|
||||
index=["idx_L0"],
|
||||
columns=["idx_L1"],
|
||||
values=[(0, 1)],
|
||||
aggfunc=lambda col: col.values.sum(),
|
||||
)
|
||||
|
||||
expected = pd.DataFrame(
|
||||
data=[[5, np.nan], [10, 7.0]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[(0, 1, 0), (0, 1, 1)], names=["col_L0", "col_L1", "idx_L1"]
|
||||
),
|
||||
index=Index([0, 1], dtype="int64", name="idx_L0"),
|
||||
)
|
||||
if not using_array_manager:
|
||||
# BlockManager does not preserve the dtypes
|
||||
expected = expected.astype("float64")
|
||||
|
||||
tm.assert_frame_equal(res, expected)
|
||||
|
||||
|
||||
def test_pivot_df_multiindex_index_none():
|
||||
# GH 23955
|
||||
df = pd.DataFrame(
|
||||
[
|
||||
["A", "A1", "label1", 1],
|
||||
["A", "A2", "label2", 2],
|
||||
["B", "A1", "label1", 3],
|
||||
["B", "A2", "label2", 4],
|
||||
],
|
||||
columns=["index_1", "index_2", "label", "value"],
|
||||
)
|
||||
df = df.set_index(["index_1", "index_2"])
|
||||
|
||||
result = df.pivot(columns="label", values="value")
|
||||
expected = pd.DataFrame(
|
||||
[[1.0, np.nan], [np.nan, 2.0], [3.0, np.nan], [np.nan, 4.0]],
|
||||
index=df.index,
|
||||
columns=Index(["label1", "label2"], name="label"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
305
lib/python3.13/site-packages/pandas/tests/reshape/test_qcut.py
Normal file
305
lib/python3.13/site-packages/pandas/tests/reshape/test_qcut.py
Normal file
@ -0,0 +1,305 @@
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DatetimeIndex,
|
||||
Interval,
|
||||
IntervalIndex,
|
||||
NaT,
|
||||
Series,
|
||||
Timedelta,
|
||||
TimedeltaIndex,
|
||||
Timestamp,
|
||||
cut,
|
||||
date_range,
|
||||
isna,
|
||||
qcut,
|
||||
timedelta_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.api.types import CategoricalDtype
|
||||
|
||||
from pandas.tseries.offsets import Day
|
||||
|
||||
|
||||
def test_qcut():
|
||||
arr = np.random.default_rng(2).standard_normal(1000)
|
||||
|
||||
# We store the bins as Index that have been
|
||||
# rounded to comparisons are a bit tricky.
|
||||
labels, _ = qcut(arr, 4, retbins=True)
|
||||
ex_bins = np.quantile(arr, [0, 0.25, 0.5, 0.75, 1.0])
|
||||
|
||||
result = labels.categories.left.values
|
||||
assert np.allclose(result, ex_bins[:-1], atol=1e-2)
|
||||
|
||||
result = labels.categories.right.values
|
||||
assert np.allclose(result, ex_bins[1:], atol=1e-2)
|
||||
|
||||
ex_levels = cut(arr, ex_bins, include_lowest=True)
|
||||
tm.assert_categorical_equal(labels, ex_levels)
|
||||
|
||||
|
||||
def test_qcut_bounds():
|
||||
arr = np.random.default_rng(2).standard_normal(1000)
|
||||
|
||||
factor = qcut(arr, 10, labels=False)
|
||||
assert len(np.unique(factor)) == 10
|
||||
|
||||
|
||||
def test_qcut_specify_quantiles():
|
||||
arr = np.random.default_rng(2).standard_normal(100)
|
||||
factor = qcut(arr, [0, 0.25, 0.5, 0.75, 1.0])
|
||||
|
||||
expected = qcut(arr, 4)
|
||||
tm.assert_categorical_equal(factor, expected)
|
||||
|
||||
|
||||
def test_qcut_all_bins_same():
|
||||
with pytest.raises(ValueError, match="edges.*unique"):
|
||||
qcut([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3)
|
||||
|
||||
|
||||
def test_qcut_include_lowest():
|
||||
values = np.arange(10)
|
||||
ii = qcut(values, 4)
|
||||
|
||||
ex_levels = IntervalIndex(
|
||||
[
|
||||
Interval(-0.001, 2.25),
|
||||
Interval(2.25, 4.5),
|
||||
Interval(4.5, 6.75),
|
||||
Interval(6.75, 9),
|
||||
]
|
||||
)
|
||||
tm.assert_index_equal(ii.categories, ex_levels)
|
||||
|
||||
|
||||
def test_qcut_nas():
|
||||
arr = np.random.default_rng(2).standard_normal(100)
|
||||
arr[:20] = np.nan
|
||||
|
||||
result = qcut(arr, 4)
|
||||
assert isna(result[:20]).all()
|
||||
|
||||
|
||||
def test_qcut_index():
|
||||
result = qcut([0, 2], 2)
|
||||
intervals = [Interval(-0.001, 1), Interval(1, 2)]
|
||||
|
||||
expected = Categorical(intervals, ordered=True)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
|
||||
def test_qcut_binning_issues(datapath):
|
||||
# see gh-1978, gh-1979
|
||||
cut_file = datapath(os.path.join("reshape", "data", "cut_data.csv"))
|
||||
arr = np.loadtxt(cut_file)
|
||||
result = qcut(arr, 20)
|
||||
|
||||
starts = []
|
||||
ends = []
|
||||
|
||||
for lev in np.unique(result):
|
||||
s = lev.left
|
||||
e = lev.right
|
||||
assert s != e
|
||||
|
||||
starts.append(float(s))
|
||||
ends.append(float(e))
|
||||
|
||||
for (sp, sn), (ep, en) in zip(
|
||||
zip(starts[:-1], starts[1:]), zip(ends[:-1], ends[1:])
|
||||
):
|
||||
assert sp < sn
|
||||
assert ep < en
|
||||
assert ep <= sn
|
||||
|
||||
|
||||
def test_qcut_return_intervals():
|
||||
ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
|
||||
res = qcut(ser, [0, 0.333, 0.666, 1])
|
||||
|
||||
exp_levels = np.array(
|
||||
[Interval(-0.001, 2.664), Interval(2.664, 5.328), Interval(5.328, 8)]
|
||||
)
|
||||
exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(
|
||||
CategoricalDtype(ordered=True)
|
||||
)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("labels", ["foo", 1, True])
|
||||
def test_qcut_incorrect_labels(labels):
|
||||
# GH 13318
|
||||
values = range(5)
|
||||
msg = "Bin labels must either be False, None or passed in as a list-like argument"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
qcut(values, 4, labels=labels)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("labels", [["a", "b", "c"], list(range(3))])
|
||||
def test_qcut_wrong_length_labels(labels):
|
||||
# GH 13318
|
||||
values = range(10)
|
||||
msg = "Bin labels must be one fewer than the number of bin edges"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
qcut(values, 4, labels=labels)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"labels, expected",
|
||||
[
|
||||
(["a", "b", "c"], Categorical(["a", "b", "c"], ordered=True)),
|
||||
(list(range(3)), Categorical([0, 1, 2], ordered=True)),
|
||||
],
|
||||
)
|
||||
def test_qcut_list_like_labels(labels, expected):
|
||||
# GH 13318
|
||||
values = range(3)
|
||||
result = qcut(values, 3, labels=labels)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,msg",
|
||||
[
|
||||
({"duplicates": "drop"}, None),
|
||||
({}, "Bin edges must be unique"),
|
||||
({"duplicates": "raise"}, "Bin edges must be unique"),
|
||||
({"duplicates": "foo"}, "invalid value for 'duplicates' parameter"),
|
||||
],
|
||||
)
|
||||
def test_qcut_duplicates_bin(kwargs, msg):
|
||||
# see gh-7751
|
||||
values = [0, 0, 0, 0, 1, 2, 3]
|
||||
|
||||
if msg is not None:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
qcut(values, 3, **kwargs)
|
||||
else:
|
||||
result = qcut(values, 3, **kwargs)
|
||||
expected = IntervalIndex([Interval(-0.001, 1), Interval(1, 3)])
|
||||
tm.assert_index_equal(result.categories, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,start,end", [(9.0, 8.999, 9.0), (0.0, -0.001, 0.0), (-9.0, -9.001, -9.0)]
|
||||
)
|
||||
@pytest.mark.parametrize("length", [1, 2])
|
||||
@pytest.mark.parametrize("labels", [None, False])
|
||||
def test_single_quantile(data, start, end, length, labels):
|
||||
# see gh-15431
|
||||
ser = Series([data] * length)
|
||||
result = qcut(ser, 1, labels=labels)
|
||||
|
||||
if labels is None:
|
||||
intervals = IntervalIndex([Interval(start, end)] * length, closed="right")
|
||||
expected = Series(intervals).astype(CategoricalDtype(ordered=True))
|
||||
else:
|
||||
expected = Series([0] * length, dtype=np.intp)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ser",
|
||||
[
|
||||
Series(DatetimeIndex(["20180101", NaT, "20180103"])),
|
||||
Series(TimedeltaIndex(["0 days", NaT, "2 days"])),
|
||||
],
|
||||
ids=lambda x: str(x.dtype),
|
||||
)
|
||||
def test_qcut_nat(ser, unit):
|
||||
# see gh-19768
|
||||
ser = ser.dt.as_unit(unit)
|
||||
td = Timedelta(1, unit=unit).as_unit(unit)
|
||||
|
||||
left = Series([ser[0] - td, np.nan, ser[2] - Day()], dtype=ser.dtype)
|
||||
right = Series([ser[2] - Day(), np.nan, ser[2]], dtype=ser.dtype)
|
||||
intervals = IntervalIndex.from_arrays(left, right)
|
||||
expected = Series(Categorical(intervals, ordered=True))
|
||||
|
||||
result = qcut(ser, 2)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bins", [3, np.linspace(0, 1, 4)])
|
||||
def test_datetime_tz_qcut(bins):
|
||||
# see gh-19872
|
||||
tz = "US/Eastern"
|
||||
ser = Series(date_range("20130101", periods=3, tz=tz))
|
||||
|
||||
result = qcut(ser, bins)
|
||||
expected = Series(
|
||||
IntervalIndex(
|
||||
[
|
||||
Interval(
|
||||
Timestamp("2012-12-31 23:59:59.999999999", tz=tz),
|
||||
Timestamp("2013-01-01 16:00:00", tz=tz),
|
||||
),
|
||||
Interval(
|
||||
Timestamp("2013-01-01 16:00:00", tz=tz),
|
||||
Timestamp("2013-01-02 08:00:00", tz=tz),
|
||||
),
|
||||
Interval(
|
||||
Timestamp("2013-01-02 08:00:00", tz=tz),
|
||||
Timestamp("2013-01-03 00:00:00", tz=tz),
|
||||
),
|
||||
]
|
||||
)
|
||||
).astype(CategoricalDtype(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg,expected_bins",
|
||||
[
|
||||
[
|
||||
timedelta_range("1day", periods=3),
|
||||
TimedeltaIndex(["1 days", "2 days", "3 days"]),
|
||||
],
|
||||
[
|
||||
date_range("20180101", periods=3),
|
||||
DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"]),
|
||||
],
|
||||
],
|
||||
)
|
||||
def test_date_like_qcut_bins(arg, expected_bins):
|
||||
# see gh-19891
|
||||
ser = Series(arg)
|
||||
result, result_bins = qcut(ser, 2, retbins=True)
|
||||
tm.assert_index_equal(result_bins, expected_bins)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bins", [6, 7])
|
||||
@pytest.mark.parametrize(
|
||||
"box, compare",
|
||||
[
|
||||
(Series, tm.assert_series_equal),
|
||||
(np.array, tm.assert_categorical_equal),
|
||||
(list, tm.assert_equal),
|
||||
],
|
||||
)
|
||||
def test_qcut_bool_coercion_to_int(bins, box, compare):
|
||||
# issue 20303
|
||||
data_expected = box([0, 1, 1, 0, 1] * 10)
|
||||
data_result = box([False, True, True, False, True] * 10)
|
||||
expected = qcut(data_expected, bins, duplicates="drop")
|
||||
result = qcut(data_result, bins, duplicates="drop")
|
||||
compare(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("q", [2, 5, 10])
|
||||
def test_qcut_nullable_integer(q, any_numeric_ea_dtype):
|
||||
arr = pd.array(np.arange(100), dtype=any_numeric_ea_dtype)
|
||||
arr[::2] = pd.NA
|
||||
|
||||
result = qcut(arr, q)
|
||||
expected = qcut(arr.astype(float), q)
|
||||
|
||||
tm.assert_categorical_equal(result, expected)
|
@ -0,0 +1,365 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.concat import union_categoricals
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
CategoricalIndex,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestUnionCategoricals:
|
||||
@pytest.mark.parametrize(
|
||||
"a, b, combined",
|
||||
[
|
||||
(list("abc"), list("abd"), list("abcabd")),
|
||||
([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
|
||||
([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),
|
||||
(
|
||||
["b", "b", np.nan, "a"],
|
||||
["a", np.nan, "c"],
|
||||
["b", "b", np.nan, "a", "a", np.nan, "c"],
|
||||
),
|
||||
(
|
||||
pd.date_range("2014-01-01", "2014-01-05"),
|
||||
pd.date_range("2014-01-06", "2014-01-07"),
|
||||
pd.date_range("2014-01-01", "2014-01-07"),
|
||||
),
|
||||
(
|
||||
pd.date_range("2014-01-01", "2014-01-05", tz="US/Central"),
|
||||
pd.date_range("2014-01-06", "2014-01-07", tz="US/Central"),
|
||||
pd.date_range("2014-01-01", "2014-01-07", tz="US/Central"),
|
||||
),
|
||||
(
|
||||
pd.period_range("2014-01-01", "2014-01-05"),
|
||||
pd.period_range("2014-01-06", "2014-01-07"),
|
||||
pd.period_range("2014-01-01", "2014-01-07"),
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("box", [Categorical, CategoricalIndex, Series])
|
||||
def test_union_categorical(self, a, b, combined, box):
|
||||
# GH 13361
|
||||
result = union_categoricals([box(Categorical(a)), box(Categorical(b))])
|
||||
expected = Categorical(combined)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categorical_ordered_appearance(self):
|
||||
# new categories ordered by appearance
|
||||
s = Categorical(["x", "y", "z"])
|
||||
s2 = Categorical(["a", "b", "c"])
|
||||
result = union_categoricals([s, s2])
|
||||
expected = Categorical(
|
||||
["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"]
|
||||
)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categorical_ordered_true(self):
|
||||
s = Categorical([0, 1.2, 2], ordered=True)
|
||||
s2 = Categorical([0, 1.2, 2], ordered=True)
|
||||
result = union_categoricals([s, s2])
|
||||
expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categorical_match_types(self):
|
||||
# must exactly match types
|
||||
s = Categorical([0, 1.2, 2])
|
||||
s2 = Categorical([2, 3, 4])
|
||||
msg = "dtype of categories must be the same"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([s, s2])
|
||||
|
||||
def test_union_categorical_empty(self):
|
||||
msg = "No Categoricals to union"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
union_categoricals([])
|
||||
|
||||
def test_union_categoricals_nan(self):
|
||||
# GH 13759
|
||||
res = union_categoricals(
|
||||
[Categorical([1, 2, np.nan]), Categorical([3, 2, np.nan])]
|
||||
)
|
||||
exp = Categorical([1, 2, np.nan, 3, 2, np.nan])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
res = union_categoricals(
|
||||
[Categorical(["A", "B"]), Categorical(["B", "B", np.nan])]
|
||||
)
|
||||
exp = Categorical(["A", "B", "B", "B", np.nan])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
val1 = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-03-01"), pd.NaT]
|
||||
val2 = [pd.NaT, pd.Timestamp("2011-01-01"), pd.Timestamp("2011-02-01")]
|
||||
|
||||
res = union_categoricals([Categorical(val1), Categorical(val2)])
|
||||
exp = Categorical(
|
||||
val1 + val2,
|
||||
categories=[
|
||||
pd.Timestamp("2011-01-01"),
|
||||
pd.Timestamp("2011-03-01"),
|
||||
pd.Timestamp("2011-02-01"),
|
||||
],
|
||||
)
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
# all NaN
|
||||
res = union_categoricals(
|
||||
[
|
||||
Categorical(np.array([np.nan, np.nan], dtype=object)),
|
||||
Categorical(["X"], categories=pd.Index(["X"], dtype=object)),
|
||||
]
|
||||
)
|
||||
exp = Categorical([np.nan, np.nan, "X"])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
res = union_categoricals(
|
||||
[Categorical([np.nan, np.nan]), Categorical([np.nan, np.nan])]
|
||||
)
|
||||
exp = Categorical([np.nan, np.nan, np.nan, np.nan])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
@pytest.mark.parametrize("val", [[], ["1"]])
|
||||
def test_union_categoricals_empty(self, val, request, using_infer_string):
|
||||
# GH 13759
|
||||
if using_infer_string and val == ["1"]:
|
||||
request.applymarker(pytest.mark.xfail("object and strings dont match"))
|
||||
res = union_categoricals([Categorical([]), Categorical(val)])
|
||||
exp = Categorical(val)
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
def test_union_categorical_same_category(self):
|
||||
# check fastpath
|
||||
c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
|
||||
c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
|
||||
res = union_categoricals([c1, c2])
|
||||
exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan], categories=[1, 2, 3, 4])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
def test_union_categorical_same_category_str(self):
|
||||
c1 = Categorical(["z", "z", "z"], categories=["x", "y", "z"])
|
||||
c2 = Categorical(["x", "x", "x"], categories=["x", "y", "z"])
|
||||
res = union_categoricals([c1, c2])
|
||||
exp = Categorical(["z", "z", "z", "x", "x", "x"], categories=["x", "y", "z"])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
def test_union_categorical_same_categories_different_order(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/19096
|
||||
c1 = Categorical(["a", "b", "c"], categories=["a", "b", "c"])
|
||||
c2 = Categorical(["a", "b", "c"], categories=["b", "a", "c"])
|
||||
result = union_categoricals([c1, c2])
|
||||
expected = Categorical(
|
||||
["a", "b", "c", "a", "b", "c"], categories=["a", "b", "c"]
|
||||
)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categoricals_ordered(self):
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([1, 2, 3], ordered=False)
|
||||
|
||||
msg = "Categorical.ordered must be the same"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, c2])
|
||||
|
||||
res = union_categoricals([c1, c1])
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3, np.nan], ordered=True)
|
||||
c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
|
||||
|
||||
res = union_categoricals([c1, c2])
|
||||
exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
|
||||
|
||||
msg = "to union ordered Categoricals, all categories must be the same"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, c2])
|
||||
|
||||
def test_union_categoricals_ignore_order(self):
|
||||
# GH 15219
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([1, 2, 3], ordered=False)
|
||||
|
||||
res = union_categoricals([c1, c2], ignore_order=True)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
msg = "Categorical.ordered must be the same"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, c2], ignore_order=False)
|
||||
|
||||
res = union_categoricals([c1, c1], ignore_order=True)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
res = union_categoricals([c1, c1], ignore_order=False)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True)
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3, np.nan], ordered=True)
|
||||
c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
|
||||
|
||||
res = union_categoricals([c1, c2], ignore_order=True)
|
||||
exp = Categorical([1, 2, 3, np.nan, 3, 2])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
|
||||
|
||||
res = union_categoricals([c1, c2], ignore_order=True)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
res = union_categoricals([c2, c1], ignore_order=True, sort_categories=True)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([4, 5, 6], ordered=True)
|
||||
result = union_categoricals([c1, c2], ignore_order=True)
|
||||
expected = Categorical([1, 2, 3, 4, 5, 6])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
msg = "to union ordered Categoricals, all categories must be the same"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, c2], ignore_order=False)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, c2])
|
||||
|
||||
def test_union_categoricals_sort(self):
|
||||
# GH 13846
|
||||
c1 = Categorical(["x", "y", "z"])
|
||||
c2 = Categorical(["a", "b", "c"])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(
|
||||
["x", "y", "z", "a", "b", "c"], categories=["a", "b", "c", "x", "y", "z"]
|
||||
)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
# fastpath
|
||||
c1 = Categorical(["a", "b"], categories=["b", "a", "c"])
|
||||
c2 = Categorical(["b", "c"], categories=["b", "a", "c"])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical(["a", "b"], categories=["c", "a", "b"])
|
||||
c2 = Categorical(["b", "c"], categories=["c", "a", "b"])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
# fastpath - skip resort
|
||||
c1 = Categorical(["a", "b"], categories=["a", "b", "c"])
|
||||
c2 = Categorical(["b", "c"], categories=["a", "b", "c"])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical(["x", np.nan])
|
||||
c2 = Categorical([np.nan, "b"])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(["x", np.nan, np.nan, "b"], categories=["b", "x"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical([np.nan])
|
||||
c2 = Categorical([np.nan])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical([np.nan, np.nan])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical([])
|
||||
c2 = Categorical([])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical([])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True)
|
||||
c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True)
|
||||
msg = "Cannot use sort_categories=True with ordered Categoricals"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, c2], sort_categories=True)
|
||||
|
||||
def test_union_categoricals_sort_false(self):
|
||||
# GH 13846
|
||||
c1 = Categorical(["x", "y", "z"])
|
||||
c2 = Categorical(["a", "b", "c"])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(
|
||||
["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"]
|
||||
)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categoricals_sort_false_fastpath(self):
|
||||
# fastpath
|
||||
c1 = Categorical(["a", "b"], categories=["b", "a", "c"])
|
||||
c2 = Categorical(["b", "c"], categories=["b", "a", "c"])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(["a", "b", "b", "c"], categories=["b", "a", "c"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categoricals_sort_false_skipresort(self):
|
||||
# fastpath - skip resort
|
||||
c1 = Categorical(["a", "b"], categories=["a", "b", "c"])
|
||||
c2 = Categorical(["b", "c"], categories=["a", "b", "c"])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categoricals_sort_false_one_nan(self):
|
||||
c1 = Categorical(["x", np.nan])
|
||||
c2 = Categorical([np.nan, "b"])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(["x", np.nan, np.nan, "b"], categories=["x", "b"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categoricals_sort_false_only_nan(self):
|
||||
c1 = Categorical([np.nan])
|
||||
c2 = Categorical([np.nan])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical([np.nan, np.nan])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categoricals_sort_false_empty(self):
|
||||
c1 = Categorical([])
|
||||
c2 = Categorical([])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical([])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categoricals_sort_false_ordered_true(self):
|
||||
c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True)
|
||||
c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True)
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(
|
||||
["b", "a", "a", "c"], categories=["b", "a", "c"], ordered=True
|
||||
)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categorical_unwrap(self):
|
||||
# GH 14173
|
||||
c1 = Categorical(["a", "b"])
|
||||
c2 = Series(["b", "c"], dtype="category")
|
||||
result = union_categoricals([c1, c2])
|
||||
expected = Categorical(["a", "b", "b", "c"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c2 = CategoricalIndex(c2)
|
||||
result = union_categoricals([c1, c2])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Series(c1)
|
||||
result = union_categoricals([c1, c2])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
msg = "all components to combine must be Categorical"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, ["a", "b", "c"]])
|
@ -0,0 +1,79 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
Index,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.reshape.util import cartesian_product
|
||||
|
||||
|
||||
class TestCartesianProduct:
|
||||
def test_simple(self):
|
||||
x, y = list("ABC"), [1, 22]
|
||||
result1, result2 = cartesian_product([x, y])
|
||||
expected1 = np.array(["A", "A", "B", "B", "C", "C"])
|
||||
expected2 = np.array([1, 22, 1, 22, 1, 22])
|
||||
tm.assert_numpy_array_equal(result1, expected1)
|
||||
tm.assert_numpy_array_equal(result2, expected2)
|
||||
|
||||
def test_datetimeindex(self):
|
||||
# regression test for GitHub issue #6439
|
||||
# make sure that the ordering on datetimeindex is consistent
|
||||
x = date_range("2000-01-01", periods=2)
|
||||
result1, result2 = (Index(y).day for y in cartesian_product([x, x]))
|
||||
expected1 = Index([1, 1, 2, 2], dtype=np.int32)
|
||||
expected2 = Index([1, 2, 1, 2], dtype=np.int32)
|
||||
tm.assert_index_equal(result1, expected1)
|
||||
tm.assert_index_equal(result2, expected2)
|
||||
|
||||
def test_tzaware_retained(self):
|
||||
x = date_range("2000-01-01", periods=2, tz="US/Pacific")
|
||||
y = np.array([3, 4])
|
||||
result1, result2 = cartesian_product([x, y])
|
||||
|
||||
expected = x.repeat(2)
|
||||
tm.assert_index_equal(result1, expected)
|
||||
|
||||
def test_tzaware_retained_categorical(self):
|
||||
x = date_range("2000-01-01", periods=2, tz="US/Pacific").astype("category")
|
||||
y = np.array([3, 4])
|
||||
result1, result2 = cartesian_product([x, y])
|
||||
|
||||
expected = x.repeat(2)
|
||||
tm.assert_index_equal(result1, expected)
|
||||
|
||||
@pytest.mark.parametrize("x, y", [[[], []], [[0, 1], []], [[], ["a", "b", "c"]]])
|
||||
def test_empty(self, x, y):
|
||||
# product of empty factors
|
||||
expected1 = np.array([], dtype=np.asarray(x).dtype)
|
||||
expected2 = np.array([], dtype=np.asarray(y).dtype)
|
||||
result1, result2 = cartesian_product([x, y])
|
||||
tm.assert_numpy_array_equal(result1, expected1)
|
||||
tm.assert_numpy_array_equal(result2, expected2)
|
||||
|
||||
def test_empty_input(self):
|
||||
# empty product (empty input):
|
||||
result = cartesian_product([])
|
||||
expected = []
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X", [1, [1], [1, 2], [[1], 2], "a", ["a"], ["a", "b"], [["a"], "b"]]
|
||||
)
|
||||
def test_invalid_input(self, X):
|
||||
msg = "Input must be a list-like of list-likes"
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
cartesian_product(X=X)
|
||||
|
||||
def test_exceed_product_space(self):
|
||||
# GH31355: raise useful error when produce space is too large
|
||||
msg = "Product space too large to allocate arrays!"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
dims = [np.arange(0, 22, dtype=np.int16) for i in range(12)] + [
|
||||
(np.arange(15128, dtype=np.int16)),
|
||||
]
|
||||
cartesian_product(X=dims)
|
Reference in New Issue
Block a user