Updated script that can be controled by Nodejs web app

This commit is contained in:
mac OS
2024-11-25 12:24:18 +07:00
parent c440eda1f4
commit 8b0ab2bd3a
8662 changed files with 1803808 additions and 34 deletions

View File

@ -0,0 +1,7 @@
import pytest
@pytest.fixture(params=[True, False])
def sort(request):
"""Boolean sort keyword for concat and DataFrame.append."""
return request.param

View File

@ -0,0 +1,389 @@
import datetime as dt
from itertools import combinations
import dateutil
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
Series,
Timestamp,
concat,
isna,
)
import pandas._testing as tm
class TestAppend:
def test_append(self, sort, float_frame):
mixed_frame = float_frame.copy()
mixed_frame["foo"] = "bar"
begin_index = float_frame.index[:5]
end_index = float_frame.index[5:]
begin_frame = float_frame.reindex(begin_index)
end_frame = float_frame.reindex(end_index)
appended = begin_frame._append(end_frame)
tm.assert_almost_equal(appended["A"], float_frame["A"])
del end_frame["A"]
partial_appended = begin_frame._append(end_frame, sort=sort)
assert "A" in partial_appended
partial_appended = end_frame._append(begin_frame, sort=sort)
assert "A" in partial_appended
# mixed type handling
appended = mixed_frame[:5]._append(mixed_frame[5:])
tm.assert_frame_equal(appended, mixed_frame)
# what to test here
mixed_appended = mixed_frame[:5]._append(float_frame[5:], sort=sort)
mixed_appended2 = float_frame[:5]._append(mixed_frame[5:], sort=sort)
# all equal except 'foo' column
tm.assert_frame_equal(
mixed_appended.reindex(columns=["A", "B", "C", "D"]),
mixed_appended2.reindex(columns=["A", "B", "C", "D"]),
)
def test_append_empty(self, float_frame):
empty = DataFrame()
appended = float_frame._append(empty)
tm.assert_frame_equal(float_frame, appended)
assert appended is not float_frame
appended = empty._append(float_frame)
tm.assert_frame_equal(float_frame, appended)
assert appended is not float_frame
def test_append_overlap_raises(self, float_frame):
msg = "Indexes have overlapping values"
with pytest.raises(ValueError, match=msg):
float_frame._append(float_frame, verify_integrity=True)
def test_append_new_columns(self):
# see gh-6129: new columns
df = DataFrame({"a": {"x": 1, "y": 2}, "b": {"x": 3, "y": 4}})
row = Series([5, 6, 7], index=["a", "b", "c"], name="z")
expected = DataFrame(
{
"a": {"x": 1, "y": 2, "z": 5},
"b": {"x": 3, "y": 4, "z": 6},
"c": {"z": 7},
}
)
result = df._append(row)
tm.assert_frame_equal(result, expected)
def test_append_length0_frame(self, sort):
df = DataFrame(columns=["A", "B", "C"])
df3 = DataFrame(index=[0, 1], columns=["A", "B"])
df5 = df._append(df3, sort=sort)
expected = DataFrame(index=[0, 1], columns=["A", "B", "C"])
tm.assert_frame_equal(df5, expected)
def test_append_records(self):
arr1 = np.zeros((2,), dtype=("i4,f4,S10"))
arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
arr2 = np.zeros((3,), dtype=("i4,f4,S10"))
arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")]
df1 = DataFrame(arr1)
df2 = DataFrame(arr2)
result = df1._append(df2, ignore_index=True)
expected = DataFrame(np.concatenate((arr1, arr2)))
tm.assert_frame_equal(result, expected)
# rewrite sort fixture, since we also want to test default of None
def test_append_sorts(self, sort):
df1 = DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"])
df2 = DataFrame({"a": [1, 2], "c": [3, 4]}, index=[2, 3])
result = df1._append(df2, sort=sort)
# for None / True
expected = DataFrame(
{"b": [1, 2, None, None], "a": [1, 2, 1, 2], "c": [None, None, 3, 4]},
columns=["a", "b", "c"],
)
if sort is False:
expected = expected[["b", "a", "c"]]
tm.assert_frame_equal(result, expected)
def test_append_different_columns(self, sort):
df = DataFrame(
{
"bools": np.random.default_rng(2).standard_normal(10) > 0,
"ints": np.random.default_rng(2).integers(0, 10, 10),
"floats": np.random.default_rng(2).standard_normal(10),
"strings": ["foo", "bar"] * 5,
}
)
a = df[:5].loc[:, ["bools", "ints", "floats"]]
b = df[5:].loc[:, ["strings", "ints", "floats"]]
appended = a._append(b, sort=sort)
assert isna(appended["strings"][0:4]).all()
assert isna(appended["bools"][5:]).all()
def test_append_many(self, sort, float_frame):
chunks = [
float_frame[:5],
float_frame[5:10],
float_frame[10:15],
float_frame[15:],
]
result = chunks[0]._append(chunks[1:])
tm.assert_frame_equal(result, float_frame)
chunks[-1] = chunks[-1].copy()
chunks[-1]["foo"] = "bar"
result = chunks[0]._append(chunks[1:], sort=sort)
tm.assert_frame_equal(result.loc[:, float_frame.columns], float_frame)
assert (result["foo"][15:] == "bar").all()
assert result["foo"][:15].isna().all()
def test_append_preserve_index_name(self):
# #980
df1 = DataFrame(columns=["A", "B", "C"])
df1 = df1.set_index(["A"])
df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], columns=["A", "B", "C"])
df2 = df2.set_index(["A"])
msg = "The behavior of array concatenation with empty entries is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df1._append(df2)
assert result.index.name == "A"
indexes_can_append = [
pd.RangeIndex(3),
Index([4, 5, 6]),
Index([4.5, 5.5, 6.5]),
Index(list("abc")),
pd.CategoricalIndex("A B C".split()),
pd.CategoricalIndex("D E F".split(), ordered=True),
pd.IntervalIndex.from_breaks([7, 8, 9, 10]),
pd.DatetimeIndex(
[
dt.datetime(2013, 1, 3, 0, 0),
dt.datetime(2013, 1, 3, 6, 10),
dt.datetime(2013, 1, 3, 7, 12),
]
),
pd.MultiIndex.from_arrays(["A B C".split(), "D E F".split()]),
]
@pytest.mark.parametrize(
"index", indexes_can_append, ids=lambda x: type(x).__name__
)
def test_append_same_columns_type(self, index):
# GH18359
# df wider than ser
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=index)
ser_index = index[:2]
ser = Series([7, 8], index=ser_index, name=2)
result = df._append(ser)
expected = DataFrame(
[[1, 2, 3.0], [4, 5, 6], [7, 8, np.nan]], index=[0, 1, 2], columns=index
)
# integer dtype is preserved for columns present in ser.index
assert expected.dtypes.iloc[0].kind == "i"
assert expected.dtypes.iloc[1].kind == "i"
tm.assert_frame_equal(result, expected)
# ser wider than df
ser_index = index
index = index[:2]
df = DataFrame([[1, 2], [4, 5]], columns=index)
ser = Series([7, 8, 9], index=ser_index, name=2)
result = df._append(ser)
expected = DataFrame(
[[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]],
index=[0, 1, 2],
columns=ser_index,
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"df_columns, series_index",
combinations(indexes_can_append, r=2),
ids=lambda x: type(x).__name__,
)
def test_append_different_columns_types(self, df_columns, series_index):
# GH18359
# See also test 'test_append_different_columns_types_raises' below
# for errors raised when appending
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns)
ser = Series([7, 8, 9], index=series_index, name=2)
result = df._append(ser)
idx_diff = ser.index.difference(df_columns)
combined_columns = Index(df_columns.tolist()).append(idx_diff)
expected = DataFrame(
[
[1.0, 2.0, 3.0, np.nan, np.nan, np.nan],
[4, 5, 6, np.nan, np.nan, np.nan],
[np.nan, np.nan, np.nan, 7, 8, 9],
],
index=[0, 1, 2],
columns=combined_columns,
)
tm.assert_frame_equal(result, expected)
def test_append_dtype_coerce(self, sort):
# GH 4993
# appending with datetime will incorrectly convert datetime64
df1 = DataFrame(
index=[1, 2],
data=[dt.datetime(2013, 1, 1, 0, 0), dt.datetime(2013, 1, 2, 0, 0)],
columns=["start_time"],
)
df2 = DataFrame(
index=[4, 5],
data=[
[dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 3, 6, 10)],
[dt.datetime(2013, 1, 4, 0, 0), dt.datetime(2013, 1, 4, 7, 10)],
],
columns=["start_time", "end_time"],
)
expected = concat(
[
Series(
[
pd.NaT,
pd.NaT,
dt.datetime(2013, 1, 3, 6, 10),
dt.datetime(2013, 1, 4, 7, 10),
],
name="end_time",
),
Series(
[
dt.datetime(2013, 1, 1, 0, 0),
dt.datetime(2013, 1, 2, 0, 0),
dt.datetime(2013, 1, 3, 0, 0),
dt.datetime(2013, 1, 4, 0, 0),
],
name="start_time",
),
],
axis=1,
sort=sort,
)
result = df1._append(df2, ignore_index=True, sort=sort)
if sort:
expected = expected[["end_time", "start_time"]]
else:
expected = expected[["start_time", "end_time"]]
tm.assert_frame_equal(result, expected)
def test_append_missing_column_proper_upcast(self, sort):
df1 = DataFrame({"A": np.array([1, 2, 3, 4], dtype="i8")})
df2 = DataFrame({"B": np.array([True, False, True, False], dtype=bool)})
appended = df1._append(df2, ignore_index=True, sort=sort)
assert appended["A"].dtype == "f8"
assert appended["B"].dtype == "O"
def test_append_empty_frame_to_series_with_dateutil_tz(self):
# GH 23682
date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc())
ser = Series({"a": 1.0, "b": 2.0, "date": date})
df = DataFrame(columns=["c", "d"])
result_a = df._append(ser, ignore_index=True)
expected = DataFrame(
[[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"]
)
# These columns get cast to object after append
expected["c"] = expected["c"].astype(object)
expected["d"] = expected["d"].astype(object)
tm.assert_frame_equal(result_a, expected)
expected = DataFrame(
[[np.nan, np.nan, 1.0, 2.0, date]] * 2, columns=["c", "d", "a", "b", "date"]
)
expected["c"] = expected["c"].astype(object)
expected["d"] = expected["d"].astype(object)
result_b = result_a._append(ser, ignore_index=True)
tm.assert_frame_equal(result_b, expected)
result = df._append([ser, ser], ignore_index=True)
tm.assert_frame_equal(result, expected)
def test_append_empty_tz_frame_with_datetime64ns(self, using_array_manager):
# https://github.com/pandas-dev/pandas/issues/35460
df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]")
# pd.NaT gets inferred as tz-naive, so append result is tz-naive
result = df._append({"a": pd.NaT}, ignore_index=True)
if using_array_manager:
expected = DataFrame({"a": [pd.NaT]}, dtype=object)
else:
expected = DataFrame({"a": [np.nan]}, dtype=object)
tm.assert_frame_equal(result, expected)
# also test with typed value to append
df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]")
other = Series({"a": pd.NaT}, dtype="datetime64[ns]")
result = df._append(other, ignore_index=True)
tm.assert_frame_equal(result, expected)
# mismatched tz
other = Series({"a": pd.NaT}, dtype="datetime64[ns, US/Pacific]")
result = df._append(other, ignore_index=True)
expected = DataFrame({"a": [pd.NaT]}).astype(object)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
)
@pytest.mark.parametrize("val", [1, "NaT"])
def test_append_empty_frame_with_timedelta64ns_nat(
self, dtype_str, val, using_array_manager
):
# https://github.com/pandas-dev/pandas/issues/35460
df = DataFrame(columns=["a"]).astype(dtype_str)
other = DataFrame({"a": [np.timedelta64(val, "ns")]})
result = df._append(other, ignore_index=True)
expected = other.astype(object)
if isinstance(val, str) and dtype_str != "int64" and not using_array_manager:
# TODO: expected used to be `other.astype(object)` which is a more
# reasonable result. This was changed when tightening
# assert_frame_equal's treatment of mismatched NAs to match the
# existing behavior.
expected = DataFrame({"a": [np.nan]}, dtype=object)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
)
@pytest.mark.parametrize("val", [1, "NaT"])
def test_append_frame_with_timedelta64ns_nat(self, dtype_str, val):
# https://github.com/pandas-dev/pandas/issues/35460
df = DataFrame({"a": pd.array([1], dtype=dtype_str)})
other = DataFrame({"a": [np.timedelta64(val, "ns")]})
result = df._append(other, ignore_index=True)
expected = DataFrame({"a": [df.iloc[0, 0], other.iloc[0, 0]]}, dtype=object)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,753 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Index,
Series,
)
import pandas._testing as tm
@pytest.fixture(
params=list(
{
"bool": [True, False, True],
"int64": [1, 2, 3],
"float64": [1.1, np.nan, 3.3],
"category": Categorical(["X", "Y", "Z"]),
"object": ["a", "b", "c"],
"datetime64[ns]": [
pd.Timestamp("2011-01-01"),
pd.Timestamp("2011-01-02"),
pd.Timestamp("2011-01-03"),
],
"datetime64[ns, US/Eastern]": [
pd.Timestamp("2011-01-01", tz="US/Eastern"),
pd.Timestamp("2011-01-02", tz="US/Eastern"),
pd.Timestamp("2011-01-03", tz="US/Eastern"),
],
"timedelta64[ns]": [
pd.Timedelta("1 days"),
pd.Timedelta("2 days"),
pd.Timedelta("3 days"),
],
"period[M]": [
pd.Period("2011-01", freq="M"),
pd.Period("2011-02", freq="M"),
pd.Period("2011-03", freq="M"),
],
}.items()
)
)
def item(request):
key, data = request.param
return key, data
@pytest.fixture
def item2(item):
return item
class TestConcatAppendCommon:
"""
Test common dtype coercion rules between concat and append.
"""
def test_dtypes(self, item, index_or_series, using_infer_string):
# to confirm test case covers intended dtypes
typ, vals = item
obj = index_or_series(vals)
if typ == "object" and using_infer_string:
typ = "string"
if isinstance(obj, Index):
assert obj.dtype == typ
elif isinstance(obj, Series):
if typ.startswith("period"):
assert obj.dtype == "Period[M]"
else:
assert obj.dtype == typ
def test_concatlike_same_dtypes(self, item):
# GH 13660
typ1, vals1 = item
vals2 = vals1
vals3 = vals1
if typ1 == "category":
exp_data = Categorical(list(vals1) + list(vals2))
exp_data3 = Categorical(list(vals1) + list(vals2) + list(vals3))
else:
exp_data = vals1 + vals2
exp_data3 = vals1 + vals2 + vals3
# ----- Index ----- #
# index.append
res = Index(vals1).append(Index(vals2))
exp = Index(exp_data)
tm.assert_index_equal(res, exp)
# 3 elements
res = Index(vals1).append([Index(vals2), Index(vals3)])
exp = Index(exp_data3)
tm.assert_index_equal(res, exp)
# index.append name mismatch
i1 = Index(vals1, name="x")
i2 = Index(vals2, name="y")
res = i1.append(i2)
exp = Index(exp_data)
tm.assert_index_equal(res, exp)
# index.append name match
i1 = Index(vals1, name="x")
i2 = Index(vals2, name="x")
res = i1.append(i2)
exp = Index(exp_data, name="x")
tm.assert_index_equal(res, exp)
# cannot append non-index
with pytest.raises(TypeError, match="all inputs must be Index"):
Index(vals1).append(vals2)
with pytest.raises(TypeError, match="all inputs must be Index"):
Index(vals1).append([Index(vals2), vals3])
# ----- Series ----- #
# series.append
res = Series(vals1)._append(Series(vals2), ignore_index=True)
exp = Series(exp_data)
tm.assert_series_equal(res, exp, check_index_type=True)
# concat
res = pd.concat([Series(vals1), Series(vals2)], ignore_index=True)
tm.assert_series_equal(res, exp, check_index_type=True)
# 3 elements
res = Series(vals1)._append([Series(vals2), Series(vals3)], ignore_index=True)
exp = Series(exp_data3)
tm.assert_series_equal(res, exp)
res = pd.concat(
[Series(vals1), Series(vals2), Series(vals3)],
ignore_index=True,
)
tm.assert_series_equal(res, exp)
# name mismatch
s1 = Series(vals1, name="x")
s2 = Series(vals2, name="y")
res = s1._append(s2, ignore_index=True)
exp = Series(exp_data)
tm.assert_series_equal(res, exp, check_index_type=True)
res = pd.concat([s1, s2], ignore_index=True)
tm.assert_series_equal(res, exp, check_index_type=True)
# name match
s1 = Series(vals1, name="x")
s2 = Series(vals2, name="x")
res = s1._append(s2, ignore_index=True)
exp = Series(exp_data, name="x")
tm.assert_series_equal(res, exp, check_index_type=True)
res = pd.concat([s1, s2], ignore_index=True)
tm.assert_series_equal(res, exp, check_index_type=True)
# cannot append non-index
msg = (
r"cannot concatenate object of type '.+'; "
"only Series and DataFrame objs are valid"
)
with pytest.raises(TypeError, match=msg):
Series(vals1)._append(vals2)
with pytest.raises(TypeError, match=msg):
Series(vals1)._append([Series(vals2), vals3])
with pytest.raises(TypeError, match=msg):
pd.concat([Series(vals1), vals2])
with pytest.raises(TypeError, match=msg):
pd.concat([Series(vals1), Series(vals2), vals3])
def test_concatlike_dtypes_coercion(self, item, item2, request):
# GH 13660
typ1, vals1 = item
typ2, vals2 = item2
vals3 = vals2
# basically infer
exp_index_dtype = None
exp_series_dtype = None
if typ1 == typ2:
pytest.skip("same dtype is tested in test_concatlike_same_dtypes")
elif typ1 == "category" or typ2 == "category":
pytest.skip("categorical type tested elsewhere")
# specify expected dtype
if typ1 == "bool" and typ2 in ("int64", "float64"):
# series coerces to numeric based on numpy rule
# index doesn't because bool is object dtype
exp_series_dtype = typ2
mark = pytest.mark.xfail(reason="GH#39187 casting to object")
request.applymarker(mark)
elif typ2 == "bool" and typ1 in ("int64", "float64"):
exp_series_dtype = typ1
mark = pytest.mark.xfail(reason="GH#39187 casting to object")
request.applymarker(mark)
elif typ1 in {"datetime64[ns, US/Eastern]", "timedelta64[ns]"} or typ2 in {
"datetime64[ns, US/Eastern]",
"timedelta64[ns]",
}:
exp_index_dtype = object
exp_series_dtype = object
exp_data = vals1 + vals2
exp_data3 = vals1 + vals2 + vals3
# ----- Index ----- #
# index.append
# GH#39817
res = Index(vals1).append(Index(vals2))
exp = Index(exp_data, dtype=exp_index_dtype)
tm.assert_index_equal(res, exp)
# 3 elements
res = Index(vals1).append([Index(vals2), Index(vals3)])
exp = Index(exp_data3, dtype=exp_index_dtype)
tm.assert_index_equal(res, exp)
# ----- Series ----- #
# series._append
# GH#39817
res = Series(vals1)._append(Series(vals2), ignore_index=True)
exp = Series(exp_data, dtype=exp_series_dtype)
tm.assert_series_equal(res, exp, check_index_type=True)
# concat
# GH#39817
res = pd.concat([Series(vals1), Series(vals2)], ignore_index=True)
tm.assert_series_equal(res, exp, check_index_type=True)
# 3 elements
# GH#39817
res = Series(vals1)._append([Series(vals2), Series(vals3)], ignore_index=True)
exp = Series(exp_data3, dtype=exp_series_dtype)
tm.assert_series_equal(res, exp)
# GH#39817
res = pd.concat(
[Series(vals1), Series(vals2), Series(vals3)],
ignore_index=True,
)
tm.assert_series_equal(res, exp)
def test_concatlike_common_coerce_to_pandas_object(self):
# GH 13626
# result must be Timestamp/Timedelta, not datetime.datetime/timedelta
dti = pd.DatetimeIndex(["2011-01-01", "2011-01-02"])
tdi = pd.TimedeltaIndex(["1 days", "2 days"])
exp = Index(
[
pd.Timestamp("2011-01-01"),
pd.Timestamp("2011-01-02"),
pd.Timedelta("1 days"),
pd.Timedelta("2 days"),
]
)
res = dti.append(tdi)
tm.assert_index_equal(res, exp)
assert isinstance(res[0], pd.Timestamp)
assert isinstance(res[-1], pd.Timedelta)
dts = Series(dti)
tds = Series(tdi)
res = dts._append(tds)
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
assert isinstance(res.iloc[0], pd.Timestamp)
assert isinstance(res.iloc[-1], pd.Timedelta)
res = pd.concat([dts, tds])
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
assert isinstance(res.iloc[0], pd.Timestamp)
assert isinstance(res.iloc[-1], pd.Timedelta)
def test_concatlike_datetimetz(self, tz_aware_fixture):
tz = tz_aware_fixture
# GH 7795
dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz)
dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz=tz)
exp = pd.DatetimeIndex(
["2011-01-01", "2011-01-02", "2012-01-01", "2012-01-02"], tz=tz
)
res = dti1.append(dti2)
tm.assert_index_equal(res, exp)
dts1 = Series(dti1)
dts2 = Series(dti2)
res = dts1._append(dts2)
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
res = pd.concat([dts1, dts2])
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
@pytest.mark.parametrize("tz", ["UTC", "US/Eastern", "Asia/Tokyo", "EST5EDT"])
def test_concatlike_datetimetz_short(self, tz):
# GH#7795
ix1 = pd.date_range(start="2014-07-15", end="2014-07-17", freq="D", tz=tz)
ix2 = pd.DatetimeIndex(["2014-07-11", "2014-07-21"], tz=tz)
df1 = DataFrame(0, index=ix1, columns=["A", "B"])
df2 = DataFrame(0, index=ix2, columns=["A", "B"])
exp_idx = pd.DatetimeIndex(
["2014-07-15", "2014-07-16", "2014-07-17", "2014-07-11", "2014-07-21"],
tz=tz,
).as_unit("ns")
exp = DataFrame(0, index=exp_idx, columns=["A", "B"])
tm.assert_frame_equal(df1._append(df2), exp)
tm.assert_frame_equal(pd.concat([df1, df2]), exp)
def test_concatlike_datetimetz_to_object(self, tz_aware_fixture):
tz = tz_aware_fixture
# GH 13660
# different tz coerces to object
dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz)
dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"])
exp = Index(
[
pd.Timestamp("2011-01-01", tz=tz),
pd.Timestamp("2011-01-02", tz=tz),
pd.Timestamp("2012-01-01"),
pd.Timestamp("2012-01-02"),
],
dtype=object,
)
res = dti1.append(dti2)
tm.assert_index_equal(res, exp)
dts1 = Series(dti1)
dts2 = Series(dti2)
res = dts1._append(dts2)
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
res = pd.concat([dts1, dts2])
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
# different tz
dti3 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz="US/Pacific")
exp = Index(
[
pd.Timestamp("2011-01-01", tz=tz),
pd.Timestamp("2011-01-02", tz=tz),
pd.Timestamp("2012-01-01", tz="US/Pacific"),
pd.Timestamp("2012-01-02", tz="US/Pacific"),
],
dtype=object,
)
res = dti1.append(dti3)
tm.assert_index_equal(res, exp)
dts1 = Series(dti1)
dts3 = Series(dti3)
res = dts1._append(dts3)
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
res = pd.concat([dts1, dts3])
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
def test_concatlike_common_period(self):
# GH 13660
pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M")
pi2 = pd.PeriodIndex(["2012-01", "2012-02"], freq="M")
exp = pd.PeriodIndex(["2011-01", "2011-02", "2012-01", "2012-02"], freq="M")
res = pi1.append(pi2)
tm.assert_index_equal(res, exp)
ps1 = Series(pi1)
ps2 = Series(pi2)
res = ps1._append(ps2)
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
res = pd.concat([ps1, ps2])
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
def test_concatlike_common_period_diff_freq_to_object(self):
# GH 13221
pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M")
pi2 = pd.PeriodIndex(["2012-01-01", "2012-02-01"], freq="D")
exp = Index(
[
pd.Period("2011-01", freq="M"),
pd.Period("2011-02", freq="M"),
pd.Period("2012-01-01", freq="D"),
pd.Period("2012-02-01", freq="D"),
],
dtype=object,
)
res = pi1.append(pi2)
tm.assert_index_equal(res, exp)
ps1 = Series(pi1)
ps2 = Series(pi2)
res = ps1._append(ps2)
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
res = pd.concat([ps1, ps2])
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
def test_concatlike_common_period_mixed_dt_to_object(self):
# GH 13221
# different datetimelike
pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M")
tdi = pd.TimedeltaIndex(["1 days", "2 days"])
exp = Index(
[
pd.Period("2011-01", freq="M"),
pd.Period("2011-02", freq="M"),
pd.Timedelta("1 days"),
pd.Timedelta("2 days"),
],
dtype=object,
)
res = pi1.append(tdi)
tm.assert_index_equal(res, exp)
ps1 = Series(pi1)
tds = Series(tdi)
res = ps1._append(tds)
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
res = pd.concat([ps1, tds])
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
# inverse
exp = Index(
[
pd.Timedelta("1 days"),
pd.Timedelta("2 days"),
pd.Period("2011-01", freq="M"),
pd.Period("2011-02", freq="M"),
],
dtype=object,
)
res = tdi.append(pi1)
tm.assert_index_equal(res, exp)
ps1 = Series(pi1)
tds = Series(tdi)
res = tds._append(ps1)
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
res = pd.concat([tds, ps1])
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
def test_concat_categorical(self):
# GH 13524
# same categories -> category
s1 = Series([1, 2, np.nan], dtype="category")
s2 = Series([2, 1, 2], dtype="category")
exp = Series([1, 2, np.nan, 2, 1, 2], dtype="category")
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
# partially different categories => not-category
s1 = Series([3, 2], dtype="category")
s2 = Series([2, 1], dtype="category")
exp = Series([3, 2, 2, 1])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
# completely different categories (same dtype) => not-category
s1 = Series([10, 11, np.nan], dtype="category")
s2 = Series([np.nan, 1, 3, 2], dtype="category")
exp = Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype=np.float64)
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
def test_union_categorical_same_categories_different_order(self):
# https://github.com/pandas-dev/pandas/issues/19096
a = Series(Categorical(["a", "b", "c"], categories=["a", "b", "c"]))
b = Series(Categorical(["a", "b", "c"], categories=["b", "a", "c"]))
result = pd.concat([a, b], ignore_index=True)
expected = Series(
Categorical(["a", "b", "c", "a", "b", "c"], categories=["a", "b", "c"])
)
tm.assert_series_equal(result, expected)
def test_concat_categorical_coercion(self):
# GH 13524
# category + not-category => not-category
s1 = Series([1, 2, np.nan], dtype="category")
s2 = Series([2, 1, 2])
exp = Series([1, 2, np.nan, 2, 1, 2], dtype=np.float64)
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
# result shouldn't be affected by 1st elem dtype
exp = Series([2, 1, 2, 1, 2, np.nan], dtype=np.float64)
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)
# all values are not in category => not-category
s1 = Series([3, 2], dtype="category")
s2 = Series([2, 1])
exp = Series([3, 2, 2, 1])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
exp = Series([2, 1, 3, 2])
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)
# completely different categories => not-category
s1 = Series([10, 11, np.nan], dtype="category")
s2 = Series([1, 3, 2])
exp = Series([10, 11, np.nan, 1, 3, 2], dtype=np.float64)
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
exp = Series([1, 3, 2, 10, 11, np.nan], dtype=np.float64)
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)
# different dtype => not-category
s1 = Series([10, 11, np.nan], dtype="category")
s2 = Series(["a", "b", "c"])
exp = Series([10, 11, np.nan, "a", "b", "c"])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
exp = Series(["a", "b", "c", 10, 11, np.nan])
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)
# if normal series only contains NaN-likes => not-category
s1 = Series([10, 11], dtype="category")
s2 = Series([np.nan, np.nan, np.nan])
exp = Series([10, 11, np.nan, np.nan, np.nan])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
exp = Series([np.nan, np.nan, np.nan, 10, 11])
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)
def test_concat_categorical_3elem_coercion(self):
# GH 13524
# mixed dtypes => not-category
s1 = Series([1, 2, np.nan], dtype="category")
s2 = Series([2, 1, 2], dtype="category")
s3 = Series([1, 2, 1, 2, np.nan])
exp = Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype="float")
tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
tm.assert_series_equal(s1._append([s2, s3], ignore_index=True), exp)
exp = Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype="float")
tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s3._append([s1, s2], ignore_index=True), exp)
# values are all in either category => not-category
s1 = Series([4, 5, 6], dtype="category")
s2 = Series([1, 2, 3], dtype="category")
s3 = Series([1, 3, 4])
exp = Series([4, 5, 6, 1, 2, 3, 1, 3, 4])
tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
tm.assert_series_equal(s1._append([s2, s3], ignore_index=True), exp)
exp = Series([1, 3, 4, 4, 5, 6, 1, 2, 3])
tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s3._append([s1, s2], ignore_index=True), exp)
# values are all in either category => not-category
s1 = Series([4, 5, 6], dtype="category")
s2 = Series([1, 2, 3], dtype="category")
s3 = Series([10, 11, 12])
exp = Series([4, 5, 6, 1, 2, 3, 10, 11, 12])
tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
tm.assert_series_equal(s1._append([s2, s3], ignore_index=True), exp)
exp = Series([10, 11, 12, 4, 5, 6, 1, 2, 3])
tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s3._append([s1, s2], ignore_index=True), exp)
def test_concat_categorical_multi_coercion(self):
# GH 13524
s1 = Series([1, 3], dtype="category")
s2 = Series([3, 4], dtype="category")
s3 = Series([2, 3])
s4 = Series([2, 2], dtype="category")
s5 = Series([1, np.nan])
s6 = Series([1, 3, 2], dtype="category")
# mixed dtype, values are all in categories => not-category
exp = Series([1, 3, 3, 4, 2, 3, 2, 2, 1, np.nan, 1, 3, 2])
res = pd.concat([s1, s2, s3, s4, s5, s6], ignore_index=True)
tm.assert_series_equal(res, exp)
res = s1._append([s2, s3, s4, s5, s6], ignore_index=True)
tm.assert_series_equal(res, exp)
exp = Series([1, 3, 2, 1, np.nan, 2, 2, 2, 3, 3, 4, 1, 3])
res = pd.concat([s6, s5, s4, s3, s2, s1], ignore_index=True)
tm.assert_series_equal(res, exp)
res = s6._append([s5, s4, s3, s2, s1], ignore_index=True)
tm.assert_series_equal(res, exp)
def test_concat_categorical_ordered(self):
# GH 13524
s1 = Series(Categorical([1, 2, np.nan], ordered=True))
s2 = Series(Categorical([2, 1, 2], ordered=True))
exp = Series(Categorical([1, 2, np.nan, 2, 1, 2], ordered=True))
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
exp = Series(Categorical([1, 2, np.nan, 2, 1, 2, 1, 2, np.nan], ordered=True))
tm.assert_series_equal(pd.concat([s1, s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s1._append([s2, s1], ignore_index=True), exp)
def test_concat_categorical_coercion_nan(self):
# GH 13524
# some edge cases
# category + not-category => not category
s1 = Series(np.array([np.nan, np.nan], dtype=np.float64), dtype="category")
s2 = Series([np.nan, 1])
exp = Series([np.nan, np.nan, np.nan, 1])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
s1 = Series([1, np.nan], dtype="category")
s2 = Series([np.nan, np.nan])
exp = Series([1, np.nan, np.nan, np.nan], dtype="float")
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
# mixed dtype, all nan-likes => not-category
s1 = Series([np.nan, np.nan], dtype="category")
s2 = Series([np.nan, np.nan])
exp = Series([np.nan, np.nan, np.nan, np.nan])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)
# all category nan-likes => category
s1 = Series([np.nan, np.nan], dtype="category")
s2 = Series([np.nan, np.nan], dtype="category")
exp = Series([np.nan, np.nan, np.nan, np.nan], dtype="category")
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
def test_concat_categorical_empty(self):
# GH 13524
s1 = Series([], dtype="category")
s2 = Series([1, 2], dtype="category")
msg = "The behavior of array concatenation with empty entries is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2)
tm.assert_series_equal(s1._append(s2, ignore_index=True), s2)
with tm.assert_produces_warning(FutureWarning, match=msg):
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2)
tm.assert_series_equal(s2._append(s1, ignore_index=True), s2)
s1 = Series([], dtype="category")
s2 = Series([], dtype="category")
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2)
tm.assert_series_equal(s1._append(s2, ignore_index=True), s2)
s1 = Series([], dtype="category")
s2 = Series([], dtype="object")
# different dtype => not-category
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2)
tm.assert_series_equal(s1._append(s2, ignore_index=True), s2)
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2)
tm.assert_series_equal(s2._append(s1, ignore_index=True), s2)
s1 = Series([], dtype="category")
s2 = Series([np.nan, np.nan])
# empty Series is ignored
exp = Series([np.nan, np.nan])
with tm.assert_produces_warning(FutureWarning, match=msg):
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
with tm.assert_produces_warning(FutureWarning, match=msg):
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)
def test_categorical_concat_append(self):
cat = Categorical(["a", "b"], categories=["a", "b"])
vals = [1, 2]
df = DataFrame({"cats": cat, "vals": vals})
cat2 = Categorical(["a", "b", "a", "b"], categories=["a", "b"])
vals2 = [1, 2, 1, 2]
exp = DataFrame({"cats": cat2, "vals": vals2}, index=Index([0, 1, 0, 1]))
tm.assert_frame_equal(pd.concat([df, df]), exp)
tm.assert_frame_equal(df._append(df), exp)
# GH 13524 can concat different categories
cat3 = Categorical(["a", "b"], categories=["a", "b", "c"])
vals3 = [1, 2]
df_different_categories = DataFrame({"cats": cat3, "vals": vals3})
res = pd.concat([df, df_different_categories], ignore_index=True)
exp = DataFrame({"cats": list("abab"), "vals": [1, 2, 1, 2]})
tm.assert_frame_equal(res, exp)
res = df._append(df_different_categories, ignore_index=True)
tm.assert_frame_equal(res, exp)

View File

@ -0,0 +1,273 @@
from datetime import datetime
import numpy as np
from pandas.core.dtypes.dtypes import CategoricalDtype
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Series,
)
import pandas._testing as tm
class TestCategoricalConcat:
def test_categorical_concat(self, sort):
# See GH 10177
df1 = DataFrame(
np.arange(18, dtype="int64").reshape(6, 3), columns=["a", "b", "c"]
)
df2 = DataFrame(np.arange(14, dtype="int64").reshape(7, 2), columns=["a", "c"])
cat_values = ["one", "one", "two", "one", "two", "two", "one"]
df2["h"] = Series(Categorical(cat_values))
res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=sort)
exp = DataFrame(
{
"a": [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12],
"b": [
1,
4,
7,
10,
13,
16,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
],
"c": [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13],
"h": [None] * 6 + cat_values,
}
)
exp["h"] = exp["h"].astype(df2["h"].dtype)
tm.assert_frame_equal(res, exp)
def test_categorical_concat_dtypes(self, using_infer_string):
# GH8143
index = ["cat", "obj", "num"]
cat = Categorical(["a", "b", "c"])
obj = Series(["a", "b", "c"])
num = Series([1, 2, 3])
df = pd.concat([Series(cat), obj, num], axis=1, keys=index)
result = df.dtypes == (
object if not using_infer_string else "string[pyarrow_numpy]"
)
expected = Series([False, True, False], index=index)
tm.assert_series_equal(result, expected)
result = df.dtypes == "int64"
expected = Series([False, False, True], index=index)
tm.assert_series_equal(result, expected)
result = df.dtypes == "category"
expected = Series([True, False, False], index=index)
tm.assert_series_equal(result, expected)
def test_concat_categoricalindex(self):
# GH 16111, categories that aren't lexsorted
categories = [9, 0, 1, 2, 3]
a = Series(1, index=pd.CategoricalIndex([9, 0], categories=categories))
b = Series(2, index=pd.CategoricalIndex([0, 1], categories=categories))
c = Series(3, index=pd.CategoricalIndex([1, 2], categories=categories))
result = pd.concat([a, b, c], axis=1)
exp_idx = pd.CategoricalIndex([9, 0, 1, 2], categories=categories)
exp = DataFrame(
{
0: [1, 1, np.nan, np.nan],
1: [np.nan, 2, 2, np.nan],
2: [np.nan, np.nan, 3, 3],
},
columns=[0, 1, 2],
index=exp_idx,
)
tm.assert_frame_equal(result, exp)
def test_categorical_concat_preserve(self):
# GH 8641 series concat not preserving category dtype
# GH 13524 can concat different categories
s = Series(list("abc"), dtype="category")
s2 = Series(list("abd"), dtype="category")
exp = Series(list("abcabd"))
res = pd.concat([s, s2], ignore_index=True)
tm.assert_series_equal(res, exp)
exp = Series(list("abcabc"), dtype="category")
res = pd.concat([s, s], ignore_index=True)
tm.assert_series_equal(res, exp)
exp = Series(list("abcabc"), index=[0, 1, 2, 0, 1, 2], dtype="category")
res = pd.concat([s, s])
tm.assert_series_equal(res, exp)
a = Series(np.arange(6, dtype="int64"))
b = Series(list("aabbca"))
df2 = DataFrame({"A": a, "B": b.astype(CategoricalDtype(list("cab")))})
res = pd.concat([df2, df2])
exp = DataFrame(
{
"A": pd.concat([a, a]),
"B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))),
}
)
tm.assert_frame_equal(res, exp)
def test_categorical_index_preserver(self):
a = Series(np.arange(6, dtype="int64"))
b = Series(list("aabbca"))
df2 = DataFrame(
{"A": a, "B": b.astype(CategoricalDtype(list("cab")))}
).set_index("B")
result = pd.concat([df2, df2])
expected = DataFrame(
{
"A": pd.concat([a, a]),
"B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))),
}
).set_index("B")
tm.assert_frame_equal(result, expected)
# wrong categories -> uses concat_compat, which casts to object
df3 = DataFrame(
{"A": a, "B": Categorical(b, categories=list("abe"))}
).set_index("B")
result = pd.concat([df2, df3])
expected = pd.concat(
[
df2.set_axis(df2.index.astype(object), axis=0),
df3.set_axis(df3.index.astype(object), axis=0),
]
)
tm.assert_frame_equal(result, expected)
def test_concat_categorical_tz(self):
# GH-23816
a = Series(pd.date_range("2017-01-01", periods=2, tz="US/Pacific"))
b = Series(["a", "b"], dtype="category")
result = pd.concat([a, b], ignore_index=True)
expected = Series(
[
pd.Timestamp("2017-01-01", tz="US/Pacific"),
pd.Timestamp("2017-01-02", tz="US/Pacific"),
"a",
"b",
]
)
tm.assert_series_equal(result, expected)
def test_concat_categorical_datetime(self):
# GH-39443
df1 = DataFrame(
{"x": Series(datetime(2021, 1, 1), index=[0], dtype="category")}
)
df2 = DataFrame(
{"x": Series(datetime(2021, 1, 2), index=[1], dtype="category")}
)
result = pd.concat([df1, df2])
expected = DataFrame(
{"x": Series([datetime(2021, 1, 1), datetime(2021, 1, 2)])}
)
tm.assert_equal(result, expected)
def test_concat_categorical_unchanged(self):
# GH-12007
# test fix for when concat on categorical and float
# coerces dtype categorical -> float
df = DataFrame(Series(["a", "b", "c"], dtype="category", name="A"))
ser = Series([0, 1, 2], index=[0, 1, 3], name="B")
result = pd.concat([df, ser], axis=1)
expected = DataFrame(
{
"A": Series(["a", "b", "c", np.nan], dtype="category"),
"B": Series([0, 1, np.nan, 2], dtype="float"),
}
)
tm.assert_equal(result, expected)
def test_categorical_concat_gh7864(self):
# GH 7864
# make sure ordering is preserved
df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list("abbaae")})
df["grade"] = Categorical(df["raw_grade"])
df["grade"].cat.set_categories(["e", "a", "b"])
df1 = df[0:3]
df2 = df[3:]
tm.assert_index_equal(df["grade"].cat.categories, df1["grade"].cat.categories)
tm.assert_index_equal(df["grade"].cat.categories, df2["grade"].cat.categories)
dfx = pd.concat([df1, df2])
tm.assert_index_equal(df["grade"].cat.categories, dfx["grade"].cat.categories)
dfa = df1._append(df2)
tm.assert_index_equal(df["grade"].cat.categories, dfa["grade"].cat.categories)
def test_categorical_index_upcast(self):
# GH 17629
# test upcasting to object when concatenating on categorical indexes
# with non-identical categories
a = DataFrame({"foo": [1, 2]}, index=Categorical(["foo", "bar"]))
b = DataFrame({"foo": [4, 3]}, index=Categorical(["baz", "bar"]))
res = pd.concat([a, b])
exp = DataFrame({"foo": [1, 2, 4, 3]}, index=["foo", "bar", "baz", "bar"])
tm.assert_equal(res, exp)
a = Series([1, 2], index=Categorical(["foo", "bar"]))
b = Series([4, 3], index=Categorical(["baz", "bar"]))
res = pd.concat([a, b])
exp = Series([1, 2, 4, 3], index=["foo", "bar", "baz", "bar"])
tm.assert_equal(res, exp)
def test_categorical_missing_from_one_frame(self):
# GH 25412
df1 = DataFrame({"f1": [1, 2, 3]})
df2 = DataFrame({"f1": [2, 3, 1], "f2": Series([4, 4, 4]).astype("category")})
result = pd.concat([df1, df2], sort=True)
dtype = CategoricalDtype([4])
expected = DataFrame(
{
"f1": [1, 2, 3, 2, 3, 1],
"f2": Categorical.from_codes([-1, -1, -1, 0, 0, 0], dtype=dtype),
},
index=[0, 1, 2, 0, 1, 2],
)
tm.assert_frame_equal(result, expected)
def test_concat_categorical_same_categories_different_order(self):
# https://github.com/pandas-dev/pandas/issues/24845
c1 = pd.CategoricalIndex(["a", "a"], categories=["a", "b"], ordered=False)
c2 = pd.CategoricalIndex(["b", "b"], categories=["b", "a"], ordered=False)
c3 = pd.CategoricalIndex(
["a", "a", "b", "b"], categories=["a", "b"], ordered=False
)
df1 = DataFrame({"A": [1, 2]}, index=c1)
df2 = DataFrame({"A": [3, 4]}, index=c2)
result = pd.concat((df1, df2))
expected = DataFrame({"A": [1, 2, 3, 4]}, index=c3)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,912 @@
from collections import (
abc,
deque,
)
from collections.abc import Iterator
from datetime import datetime
from decimal import Decimal
import numpy as np
import pytest
from pandas.errors import InvalidIndexError
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
PeriodIndex,
Series,
concat,
date_range,
)
import pandas._testing as tm
from pandas.core.arrays import SparseArray
from pandas.tests.extension.decimal import to_decimal
class TestConcatenate:
def test_append_concat(self):
# GH#1815
d1 = date_range("12/31/1990", "12/31/1999", freq="YE-DEC")
d2 = date_range("12/31/2000", "12/31/2009", freq="YE-DEC")
s1 = Series(np.random.default_rng(2).standard_normal(10), d1)
s2 = Series(np.random.default_rng(2).standard_normal(10), d2)
s1 = s1.to_period()
s2 = s2.to_period()
# drops index
result = concat([s1, s2])
assert isinstance(result.index, PeriodIndex)
assert result.index[0] == s1.index[0]
def test_concat_copy(self, using_array_manager, using_copy_on_write):
df = DataFrame(np.random.default_rng(2).standard_normal((4, 3)))
df2 = DataFrame(np.random.default_rng(2).integers(0, 10, size=4).reshape(4, 1))
df3 = DataFrame({5: "foo"}, index=range(4))
# These are actual copies.
result = concat([df, df2, df3], axis=1, copy=True)
if not using_copy_on_write:
for arr in result._mgr.arrays:
assert not any(
np.shares_memory(arr, y)
for x in [df, df2, df3]
for y in x._mgr.arrays
)
else:
for arr in result._mgr.arrays:
assert arr.base is not None
# These are the same.
result = concat([df, df2, df3], axis=1, copy=False)
for arr in result._mgr.arrays:
if arr.dtype.kind == "f":
assert arr.base is df._mgr.arrays[0].base
elif arr.dtype.kind in ["i", "u"]:
assert arr.base is df2._mgr.arrays[0].base
elif arr.dtype == object:
if using_array_manager:
# we get the same array object, which has no base
assert arr is df3._mgr.arrays[0]
else:
assert arr.base is not None
# Float block was consolidated.
df4 = DataFrame(np.random.default_rng(2).standard_normal((4, 1)))
result = concat([df, df2, df3, df4], axis=1, copy=False)
for arr in result._mgr.arrays:
if arr.dtype.kind == "f":
if using_array_manager or using_copy_on_write:
# this is a view on some array in either df or df4
assert any(
np.shares_memory(arr, other)
for other in df._mgr.arrays + df4._mgr.arrays
)
else:
# the block was consolidated, so we got a copy anyway
assert arr.base is None
elif arr.dtype.kind in ["i", "u"]:
assert arr.base is df2._mgr.arrays[0].base
elif arr.dtype == object:
# this is a view on df3
assert any(np.shares_memory(arr, other) for other in df3._mgr.arrays)
def test_concat_with_group_keys(self):
# axis=0
df = DataFrame(np.random.default_rng(2).standard_normal((3, 4)))
df2 = DataFrame(np.random.default_rng(2).standard_normal((4, 4)))
result = concat([df, df2], keys=[0, 1])
exp_index = MultiIndex.from_arrays(
[[0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 0, 1, 2, 3]]
)
expected = DataFrame(np.r_[df.values, df2.values], index=exp_index)
tm.assert_frame_equal(result, expected)
result = concat([df, df], keys=[0, 1])
exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])
expected = DataFrame(np.r_[df.values, df.values], index=exp_index2)
tm.assert_frame_equal(result, expected)
# axis=1
df = DataFrame(np.random.default_rng(2).standard_normal((4, 3)))
df2 = DataFrame(np.random.default_rng(2).standard_normal((4, 4)))
result = concat([df, df2], keys=[0, 1], axis=1)
expected = DataFrame(np.c_[df.values, df2.values], columns=exp_index)
tm.assert_frame_equal(result, expected)
result = concat([df, df], keys=[0, 1], axis=1)
expected = DataFrame(np.c_[df.values, df.values], columns=exp_index2)
tm.assert_frame_equal(result, expected)
def test_concat_keys_specific_levels(self):
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)))
pieces = [df.iloc[:, [0, 1]], df.iloc[:, [2]], df.iloc[:, [3]]]
level = ["three", "two", "one", "zero"]
result = concat(
pieces,
axis=1,
keys=["one", "two", "three"],
levels=[level],
names=["group_key"],
)
tm.assert_index_equal(result.columns.levels[0], Index(level, name="group_key"))
tm.assert_index_equal(result.columns.levels[1], Index([0, 1, 2, 3]))
assert result.columns.names == ["group_key", None]
@pytest.mark.parametrize("mapping", ["mapping", "dict"])
def test_concat_mapping(self, mapping, non_dict_mapping_subclass):
constructor = dict if mapping == "dict" else non_dict_mapping_subclass
frames = constructor(
{
"foo": DataFrame(np.random.default_rng(2).standard_normal((4, 3))),
"bar": DataFrame(np.random.default_rng(2).standard_normal((4, 3))),
"baz": DataFrame(np.random.default_rng(2).standard_normal((4, 3))),
"qux": DataFrame(np.random.default_rng(2).standard_normal((4, 3))),
}
)
sorted_keys = list(frames.keys())
result = concat(frames)
expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys)
tm.assert_frame_equal(result, expected)
result = concat(frames, axis=1)
expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys, axis=1)
tm.assert_frame_equal(result, expected)
keys = ["baz", "foo", "bar"]
result = concat(frames, keys=keys)
expected = concat([frames[k] for k in keys], keys=keys)
tm.assert_frame_equal(result, expected)
def test_concat_keys_and_levels(self):
df = DataFrame(np.random.default_rng(2).standard_normal((1, 3)))
df2 = DataFrame(np.random.default_rng(2).standard_normal((1, 4)))
levels = [["foo", "baz"], ["one", "two"]]
names = ["first", "second"]
result = concat(
[df, df2, df, df2],
keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")],
levels=levels,
names=names,
)
expected = concat([df, df2, df, df2])
exp_index = MultiIndex(
levels=levels + [[0]],
codes=[[0, 0, 1, 1], [0, 1, 0, 1], [0, 0, 0, 0]],
names=names + [None],
)
expected.index = exp_index
tm.assert_frame_equal(result, expected)
# no names
result = concat(
[df, df2, df, df2],
keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")],
levels=levels,
)
assert result.index.names == (None,) * 3
# no levels
result = concat(
[df, df2, df, df2],
keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")],
names=["first", "second"],
)
assert result.index.names == ("first", "second", None)
tm.assert_index_equal(
result.index.levels[0], Index(["baz", "foo"], name="first")
)
def test_concat_keys_levels_no_overlap(self):
# GH #1406
df = DataFrame(np.random.default_rng(2).standard_normal((1, 3)), index=["a"])
df2 = DataFrame(np.random.default_rng(2).standard_normal((1, 4)), index=["b"])
msg = "Values not found in passed level"
with pytest.raises(ValueError, match=msg):
concat([df, df], keys=["one", "two"], levels=[["foo", "bar", "baz"]])
msg = "Key one not in level"
with pytest.raises(ValueError, match=msg):
concat([df, df2], keys=["one", "two"], levels=[["foo", "bar", "baz"]])
def test_crossed_dtypes_weird_corner(self):
columns = ["A", "B", "C", "D"]
df1 = DataFrame(
{
"A": np.array([1, 2, 3, 4], dtype="f8"),
"B": np.array([1, 2, 3, 4], dtype="i8"),
"C": np.array([1, 2, 3, 4], dtype="f8"),
"D": np.array([1, 2, 3, 4], dtype="i8"),
},
columns=columns,
)
df2 = DataFrame(
{
"A": np.array([1, 2, 3, 4], dtype="i8"),
"B": np.array([1, 2, 3, 4], dtype="f8"),
"C": np.array([1, 2, 3, 4], dtype="i8"),
"D": np.array([1, 2, 3, 4], dtype="f8"),
},
columns=columns,
)
appended = concat([df1, df2], ignore_index=True)
expected = DataFrame(
np.concatenate([df1.values, df2.values], axis=0), columns=columns
)
tm.assert_frame_equal(appended, expected)
df = DataFrame(np.random.default_rng(2).standard_normal((1, 3)), index=["a"])
df2 = DataFrame(np.random.default_rng(2).standard_normal((1, 4)), index=["b"])
result = concat([df, df2], keys=["one", "two"], names=["first", "second"])
assert result.index.names == ("first", "second")
def test_with_mixed_tuples(self, sort):
# 10697
# columns have mixed tuples, so handle properly
df1 = DataFrame({"A": "foo", ("B", 1): "bar"}, index=range(2))
df2 = DataFrame({"B": "foo", ("B", 1): "bar"}, index=range(2))
# it works
concat([df1, df2], sort=sort)
def test_concat_mixed_objs_columns(self):
# Test column-wise concat for mixed series/frames (axis=1)
# G2385
index = date_range("01-Jan-2013", periods=10, freq="h")
arr = np.arange(10, dtype="int64")
s1 = Series(arr, index=index)
s2 = Series(arr, index=index)
df = DataFrame(arr.reshape(-1, 1), index=index)
expected = DataFrame(
np.repeat(arr, 2).reshape(-1, 2), index=index, columns=[0, 0]
)
result = concat([df, df], axis=1)
tm.assert_frame_equal(result, expected)
expected = DataFrame(
np.repeat(arr, 2).reshape(-1, 2), index=index, columns=[0, 1]
)
result = concat([s1, s2], axis=1)
tm.assert_frame_equal(result, expected)
expected = DataFrame(
np.repeat(arr, 3).reshape(-1, 3), index=index, columns=[0, 1, 2]
)
result = concat([s1, s2, s1], axis=1)
tm.assert_frame_equal(result, expected)
expected = DataFrame(
np.repeat(arr, 5).reshape(-1, 5), index=index, columns=[0, 0, 1, 2, 3]
)
result = concat([s1, df, s2, s2, s1], axis=1)
tm.assert_frame_equal(result, expected)
# with names
s1.name = "foo"
expected = DataFrame(
np.repeat(arr, 3).reshape(-1, 3), index=index, columns=["foo", 0, 0]
)
result = concat([s1, df, s2], axis=1)
tm.assert_frame_equal(result, expected)
s2.name = "bar"
expected = DataFrame(
np.repeat(arr, 3).reshape(-1, 3), index=index, columns=["foo", 0, "bar"]
)
result = concat([s1, df, s2], axis=1)
tm.assert_frame_equal(result, expected)
# ignore index
expected = DataFrame(
np.repeat(arr, 3).reshape(-1, 3), index=index, columns=[0, 1, 2]
)
result = concat([s1, df, s2], axis=1, ignore_index=True)
tm.assert_frame_equal(result, expected)
def test_concat_mixed_objs_index(self):
# Test row-wise concat for mixed series/frames with a common name
# GH2385, GH15047
index = date_range("01-Jan-2013", periods=10, freq="h")
arr = np.arange(10, dtype="int64")
s1 = Series(arr, index=index)
s2 = Series(arr, index=index)
df = DataFrame(arr.reshape(-1, 1), index=index)
expected = DataFrame(
np.tile(arr, 3).reshape(-1, 1), index=index.tolist() * 3, columns=[0]
)
result = concat([s1, df, s2])
tm.assert_frame_equal(result, expected)
def test_concat_mixed_objs_index_names(self):
# Test row-wise concat for mixed series/frames with distinct names
# GH2385, GH15047
index = date_range("01-Jan-2013", periods=10, freq="h")
arr = np.arange(10, dtype="int64")
s1 = Series(arr, index=index, name="foo")
s2 = Series(arr, index=index, name="bar")
df = DataFrame(arr.reshape(-1, 1), index=index)
expected = DataFrame(
np.kron(np.where(np.identity(3) == 1, 1, np.nan), arr).T,
index=index.tolist() * 3,
columns=["foo", 0, "bar"],
)
result = concat([s1, df, s2])
tm.assert_frame_equal(result, expected)
# Rename all series to 0 when ignore_index=True
expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), columns=[0])
result = concat([s1, df, s2], ignore_index=True)
tm.assert_frame_equal(result, expected)
def test_dtype_coercion(self):
# 12411
df = DataFrame({"date": [pd.Timestamp("20130101").tz_localize("UTC"), pd.NaT]})
result = concat([df.iloc[[0]], df.iloc[[1]]])
tm.assert_series_equal(result.dtypes, df.dtypes)
# 12045
df = DataFrame({"date": [datetime(2012, 1, 1), datetime(1012, 1, 2)]})
result = concat([df.iloc[[0]], df.iloc[[1]]])
tm.assert_series_equal(result.dtypes, df.dtypes)
# 11594
df = DataFrame({"text": ["some words"] + [None] * 9})
result = concat([df.iloc[[0]], df.iloc[[1]]])
tm.assert_series_equal(result.dtypes, df.dtypes)
def test_concat_single_with_key(self):
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)))
result = concat([df], keys=["foo"])
expected = concat([df, df], keys=["foo", "bar"])
tm.assert_frame_equal(result, expected[:10])
def test_concat_no_items_raises(self):
with pytest.raises(ValueError, match="No objects to concatenate"):
concat([])
def test_concat_exclude_none(self):
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)))
pieces = [df[:5], None, None, df[5:]]
result = concat(pieces)
tm.assert_frame_equal(result, df)
with pytest.raises(ValueError, match="All objects passed were None"):
concat([None, None])
def test_concat_keys_with_none(self):
# #1649
df0 = DataFrame([[10, 20, 30], [10, 20, 30], [10, 20, 30]])
result = concat({"a": None, "b": df0, "c": df0[:2], "d": df0[:1], "e": df0})
expected = concat({"b": df0, "c": df0[:2], "d": df0[:1], "e": df0})
tm.assert_frame_equal(result, expected)
result = concat(
[None, df0, df0[:2], df0[:1], df0], keys=["a", "b", "c", "d", "e"]
)
expected = concat([df0, df0[:2], df0[:1], df0], keys=["b", "c", "d", "e"])
tm.assert_frame_equal(result, expected)
def test_concat_bug_1719(self):
ts1 = Series(
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
)
ts2 = ts1.copy()[::2]
# to join with union
# these two are of different length!
left = concat([ts1, ts2], join="outer", axis=1)
right = concat([ts2, ts1], join="outer", axis=1)
assert len(left) == len(right)
def test_concat_bug_2972(self):
ts0 = Series(np.zeros(5))
ts1 = Series(np.ones(5))
ts0.name = ts1.name = "same name"
result = concat([ts0, ts1], axis=1)
expected = DataFrame({0: ts0, 1: ts1})
expected.columns = ["same name", "same name"]
tm.assert_frame_equal(result, expected)
def test_concat_bug_3602(self):
# GH 3602, duplicate columns
df1 = DataFrame(
{
"firmNo": [0, 0, 0, 0],
"prc": [6, 6, 6, 6],
"stringvar": ["rrr", "rrr", "rrr", "rrr"],
}
)
df2 = DataFrame(
{"C": [9, 10, 11, 12], "misc": [1, 2, 3, 4], "prc": [6, 6, 6, 6]}
)
expected = DataFrame(
[
[0, 6, "rrr", 9, 1, 6],
[0, 6, "rrr", 10, 2, 6],
[0, 6, "rrr", 11, 3, 6],
[0, 6, "rrr", 12, 4, 6],
]
)
expected.columns = ["firmNo", "prc", "stringvar", "C", "misc", "prc"]
result = concat([df1, df2], axis=1)
tm.assert_frame_equal(result, expected)
def test_concat_iterables(self):
# GH8645 check concat works with tuples, list, generators, and weird
# stuff like deque and custom iterables
df1 = DataFrame([1, 2, 3])
df2 = DataFrame([4, 5, 6])
expected = DataFrame([1, 2, 3, 4, 5, 6])
tm.assert_frame_equal(concat((df1, df2), ignore_index=True), expected)
tm.assert_frame_equal(concat([df1, df2], ignore_index=True), expected)
tm.assert_frame_equal(
concat((df for df in (df1, df2)), ignore_index=True), expected
)
tm.assert_frame_equal(concat(deque((df1, df2)), ignore_index=True), expected)
class CustomIterator1:
def __len__(self) -> int:
return 2
def __getitem__(self, index):
try:
return {0: df1, 1: df2}[index]
except KeyError as err:
raise IndexError from err
tm.assert_frame_equal(concat(CustomIterator1(), ignore_index=True), expected)
class CustomIterator2(abc.Iterable):
def __iter__(self) -> Iterator:
yield df1
yield df2
tm.assert_frame_equal(concat(CustomIterator2(), ignore_index=True), expected)
def test_concat_order(self):
# GH 17344, GH#47331
dfs = [DataFrame(index=range(3), columns=["a", 1, None])]
dfs += [DataFrame(index=range(3), columns=[None, 1, "a"]) for _ in range(100)]
result = concat(dfs, sort=True).columns
expected = Index([1, "a", None])
tm.assert_index_equal(result, expected)
def test_concat_different_extension_dtypes_upcasts(self):
a = Series(pd.array([1, 2], dtype="Int64"))
b = Series(to_decimal([1, 2]))
result = concat([a, b], ignore_index=True)
expected = Series([1, 2, Decimal(1), Decimal(2)], dtype=object)
tm.assert_series_equal(result, expected)
def test_concat_ordered_dict(self):
# GH 21510
expected = concat(
[Series(range(3)), Series(range(4))], keys=["First", "Another"]
)
result = concat({"First": Series(range(3)), "Another": Series(range(4))})
tm.assert_series_equal(result, expected)
def test_concat_duplicate_indices_raise(self):
# GH 45888: test raise for concat DataFrames with duplicate indices
# https://github.com/pandas-dev/pandas/issues/36263
df1 = DataFrame(
np.random.default_rng(2).standard_normal(5),
index=[0, 1, 2, 3, 3],
columns=["a"],
)
df2 = DataFrame(
np.random.default_rng(2).standard_normal(5),
index=[0, 1, 2, 2, 4],
columns=["b"],
)
msg = "Reindexing only valid with uniquely valued Index objects"
with pytest.raises(InvalidIndexError, match=msg):
concat([df1, df2], axis=1)
def test_concat_no_unnecessary_upcast(float_numpy_dtype, frame_or_series):
# GH 13247
dims = frame_or_series(dtype=object).ndim
dt = float_numpy_dtype
dfs = [
frame_or_series(np.array([1], dtype=dt, ndmin=dims)),
frame_or_series(np.array([np.nan], dtype=dt, ndmin=dims)),
frame_or_series(np.array([5], dtype=dt, ndmin=dims)),
]
x = concat(dfs)
assert x.values.dtype == dt
@pytest.mark.parametrize("pdt", [Series, DataFrame])
def test_concat_will_upcast(pdt, any_signed_int_numpy_dtype):
dt = any_signed_int_numpy_dtype
dims = pdt().ndim
dfs = [
pdt(np.array([1], dtype=dt, ndmin=dims)),
pdt(np.array([np.nan], ndmin=dims)),
pdt(np.array([5], dtype=dt, ndmin=dims)),
]
x = concat(dfs)
assert x.values.dtype == "float64"
def test_concat_empty_and_non_empty_frame_regression():
# GH 18178 regression test
df1 = DataFrame({"foo": [1]})
df2 = DataFrame({"foo": []})
expected = DataFrame({"foo": [1.0]})
result = concat([df1, df2])
tm.assert_frame_equal(result, expected)
def test_concat_sparse():
# GH 23557
a = Series(SparseArray([0, 1, 2]))
expected = DataFrame(data=[[0, 0], [1, 1], [2, 2]]).astype(
pd.SparseDtype(np.int64, 0)
)
result = concat([a, a], axis=1)
tm.assert_frame_equal(result, expected)
def test_concat_dense_sparse():
# GH 30668
dtype = pd.SparseDtype(np.float64, None)
a = Series(pd.arrays.SparseArray([1, None]), dtype=dtype)
b = Series([1], dtype=float)
expected = Series(data=[1, None, 1], index=[0, 1, 0]).astype(dtype)
result = concat([a, b], axis=0)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("keys", [["e", "f", "f"], ["f", "e", "f"]])
def test_duplicate_keys(keys):
# GH 33654
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
s1 = Series([7, 8, 9], name="c")
s2 = Series([10, 11, 12], name="d")
result = concat([df, s1, s2], axis=1, keys=keys)
expected_values = [[1, 4, 7, 10], [2, 5, 8, 11], [3, 6, 9, 12]]
expected_columns = MultiIndex.from_tuples(
[(keys[0], "a"), (keys[0], "b"), (keys[1], "c"), (keys[2], "d")]
)
expected = DataFrame(expected_values, columns=expected_columns)
tm.assert_frame_equal(result, expected)
def test_duplicate_keys_same_frame():
# GH 43595
keys = ["e", "e"]
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
result = concat([df, df], axis=1, keys=keys)
expected_values = [[1, 4, 1, 4], [2, 5, 2, 5], [3, 6, 3, 6]]
expected_columns = MultiIndex.from_tuples(
[(keys[0], "a"), (keys[0], "b"), (keys[1], "a"), (keys[1], "b")]
)
expected = DataFrame(expected_values, columns=expected_columns)
tm.assert_frame_equal(result, expected)
@pytest.mark.filterwarnings(
"ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning"
)
@pytest.mark.parametrize(
"obj",
[
tm.SubclassedDataFrame({"A": np.arange(0, 10)}),
tm.SubclassedSeries(np.arange(0, 10), name="A"),
],
)
def test_concat_preserves_subclass(obj):
# GH28330 -- preserve subclass
result = concat([obj, obj])
assert isinstance(result, type(obj))
def test_concat_frame_axis0_extension_dtypes():
# preserve extension dtype (through common_dtype mechanism)
df1 = DataFrame({"a": pd.array([1, 2, 3], dtype="Int64")})
df2 = DataFrame({"a": np.array([4, 5, 6])})
result = concat([df1, df2], ignore_index=True)
expected = DataFrame({"a": [1, 2, 3, 4, 5, 6]}, dtype="Int64")
tm.assert_frame_equal(result, expected)
result = concat([df2, df1], ignore_index=True)
expected = DataFrame({"a": [4, 5, 6, 1, 2, 3]}, dtype="Int64")
tm.assert_frame_equal(result, expected)
def test_concat_preserves_extension_int64_dtype():
# GH 24768
df_a = DataFrame({"a": [-1]}, dtype="Int64")
df_b = DataFrame({"b": [1]}, dtype="Int64")
result = concat([df_a, df_b], ignore_index=True)
expected = DataFrame({"a": [-1, None], "b": [None, 1]}, dtype="Int64")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype1,dtype2,expected_dtype",
[
("bool", "bool", "bool"),
("boolean", "bool", "boolean"),
("bool", "boolean", "boolean"),
("boolean", "boolean", "boolean"),
],
)
def test_concat_bool_types(dtype1, dtype2, expected_dtype):
# GH 42800
ser1 = Series([True, False], dtype=dtype1)
ser2 = Series([False, True], dtype=dtype2)
result = concat([ser1, ser2], ignore_index=True)
expected = Series([True, False, False, True], dtype=expected_dtype)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
("keys", "integrity"),
[
(["red"] * 3, True),
(["red"] * 3, False),
(["red", "blue", "red"], False),
(["red", "blue", "red"], True),
],
)
def test_concat_repeated_keys(keys, integrity):
# GH: 20816
series_list = [Series({"a": 1}), Series({"b": 2}), Series({"c": 3})]
result = concat(series_list, keys=keys, verify_integrity=integrity)
tuples = list(zip(keys, ["a", "b", "c"]))
expected = Series([1, 2, 3], index=MultiIndex.from_tuples(tuples))
tm.assert_series_equal(result, expected)
def test_concat_null_object_with_dti():
# GH#40841
dti = pd.DatetimeIndex(
["2021-04-08 21:21:14+00:00"], dtype="datetime64[ns, UTC]", name="Time (UTC)"
)
right = DataFrame(data={"C": [0.5274]}, index=dti)
idx = Index([None], dtype="object", name="Maybe Time (UTC)")
left = DataFrame(data={"A": [None], "B": [np.nan]}, index=idx)
result = concat([left, right], axis="columns")
exp_index = Index([None, dti[0]], dtype=object)
expected = DataFrame(
{
"A": np.array([None, np.nan], dtype=object),
"B": [np.nan, np.nan],
"C": [np.nan, 0.5274],
},
index=exp_index,
)
tm.assert_frame_equal(result, expected)
def test_concat_multiindex_with_empty_rangeindex():
# GH#41234
mi = MultiIndex.from_tuples([("B", 1), ("C", 1)])
df1 = DataFrame([[1, 2]], columns=mi)
df2 = DataFrame(index=[1], columns=pd.RangeIndex(0))
result = concat([df1, df2])
expected = DataFrame([[1, 2], [np.nan, np.nan]], columns=mi)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data",
[
Series(data=[1, 2]),
DataFrame(
data={
"col1": [1, 2],
}
),
DataFrame(dtype=float),
Series(dtype=float),
],
)
def test_concat_drop_attrs(data):
# GH#41828
df1 = data.copy()
df1.attrs = {1: 1}
df2 = data.copy()
df2.attrs = {1: 2}
df = concat([df1, df2])
assert len(df.attrs) == 0
@pytest.mark.parametrize(
"data",
[
Series(data=[1, 2]),
DataFrame(
data={
"col1": [1, 2],
}
),
DataFrame(dtype=float),
Series(dtype=float),
],
)
def test_concat_retain_attrs(data):
# GH#41828
df1 = data.copy()
df1.attrs = {1: 1}
df2 = data.copy()
df2.attrs = {1: 1}
df = concat([df1, df2])
assert df.attrs[1] == 1
@td.skip_array_manager_invalid_test
@pytest.mark.parametrize("df_dtype", ["float64", "int64", "datetime64[ns]"])
@pytest.mark.parametrize("empty_dtype", [None, "float64", "object"])
def test_concat_ignore_empty_object_float(empty_dtype, df_dtype):
# https://github.com/pandas-dev/pandas/issues/45637
df = DataFrame({"foo": [1, 2], "bar": [1, 2]}, dtype=df_dtype)
empty = DataFrame(columns=["foo", "bar"], dtype=empty_dtype)
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
warn = None
if df_dtype == "datetime64[ns]" or (
df_dtype == "float64" and empty_dtype != "float64"
):
warn = FutureWarning
with tm.assert_produces_warning(warn, match=msg):
result = concat([empty, df])
expected = df
if df_dtype == "int64":
# TODO what exact behaviour do we want for integer eventually?
if empty_dtype == "float64":
expected = df.astype("float64")
else:
expected = df.astype("object")
tm.assert_frame_equal(result, expected)
@td.skip_array_manager_invalid_test
@pytest.mark.parametrize("df_dtype", ["float64", "int64", "datetime64[ns]"])
@pytest.mark.parametrize("empty_dtype", [None, "float64", "object"])
def test_concat_ignore_all_na_object_float(empty_dtype, df_dtype):
df = DataFrame({"foo": [1, 2], "bar": [1, 2]}, dtype=df_dtype)
empty = DataFrame({"foo": [np.nan], "bar": [np.nan]}, dtype=empty_dtype)
if df_dtype == "int64":
# TODO what exact behaviour do we want for integer eventually?
if empty_dtype == "object":
df_dtype = "object"
else:
df_dtype = "float64"
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
warn = None
if empty_dtype != df_dtype and empty_dtype is not None:
warn = FutureWarning
elif df_dtype == "datetime64[ns]":
warn = FutureWarning
with tm.assert_produces_warning(warn, match=msg):
result = concat([empty, df], ignore_index=True)
expected = DataFrame({"foo": [np.nan, 1, 2], "bar": [np.nan, 1, 2]}, dtype=df_dtype)
tm.assert_frame_equal(result, expected)
@td.skip_array_manager_invalid_test
def test_concat_ignore_empty_from_reindex():
# https://github.com/pandas-dev/pandas/pull/43507#issuecomment-920375856
df1 = DataFrame({"a": [1], "b": [pd.Timestamp("2012-01-01")]})
df2 = DataFrame({"a": [2]})
aligned = df2.reindex(columns=df1.columns)
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = concat([df1, aligned], ignore_index=True)
expected = df1 = DataFrame({"a": [1, 2], "b": [pd.Timestamp("2012-01-01"), pd.NaT]})
tm.assert_frame_equal(result, expected)
def test_concat_mismatched_keys_length():
# GH#43485
ser = Series(range(5))
sers = [ser + n for n in range(4)]
keys = ["A", "B", "C"]
msg = r"The behavior of pd.concat with len\(keys\) != len\(objs\) is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
concat(sers, keys=keys, axis=1)
with tm.assert_produces_warning(FutureWarning, match=msg):
concat(sers, keys=keys, axis=0)
with tm.assert_produces_warning(FutureWarning, match=msg):
concat((x for x in sers), keys=(y for y in keys), axis=1)
with tm.assert_produces_warning(FutureWarning, match=msg):
concat((x for x in sers), keys=(y for y in keys), axis=0)
def test_concat_multiindex_with_category():
df1 = DataFrame(
{
"c1": Series(list("abc"), dtype="category"),
"c2": Series(list("eee"), dtype="category"),
"i2": Series([1, 2, 3]),
}
)
df1 = df1.set_index(["c1", "c2"])
df2 = DataFrame(
{
"c1": Series(list("abc"), dtype="category"),
"c2": Series(list("eee"), dtype="category"),
"i2": Series([4, 5, 6]),
}
)
df2 = df2.set_index(["c1", "c2"])
result = concat([df1, df2])
expected = DataFrame(
{
"c1": Series(list("abcabc"), dtype="category"),
"c2": Series(list("eeeeee"), dtype="category"),
"i2": Series([1, 2, 3, 4, 5, 6]),
}
)
expected = expected.set_index(["c1", "c2"])
tm.assert_frame_equal(result, expected)
def test_concat_ea_upcast():
# GH#54848
df1 = DataFrame(["a"], dtype="string")
df2 = DataFrame([1], dtype="Int64")
result = concat([df1, df2])
expected = DataFrame(["a", 1], index=[0, 0])
tm.assert_frame_equal(result, expected)
def test_concat_none_with_timezone_timestamp():
# GH#52093
df1 = DataFrame([{"A": None}])
df2 = DataFrame([{"A": pd.Timestamp("1990-12-20 00:00:00+00:00")}])
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = concat([df1, df2], ignore_index=True)
expected = DataFrame({"A": [None, pd.Timestamp("1990-12-20 00:00:00+00:00")]})
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,230 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
Series,
concat,
)
import pandas._testing as tm
class TestDataFrameConcat:
def test_concat_multiple_frames_dtypes(self):
# GH#2759
df1 = DataFrame(data=np.ones((10, 2)), columns=["foo", "bar"], dtype=np.float64)
df2 = DataFrame(data=np.ones((10, 2)), dtype=np.float32)
results = concat((df1, df2), axis=1).dtypes
expected = Series(
[np.dtype("float64")] * 2 + [np.dtype("float32")] * 2,
index=["foo", "bar", 0, 1],
)
tm.assert_series_equal(results, expected)
def test_concat_tuple_keys(self):
# GH#14438
df1 = DataFrame(np.ones((2, 2)), columns=list("AB"))
df2 = DataFrame(np.ones((3, 2)) * 2, columns=list("AB"))
results = concat((df1, df2), keys=[("bee", "bah"), ("bee", "boo")])
expected = DataFrame(
{
"A": {
("bee", "bah", 0): 1.0,
("bee", "bah", 1): 1.0,
("bee", "boo", 0): 2.0,
("bee", "boo", 1): 2.0,
("bee", "boo", 2): 2.0,
},
"B": {
("bee", "bah", 0): 1.0,
("bee", "bah", 1): 1.0,
("bee", "boo", 0): 2.0,
("bee", "boo", 1): 2.0,
("bee", "boo", 2): 2.0,
},
}
)
tm.assert_frame_equal(results, expected)
def test_concat_named_keys(self):
# GH#14252
df = DataFrame({"foo": [1, 2], "bar": [0.1, 0.2]})
index = Index(["a", "b"], name="baz")
concatted_named_from_keys = concat([df, df], keys=index)
expected_named = DataFrame(
{"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]},
index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=["baz", None]),
)
tm.assert_frame_equal(concatted_named_from_keys, expected_named)
index_no_name = Index(["a", "b"], name=None)
concatted_named_from_names = concat([df, df], keys=index_no_name, names=["baz"])
tm.assert_frame_equal(concatted_named_from_names, expected_named)
concatted_unnamed = concat([df, df], keys=index_no_name)
expected_unnamed = DataFrame(
{"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]},
index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=[None, None]),
)
tm.assert_frame_equal(concatted_unnamed, expected_unnamed)
def test_concat_axis_parameter(self):
# GH#14369
df1 = DataFrame({"A": [0.1, 0.2]}, index=range(2))
df2 = DataFrame({"A": [0.3, 0.4]}, index=range(2))
# Index/row/0 DataFrame
expected_index = DataFrame({"A": [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1])
concatted_index = concat([df1, df2], axis="index")
tm.assert_frame_equal(concatted_index, expected_index)
concatted_row = concat([df1, df2], axis="rows")
tm.assert_frame_equal(concatted_row, expected_index)
concatted_0 = concat([df1, df2], axis=0)
tm.assert_frame_equal(concatted_0, expected_index)
# Columns/1 DataFrame
expected_columns = DataFrame(
[[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=["A", "A"]
)
concatted_columns = concat([df1, df2], axis="columns")
tm.assert_frame_equal(concatted_columns, expected_columns)
concatted_1 = concat([df1, df2], axis=1)
tm.assert_frame_equal(concatted_1, expected_columns)
series1 = Series([0.1, 0.2])
series2 = Series([0.3, 0.4])
# Index/row/0 Series
expected_index_series = Series([0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1])
concatted_index_series = concat([series1, series2], axis="index")
tm.assert_series_equal(concatted_index_series, expected_index_series)
concatted_row_series = concat([series1, series2], axis="rows")
tm.assert_series_equal(concatted_row_series, expected_index_series)
concatted_0_series = concat([series1, series2], axis=0)
tm.assert_series_equal(concatted_0_series, expected_index_series)
# Columns/1 Series
expected_columns_series = DataFrame(
[[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1]
)
concatted_columns_series = concat([series1, series2], axis="columns")
tm.assert_frame_equal(concatted_columns_series, expected_columns_series)
concatted_1_series = concat([series1, series2], axis=1)
tm.assert_frame_equal(concatted_1_series, expected_columns_series)
# Testing ValueError
with pytest.raises(ValueError, match="No axis named"):
concat([series1, series2], axis="something")
def test_concat_numerical_names(self):
# GH#15262, GH#12223
df = DataFrame(
{"col": range(9)},
dtype="int32",
index=(
pd.MultiIndex.from_product(
[["A0", "A1", "A2"], ["B0", "B1", "B2"]], names=[1, 2]
)
),
)
result = concat((df.iloc[:2, :], df.iloc[-2:, :]))
expected = DataFrame(
{"col": [0, 1, 7, 8]},
dtype="int32",
index=pd.MultiIndex.from_tuples(
[("A0", "B0"), ("A0", "B1"), ("A2", "B1"), ("A2", "B2")], names=[1, 2]
),
)
tm.assert_frame_equal(result, expected)
def test_concat_astype_dup_col(self):
# GH#23049
df = DataFrame([{"a": "b"}])
df = concat([df, df], axis=1)
result = df.astype("category")
expected = DataFrame(
np.array(["b", "b"]).reshape(1, 2), columns=["a", "a"]
).astype("category")
tm.assert_frame_equal(result, expected)
def test_concat_dataframe_keys_bug(self, sort):
t1 = DataFrame(
{"value": Series([1, 2, 3], index=Index(["a", "b", "c"], name="id"))}
)
t2 = DataFrame({"value": Series([7, 8], index=Index(["a", "b"], name="id"))})
# it works
result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort)
assert list(result.columns) == [("t1", "value"), ("t2", "value")]
def test_concat_bool_with_int(self):
# GH#42092 we may want to change this to return object, but that
# would need a deprecation
df1 = DataFrame(Series([True, False, True, True], dtype="bool"))
df2 = DataFrame(Series([1, 0, 1], dtype="int64"))
result = concat([df1, df2])
expected = concat([df1.astype("int64"), df2])
tm.assert_frame_equal(result, expected)
def test_concat_duplicates_in_index_with_keys(self):
# GH#42651
index = [1, 1, 3]
data = [1, 2, 3]
df = DataFrame(data=data, index=index)
result = concat([df], keys=["A"], names=["ID", "date"])
mi = pd.MultiIndex.from_product([["A"], index], names=["ID", "date"])
expected = DataFrame(data=data, index=mi)
tm.assert_frame_equal(result, expected)
tm.assert_index_equal(result.index.levels[1], Index([1, 3], name="date"))
@pytest.mark.parametrize("ignore_index", [True, False])
@pytest.mark.parametrize("order", ["C", "F"])
@pytest.mark.parametrize("axis", [0, 1])
def test_concat_copies(self, axis, order, ignore_index, using_copy_on_write):
# based on asv ConcatDataFrames
df = DataFrame(np.zeros((10, 5), dtype=np.float32, order=order))
res = concat([df] * 5, axis=axis, ignore_index=ignore_index, copy=True)
if not using_copy_on_write:
for arr in res._iter_column_arrays():
for arr2 in df._iter_column_arrays():
assert not np.shares_memory(arr, arr2)
def test_outer_sort_columns(self):
# GH#47127
df1 = DataFrame({"A": [0], "B": [1], 0: 1})
df2 = DataFrame({"A": [100]})
result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
expected = DataFrame({0: [1.0, np.nan], "A": [0, 100], "B": [1.0, np.nan]})
tm.assert_frame_equal(result, expected)
def test_inner_sort_columns(self):
# GH#47127
df1 = DataFrame({"A": [0], "B": [1], 0: 1})
df2 = DataFrame({"A": [100], 0: 2})
result = concat([df1, df2], ignore_index=True, join="inner", sort=True)
expected = DataFrame({0: [1, 2], "A": [0, 100]})
tm.assert_frame_equal(result, expected)
def test_sort_columns_one_df(self):
# GH#47127
df1 = DataFrame({"A": [100], 0: 2})
result = concat([df1], ignore_index=True, join="inner", sort=True)
expected = DataFrame({0: [2], "A": [100]})
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,606 @@
import datetime as dt
from datetime import datetime
import dateutil
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
DatetimeIndex,
Index,
MultiIndex,
Series,
Timestamp,
concat,
date_range,
to_timedelta,
)
import pandas._testing as tm
class TestDatetimeConcat:
def test_concat_datetime64_block(self):
rng = date_range("1/1/2000", periods=10)
df = DataFrame({"time": rng})
result = concat([df, df])
assert (result.iloc[:10]["time"] == rng).all()
assert (result.iloc[10:]["time"] == rng).all()
def test_concat_datetime_datetime64_frame(self):
# GH#2624
rows = []
rows.append([datetime(2010, 1, 1), 1])
rows.append([datetime(2010, 1, 2), "hi"])
df2_obj = DataFrame.from_records(rows, columns=["date", "test"])
ind = date_range(start="2000/1/1", freq="D", periods=10)
df1 = DataFrame({"date": ind, "test": range(10)})
# it works!
concat([df1, df2_obj])
def test_concat_datetime_timezone(self):
# GH 18523
idx1 = date_range("2011-01-01", periods=3, freq="h", tz="Europe/Paris")
idx2 = date_range(start=idx1[0], end=idx1[-1], freq="h")
df1 = DataFrame({"a": [1, 2, 3]}, index=idx1)
df2 = DataFrame({"b": [1, 2, 3]}, index=idx2)
result = concat([df1, df2], axis=1)
exp_idx = DatetimeIndex(
[
"2011-01-01 00:00:00+01:00",
"2011-01-01 01:00:00+01:00",
"2011-01-01 02:00:00+01:00",
],
dtype="M8[ns, Europe/Paris]",
freq="h",
)
expected = DataFrame(
[[1, 1], [2, 2], [3, 3]], index=exp_idx, columns=["a", "b"]
)
tm.assert_frame_equal(result, expected)
idx3 = date_range("2011-01-01", periods=3, freq="h", tz="Asia/Tokyo")
df3 = DataFrame({"b": [1, 2, 3]}, index=idx3)
result = concat([df1, df3], axis=1)
exp_idx = DatetimeIndex(
[
"2010-12-31 15:00:00+00:00",
"2010-12-31 16:00:00+00:00",
"2010-12-31 17:00:00+00:00",
"2010-12-31 23:00:00+00:00",
"2011-01-01 00:00:00+00:00",
"2011-01-01 01:00:00+00:00",
]
).as_unit("ns")
expected = DataFrame(
[
[np.nan, 1],
[np.nan, 2],
[np.nan, 3],
[1, np.nan],
[2, np.nan],
[3, np.nan],
],
index=exp_idx,
columns=["a", "b"],
)
tm.assert_frame_equal(result, expected)
# GH 13783: Concat after resample
result = concat([df1.resample("h").mean(), df2.resample("h").mean()], sort=True)
expected = DataFrame(
{"a": [1, 2, 3] + [np.nan] * 3, "b": [np.nan] * 3 + [1, 2, 3]},
index=idx1.append(idx1),
)
tm.assert_frame_equal(result, expected)
def test_concat_datetimeindex_freq(self):
# GH 3232
# Monotonic index result
dr = date_range("01-Jan-2013", periods=100, freq="50ms", tz="UTC")
data = list(range(100))
expected = DataFrame(data, index=dr)
result = concat([expected[:50], expected[50:]])
tm.assert_frame_equal(result, expected)
# Non-monotonic index result
result = concat([expected[50:], expected[:50]])
expected = DataFrame(data[50:] + data[:50], index=dr[50:].append(dr[:50]))
expected.index._data.freq = None
tm.assert_frame_equal(result, expected)
def test_concat_multiindex_datetime_object_index(self):
# https://github.com/pandas-dev/pandas/issues/11058
idx = Index(
[dt.date(2013, 1, 1), dt.date(2014, 1, 1), dt.date(2015, 1, 1)],
dtype="object",
)
s = Series(
["a", "b"],
index=MultiIndex.from_arrays(
[
[1, 2],
idx[:-1],
],
names=["first", "second"],
),
)
s2 = Series(
["a", "b"],
index=MultiIndex.from_arrays(
[[1, 2], idx[::2]],
names=["first", "second"],
),
)
mi = MultiIndex.from_arrays(
[[1, 2, 2], idx],
names=["first", "second"],
)
assert mi.levels[1].dtype == object
expected = DataFrame(
[["a", "a"], ["b", np.nan], [np.nan, "b"]],
index=mi,
)
result = concat([s, s2], axis=1)
tm.assert_frame_equal(result, expected)
def test_concat_NaT_series(self):
# GH 11693
# test for merging NaT series with datetime series.
x = Series(
date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="US/Eastern")
)
y = Series(pd.NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]")
expected = Series([x[0], x[1], pd.NaT, pd.NaT])
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
# all NaT with tz
expected = Series(pd.NaT, index=range(4), dtype="datetime64[ns, US/Eastern]")
result = concat([y, y], ignore_index=True)
tm.assert_series_equal(result, expected)
def test_concat_NaT_series2(self):
# without tz
x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h"))
y = Series(date_range("20151124 10:00", "20151124 11:00", freq="1h"))
y[:] = pd.NaT
expected = Series([x[0], x[1], pd.NaT, pd.NaT])
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
# all NaT without tz
x[:] = pd.NaT
expected = Series(pd.NaT, index=range(4), dtype="datetime64[ns]")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("tz", [None, "UTC"])
def test_concat_NaT_dataframes(self, tz):
# GH 12396
dti = DatetimeIndex([pd.NaT, pd.NaT], tz=tz)
first = DataFrame({0: dti})
second = DataFrame(
[[Timestamp("2015/01/01", tz=tz)], [Timestamp("2016/01/01", tz=tz)]],
index=[2, 3],
)
expected = DataFrame(
[
pd.NaT,
pd.NaT,
Timestamp("2015/01/01", tz=tz),
Timestamp("2016/01/01", tz=tz),
]
)
result = concat([first, second], axis=0)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("tz1", [None, "UTC"])
@pytest.mark.parametrize("tz2", [None, "UTC"])
@pytest.mark.parametrize("item", [pd.NaT, Timestamp("20150101")])
def test_concat_NaT_dataframes_all_NaT_axis_0(
self, tz1, tz2, item, using_array_manager
):
# GH 12396
# tz-naive
first = DataFrame([[pd.NaT], [pd.NaT]]).apply(lambda x: x.dt.tz_localize(tz1))
second = DataFrame([item]).apply(lambda x: x.dt.tz_localize(tz2))
result = concat([first, second], axis=0)
expected = DataFrame(Series([pd.NaT, pd.NaT, item], index=[0, 1, 0]))
expected = expected.apply(lambda x: x.dt.tz_localize(tz2))
if tz1 != tz2:
expected = expected.astype(object)
if item is pd.NaT and not using_array_manager:
# GH#18463
# TODO: setting nan here is to keep the test passing as we
# make assert_frame_equal stricter, but is nan really the
# ideal behavior here?
if tz1 is not None:
expected.iloc[-1, 0] = np.nan
else:
expected.iloc[:-1, 0] = np.nan
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("tz1", [None, "UTC"])
@pytest.mark.parametrize("tz2", [None, "UTC"])
def test_concat_NaT_dataframes_all_NaT_axis_1(self, tz1, tz2):
# GH 12396
first = DataFrame(Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1))
second = DataFrame(Series([pd.NaT]).dt.tz_localize(tz2), columns=[1])
expected = DataFrame(
{
0: Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1),
1: Series([pd.NaT, pd.NaT]).dt.tz_localize(tz2),
}
)
result = concat([first, second], axis=1)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("tz1", [None, "UTC"])
@pytest.mark.parametrize("tz2", [None, "UTC"])
def test_concat_NaT_series_dataframe_all_NaT(self, tz1, tz2):
# GH 12396
# tz-naive
first = Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1)
second = DataFrame(
[
[Timestamp("2015/01/01", tz=tz2)],
[Timestamp("2016/01/01", tz=tz2)],
],
index=[2, 3],
)
expected = DataFrame(
[
pd.NaT,
pd.NaT,
Timestamp("2015/01/01", tz=tz2),
Timestamp("2016/01/01", tz=tz2),
]
)
if tz1 != tz2:
expected = expected.astype(object)
result = concat([first, second])
tm.assert_frame_equal(result, expected)
class TestTimezoneConcat:
def test_concat_tz_series(self):
# gh-11755: tz and no tz
x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC"))
y = Series(date_range("2012-01-01", "2012-01-02"))
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
def test_concat_tz_series2(self):
# gh-11887: concat tz and object
x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC"))
y = Series(["a", "b"])
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
def test_concat_tz_series3(self, unit, unit2):
# see gh-12217 and gh-12306
# Concatenating two UTC times
first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]")
first[0] = first[0].dt.tz_localize("UTC")
second = DataFrame([[datetime(2016, 1, 2)]], dtype=f"M8[{unit2}]")
second[0] = second[0].dt.tz_localize("UTC")
result = concat([first, second])
exp_unit = tm.get_finest_unit(unit, unit2)
assert result[0].dtype == f"datetime64[{exp_unit}, UTC]"
def test_concat_tz_series4(self, unit, unit2):
# Concatenating two London times
first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]")
first[0] = first[0].dt.tz_localize("Europe/London")
second = DataFrame([[datetime(2016, 1, 2)]], dtype=f"M8[{unit2}]")
second[0] = second[0].dt.tz_localize("Europe/London")
result = concat([first, second])
exp_unit = tm.get_finest_unit(unit, unit2)
assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]"
def test_concat_tz_series5(self, unit, unit2):
# Concatenating 2+1 London times
first = DataFrame(
[[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]], dtype=f"M8[{unit}]"
)
first[0] = first[0].dt.tz_localize("Europe/London")
second = DataFrame([[datetime(2016, 1, 3)]], dtype=f"M8[{unit2}]")
second[0] = second[0].dt.tz_localize("Europe/London")
result = concat([first, second])
exp_unit = tm.get_finest_unit(unit, unit2)
assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]"
def test_concat_tz_series6(self, unit, unit2):
# Concatenating 1+2 London times
first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]")
first[0] = first[0].dt.tz_localize("Europe/London")
second = DataFrame(
[[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]], dtype=f"M8[{unit2}]"
)
second[0] = second[0].dt.tz_localize("Europe/London")
result = concat([first, second])
exp_unit = tm.get_finest_unit(unit, unit2)
assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]"
def test_concat_tz_series_tzlocal(self):
# see gh-13583
x = [
Timestamp("2011-01-01", tz=dateutil.tz.tzlocal()),
Timestamp("2011-02-01", tz=dateutil.tz.tzlocal()),
]
y = [
Timestamp("2012-01-01", tz=dateutil.tz.tzlocal()),
Timestamp("2012-02-01", tz=dateutil.tz.tzlocal()),
]
result = concat([Series(x), Series(y)], ignore_index=True)
tm.assert_series_equal(result, Series(x + y))
assert result.dtype == "datetime64[ns, tzlocal()]"
def test_concat_tz_series_with_datetimelike(self):
# see gh-12620: tz and timedelta
x = [
Timestamp("2011-01-01", tz="US/Eastern"),
Timestamp("2011-02-01", tz="US/Eastern"),
]
y = [pd.Timedelta("1 day"), pd.Timedelta("2 day")]
result = concat([Series(x), Series(y)], ignore_index=True)
tm.assert_series_equal(result, Series(x + y, dtype="object"))
# tz and period
y = [pd.Period("2011-03", freq="M"), pd.Period("2011-04", freq="M")]
result = concat([Series(x), Series(y)], ignore_index=True)
tm.assert_series_equal(result, Series(x + y, dtype="object"))
def test_concat_tz_frame(self):
df2 = DataFrame(
{
"A": Timestamp("20130102", tz="US/Eastern"),
"B": Timestamp("20130603", tz="CET"),
},
index=range(5),
)
# concat
df3 = concat([df2.A.to_frame(), df2.B.to_frame()], axis=1)
tm.assert_frame_equal(df2, df3)
def test_concat_multiple_tzs(self):
# GH#12467
# combining datetime tz-aware and naive DataFrames
ts1 = Timestamp("2015-01-01", tz=None)
ts2 = Timestamp("2015-01-01", tz="UTC")
ts3 = Timestamp("2015-01-01", tz="EST")
df1 = DataFrame({"time": [ts1]})
df2 = DataFrame({"time": [ts2]})
df3 = DataFrame({"time": [ts3]})
results = concat([df1, df2]).reset_index(drop=True)
expected = DataFrame({"time": [ts1, ts2]}, dtype=object)
tm.assert_frame_equal(results, expected)
results = concat([df1, df3]).reset_index(drop=True)
expected = DataFrame({"time": [ts1, ts3]}, dtype=object)
tm.assert_frame_equal(results, expected)
results = concat([df2, df3]).reset_index(drop=True)
expected = DataFrame({"time": [ts2, ts3]})
tm.assert_frame_equal(results, expected)
def test_concat_multiindex_with_tz(self):
# GH 6606
df = DataFrame(
{
"dt": DatetimeIndex(
[
datetime(2014, 1, 1),
datetime(2014, 1, 2),
datetime(2014, 1, 3),
],
dtype="M8[ns, US/Pacific]",
),
"b": ["A", "B", "C"],
"c": [1, 2, 3],
"d": [4, 5, 6],
}
)
df = df.set_index(["dt", "b"])
exp_idx1 = DatetimeIndex(
["2014-01-01", "2014-01-02", "2014-01-03"] * 2,
dtype="M8[ns, US/Pacific]",
name="dt",
)
exp_idx2 = Index(["A", "B", "C"] * 2, name="b")
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
expected = DataFrame(
{"c": [1, 2, 3] * 2, "d": [4, 5, 6] * 2}, index=exp_idx, columns=["c", "d"]
)
result = concat([df, df])
tm.assert_frame_equal(result, expected)
def test_concat_tz_not_aligned(self):
# GH#22796
ts = pd.to_datetime([1, 2]).tz_localize("UTC")
a = DataFrame({"A": ts})
b = DataFrame({"A": ts, "B": ts})
result = concat([a, b], sort=True, ignore_index=True)
expected = DataFrame(
{"A": list(ts) + list(ts), "B": [pd.NaT, pd.NaT] + list(ts)}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"t1",
[
"2015-01-01",
pytest.param(
pd.NaT,
marks=pytest.mark.xfail(
reason="GH23037 incorrect dtype when concatenating"
),
),
],
)
def test_concat_tz_NaT(self, t1):
# GH#22796
# Concatenating tz-aware multicolumn DataFrames
ts1 = Timestamp(t1, tz="UTC")
ts2 = Timestamp("2015-01-01", tz="UTC")
ts3 = Timestamp("2015-01-01", tz="UTC")
df1 = DataFrame([[ts1, ts2]])
df2 = DataFrame([[ts3]])
result = concat([df1, df2])
expected = DataFrame([[ts1, ts2], [ts3, pd.NaT]], index=[0, 0])
tm.assert_frame_equal(result, expected)
def test_concat_tz_with_empty(self):
# GH 9188
result = concat(
[DataFrame(date_range("2000", periods=1, tz="UTC")), DataFrame()]
)
expected = DataFrame(date_range("2000", periods=1, tz="UTC"))
tm.assert_frame_equal(result, expected)
class TestPeriodConcat:
def test_concat_period_series(self):
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
y = Series(pd.PeriodIndex(["2015-10-01", "2016-01-01"], freq="D"))
expected = Series([x[0], x[1], y[0], y[1]], dtype="Period[D]")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
def test_concat_period_multiple_freq_series(self):
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
y = Series(pd.PeriodIndex(["2015-10-01", "2016-01-01"], freq="M"))
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
assert result.dtype == "object"
def test_concat_period_other_series(self):
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
y = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="M"))
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
assert result.dtype == "object"
def test_concat_period_other_series2(self):
# non-period
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
y = Series(DatetimeIndex(["2015-11-01", "2015-12-01"]))
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
assert result.dtype == "object"
def test_concat_period_other_series3(self):
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
y = Series(["A", "B"])
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
assert result.dtype == "object"
def test_concat_timedelta64_block():
rng = to_timedelta(np.arange(10), unit="s")
df = DataFrame({"time": rng})
result = concat([df, df])
tm.assert_frame_equal(result.iloc[:10], df)
tm.assert_frame_equal(result.iloc[10:], df)
def test_concat_multiindex_datetime_nat():
# GH#44900
left = DataFrame({"a": 1}, index=MultiIndex.from_tuples([(1, pd.NaT)]))
right = DataFrame(
{"b": 2}, index=MultiIndex.from_tuples([(1, pd.NaT), (2, pd.NaT)])
)
result = concat([left, right], axis="columns")
expected = DataFrame(
{"a": [1.0, np.nan], "b": 2}, MultiIndex.from_tuples([(1, pd.NaT), (2, pd.NaT)])
)
tm.assert_frame_equal(result, expected)
def test_concat_float_datetime64(using_array_manager):
# GH#32934
df_time = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")})
df_float = DataFrame({"A": pd.array([1.0], dtype="float64")})
expected = DataFrame(
{
"A": [
pd.array(["2000"], dtype="datetime64[ns]")[0],
pd.array([1.0], dtype="float64")[0],
]
},
index=[0, 0],
)
result = concat([df_time, df_float])
tm.assert_frame_equal(result, expected)
expected = DataFrame({"A": pd.array([], dtype="object")})
result = concat([df_time.iloc[:0], df_float.iloc[:0]])
tm.assert_frame_equal(result, expected)
expected = DataFrame({"A": pd.array([1.0], dtype="object")})
result = concat([df_time.iloc[:0], df_float])
tm.assert_frame_equal(result, expected)
if not using_array_manager:
expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")})
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = concat([df_time, df_float.iloc[:0]])
tm.assert_frame_equal(result, expected)
else:
expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")}).astype(
{"A": "object"}
)
result = concat([df_time, df_float.iloc[:0]])
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,295 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
RangeIndex,
Series,
concat,
date_range,
)
import pandas._testing as tm
class TestEmptyConcat:
def test_handle_empty_objects(self, sort, using_infer_string):
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd")
)
dfcopy = df[:5].copy()
dfcopy["foo"] = "bar"
empty = df[5:5]
frames = [dfcopy, empty, empty, df[5:]]
concatted = concat(frames, axis=0, sort=sort)
expected = df.reindex(columns=["a", "b", "c", "d", "foo"])
expected["foo"] = expected["foo"].astype(
object if not using_infer_string else "string[pyarrow_numpy]"
)
expected.loc[0:4, "foo"] = "bar"
tm.assert_frame_equal(concatted, expected)
# empty as first element with time series
# GH3259
df = DataFrame(
{"A": range(10000)}, index=date_range("20130101", periods=10000, freq="s")
)
empty = DataFrame()
result = concat([df, empty], axis=1)
tm.assert_frame_equal(result, df)
result = concat([empty, df], axis=1)
tm.assert_frame_equal(result, df)
result = concat([df, empty])
tm.assert_frame_equal(result, df)
result = concat([empty, df])
tm.assert_frame_equal(result, df)
def test_concat_empty_series(self):
# GH 11082
s1 = Series([1, 2, 3], name="x")
s2 = Series(name="y", dtype="float64")
res = concat([s1, s2], axis=1)
exp = DataFrame(
{"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan]},
index=RangeIndex(3),
)
tm.assert_frame_equal(res, exp)
s1 = Series([1, 2, 3], name="x")
s2 = Series(name="y", dtype="float64")
msg = "The behavior of array concatenation with empty entries is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
res = concat([s1, s2], axis=0)
# name will be reset
exp = Series([1, 2, 3])
tm.assert_series_equal(res, exp)
# empty Series with no name
s1 = Series([1, 2, 3], name="x")
s2 = Series(name=None, dtype="float64")
res = concat([s1, s2], axis=1)
exp = DataFrame(
{"x": [1, 2, 3], 0: [np.nan, np.nan, np.nan]},
columns=["x", 0],
index=RangeIndex(3),
)
tm.assert_frame_equal(res, exp)
@pytest.mark.parametrize("tz", [None, "UTC"])
@pytest.mark.parametrize("values", [[], [1, 2, 3]])
def test_concat_empty_series_timelike(self, tz, values):
# GH 18447
first = Series([], dtype="M8[ns]").dt.tz_localize(tz)
dtype = None if values else np.float64
second = Series(values, dtype=dtype)
expected = DataFrame(
{
0: Series([pd.NaT] * len(values), dtype="M8[ns]").dt.tz_localize(tz),
1: values,
}
)
result = concat([first, second], axis=1)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"left,right,expected",
[
# booleans
(np.bool_, np.int32, np.object_), # changed from int32 in 2.0 GH#39817
(np.bool_, np.float32, np.object_),
# datetime-like
("m8[ns]", np.bool_, np.object_),
("m8[ns]", np.int64, np.object_),
("M8[ns]", np.bool_, np.object_),
("M8[ns]", np.int64, np.object_),
# categorical
("category", "category", "category"),
("category", "object", "object"),
],
)
def test_concat_empty_series_dtypes(self, left, right, expected):
# GH#39817, GH#45101
result = concat([Series(dtype=left), Series(dtype=right)])
assert result.dtype == expected
@pytest.mark.parametrize(
"dtype", ["float64", "int8", "uint8", "bool", "m8[ns]", "M8[ns]"]
)
def test_concat_empty_series_dtypes_match_roundtrips(self, dtype):
dtype = np.dtype(dtype)
result = concat([Series(dtype=dtype)])
assert result.dtype == dtype
result = concat([Series(dtype=dtype), Series(dtype=dtype)])
assert result.dtype == dtype
@pytest.mark.parametrize("dtype", ["float64", "int8", "uint8", "m8[ns]", "M8[ns]"])
@pytest.mark.parametrize(
"dtype2",
["float64", "int8", "uint8", "m8[ns]", "M8[ns]"],
)
def test_concat_empty_series_dtypes_roundtrips(self, dtype, dtype2):
# round-tripping with self & like self
if dtype == dtype2:
pytest.skip("same dtype is not applicable for test")
def int_result_type(dtype, dtype2):
typs = {dtype.kind, dtype2.kind}
if not len(typs - {"i", "u", "b"}) and (
dtype.kind == "i" or dtype2.kind == "i"
):
return "i"
elif not len(typs - {"u", "b"}) and (
dtype.kind == "u" or dtype2.kind == "u"
):
return "u"
return None
def float_result_type(dtype, dtype2):
typs = {dtype.kind, dtype2.kind}
if not len(typs - {"f", "i", "u"}) and (
dtype.kind == "f" or dtype2.kind == "f"
):
return "f"
return None
def get_result_type(dtype, dtype2):
result = float_result_type(dtype, dtype2)
if result is not None:
return result
result = int_result_type(dtype, dtype2)
if result is not None:
return result
return "O"
dtype = np.dtype(dtype)
dtype2 = np.dtype(dtype2)
expected = get_result_type(dtype, dtype2)
result = concat([Series(dtype=dtype), Series(dtype=dtype2)]).dtype
assert result.kind == expected
def test_concat_empty_series_dtypes_triple(self):
assert (
concat(
[Series(dtype="M8[ns]"), Series(dtype=np.bool_), Series(dtype=np.int64)]
).dtype
== np.object_
)
def test_concat_empty_series_dtype_category_with_array(self):
# GH#18515
assert (
concat(
[Series(np.array([]), dtype="category"), Series(dtype="float64")]
).dtype
== "float64"
)
def test_concat_empty_series_dtypes_sparse(self):
result = concat(
[
Series(dtype="float64").astype("Sparse"),
Series(dtype="float64").astype("Sparse"),
]
)
assert result.dtype == "Sparse[float64]"
result = concat(
[Series(dtype="float64").astype("Sparse"), Series(dtype="float64")]
)
expected = pd.SparseDtype(np.float64)
assert result.dtype == expected
result = concat(
[Series(dtype="float64").astype("Sparse"), Series(dtype="object")]
)
expected = pd.SparseDtype("object")
assert result.dtype == expected
def test_concat_empty_df_object_dtype(self):
# GH 9149
df_1 = DataFrame({"Row": [0, 1, 1], "EmptyCol": np.nan, "NumberCol": [1, 2, 3]})
df_2 = DataFrame(columns=df_1.columns)
result = concat([df_1, df_2], axis=0)
expected = df_1.astype(object)
tm.assert_frame_equal(result, expected)
def test_concat_empty_dataframe_dtypes(self):
df = DataFrame(columns=list("abc"))
df["a"] = df["a"].astype(np.bool_)
df["b"] = df["b"].astype(np.int32)
df["c"] = df["c"].astype(np.float64)
result = concat([df, df])
assert result["a"].dtype == np.bool_
assert result["b"].dtype == np.int32
assert result["c"].dtype == np.float64
result = concat([df, df.astype(np.float64)])
assert result["a"].dtype == np.object_
assert result["b"].dtype == np.float64
assert result["c"].dtype == np.float64
def test_concat_inner_join_empty(self):
# GH 15328
df_empty = DataFrame()
df_a = DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64")
df_expected = DataFrame({"a": []}, index=RangeIndex(0), dtype="int64")
result = concat([df_a, df_empty], axis=1, join="inner")
tm.assert_frame_equal(result, df_expected)
result = concat([df_a, df_empty], axis=1, join="outer")
tm.assert_frame_equal(result, df_a)
def test_empty_dtype_coerce(self):
# xref to #12411
# xref to #12045
# xref to #11594
# see below
# 10571
df1 = DataFrame(data=[[1, None], [2, None]], columns=["a", "b"])
df2 = DataFrame(data=[[3, None], [4, None]], columns=["a", "b"])
result = concat([df1, df2])
expected = df1.dtypes
tm.assert_series_equal(result.dtypes, expected)
def test_concat_empty_dataframe(self):
# 39037
df1 = DataFrame(columns=["a", "b"])
df2 = DataFrame(columns=["b", "c"])
result = concat([df1, df2, df1])
expected = DataFrame(columns=["a", "b", "c"])
tm.assert_frame_equal(result, expected)
df3 = DataFrame(columns=["a", "b"])
df4 = DataFrame(columns=["b"])
result = concat([df3, df4])
expected = DataFrame(columns=["a", "b"])
tm.assert_frame_equal(result, expected)
def test_concat_empty_dataframe_different_dtypes(self, using_infer_string):
# 39037
df1 = DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
df2 = DataFrame({"a": [1, 2, 3]})
result = concat([df1[:0], df2[:0]])
assert result["a"].dtype == np.int64
assert result["b"].dtype == np.object_ if not using_infer_string else "string"
def test_concat_to_empty_ea(self):
"""48510 `concat` to an empty EA should maintain type EA dtype."""
df_empty = DataFrame({"a": pd.array([], dtype=pd.Int64Dtype())})
df_new = DataFrame({"a": pd.array([1, 2, 3], dtype=pd.Int64Dtype())})
expected = df_new.copy()
result = concat([df_empty, df_new])
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,472 @@
from copy import deepcopy
import numpy as np
import pytest
from pandas.errors import PerformanceWarning
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
Series,
concat,
)
import pandas._testing as tm
class TestIndexConcat:
def test_concat_ignore_index(self, sort):
frame1 = DataFrame(
{"test1": ["a", "b", "c"], "test2": [1, 2, 3], "test3": [4.5, 3.2, 1.2]}
)
frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]})
frame1.index = Index(["x", "y", "z"])
frame2.index = Index(["x", "y", "q"])
v1 = concat([frame1, frame2], axis=1, ignore_index=True, sort=sort)
nan = np.nan
expected = DataFrame(
[
[nan, nan, nan, 4.3],
["a", 1, 4.5, 5.2],
["b", 2, 3.2, 2.2],
["c", 3, 1.2, nan],
],
index=Index(["q", "x", "y", "z"]),
)
if not sort:
expected = expected.loc[["x", "y", "z", "q"]]
tm.assert_frame_equal(v1, expected)
@pytest.mark.parametrize(
"name_in1,name_in2,name_in3,name_out",
[
("idx", "idx", "idx", "idx"),
("idx", "idx", None, None),
("idx", None, None, None),
("idx1", "idx2", None, None),
("idx1", "idx1", "idx2", None),
("idx1", "idx2", "idx3", None),
(None, None, None, None),
],
)
def test_concat_same_index_names(self, name_in1, name_in2, name_in3, name_out):
# GH13475
indices = [
Index(["a", "b", "c"], name=name_in1),
Index(["b", "c", "d"], name=name_in2),
Index(["c", "d", "e"], name=name_in3),
]
frames = [
DataFrame({c: [0, 1, 2]}, index=i) for i, c in zip(indices, ["x", "y", "z"])
]
result = concat(frames, axis=1)
exp_ind = Index(["a", "b", "c", "d", "e"], name=name_out)
expected = DataFrame(
{
"x": [0, 1, 2, np.nan, np.nan],
"y": [np.nan, 0, 1, 2, np.nan],
"z": [np.nan, np.nan, 0, 1, 2],
},
index=exp_ind,
)
tm.assert_frame_equal(result, expected)
def test_concat_rename_index(self):
a = DataFrame(
np.random.default_rng(2).random((3, 3)),
columns=list("ABC"),
index=Index(list("abc"), name="index_a"),
)
b = DataFrame(
np.random.default_rng(2).random((3, 3)),
columns=list("ABC"),
index=Index(list("abc"), name="index_b"),
)
result = concat([a, b], keys=["key0", "key1"], names=["lvl0", "lvl1"])
exp = concat([a, b], keys=["key0", "key1"], names=["lvl0"])
names = list(exp.index.names)
names[1] = "lvl1"
exp.index.set_names(names, inplace=True)
tm.assert_frame_equal(result, exp)
assert result.index.names == exp.index.names
def test_concat_copy_index_series(self, axis, using_copy_on_write):
# GH 29879
ser = Series([1, 2])
comb = concat([ser, ser], axis=axis, copy=True)
if not using_copy_on_write or axis in [0, "index"]:
assert comb.index is not ser.index
else:
assert comb.index is ser.index
def test_concat_copy_index_frame(self, axis, using_copy_on_write):
# GH 29879
df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
comb = concat([df, df], axis=axis, copy=True)
if not using_copy_on_write:
assert not comb.index.is_(df.index)
assert not comb.columns.is_(df.columns)
elif axis in [0, "index"]:
assert not comb.index.is_(df.index)
assert comb.columns.is_(df.columns)
elif axis in [1, "columns"]:
assert comb.index.is_(df.index)
assert not comb.columns.is_(df.columns)
def test_default_index(self):
# is_series and ignore_index
s1 = Series([1, 2, 3], name="x")
s2 = Series([4, 5, 6], name="y")
res = concat([s1, s2], axis=1, ignore_index=True)
assert isinstance(res.columns, pd.RangeIndex)
exp = DataFrame([[1, 4], [2, 5], [3, 6]])
# use check_index_type=True to check the result have
# RangeIndex (default index)
tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
# is_series and all inputs have no names
s1 = Series([1, 2, 3])
s2 = Series([4, 5, 6])
res = concat([s1, s2], axis=1, ignore_index=False)
assert isinstance(res.columns, pd.RangeIndex)
exp = DataFrame([[1, 4], [2, 5], [3, 6]])
exp.columns = pd.RangeIndex(2)
tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
# is_dataframe and ignore_index
df1 = DataFrame({"A": [1, 2], "B": [5, 6]})
df2 = DataFrame({"A": [3, 4], "B": [7, 8]})
res = concat([df1, df2], axis=0, ignore_index=True)
exp = DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], columns=["A", "B"])
tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
res = concat([df1, df2], axis=1, ignore_index=True)
exp = DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]])
tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
def test_dups_index(self):
# GH 4771
# single dtypes
df = DataFrame(
np.random.default_rng(2).integers(0, 10, size=40).reshape(10, 4),
columns=["A", "A", "C", "C"],
)
result = concat([df, df], axis=1)
tm.assert_frame_equal(result.iloc[:, :4], df)
tm.assert_frame_equal(result.iloc[:, 4:], df)
result = concat([df, df], axis=0)
tm.assert_frame_equal(result.iloc[:10], df)
tm.assert_frame_equal(result.iloc[10:], df)
# multi dtypes
df = concat(
[
DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=["A", "A", "B", "B"],
),
DataFrame(
np.random.default_rng(2).integers(0, 10, size=20).reshape(10, 2),
columns=["A", "C"],
),
],
axis=1,
)
result = concat([df, df], axis=1)
tm.assert_frame_equal(result.iloc[:, :6], df)
tm.assert_frame_equal(result.iloc[:, 6:], df)
result = concat([df, df], axis=0)
tm.assert_frame_equal(result.iloc[:10], df)
tm.assert_frame_equal(result.iloc[10:], df)
# append
result = df.iloc[0:8, :]._append(df.iloc[8:])
tm.assert_frame_equal(result, df)
result = df.iloc[0:8, :]._append(df.iloc[8:9])._append(df.iloc[9:10])
tm.assert_frame_equal(result, df)
expected = concat([df, df], axis=0)
result = df._append(df)
tm.assert_frame_equal(result, expected)
class TestMultiIndexConcat:
def test_concat_multiindex_with_keys(self, multiindex_dataframe_random_data):
frame = multiindex_dataframe_random_data
index = frame.index
result = concat([frame, frame], keys=[0, 1], names=["iteration"])
assert result.index.names == ("iteration",) + index.names
tm.assert_frame_equal(result.loc[0], frame)
tm.assert_frame_equal(result.loc[1], frame)
assert result.index.nlevels == 3
def test_concat_multiindex_with_none_in_index_names(self):
# GH 15787
index = MultiIndex.from_product([[1], range(5)], names=["level1", None])
df = DataFrame({"col": range(5)}, index=index, dtype=np.int32)
result = concat([df, df], keys=[1, 2], names=["level2"])
index = MultiIndex.from_product(
[[1, 2], [1], range(5)], names=["level2", "level1", None]
)
expected = DataFrame({"col": list(range(5)) * 2}, index=index, dtype=np.int32)
tm.assert_frame_equal(result, expected)
result = concat([df, df[:2]], keys=[1, 2], names=["level2"])
level2 = [1] * 5 + [2] * 2
level1 = [1] * 7
no_name = list(range(5)) + list(range(2))
tuples = list(zip(level2, level1, no_name))
index = MultiIndex.from_tuples(tuples, names=["level2", "level1", None])
expected = DataFrame({"col": no_name}, index=index, dtype=np.int32)
tm.assert_frame_equal(result, expected)
def test_concat_multiindex_rangeindex(self):
# GH13542
# when multi-index levels are RangeIndex objects
# there is a bug in concat with objects of len 1
df = DataFrame(np.random.default_rng(2).standard_normal((9, 2)))
df.index = MultiIndex(
levels=[pd.RangeIndex(3), pd.RangeIndex(3)],
codes=[np.repeat(np.arange(3), 3), np.tile(np.arange(3), 3)],
)
res = concat([df.iloc[[2, 3, 4], :], df.iloc[[5], :]])
exp = df.iloc[[2, 3, 4, 5], :]
tm.assert_frame_equal(res, exp)
def test_concat_multiindex_dfs_with_deepcopy(self):
# GH 9967
example_multiindex1 = MultiIndex.from_product([["a"], ["b"]])
example_dataframe1 = DataFrame([0], index=example_multiindex1)
example_multiindex2 = MultiIndex.from_product([["a"], ["c"]])
example_dataframe2 = DataFrame([1], index=example_multiindex2)
example_dict = {"s1": example_dataframe1, "s2": example_dataframe2}
expected_index = MultiIndex(
levels=[["s1", "s2"], ["a"], ["b", "c"]],
codes=[[0, 1], [0, 0], [0, 1]],
names=["testname", None, None],
)
expected = DataFrame([[0], [1]], index=expected_index)
result_copy = concat(deepcopy(example_dict), names=["testname"])
tm.assert_frame_equal(result_copy, expected)
result_no_copy = concat(example_dict, names=["testname"])
tm.assert_frame_equal(result_no_copy, expected)
@pytest.mark.parametrize(
"mi1_list",
[
[["a"], range(2)],
[["b"], np.arange(2.0, 4.0)],
[["c"], ["A", "B"]],
[["d"], pd.date_range(start="2017", end="2018", periods=2)],
],
)
@pytest.mark.parametrize(
"mi2_list",
[
[["a"], range(2)],
[["b"], np.arange(2.0, 4.0)],
[["c"], ["A", "B"]],
[["d"], pd.date_range(start="2017", end="2018", periods=2)],
],
)
def test_concat_with_various_multiindex_dtypes(
self, mi1_list: list, mi2_list: list
):
# GitHub #23478
mi1 = MultiIndex.from_product(mi1_list)
mi2 = MultiIndex.from_product(mi2_list)
df1 = DataFrame(np.zeros((1, len(mi1))), columns=mi1)
df2 = DataFrame(np.zeros((1, len(mi2))), columns=mi2)
if mi1_list[0] == mi2_list[0]:
expected_mi = MultiIndex(
levels=[mi1_list[0], list(mi1_list[1])],
codes=[[0, 0, 0, 0], [0, 1, 0, 1]],
)
else:
expected_mi = MultiIndex(
levels=[
mi1_list[0] + mi2_list[0],
list(mi1_list[1]) + list(mi2_list[1]),
],
codes=[[0, 0, 1, 1], [0, 1, 2, 3]],
)
expected_df = DataFrame(np.zeros((1, len(expected_mi))), columns=expected_mi)
with tm.assert_produces_warning(None):
result_df = concat((df1, df2), axis=1)
tm.assert_frame_equal(expected_df, result_df)
def test_concat_multiindex_(self):
# GitHub #44786
df = DataFrame({"col": ["a", "b", "c"]}, index=["1", "2", "2"])
df = concat([df], keys=["X"])
iterables = [["X"], ["1", "2", "2"]]
result_index = df.index
expected_index = MultiIndex.from_product(iterables)
tm.assert_index_equal(result_index, expected_index)
result_df = df
expected_df = DataFrame(
{"col": ["a", "b", "c"]}, index=MultiIndex.from_product(iterables)
)
tm.assert_frame_equal(result_df, expected_df)
def test_concat_with_key_not_unique(self):
# GitHub #46519
df1 = DataFrame({"name": [1]})
df2 = DataFrame({"name": [2]})
df3 = DataFrame({"name": [3]})
df_a = concat([df1, df2, df3], keys=["x", "y", "x"])
# the warning is caused by indexing unsorted multi-index
with tm.assert_produces_warning(
PerformanceWarning, match="indexing past lexsort depth"
):
out_a = df_a.loc[("x", 0), :]
df_b = DataFrame(
{"name": [1, 2, 3]}, index=Index([("x", 0), ("y", 0), ("x", 0)])
)
with tm.assert_produces_warning(
PerformanceWarning, match="indexing past lexsort depth"
):
out_b = df_b.loc[("x", 0)]
tm.assert_frame_equal(out_a, out_b)
df1 = DataFrame({"name": ["a", "a", "b"]})
df2 = DataFrame({"name": ["a", "b"]})
df3 = DataFrame({"name": ["c", "d"]})
df_a = concat([df1, df2, df3], keys=["x", "y", "x"])
with tm.assert_produces_warning(
PerformanceWarning, match="indexing past lexsort depth"
):
out_a = df_a.loc[("x", 0), :]
df_b = DataFrame(
{
"a": ["x", "x", "x", "y", "y", "x", "x"],
"b": [0, 1, 2, 0, 1, 0, 1],
"name": list("aababcd"),
}
).set_index(["a", "b"])
df_b.index.names = [None, None]
with tm.assert_produces_warning(
PerformanceWarning, match="indexing past lexsort depth"
):
out_b = df_b.loc[("x", 0), :]
tm.assert_frame_equal(out_a, out_b)
def test_concat_with_duplicated_levels(self):
# keyword levels should be unique
df1 = DataFrame({"A": [1]}, index=["x"])
df2 = DataFrame({"A": [1]}, index=["y"])
msg = r"Level values not unique: \['x', 'y', 'y'\]"
with pytest.raises(ValueError, match=msg):
concat([df1, df2], keys=["x", "y"], levels=[["x", "y", "y"]])
@pytest.mark.parametrize("levels", [[["x", "y"]], [["x", "y", "y"]]])
def test_concat_with_levels_with_none_keys(self, levels):
df1 = DataFrame({"A": [1]}, index=["x"])
df2 = DataFrame({"A": [1]}, index=["y"])
msg = "levels supported only when keys is not None"
with pytest.raises(ValueError, match=msg):
concat([df1, df2], levels=levels)
def test_concat_range_index_result(self):
# GH#47501
df1 = DataFrame({"a": [1, 2]})
df2 = DataFrame({"b": [1, 2]})
result = concat([df1, df2], sort=True, axis=1)
expected = DataFrame({"a": [1, 2], "b": [1, 2]})
tm.assert_frame_equal(result, expected)
expected_index = pd.RangeIndex(0, 2)
tm.assert_index_equal(result.index, expected_index, exact=True)
def test_concat_index_keep_dtype(self):
# GH#47329
df1 = DataFrame([[0, 1, 1]], columns=Index([1, 2, 3], dtype="object"))
df2 = DataFrame([[0, 1]], columns=Index([1, 2], dtype="object"))
result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
expected = DataFrame(
[[0, 1, 1.0], [0, 1, np.nan]], columns=Index([1, 2, 3], dtype="object")
)
tm.assert_frame_equal(result, expected)
def test_concat_index_keep_dtype_ea_numeric(self, any_numeric_ea_dtype):
# GH#47329
df1 = DataFrame(
[[0, 1, 1]], columns=Index([1, 2, 3], dtype=any_numeric_ea_dtype)
)
df2 = DataFrame([[0, 1]], columns=Index([1, 2], dtype=any_numeric_ea_dtype))
result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
expected = DataFrame(
[[0, 1, 1.0], [0, 1, np.nan]],
columns=Index([1, 2, 3], dtype=any_numeric_ea_dtype),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype", ["Int8", "Int16", "Int32"])
def test_concat_index_find_common(self, dtype):
# GH#47329
df1 = DataFrame([[0, 1, 1]], columns=Index([1, 2, 3], dtype=dtype))
df2 = DataFrame([[0, 1]], columns=Index([1, 2], dtype="Int32"))
result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
expected = DataFrame(
[[0, 1, 1.0], [0, 1, np.nan]], columns=Index([1, 2, 3], dtype="Int32")
)
tm.assert_frame_equal(result, expected)
def test_concat_axis_1_sort_false_rangeindex(self, using_infer_string):
# GH 46675
s1 = Series(["a", "b", "c"])
s2 = Series(["a", "b"])
s3 = Series(["a", "b", "c", "d"])
s4 = Series(
[], dtype=object if not using_infer_string else "string[pyarrow_numpy]"
)
result = concat(
[s1, s2, s3, s4], sort=False, join="outer", ignore_index=False, axis=1
)
expected = DataFrame(
[
["a"] * 3 + [np.nan],
["b"] * 3 + [np.nan],
["c", np.nan] * 2,
[np.nan] * 2 + ["d"] + [np.nan],
],
dtype=object if not using_infer_string else "string[pyarrow_numpy]",
)
tm.assert_frame_equal(
result, expected, check_index_type=True, check_column_type=True
)

View File

@ -0,0 +1,54 @@
from io import StringIO
import numpy as np
import pytest
from pandas import (
DataFrame,
concat,
read_csv,
)
import pandas._testing as tm
class TestInvalidConcat:
@pytest.mark.parametrize("obj", [1, {}, [1, 2], (1, 2)])
def test_concat_invalid(self, obj):
# trying to concat a ndframe with a non-ndframe
df1 = DataFrame(range(2))
msg = (
f"cannot concatenate object of type '{type(obj)}'; "
"only Series and DataFrame objs are valid"
)
with pytest.raises(TypeError, match=msg):
concat([df1, obj])
def test_concat_invalid_first_argument(self):
df1 = DataFrame(range(2))
msg = (
"first argument must be an iterable of pandas "
'objects, you passed an object of type "DataFrame"'
)
with pytest.raises(TypeError, match=msg):
concat(df1)
def test_concat_generator_obj(self):
# generator ok though
concat(DataFrame(np.random.default_rng(2).random((5, 5))) for _ in range(3))
def test_concat_textreader_obj(self):
# text reader ok
# GH6583
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
with read_csv(StringIO(data), chunksize=1) as reader:
result = concat(reader, ignore_index=True)
expected = read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,175 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
DatetimeIndex,
Index,
MultiIndex,
Series,
concat,
date_range,
)
import pandas._testing as tm
class TestSeriesConcat:
def test_concat_series(self):
ts = Series(
np.arange(20, dtype=np.float64),
index=date_range("2020-01-01", periods=20),
name="foo",
)
ts.name = "foo"
pieces = [ts[:5], ts[5:15], ts[15:]]
result = concat(pieces)
tm.assert_series_equal(result, ts)
assert result.name == ts.name
result = concat(pieces, keys=[0, 1, 2])
expected = ts.copy()
ts.index = DatetimeIndex(np.array(ts.index.values, dtype="M8[ns]"))
exp_codes = [np.repeat([0, 1, 2], [len(x) for x in pieces]), np.arange(len(ts))]
exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], codes=exp_codes)
expected.index = exp_index
tm.assert_series_equal(result, expected)
def test_concat_empty_and_non_empty_series_regression(self):
# GH 18187 regression test
s1 = Series([1])
s2 = Series([], dtype=object)
expected = s1
msg = "The behavior of array concatenation with empty entries is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = concat([s1, s2])
tm.assert_series_equal(result, expected)
def test_concat_series_axis1(self):
ts = Series(
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
)
pieces = [ts[:-2], ts[2:], ts[2:-2]]
result = concat(pieces, axis=1)
expected = DataFrame(pieces).T
tm.assert_frame_equal(result, expected)
result = concat(pieces, keys=["A", "B", "C"], axis=1)
expected = DataFrame(pieces, index=["A", "B", "C"]).T
tm.assert_frame_equal(result, expected)
def test_concat_series_axis1_preserves_series_names(self):
# preserve series names, #2489
s = Series(np.random.default_rng(2).standard_normal(5), name="A")
s2 = Series(np.random.default_rng(2).standard_normal(5), name="B")
result = concat([s, s2], axis=1)
expected = DataFrame({"A": s, "B": s2})
tm.assert_frame_equal(result, expected)
s2.name = None
result = concat([s, s2], axis=1)
tm.assert_index_equal(result.columns, Index(["A", 0], dtype="object"))
def test_concat_series_axis1_with_reindex(self, sort):
# must reindex, #2603
s = Series(
np.random.default_rng(2).standard_normal(3), index=["c", "a", "b"], name="A"
)
s2 = Series(
np.random.default_rng(2).standard_normal(4),
index=["d", "a", "b", "c"],
name="B",
)
result = concat([s, s2], axis=1, sort=sort)
expected = DataFrame({"A": s, "B": s2}, index=["c", "a", "b", "d"])
if sort:
expected = expected.sort_index()
tm.assert_frame_equal(result, expected)
def test_concat_series_axis1_names_applied(self):
# ensure names argument is not ignored on axis=1, #23490
s = Series([1, 2, 3])
s2 = Series([4, 5, 6])
result = concat([s, s2], axis=1, keys=["a", "b"], names=["A"])
expected = DataFrame(
[[1, 4], [2, 5], [3, 6]], columns=Index(["a", "b"], name="A")
)
tm.assert_frame_equal(result, expected)
result = concat([s, s2], axis=1, keys=[("a", 1), ("b", 2)], names=["A", "B"])
expected = DataFrame(
[[1, 4], [2, 5], [3, 6]],
columns=MultiIndex.from_tuples([("a", 1), ("b", 2)], names=["A", "B"]),
)
tm.assert_frame_equal(result, expected)
def test_concat_series_axis1_same_names_ignore_index(self):
dates = date_range("01-Jan-2013", "01-Jan-2014", freq="MS")[0:-1]
s1 = Series(
np.random.default_rng(2).standard_normal(len(dates)),
index=dates,
name="value",
)
s2 = Series(
np.random.default_rng(2).standard_normal(len(dates)),
index=dates,
name="value",
)
result = concat([s1, s2], axis=1, ignore_index=True)
expected = Index(range(2))
tm.assert_index_equal(result.columns, expected, exact=True)
@pytest.mark.parametrize(
"s1name,s2name", [(np.int64(190), (43, 0)), (190, (43, 0))]
)
def test_concat_series_name_npscalar_tuple(self, s1name, s2name):
# GH21015
s1 = Series({"a": 1, "b": 2}, name=s1name)
s2 = Series({"c": 5, "d": 6}, name=s2name)
result = concat([s1, s2])
expected = Series({"a": 1, "b": 2, "c": 5, "d": 6})
tm.assert_series_equal(result, expected)
def test_concat_series_partial_columns_names(self):
# GH10698
named_series = Series([1, 2], name="foo")
unnamed_series1 = Series([1, 2])
unnamed_series2 = Series([4, 5])
result = concat([named_series, unnamed_series1, unnamed_series2], axis=1)
expected = DataFrame(
{"foo": [1, 2], 0: [1, 2], 1: [4, 5]}, columns=["foo", 0, 1]
)
tm.assert_frame_equal(result, expected)
result = concat(
[named_series, unnamed_series1, unnamed_series2],
axis=1,
keys=["red", "blue", "yellow"],
)
expected = DataFrame(
{"red": [1, 2], "blue": [1, 2], "yellow": [4, 5]},
columns=["red", "blue", "yellow"],
)
tm.assert_frame_equal(result, expected)
result = concat(
[named_series, unnamed_series1, unnamed_series2], axis=1, ignore_index=True
)
expected = DataFrame({0: [1, 2], 1: [1, 2], 2: [4, 5]})
tm.assert_frame_equal(result, expected)
def test_concat_series_length_one_reversed(self, frame_or_series):
# GH39401
obj = frame_or_series([100])
result = concat([obj.iloc[::-1]])
tm.assert_equal(result, obj)

View File

@ -0,0 +1,118 @@
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame
import pandas._testing as tm
class TestConcatSort:
def test_concat_sorts_columns(self, sort):
# GH-4588
df1 = DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"])
df2 = DataFrame({"a": [3, 4], "c": [5, 6]})
# for sort=True/None
expected = DataFrame(
{"a": [1, 2, 3, 4], "b": [1, 2, None, None], "c": [None, None, 5, 6]},
columns=["a", "b", "c"],
)
if sort is False:
expected = expected[["b", "a", "c"]]
# default
with tm.assert_produces_warning(None):
result = pd.concat([df1, df2], ignore_index=True, sort=sort)
tm.assert_frame_equal(result, expected)
def test_concat_sorts_index(self, sort):
df1 = DataFrame({"a": [1, 2, 3]}, index=["c", "a", "b"])
df2 = DataFrame({"b": [1, 2]}, index=["a", "b"])
# For True/None
expected = DataFrame(
{"a": [2, 3, 1], "b": [1, 2, None]},
index=["a", "b", "c"],
columns=["a", "b"],
)
if sort is False:
expected = expected.loc[["c", "a", "b"]]
# Warn and sort by default
with tm.assert_produces_warning(None):
result = pd.concat([df1, df2], axis=1, sort=sort)
tm.assert_frame_equal(result, expected)
def test_concat_inner_sort(self, sort):
# https://github.com/pandas-dev/pandas/pull/20613
df1 = DataFrame(
{"a": [1, 2], "b": [1, 2], "c": [1, 2]}, columns=["b", "a", "c"]
)
df2 = DataFrame({"a": [1, 2], "b": [3, 4]}, index=[3, 4])
with tm.assert_produces_warning(None):
# unset sort should *not* warn for inner join
# since that never sorted
result = pd.concat([df1, df2], sort=sort, join="inner", ignore_index=True)
expected = DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]}, columns=["b", "a"])
if sort is True:
expected = expected[["a", "b"]]
tm.assert_frame_equal(result, expected)
def test_concat_aligned_sort(self):
# GH-4588
df = DataFrame({"c": [1, 2], "b": [3, 4], "a": [5, 6]}, columns=["c", "b", "a"])
result = pd.concat([df, df], sort=True, ignore_index=True)
expected = DataFrame(
{"a": [5, 6, 5, 6], "b": [3, 4, 3, 4], "c": [1, 2, 1, 2]},
columns=["a", "b", "c"],
)
tm.assert_frame_equal(result, expected)
result = pd.concat(
[df, df[["c", "b"]]], join="inner", sort=True, ignore_index=True
)
expected = expected[["b", "c"]]
tm.assert_frame_equal(result, expected)
def test_concat_aligned_sort_does_not_raise(self):
# GH-4588
# We catch TypeErrors from sorting internally and do not re-raise.
df = DataFrame({1: [1, 2], "a": [3, 4]}, columns=[1, "a"])
expected = DataFrame({1: [1, 2, 1, 2], "a": [3, 4, 3, 4]}, columns=[1, "a"])
result = pd.concat([df, df], ignore_index=True, sort=True)
tm.assert_frame_equal(result, expected)
def test_concat_frame_with_sort_false(self):
# GH 43375
result = pd.concat(
[DataFrame({i: i}, index=[i]) for i in range(2, 0, -1)], sort=False
)
expected = DataFrame([[2, np.nan], [np.nan, 1]], index=[2, 1], columns=[2, 1])
tm.assert_frame_equal(result, expected)
# GH 37937
df1 = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=[1, 2, 3])
df2 = DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]}, index=[3, 1, 6])
result = pd.concat([df2, df1], axis=1, sort=False)
expected = DataFrame(
[
[7.0, 10.0, 3.0, 6.0],
[8.0, 11.0, 1.0, 4.0],
[9.0, 12.0, np.nan, np.nan],
[np.nan, np.nan, 2.0, 5.0],
],
index=[3, 1, 6, 2],
columns=["c", "d", "a", "b"],
)
tm.assert_frame_equal(result, expected)
def test_concat_sort_none_raises(self):
# GH#41518
df = DataFrame({1: [1, 2], "a": [3, 4]})
msg = "The 'sort' keyword only accepts boolean values; None was passed."
with pytest.raises(ValueError, match=msg):
pd.concat([df, df], sort=None)