Updated script that can be controled by Nodejs web app

This commit is contained in:
mac OS
2024-11-25 12:24:18 +07:00
parent c440eda1f4
commit 8b0ab2bd3a
8662 changed files with 1803808 additions and 34 deletions

View File

@ -0,0 +1,7 @@
"""
Test files dedicated to individual (stand-alone) DataFrame methods
Ideally these files/tests should correspond 1-to-1 with tests.series.methods
These may also present opportunities for sharing/de-duplicating test code.
"""

View File

@ -0,0 +1,49 @@
import pytest
from pandas import Index
import pandas._testing as tm
def test_add_prefix_suffix(float_frame):
with_prefix = float_frame.add_prefix("foo#")
expected = Index([f"foo#{c}" for c in float_frame.columns])
tm.assert_index_equal(with_prefix.columns, expected)
with_suffix = float_frame.add_suffix("#foo")
expected = Index([f"{c}#foo" for c in float_frame.columns])
tm.assert_index_equal(with_suffix.columns, expected)
with_pct_prefix = float_frame.add_prefix("%")
expected = Index([f"%{c}" for c in float_frame.columns])
tm.assert_index_equal(with_pct_prefix.columns, expected)
with_pct_suffix = float_frame.add_suffix("%")
expected = Index([f"{c}%" for c in float_frame.columns])
tm.assert_index_equal(with_pct_suffix.columns, expected)
def test_add_prefix_suffix_axis(float_frame):
# GH 47819
with_prefix = float_frame.add_prefix("foo#", axis=0)
expected = Index([f"foo#{c}" for c in float_frame.index])
tm.assert_index_equal(with_prefix.index, expected)
with_prefix = float_frame.add_prefix("foo#", axis=1)
expected = Index([f"foo#{c}" for c in float_frame.columns])
tm.assert_index_equal(with_prefix.columns, expected)
with_pct_suffix = float_frame.add_suffix("#foo", axis=0)
expected = Index([f"{c}#foo" for c in float_frame.index])
tm.assert_index_equal(with_pct_suffix.index, expected)
with_pct_suffix = float_frame.add_suffix("#foo", axis=1)
expected = Index([f"{c}#foo" for c in float_frame.columns])
tm.assert_index_equal(with_pct_suffix.columns, expected)
def test_add_prefix_suffix_invalid_axis(float_frame):
with pytest.raises(ValueError, match="No axis named 2 for object type DataFrame"):
float_frame.add_prefix("foo#", axis=2)
with pytest.raises(ValueError, match="No axis named 2 for object type DataFrame"):
float_frame.add_suffix("foo#", axis=2)

View File

@ -0,0 +1,484 @@
from datetime import timezone
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
Series,
date_range,
)
import pandas._testing as tm
class TestDataFrameAlign:
def test_align_asfreq_method_raises(self):
df = DataFrame({"A": [1, np.nan, 2]})
msg = "Invalid fill method"
msg2 = "The 'method', 'limit', and 'fill_axis' keywords"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning, match=msg2):
df.align(df.iloc[::-1], method="asfreq")
def test_frame_align_aware(self):
idx1 = date_range("2001", periods=5, freq="h", tz="US/Eastern")
idx2 = date_range("2001", periods=5, freq="2h", tz="US/Eastern")
df1 = DataFrame(np.random.default_rng(2).standard_normal((len(idx1), 3)), idx1)
df2 = DataFrame(np.random.default_rng(2).standard_normal((len(idx2), 3)), idx2)
new1, new2 = df1.align(df2)
assert df1.index.tz == new1.index.tz
assert df2.index.tz == new2.index.tz
# different timezones convert to UTC
# frame with frame
df1_central = df1.tz_convert("US/Central")
new1, new2 = df1.align(df1_central)
assert new1.index.tz is timezone.utc
assert new2.index.tz is timezone.utc
# frame with Series
new1, new2 = df1.align(df1_central[0], axis=0)
assert new1.index.tz is timezone.utc
assert new2.index.tz is timezone.utc
df1[0].align(df1_central, axis=0)
assert new1.index.tz is timezone.utc
assert new2.index.tz is timezone.utc
def test_align_float(self, float_frame, using_copy_on_write):
af, bf = float_frame.align(float_frame)
assert af._mgr is not float_frame._mgr
af, bf = float_frame.align(float_frame, copy=False)
if not using_copy_on_write:
assert af._mgr is float_frame._mgr
else:
assert af._mgr is not float_frame._mgr
# axis = 0
other = float_frame.iloc[:-5, :3]
af, bf = float_frame.align(other, axis=0, fill_value=-1)
tm.assert_index_equal(bf.columns, other.columns)
# test fill value
join_idx = float_frame.index.join(other.index)
diff_a = float_frame.index.difference(join_idx)
diff_a_vals = af.reindex(diff_a).values
assert (diff_a_vals == -1).all()
af, bf = float_frame.align(other, join="right", axis=0)
tm.assert_index_equal(bf.columns, other.columns)
tm.assert_index_equal(bf.index, other.index)
tm.assert_index_equal(af.index, other.index)
# axis = 1
other = float_frame.iloc[:-5, :3].copy()
af, bf = float_frame.align(other, axis=1)
tm.assert_index_equal(bf.columns, float_frame.columns)
tm.assert_index_equal(bf.index, other.index)
# test fill value
join_idx = float_frame.index.join(other.index)
diff_a = float_frame.index.difference(join_idx)
diff_a_vals = af.reindex(diff_a).values
assert (diff_a_vals == -1).all()
af, bf = float_frame.align(other, join="inner", axis=1)
tm.assert_index_equal(bf.columns, other.columns)
msg = (
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
"are deprecated"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
af, bf = float_frame.align(other, join="inner", axis=1, method="pad")
tm.assert_index_equal(bf.columns, other.columns)
msg = (
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
"are deprecated"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
af, bf = float_frame.align(
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=None
)
tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype))
msg = (
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
"are deprecated"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
af, bf = float_frame.align(
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0
)
tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype))
# Try to align DataFrame to Series along bad axis
msg = "No axis named 2 for object type DataFrame"
with pytest.raises(ValueError, match=msg):
float_frame.align(af.iloc[0, :3], join="inner", axis=2)
def test_align_frame_with_series(self, float_frame):
# align dataframe to series with broadcast or not
idx = float_frame.index
s = Series(range(len(idx)), index=idx)
left, right = float_frame.align(s, axis=0)
tm.assert_index_equal(left.index, float_frame.index)
tm.assert_index_equal(right.index, float_frame.index)
assert isinstance(right, Series)
msg = "The 'broadcast_axis' keyword in DataFrame.align is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
left, right = float_frame.align(s, broadcast_axis=1)
tm.assert_index_equal(left.index, float_frame.index)
expected = {c: s for c in float_frame.columns}
expected = DataFrame(
expected, index=float_frame.index, columns=float_frame.columns
)
tm.assert_frame_equal(right, expected)
def test_align_series_condition(self):
# see gh-9558
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
result = df[df["a"] == 2]
expected = DataFrame([[2, 5]], index=[1], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
result = df.where(df["a"] == 2, 0)
expected = DataFrame({"a": [0, 2, 0], "b": [0, 5, 0]})
tm.assert_frame_equal(result, expected)
def test_align_int(self, int_frame):
# test other non-float types
other = DataFrame(index=range(5), columns=["A", "B", "C"])
msg = (
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
"are deprecated"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
af, bf = int_frame.align(other, join="inner", axis=1, method="pad")
tm.assert_index_equal(bf.columns, other.columns)
def test_align_mixed_type(self, float_string_frame):
msg = (
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
"are deprecated"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
af, bf = float_string_frame.align(
float_string_frame, join="inner", axis=1, method="pad"
)
tm.assert_index_equal(bf.columns, float_string_frame.columns)
def test_align_mixed_float(self, mixed_float_frame):
# mixed floats/ints
other = DataFrame(index=range(5), columns=["A", "B", "C"])
msg = (
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
"are deprecated"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
af, bf = mixed_float_frame.align(
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0
)
tm.assert_index_equal(bf.index, Index([]))
def test_align_mixed_int(self, mixed_int_frame):
other = DataFrame(index=range(5), columns=["A", "B", "C"])
msg = (
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
"are deprecated"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
af, bf = mixed_int_frame.align(
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0
)
tm.assert_index_equal(bf.index, Index([]))
@pytest.mark.parametrize(
"l_ordered,r_ordered,expected",
[
[True, True, pd.CategoricalIndex],
[True, False, Index],
[False, True, Index],
[False, False, pd.CategoricalIndex],
],
)
def test_align_categorical(self, l_ordered, r_ordered, expected):
# GH-28397
df_1 = DataFrame(
{
"A": np.arange(6, dtype="int64"),
"B": Series(list("aabbca")).astype(
pd.CategoricalDtype(list("cab"), ordered=l_ordered)
),
}
).set_index("B")
df_2 = DataFrame(
{
"A": np.arange(5, dtype="int64"),
"B": Series(list("babca")).astype(
pd.CategoricalDtype(list("cab"), ordered=r_ordered)
),
}
).set_index("B")
aligned_1, aligned_2 = df_1.align(df_2)
assert isinstance(aligned_1.index, expected)
assert isinstance(aligned_2.index, expected)
tm.assert_index_equal(aligned_1.index, aligned_2.index)
def test_align_multiindex(self):
# GH#10665
# same test cases as test_align_multiindex in test_series.py
midx = pd.MultiIndex.from_product(
[range(2), range(3), range(2)], names=("a", "b", "c")
)
idx = Index(range(2), name="b")
df1 = DataFrame(np.arange(12, dtype="int64"), index=midx)
df2 = DataFrame(np.arange(2, dtype="int64"), index=idx)
# these must be the same results (but flipped)
res1l, res1r = df1.align(df2, join="left")
res2l, res2r = df2.align(df1, join="right")
expl = df1
tm.assert_frame_equal(expl, res1l)
tm.assert_frame_equal(expl, res2r)
expr = DataFrame([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx)
tm.assert_frame_equal(expr, res1r)
tm.assert_frame_equal(expr, res2l)
res1l, res1r = df1.align(df2, join="right")
res2l, res2r = df2.align(df1, join="left")
exp_idx = pd.MultiIndex.from_product(
[range(2), range(2), range(2)], names=("a", "b", "c")
)
expl = DataFrame([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx)
tm.assert_frame_equal(expl, res1l)
tm.assert_frame_equal(expl, res2r)
expr = DataFrame([0, 0, 1, 1] * 2, index=exp_idx)
tm.assert_frame_equal(expr, res1r)
tm.assert_frame_equal(expr, res2l)
def test_align_series_combinations(self):
df = DataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE"))
s = Series([1, 2, 4], index=list("ABD"), name="x")
# frame + series
res1, res2 = df.align(s, axis=0)
exp1 = DataFrame(
{"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]},
index=list("ABCDE"),
)
exp2 = Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x")
tm.assert_frame_equal(res1, exp1)
tm.assert_series_equal(res2, exp2)
# series + frame
res1, res2 = s.align(df)
tm.assert_series_equal(res1, exp2)
tm.assert_frame_equal(res2, exp1)
def test_multiindex_align_to_series_with_common_index_level(self):
# GH-46001
foo_index = Index([1, 2, 3], name="foo")
bar_index = Index([1, 2], name="bar")
series = Series([1, 2], index=bar_index, name="foo_series")
df = DataFrame(
{"col": np.arange(6)},
index=pd.MultiIndex.from_product([foo_index, bar_index]),
)
expected_r = Series([1, 2] * 3, index=df.index, name="foo_series")
result_l, result_r = df.align(series, axis=0)
tm.assert_frame_equal(result_l, df)
tm.assert_series_equal(result_r, expected_r)
def test_multiindex_align_to_series_with_common_index_level_missing_in_left(self):
# GH-46001
foo_index = Index([1, 2, 3], name="foo")
bar_index = Index([1, 2], name="bar")
series = Series(
[1, 2, 3, 4], index=Index([1, 2, 3, 4], name="bar"), name="foo_series"
)
df = DataFrame(
{"col": np.arange(6)},
index=pd.MultiIndex.from_product([foo_index, bar_index]),
)
expected_r = Series([1, 2] * 3, index=df.index, name="foo_series")
result_l, result_r = df.align(series, axis=0)
tm.assert_frame_equal(result_l, df)
tm.assert_series_equal(result_r, expected_r)
def test_multiindex_align_to_series_with_common_index_level_missing_in_right(self):
# GH-46001
foo_index = Index([1, 2, 3], name="foo")
bar_index = Index([1, 2, 3, 4], name="bar")
series = Series([1, 2], index=Index([1, 2], name="bar"), name="foo_series")
df = DataFrame(
{"col": np.arange(12)},
index=pd.MultiIndex.from_product([foo_index, bar_index]),
)
expected_r = Series(
[1, 2, np.nan, np.nan] * 3, index=df.index, name="foo_series"
)
result_l, result_r = df.align(series, axis=0)
tm.assert_frame_equal(result_l, df)
tm.assert_series_equal(result_r, expected_r)
def test_multiindex_align_to_series_with_common_index_level_missing_in_both(self):
# GH-46001
foo_index = Index([1, 2, 3], name="foo")
bar_index = Index([1, 3, 4], name="bar")
series = Series(
[1, 2, 3], index=Index([1, 2, 4], name="bar"), name="foo_series"
)
df = DataFrame(
{"col": np.arange(9)},
index=pd.MultiIndex.from_product([foo_index, bar_index]),
)
expected_r = Series([1, np.nan, 3] * 3, index=df.index, name="foo_series")
result_l, result_r = df.align(series, axis=0)
tm.assert_frame_equal(result_l, df)
tm.assert_series_equal(result_r, expected_r)
def test_multiindex_align_to_series_with_common_index_level_non_unique_cols(self):
# GH-46001
foo_index = Index([1, 2, 3], name="foo")
bar_index = Index([1, 2], name="bar")
series = Series([1, 2], index=bar_index, name="foo_series")
df = DataFrame(
np.arange(18).reshape(6, 3),
index=pd.MultiIndex.from_product([foo_index, bar_index]),
)
df.columns = ["cfoo", "cbar", "cfoo"]
expected = Series([1, 2] * 3, index=df.index, name="foo_series")
result_left, result_right = df.align(series, axis=0)
tm.assert_series_equal(result_right, expected)
tm.assert_index_equal(result_left.columns, df.columns)
def test_missing_axis_specification_exception(self):
df = DataFrame(np.arange(50).reshape((10, 5)))
series = Series(np.arange(5))
with pytest.raises(ValueError, match=r"axis=0 or 1"):
df.align(series)
@pytest.mark.parametrize("method", ["pad", "bfill"])
@pytest.mark.parametrize("axis", [0, 1, None])
@pytest.mark.parametrize("fill_axis", [0, 1])
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize(
"left_slice",
[
[slice(4), slice(10)],
[slice(0), slice(0)],
],
)
@pytest.mark.parametrize(
"right_slice",
[
[slice(2, None), slice(6, None)],
[slice(0), slice(0)],
],
)
@pytest.mark.parametrize("limit", [1, None])
def test_align_fill_method(
self, how, method, axis, fill_axis, float_frame, left_slice, right_slice, limit
):
frame = float_frame
left = frame.iloc[left_slice[0], left_slice[1]]
right = frame.iloc[right_slice[0], right_slice[1]]
msg = (
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
"are deprecated"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
aa, ab = left.align(
right,
axis=axis,
join=how,
method=method,
limit=limit,
fill_axis=fill_axis,
)
join_index, join_columns = None, None
ea, eb = left, right
if axis is None or axis == 0:
join_index = left.index.join(right.index, how=how)
ea = ea.reindex(index=join_index)
eb = eb.reindex(index=join_index)
if axis is None or axis == 1:
join_columns = left.columns.join(right.columns, how=how)
ea = ea.reindex(columns=join_columns)
eb = eb.reindex(columns=join_columns)
msg = "DataFrame.fillna with 'method' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
ea = ea.fillna(axis=fill_axis, method=method, limit=limit)
eb = eb.fillna(axis=fill_axis, method=method, limit=limit)
tm.assert_frame_equal(aa, ea)
tm.assert_frame_equal(ab, eb)
def test_align_series_check_copy(self):
# GH#
df = DataFrame({0: [1, 2]})
ser = Series([1], name=0)
expected = ser.copy()
result, other = df.align(ser, axis=1)
ser.iloc[0] = 100
tm.assert_series_equal(other, expected)
def test_align_identical_different_object(self):
# GH#51032
df = DataFrame({"a": [1, 2]})
ser = Series([3, 4])
result, result2 = df.align(ser, axis=0)
tm.assert_frame_equal(result, df)
tm.assert_series_equal(result2, ser)
assert df is not result
assert ser is not result2
def test_align_identical_different_object_columns(self):
# GH#51032
df = DataFrame({"a": [1, 2]})
ser = Series([1], index=["a"])
result, result2 = df.align(ser, axis=1)
tm.assert_frame_equal(result, df)
tm.assert_series_equal(result2, ser)
assert df is not result
assert ser is not result2

View File

@ -0,0 +1,263 @@
from datetime import datetime
import numpy as np
import pytest
from pandas._libs.tslibs.offsets import MonthEnd
from pandas import (
DataFrame,
DatetimeIndex,
Series,
date_range,
period_range,
to_datetime,
)
import pandas._testing as tm
from pandas.tseries import offsets
class TestAsFreq:
@pytest.fixture(params=["s", "ms", "us", "ns"])
def unit(self, request):
return request.param
def test_asfreq2(self, frame_or_series):
ts = frame_or_series(
[0.0, 1.0, 2.0],
index=DatetimeIndex(
[
datetime(2009, 10, 30),
datetime(2009, 11, 30),
datetime(2009, 12, 31),
],
dtype="M8[ns]",
freq="BME",
),
)
daily_ts = ts.asfreq("B")
monthly_ts = daily_ts.asfreq("BME")
tm.assert_equal(monthly_ts, ts)
daily_ts = ts.asfreq("B", method="pad")
monthly_ts = daily_ts.asfreq("BME")
tm.assert_equal(monthly_ts, ts)
daily_ts = ts.asfreq(offsets.BDay())
monthly_ts = daily_ts.asfreq(offsets.BMonthEnd())
tm.assert_equal(monthly_ts, ts)
result = ts[:0].asfreq("ME")
assert len(result) == 0
assert result is not ts
if frame_or_series is Series:
daily_ts = ts.asfreq("D", fill_value=-1)
result = daily_ts.value_counts().sort_index()
expected = Series(
[60, 1, 1, 1], index=[-1.0, 2.0, 1.0, 0.0], name="count"
).sort_index()
tm.assert_series_equal(result, expected)
def test_asfreq_datetimeindex_empty(self, frame_or_series):
# GH#14320
index = DatetimeIndex(["2016-09-29 11:00"])
expected = frame_or_series(index=index, dtype=object).asfreq("h")
result = frame_or_series([3], index=index.copy()).asfreq("h")
tm.assert_index_equal(expected.index, result.index)
@pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"])
def test_tz_aware_asfreq_smoke(self, tz, frame_or_series):
dr = date_range("2011-12-01", "2012-07-20", freq="D", tz=tz)
obj = frame_or_series(
np.random.default_rng(2).standard_normal(len(dr)), index=dr
)
# it works!
obj.asfreq("min")
def test_asfreq_normalize(self, frame_or_series):
rng = date_range("1/1/2000 09:30", periods=20)
norm = date_range("1/1/2000", periods=20)
vals = np.random.default_rng(2).standard_normal((20, 3))
obj = DataFrame(vals, index=rng)
expected = DataFrame(vals, index=norm)
if frame_or_series is Series:
obj = obj[0]
expected = expected[0]
result = obj.asfreq("D", normalize=True)
tm.assert_equal(result, expected)
def test_asfreq_keep_index_name(self, frame_or_series):
# GH#9854
index_name = "bar"
index = date_range("20130101", periods=20, name=index_name)
obj = DataFrame(list(range(20)), columns=["foo"], index=index)
obj = tm.get_obj(obj, frame_or_series)
assert index_name == obj.index.name
assert index_name == obj.asfreq("10D").index.name
def test_asfreq_ts(self, frame_or_series):
index = period_range(freq="Y", start="1/1/2001", end="12/31/2010")
obj = DataFrame(
np.random.default_rng(2).standard_normal((len(index), 3)), index=index
)
obj = tm.get_obj(obj, frame_or_series)
result = obj.asfreq("D", how="end")
exp_index = index.asfreq("D", how="end")
assert len(result) == len(obj)
tm.assert_index_equal(result.index, exp_index)
result = obj.asfreq("D", how="start")
exp_index = index.asfreq("D", how="start")
assert len(result) == len(obj)
tm.assert_index_equal(result.index, exp_index)
def test_asfreq_resample_set_correct_freq(self, frame_or_series):
# GH#5613
# we test if .asfreq() and .resample() set the correct value for .freq
dti = to_datetime(["2012-01-01", "2012-01-02", "2012-01-03"])
obj = DataFrame({"col": [1, 2, 3]}, index=dti)
obj = tm.get_obj(obj, frame_or_series)
# testing the settings before calling .asfreq() and .resample()
assert obj.index.freq is None
assert obj.index.inferred_freq == "D"
# does .asfreq() set .freq correctly?
assert obj.asfreq("D").index.freq == "D"
# does .resample() set .freq correctly?
assert obj.resample("D").asfreq().index.freq == "D"
def test_asfreq_empty(self, datetime_frame):
# test does not blow up on length-0 DataFrame
zero_length = datetime_frame.reindex([])
result = zero_length.asfreq("BME")
assert result is not zero_length
def test_asfreq(self, datetime_frame):
offset_monthly = datetime_frame.asfreq(offsets.BMonthEnd())
rule_monthly = datetime_frame.asfreq("BME")
tm.assert_frame_equal(offset_monthly, rule_monthly)
rule_monthly.asfreq("B", method="pad")
# TODO: actually check that this worked.
# don't forget!
rule_monthly.asfreq("B", method="pad")
def test_asfreq_datetimeindex(self):
df = DataFrame(
{"A": [1, 2, 3]},
index=[datetime(2011, 11, 1), datetime(2011, 11, 2), datetime(2011, 11, 3)],
)
df = df.asfreq("B")
assert isinstance(df.index, DatetimeIndex)
ts = df["A"].asfreq("B")
assert isinstance(ts.index, DatetimeIndex)
def test_asfreq_fillvalue(self):
# test for fill value during upsampling, related to issue 3715
# setup
rng = date_range("1/1/2016", periods=10, freq="2s")
# Explicit cast to 'float' to avoid implicit cast when setting None
ts = Series(np.arange(len(rng)), index=rng, dtype="float")
df = DataFrame({"one": ts})
# insert pre-existing missing value
df.loc["2016-01-01 00:00:08", "one"] = None
actual_df = df.asfreq(freq="1s", fill_value=9.0)
expected_df = df.asfreq(freq="1s").fillna(9.0)
expected_df.loc["2016-01-01 00:00:08", "one"] = None
tm.assert_frame_equal(expected_df, actual_df)
expected_series = ts.asfreq(freq="1s").fillna(9.0)
actual_series = ts.asfreq(freq="1s", fill_value=9.0)
tm.assert_series_equal(expected_series, actual_series)
def test_asfreq_with_date_object_index(self, frame_or_series):
rng = date_range("1/1/2000", periods=20)
ts = frame_or_series(np.random.default_rng(2).standard_normal(20), index=rng)
ts2 = ts.copy()
ts2.index = [x.date() for x in ts2.index]
result = ts2.asfreq("4h", method="ffill")
expected = ts.asfreq("4h", method="ffill")
tm.assert_equal(result, expected)
def test_asfreq_with_unsorted_index(self, frame_or_series):
# GH#39805
# Test that rows are not dropped when the datetime index is out of order
index = to_datetime(["2021-01-04", "2021-01-02", "2021-01-03", "2021-01-01"])
result = frame_or_series(range(4), index=index)
expected = result.reindex(sorted(index))
expected.index = expected.index._with_freq("infer")
result = result.asfreq("D")
tm.assert_equal(result, expected)
def test_asfreq_after_normalize(self, unit):
# https://github.com/pandas-dev/pandas/issues/50727
result = DatetimeIndex(
date_range("2000", periods=2).as_unit(unit).normalize(), freq="D"
)
expected = DatetimeIndex(["2000-01-01", "2000-01-02"], freq="D").as_unit(unit)
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize(
"freq, freq_half",
[
("2ME", "ME"),
(MonthEnd(2), MonthEnd(1)),
],
)
def test_asfreq_2ME(self, freq, freq_half):
index = date_range("1/1/2000", periods=6, freq=freq_half)
df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0, 4.0, 5.0], index=index)})
expected = df.asfreq(freq=freq)
index = date_range("1/1/2000", periods=3, freq=freq)
result = DataFrame({"s": Series([0.0, 2.0, 4.0], index=index)})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"freq, freq_depr",
[
("2ME", "2M"),
("2QE", "2Q"),
("2QE-SEP", "2Q-SEP"),
("1BQE", "1BQ"),
("2BQE-SEP", "2BQ-SEP"),
("1YE", "1Y"),
("2YE-MAR", "2Y-MAR"),
("1YE", "1A"),
("2YE-MAR", "2A-MAR"),
("2BYE-MAR", "2BA-MAR"),
],
)
def test_asfreq_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr):
# GH#9586, #55978
depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed "
f"in a future version, please use '{freq[1:]}' instead."
index = date_range("1/1/2000", periods=4, freq=f"{freq[1:]}")
df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0], index=index)})
expected = df.asfreq(freq=freq)
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
result = df.asfreq(freq=freq_depr)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,198 @@
import numpy as np
import pytest
from pandas._libs.tslibs import IncompatibleFrequency
from pandas import (
DataFrame,
Period,
Series,
Timestamp,
date_range,
period_range,
to_datetime,
)
import pandas._testing as tm
@pytest.fixture
def date_range_frame():
"""
Fixture for DataFrame of ints with date_range index
Columns are ['A', 'B'].
"""
N = 50
rng = date_range("1/1/1990", periods=N, freq="53s")
return DataFrame({"A": np.arange(N), "B": np.arange(N)}, index=rng)
class TestFrameAsof:
def test_basic(self, date_range_frame):
# Explicitly cast to float to avoid implicit cast when setting np.nan
df = date_range_frame.astype({"A": "float"})
N = 50
df.loc[df.index[15:30], "A"] = np.nan
dates = date_range("1/1/1990", periods=N * 3, freq="25s")
result = df.asof(dates)
assert result.notna().all(1).all()
lb = df.index[14]
ub = df.index[30]
dates = list(dates)
result = df.asof(dates)
assert result.notna().all(1).all()
mask = (result.index >= lb) & (result.index < ub)
rs = result[mask]
assert (rs == 14).all(1).all()
def test_subset(self, date_range_frame):
N = 10
# explicitly cast to float to avoid implicit upcast when setting to np.nan
df = date_range_frame.iloc[:N].copy().astype({"A": "float"})
df.loc[df.index[4:8], "A"] = np.nan
dates = date_range("1/1/1990", periods=N * 3, freq="25s")
# with a subset of A should be the same
result = df.asof(dates, subset="A")
expected = df.asof(dates)
tm.assert_frame_equal(result, expected)
# same with A/B
result = df.asof(dates, subset=["A", "B"])
expected = df.asof(dates)
tm.assert_frame_equal(result, expected)
# B gives df.asof
result = df.asof(dates, subset="B")
expected = df.resample("25s", closed="right").ffill().reindex(dates)
expected.iloc[20:] = 9
# no "missing", so "B" can retain int dtype (df["A"].dtype platform-dependent)
expected["B"] = expected["B"].astype(df["B"].dtype)
tm.assert_frame_equal(result, expected)
def test_missing(self, date_range_frame):
# GH 15118
# no match found - `where` value before earliest date in index
N = 10
# Cast to 'float64' to avoid upcast when introducing nan in df.asof
df = date_range_frame.iloc[:N].copy().astype("float64")
result = df.asof("1989-12-31")
expected = Series(
index=["A", "B"], name=Timestamp("1989-12-31"), dtype=np.float64
)
tm.assert_series_equal(result, expected)
result = df.asof(to_datetime(["1989-12-31"]))
expected = DataFrame(
index=to_datetime(["1989-12-31"]), columns=["A", "B"], dtype="float64"
)
tm.assert_frame_equal(result, expected)
# Check that we handle PeriodIndex correctly, dont end up with
# period.ordinal for series name
df = df.to_period("D")
result = df.asof("1989-12-31")
assert isinstance(result.name, Period)
def test_asof_all_nans(self, frame_or_series):
# GH 15713
# DataFrame/Series is all nans
result = frame_or_series([np.nan]).asof([0])
expected = frame_or_series([np.nan])
tm.assert_equal(result, expected)
def test_all_nans(self, date_range_frame):
# GH 15713
# DataFrame is all nans
# testing non-default indexes, multiple inputs
N = 150
rng = date_range_frame.index
dates = date_range("1/1/1990", periods=N, freq="25s")
result = DataFrame(np.nan, index=rng, columns=["A"]).asof(dates)
expected = DataFrame(np.nan, index=dates, columns=["A"])
tm.assert_frame_equal(result, expected)
# testing multiple columns
dates = date_range("1/1/1990", periods=N, freq="25s")
result = DataFrame(np.nan, index=rng, columns=["A", "B", "C"]).asof(dates)
expected = DataFrame(np.nan, index=dates, columns=["A", "B", "C"])
tm.assert_frame_equal(result, expected)
# testing scalar input
result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof([3])
expected = DataFrame(np.nan, index=[3], columns=["A", "B"])
tm.assert_frame_equal(result, expected)
result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof(3)
expected = Series(np.nan, index=["A", "B"], name=3)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"stamp,expected",
[
(
Timestamp("2018-01-01 23:22:43.325+00:00"),
Series(2, name=Timestamp("2018-01-01 23:22:43.325+00:00")),
),
(
Timestamp("2018-01-01 22:33:20.682+01:00"),
Series(1, name=Timestamp("2018-01-01 22:33:20.682+01:00")),
),
],
)
def test_time_zone_aware_index(self, stamp, expected):
# GH21194
# Testing awareness of DataFrame index considering different
# UTC and timezone
df = DataFrame(
data=[1, 2],
index=[
Timestamp("2018-01-01 21:00:05.001+00:00"),
Timestamp("2018-01-01 22:35:10.550+00:00"),
],
)
result = df.asof(stamp)
tm.assert_series_equal(result, expected)
def test_is_copy(self, date_range_frame):
# GH-27357, GH-30784: ensure the result of asof is an actual copy and
# doesn't track the parent dataframe / doesn't give SettingWithCopy warnings
df = date_range_frame.astype({"A": "float"})
N = 50
df.loc[df.index[15:30], "A"] = np.nan
dates = date_range("1/1/1990", periods=N * 3, freq="25s")
result = df.asof(dates)
with tm.assert_produces_warning(None):
result["C"] = 1
def test_asof_periodindex_mismatched_freq(self):
N = 50
rng = period_range("1/1/1990", periods=N, freq="h")
df = DataFrame(np.random.default_rng(2).standard_normal(N), index=rng)
# Mismatched freq
msg = "Input has different freq"
with pytest.raises(IncompatibleFrequency, match=msg):
df.asof(rng.asfreq("D"))
def test_asof_preserves_bool_dtype(self):
# GH#16063 was casting bools to floats
dti = date_range("2017-01-01", freq="MS", periods=4)
ser = Series([True, False, True], index=dti[:-1])
ts = dti[-1]
res = ser.asof([ts])
expected = Series([True], index=[ts])
tm.assert_series_equal(res, expected)

View File

@ -0,0 +1,84 @@
import pytest
from pandas import DataFrame
import pandas._testing as tm
class TestAssign:
def test_assign(self):
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
original = df.copy()
result = df.assign(C=df.B / df.A)
expected = df.copy()
expected["C"] = [4, 2.5, 2]
tm.assert_frame_equal(result, expected)
# lambda syntax
result = df.assign(C=lambda x: x.B / x.A)
tm.assert_frame_equal(result, expected)
# original is unmodified
tm.assert_frame_equal(df, original)
# Non-Series array-like
result = df.assign(C=[4, 2.5, 2])
tm.assert_frame_equal(result, expected)
# original is unmodified
tm.assert_frame_equal(df, original)
result = df.assign(B=df.B / df.A)
expected = expected.drop("B", axis=1).rename(columns={"C": "B"})
tm.assert_frame_equal(result, expected)
# overwrite
result = df.assign(A=df.A + df.B)
expected = df.copy()
expected["A"] = [5, 7, 9]
tm.assert_frame_equal(result, expected)
# lambda
result = df.assign(A=lambda x: x.A + x.B)
tm.assert_frame_equal(result, expected)
def test_assign_multiple(self):
df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=["A", "B"])
result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B)
expected = DataFrame(
[[1, 4, 7, 1, 4], [2, 5, 8, 2, 5], [3, 6, 9, 3, 6]], columns=list("ABCDE")
)
tm.assert_frame_equal(result, expected)
def test_assign_order(self):
# GH 9818
df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
result = df.assign(D=df.A + df.B, C=df.A - df.B)
expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]], columns=list("ABDC"))
tm.assert_frame_equal(result, expected)
result = df.assign(C=df.A - df.B, D=df.A + df.B)
expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], columns=list("ABCD"))
tm.assert_frame_equal(result, expected)
def test_assign_bad(self):
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
# non-keyword argument
msg = r"assign\(\) takes 1 positional argument but 2 were given"
with pytest.raises(TypeError, match=msg):
df.assign(lambda x: x.A)
msg = "'DataFrame' object has no attribute 'C'"
with pytest.raises(AttributeError, match=msg):
df.assign(C=df.A, D=df.A + df.C)
def test_assign_dependent(self):
df = DataFrame({"A": [1, 2], "B": [3, 4]})
result = df.assign(C=df.A, D=lambda x: x["A"] + x["C"])
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD"))
tm.assert_frame_equal(result, expected)
result = df.assign(C=lambda df: df.A, D=lambda df: df["A"] + df["C"])
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD"))
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,911 @@
import re
import numpy as np
import pytest
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
Categorical,
CategoricalDtype,
DataFrame,
DatetimeTZDtype,
Index,
Interval,
IntervalDtype,
NaT,
Series,
Timedelta,
Timestamp,
concat,
date_range,
option_context,
)
import pandas._testing as tm
def _check_cast(df, v):
"""
Check if all dtypes of df are equal to v
"""
assert all(s.dtype.name == v for _, s in df.items())
class TestAstype:
def test_astype_float(self, float_frame):
casted = float_frame.astype(int)
expected = DataFrame(
float_frame.values.astype(int),
index=float_frame.index,
columns=float_frame.columns,
)
tm.assert_frame_equal(casted, expected)
casted = float_frame.astype(np.int32)
expected = DataFrame(
float_frame.values.astype(np.int32),
index=float_frame.index,
columns=float_frame.columns,
)
tm.assert_frame_equal(casted, expected)
float_frame["foo"] = "5"
casted = float_frame.astype(int)
expected = DataFrame(
float_frame.values.astype(int),
index=float_frame.index,
columns=float_frame.columns,
)
tm.assert_frame_equal(casted, expected)
def test_astype_mixed_float(self, mixed_float_frame):
# mixed casting
casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float32")
_check_cast(casted, "float32")
casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float16")
_check_cast(casted, "float16")
def test_astype_mixed_type(self):
# mixed casting
df = DataFrame(
{
"a": 1.0,
"b": 2,
"c": "foo",
"float32": np.array([1.0] * 10, dtype="float32"),
"int32": np.array([1] * 10, dtype="int32"),
},
index=np.arange(10),
)
mn = df._get_numeric_data().copy()
mn["little_float"] = np.array(12345.0, dtype="float16")
mn["big_float"] = np.array(123456789101112.0, dtype="float64")
casted = mn.astype("float64")
_check_cast(casted, "float64")
casted = mn.astype("int64")
_check_cast(casted, "int64")
casted = mn.reindex(columns=["little_float"]).astype("float16")
_check_cast(casted, "float16")
casted = mn.astype("float32")
_check_cast(casted, "float32")
casted = mn.astype("int32")
_check_cast(casted, "int32")
# to object
casted = mn.astype("O")
_check_cast(casted, "object")
def test_astype_with_exclude_string(self, float_frame):
df = float_frame.copy()
expected = float_frame.astype(int)
df["string"] = "foo"
casted = df.astype(int, errors="ignore")
expected["string"] = "foo"
tm.assert_frame_equal(casted, expected)
df = float_frame.copy()
expected = float_frame.astype(np.int32)
df["string"] = "foo"
casted = df.astype(np.int32, errors="ignore")
expected["string"] = "foo"
tm.assert_frame_equal(casted, expected)
def test_astype_with_view_float(self, float_frame):
# this is the only real reason to do it this way
tf = np.round(float_frame).astype(np.int32)
tf.astype(np.float32, copy=False)
# TODO(wesm): verification?
tf = float_frame.astype(np.float64)
tf.astype(np.int64, copy=False)
def test_astype_with_view_mixed_float(self, mixed_float_frame):
tf = mixed_float_frame.reindex(columns=["A", "B", "C"])
tf.astype(np.int64)
tf.astype(np.float32)
@pytest.mark.parametrize("dtype", [np.int32, np.int64])
@pytest.mark.parametrize("val", [np.nan, np.inf])
def test_astype_cast_nan_inf_int(self, val, dtype):
# see GH#14265
#
# Check NaN and inf --> raise error when converting to int.
msg = "Cannot convert non-finite values \\(NA or inf\\) to integer"
df = DataFrame([val])
with pytest.raises(ValueError, match=msg):
df.astype(dtype)
def test_astype_str(self):
# see GH#9757
a = Series(date_range("2010-01-04", periods=5))
b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern"))
c = Series([Timedelta(x, unit="d") for x in range(5)])
d = Series(range(5))
e = Series([0.0, 0.2, 0.4, 0.6, 0.8])
df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e})
# Datetime-like
result = df.astype(str)
expected = DataFrame(
{
"a": list(map(str, (Timestamp(x)._date_repr for x in a._values))),
"b": list(map(str, map(Timestamp, b._values))),
"c": [Timedelta(x)._repr_base() for x in c._values],
"d": list(map(str, d._values)),
"e": list(map(str, e._values)),
},
dtype="object",
)
tm.assert_frame_equal(result, expected)
def test_astype_str_float(self):
# see GH#11302
result = DataFrame([np.nan]).astype(str)
expected = DataFrame(["nan"], dtype="object")
tm.assert_frame_equal(result, expected)
result = DataFrame([1.12345678901234567890]).astype(str)
val = "1.1234567890123457"
expected = DataFrame([val], dtype="object")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype_class", [dict, Series])
def test_astype_dict_like(self, dtype_class):
# GH7271 & GH16717
a = Series(date_range("2010-01-04", periods=5))
b = Series(range(5))
c = Series([0.0, 0.2, 0.4, 0.6, 0.8])
d = Series(["1.0", "2", "3.14", "4", "5.4"])
df = DataFrame({"a": a, "b": b, "c": c, "d": d})
original = df.copy(deep=True)
# change type of a subset of columns
dt1 = dtype_class({"b": "str", "d": "float32"})
result = df.astype(dt1)
expected = DataFrame(
{
"a": a,
"b": Series(["0", "1", "2", "3", "4"], dtype="object"),
"c": c,
"d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"),
}
)
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(df, original)
dt2 = dtype_class({"b": np.float32, "c": "float32", "d": np.float64})
result = df.astype(dt2)
expected = DataFrame(
{
"a": a,
"b": Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float32"),
"c": Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype="float32"),
"d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float64"),
}
)
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(df, original)
# change all columns
dt3 = dtype_class({"a": str, "b": str, "c": str, "d": str})
tm.assert_frame_equal(df.astype(dt3), df.astype(str))
tm.assert_frame_equal(df, original)
# error should be raised when using something other than column labels
# in the keys of the dtype dict
dt4 = dtype_class({"b": str, 2: str})
dt5 = dtype_class({"e": str})
msg_frame = (
"Only a column name can be used for the key in a dtype mappings argument. "
"'{}' not found in columns."
)
with pytest.raises(KeyError, match=msg_frame.format(2)):
df.astype(dt4)
with pytest.raises(KeyError, match=msg_frame.format("e")):
df.astype(dt5)
tm.assert_frame_equal(df, original)
# if the dtypes provided are the same as the original dtypes, the
# resulting DataFrame should be the same as the original DataFrame
dt6 = dtype_class({col: df[col].dtype for col in df.columns})
equiv = df.astype(dt6)
tm.assert_frame_equal(df, equiv)
tm.assert_frame_equal(df, original)
# GH#16717
# if dtypes provided is empty, the resulting DataFrame
# should be the same as the original DataFrame
dt7 = dtype_class({}) if dtype_class is dict else dtype_class({}, dtype=object)
equiv = df.astype(dt7)
tm.assert_frame_equal(df, equiv)
tm.assert_frame_equal(df, original)
def test_astype_duplicate_col(self):
a1 = Series([1, 2, 3, 4, 5], name="a")
b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name="b")
a2 = Series([0, 1, 2, 3, 4], name="a")
df = concat([a1, b, a2], axis=1)
result = df.astype(str)
a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a")
b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype=str, name="b")
a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a")
expected = concat([a1_str, b_str, a2_str], axis=1)
tm.assert_frame_equal(result, expected)
result = df.astype({"a": "str"})
expected = concat([a1_str, b, a2_str], axis=1)
tm.assert_frame_equal(result, expected)
def test_astype_duplicate_col_series_arg(self):
# GH#44417
vals = np.random.default_rng(2).standard_normal((3, 4))
df = DataFrame(vals, columns=["A", "B", "C", "A"])
dtypes = df.dtypes
dtypes.iloc[0] = str
dtypes.iloc[2] = "Float64"
result = df.astype(dtypes)
expected = DataFrame(
{
0: Series(vals[:, 0].astype(str), dtype=object),
1: vals[:, 1],
2: pd.array(vals[:, 2], dtype="Float64"),
3: vals[:, 3],
}
)
expected.columns = df.columns
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype",
[
"category",
CategoricalDtype(),
CategoricalDtype(ordered=True),
CategoricalDtype(ordered=False),
CategoricalDtype(categories=list("abcdef")),
CategoricalDtype(categories=list("edba"), ordered=False),
CategoricalDtype(categories=list("edcb"), ordered=True),
],
ids=repr,
)
def test_astype_categorical(self, dtype):
# GH#18099
d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")}
df = DataFrame(d)
result = df.astype(dtype)
expected = DataFrame({k: Categorical(v, dtype=dtype) for k, v in d.items()})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("cls", [CategoricalDtype, DatetimeTZDtype, IntervalDtype])
def test_astype_categoricaldtype_class_raises(self, cls):
df = DataFrame({"A": ["a", "a", "b", "c"]})
xpr = f"Expected an instance of {cls.__name__}"
with pytest.raises(TypeError, match=xpr):
df.astype({"A": cls})
with pytest.raises(TypeError, match=xpr):
df["A"].astype(cls)
@pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"])
def test_astype_extension_dtypes(self, dtype):
# GH#22578
df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"])
expected1 = DataFrame(
{
"a": pd.array([1, 3, 5], dtype=dtype),
"b": pd.array([2, 4, 6], dtype=dtype),
}
)
tm.assert_frame_equal(df.astype(dtype), expected1)
tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
tm.assert_frame_equal(df.astype(dtype).astype("float64"), df)
df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"])
df["b"] = df["b"].astype(dtype)
expected2 = DataFrame(
{"a": [1.0, 3.0, 5.0], "b": pd.array([2, 4, 6], dtype=dtype)}
)
tm.assert_frame_equal(df, expected2)
tm.assert_frame_equal(df.astype(dtype), expected1)
tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
@pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"])
def test_astype_extension_dtypes_1d(self, dtype):
# GH#22578
df = DataFrame({"a": [1.0, 2.0, 3.0]})
expected1 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)})
tm.assert_frame_equal(df.astype(dtype), expected1)
tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
df = DataFrame({"a": [1.0, 2.0, 3.0]})
df["a"] = df["a"].astype(dtype)
expected2 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)})
tm.assert_frame_equal(df, expected2)
tm.assert_frame_equal(df.astype(dtype), expected1)
tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
@pytest.mark.parametrize("dtype", ["category", "Int64"])
def test_astype_extension_dtypes_duplicate_col(self, dtype):
# GH#24704
a1 = Series([0, np.nan, 4], name="a")
a2 = Series([np.nan, 3, 5], name="a")
df = concat([a1, a2], axis=1)
result = df.astype(dtype)
expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype", [{100: "float64", 200: "uint64"}, "category", "float64"]
)
def test_astype_column_metadata(self, dtype):
# GH#19920
columns = Index([100, 200, 300], dtype=np.uint64, name="foo")
df = DataFrame(np.arange(15).reshape(5, 3), columns=columns)
df = df.astype(dtype)
tm.assert_index_equal(df.columns, columns)
@pytest.mark.parametrize("unit", ["Y", "M", "W", "D", "h", "m"])
def test_astype_from_object_to_datetime_unit(self, unit):
vals = [
["2015-01-01", "2015-01-02", "2015-01-03"],
["2017-01-01", "2017-01-02", "2017-02-03"],
]
df = DataFrame(vals, dtype=object)
msg = (
rf"Unexpected value for 'dtype': 'datetime64\[{unit}\]'. "
r"Must be 'datetime64\[s\]', 'datetime64\[ms\]', 'datetime64\[us\]', "
r"'datetime64\[ns\]' or DatetimeTZDtype"
)
with pytest.raises(ValueError, match=msg):
df.astype(f"M8[{unit}]")
@pytest.mark.parametrize("unit", ["Y", "M", "W", "D", "h", "m"])
def test_astype_from_object_to_timedelta_unit(self, unit):
vals = [
["1 Day", "2 Days", "3 Days"],
["4 Days", "5 Days", "6 Days"],
]
df = DataFrame(vals, dtype=object)
msg = (
r"Cannot convert from timedelta64\[ns\] to timedelta64\[.*\]. "
"Supported resolutions are 's', 'ms', 'us', 'ns'"
)
with pytest.raises(ValueError, match=msg):
# TODO: this is ValueError while for DatetimeArray it is TypeError;
# get these consistent
df.astype(f"m8[{unit}]")
@pytest.mark.parametrize("dtype", ["M8", "m8"])
@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
def test_astype_from_datetimelike_to_object(self, dtype, unit):
# tests astype to object dtype
# GH#19223 / GH#12425
dtype = f"{dtype}[{unit}]"
arr = np.array([[1, 2, 3]], dtype=dtype)
df = DataFrame(arr)
result = df.astype(object)
assert (result.dtypes == object).all()
if dtype.startswith("M8"):
assert result.iloc[0, 0] == Timestamp(1, unit=unit)
else:
assert result.iloc[0, 0] == Timedelta(1, unit=unit)
@pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
@pytest.mark.parametrize("dtype", ["M8", "m8"])
@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit):
# tests all units from numeric origination
# GH#19223 / GH#12425
dtype = f"{dtype}[{unit}]"
arr = np.array([[1, 2, 3]], dtype=arr_dtype)
df = DataFrame(arr)
result = df.astype(dtype)
expected = DataFrame(arr.astype(dtype))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
def test_astype_to_datetime_unit(self, unit):
# tests all units from datetime origination
# GH#19223
dtype = f"M8[{unit}]"
arr = np.array([[1, 2, 3]], dtype=dtype)
df = DataFrame(arr)
ser = df.iloc[:, 0]
idx = Index(ser)
dta = ser._values
if unit in ["ns", "us", "ms", "s"]:
# GH#48928
result = df.astype(dtype)
else:
# we use the nearest supported dtype (i.e. M8[s])
msg = rf"Cannot cast DatetimeArray to dtype datetime64\[{unit}\]"
with pytest.raises(TypeError, match=msg):
df.astype(dtype)
with pytest.raises(TypeError, match=msg):
ser.astype(dtype)
with pytest.raises(TypeError, match=msg.replace("Array", "Index")):
idx.astype(dtype)
with pytest.raises(TypeError, match=msg):
dta.astype(dtype)
return
exp_df = DataFrame(arr.astype(dtype))
assert (exp_df.dtypes == dtype).all()
tm.assert_frame_equal(result, exp_df)
res_ser = ser.astype(dtype)
exp_ser = exp_df.iloc[:, 0]
assert exp_ser.dtype == dtype
tm.assert_series_equal(res_ser, exp_ser)
exp_dta = exp_ser._values
res_index = idx.astype(dtype)
exp_index = Index(exp_ser)
assert exp_index.dtype == dtype
tm.assert_index_equal(res_index, exp_index)
res_dta = dta.astype(dtype)
assert exp_dta.dtype == dtype
tm.assert_extension_array_equal(res_dta, exp_dta)
@pytest.mark.parametrize("unit", ["ns"])
def test_astype_to_timedelta_unit_ns(self, unit):
# preserver the timedelta conversion
# GH#19223
dtype = f"m8[{unit}]"
arr = np.array([[1, 2, 3]], dtype=dtype)
df = DataFrame(arr)
result = df.astype(dtype)
expected = DataFrame(arr.astype(dtype))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"])
def test_astype_to_timedelta_unit(self, unit):
# coerce to float
# GH#19223 until 2.0 used to coerce to float
dtype = f"m8[{unit}]"
arr = np.array([[1, 2, 3]], dtype=dtype)
df = DataFrame(arr)
ser = df.iloc[:, 0]
tdi = Index(ser)
tda = tdi._values
if unit in ["us", "ms", "s"]:
assert (df.dtypes == dtype).all()
result = df.astype(dtype)
else:
# We get the nearest supported unit, i.e. "s"
assert (df.dtypes == "m8[s]").all()
msg = (
rf"Cannot convert from timedelta64\[s\] to timedelta64\[{unit}\]. "
"Supported resolutions are 's', 'ms', 'us', 'ns'"
)
with pytest.raises(ValueError, match=msg):
df.astype(dtype)
with pytest.raises(ValueError, match=msg):
ser.astype(dtype)
with pytest.raises(ValueError, match=msg):
tdi.astype(dtype)
with pytest.raises(ValueError, match=msg):
tda.astype(dtype)
return
result = df.astype(dtype)
# The conversion is a no-op, so we just get a copy
expected = df
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
def test_astype_to_incorrect_datetimelike(self, unit):
# trying to astype a m to a M, or vice-versa
# GH#19224
dtype = f"M8[{unit}]"
other = f"m8[{unit}]"
df = DataFrame(np.array([[1, 2, 3]], dtype=dtype))
msg = "|".join(
[
# BlockManager path
rf"Cannot cast DatetimeArray to dtype timedelta64\[{unit}\]",
# ArrayManager path
"cannot astype a datetimelike from "
rf"\[datetime64\[ns\]\] to \[timedelta64\[{unit}\]\]",
]
)
with pytest.raises(TypeError, match=msg):
df.astype(other)
msg = "|".join(
[
# BlockManager path
rf"Cannot cast TimedeltaArray to dtype datetime64\[{unit}\]",
# ArrayManager path
"cannot astype a timedelta from "
rf"\[timedelta64\[ns\]\] to \[datetime64\[{unit}\]\]",
]
)
df = DataFrame(np.array([[1, 2, 3]], dtype=other))
with pytest.raises(TypeError, match=msg):
df.astype(dtype)
def test_astype_arg_for_errors(self):
# GH#14878
df = DataFrame([1, 2, 3])
msg = (
"Expected value of kwarg 'errors' to be one of "
"['raise', 'ignore']. Supplied value is 'True'"
)
with pytest.raises(ValueError, match=re.escape(msg)):
df.astype(np.float64, errors=True)
df.astype(np.int8, errors="ignore")
def test_astype_invalid_conversion(self):
# GH#47571
df = DataFrame({"a": [1, 2, "text"], "b": [1, 2, 3]})
msg = (
"invalid literal for int() with base 10: 'text': "
"Error while type casting for column 'a'"
)
with pytest.raises(ValueError, match=re.escape(msg)):
df.astype({"a": int})
def test_astype_arg_for_errors_dictlist(self):
# GH#25905
df = DataFrame(
[
{"a": "1", "b": "16.5%", "c": "test"},
{"a": "2.2", "b": "15.3", "c": "another_test"},
]
)
expected = DataFrame(
[
{"a": 1.0, "b": "16.5%", "c": "test"},
{"a": 2.2, "b": "15.3", "c": "another_test"},
]
)
expected["c"] = expected["c"].astype("object")
type_dict = {"a": "float64", "b": "float64", "c": "object"}
result = df.astype(dtype=type_dict, errors="ignore")
tm.assert_frame_equal(result, expected)
def test_astype_dt64tz(self, timezone_frame):
# astype
expected = np.array(
[
[
Timestamp("2013-01-01 00:00:00"),
Timestamp("2013-01-02 00:00:00"),
Timestamp("2013-01-03 00:00:00"),
],
[
Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"),
NaT,
Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"),
],
[
Timestamp("2013-01-01 00:00:00+0100", tz="CET"),
NaT,
Timestamp("2013-01-03 00:00:00+0100", tz="CET"),
],
],
dtype=object,
).T
expected = DataFrame(
expected,
index=timezone_frame.index,
columns=timezone_frame.columns,
dtype=object,
)
result = timezone_frame.astype(object)
tm.assert_frame_equal(result, expected)
msg = "Cannot use .astype to convert from timezone-aware dtype to timezone-"
with pytest.raises(TypeError, match=msg):
# dt64tz->dt64 deprecated
timezone_frame.astype("datetime64[ns]")
def test_astype_dt64tz_to_str(self, timezone_frame):
# str formatting
result = timezone_frame.astype(str)
expected = DataFrame(
[
[
"2013-01-01",
"2013-01-01 00:00:00-05:00",
"2013-01-01 00:00:00+01:00",
],
["2013-01-02", "NaT", "NaT"],
[
"2013-01-03",
"2013-01-03 00:00:00-05:00",
"2013-01-03 00:00:00+01:00",
],
],
columns=timezone_frame.columns,
dtype="object",
)
tm.assert_frame_equal(result, expected)
with option_context("display.max_columns", 20):
result = str(timezone_frame)
assert (
"0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00"
) in result
assert (
"1 2013-01-02 NaT NaT"
) in result
assert (
"2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00"
) in result
def test_astype_empty_dtype_dict(self):
# issue mentioned further down in the following issue's thread
# https://github.com/pandas-dev/pandas/issues/33113
df = DataFrame()
result = df.astype({})
tm.assert_frame_equal(result, df)
assert result is not df
@pytest.mark.parametrize(
"data, dtype",
[
(["x", "y", "z"], "string[python]"),
pytest.param(
["x", "y", "z"],
"string[pyarrow]",
marks=td.skip_if_no("pyarrow"),
),
(["x", "y", "z"], "category"),
(3 * [Timestamp("2020-01-01", tz="UTC")], None),
(3 * [Interval(0, 1)], None),
],
)
@pytest.mark.parametrize("errors", ["raise", "ignore"])
def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors):
# https://github.com/pandas-dev/pandas/issues/35471
df = DataFrame(Series(data, dtype=dtype))
if errors == "ignore":
expected = df
result = df.astype(float, errors=errors)
tm.assert_frame_equal(result, expected)
else:
msg = "(Cannot cast)|(could not convert)"
with pytest.raises((ValueError, TypeError), match=msg):
df.astype(float, errors=errors)
def test_astype_tz_conversion(self):
# GH 35973
val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")}
df = DataFrame(val)
result = df.astype({"tz": "datetime64[ns, Europe/Berlin]"})
expected = df
expected["tz"] = expected["tz"].dt.tz_convert("Europe/Berlin")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("tz", ["UTC", "Europe/Berlin"])
def test_astype_tz_object_conversion(self, tz):
# GH 35973
val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")}
expected = DataFrame(val)
# convert expected to object dtype from other tz str (independently tested)
result = expected.astype({"tz": f"datetime64[ns, {tz}]"})
result = result.astype({"tz": "object"})
# do real test: object dtype to a specified tz, different from construction tz.
result = result.astype({"tz": "datetime64[ns, Europe/London]"})
tm.assert_frame_equal(result, expected)
def test_astype_dt64_to_string(
self, frame_or_series, tz_naive_fixture, using_infer_string
):
# GH#41409
tz = tz_naive_fixture
dti = date_range("2016-01-01", periods=3, tz=tz)
dta = dti._data
dta[0] = NaT
obj = frame_or_series(dta)
result = obj.astype("string")
# Check that Series/DataFrame.astype matches DatetimeArray.astype
expected = frame_or_series(dta.astype("string"))
tm.assert_equal(result, expected)
item = result.iloc[0]
if frame_or_series is DataFrame:
item = item.iloc[0]
if using_infer_string:
assert item is np.nan
else:
assert item is pd.NA
# For non-NA values, we should match what we get for non-EA str
alt = obj.astype(str)
assert np.all(alt.iloc[1:] == result.iloc[1:])
def test_astype_td64_to_string(self, frame_or_series):
# GH#41409
tdi = pd.timedelta_range("1 Day", periods=3)
obj = frame_or_series(tdi)
expected = frame_or_series(["1 days", "2 days", "3 days"], dtype="string")
result = obj.astype("string")
tm.assert_equal(result, expected)
def test_astype_bytes(self):
# GH#39474
result = DataFrame(["foo", "bar", "baz"]).astype(bytes)
assert result.dtypes[0] == np.dtype("S3")
@pytest.mark.parametrize(
"index_slice",
[
np.s_[:2, :2],
np.s_[:1, :2],
np.s_[:2, :1],
np.s_[::2, ::2],
np.s_[::1, ::2],
np.s_[::2, ::1],
],
)
def test_astype_noncontiguous(self, index_slice):
# GH#42396
data = np.arange(16).reshape(4, 4)
df = DataFrame(data)
result = df.iloc[index_slice].astype("int16")
expected = df.iloc[index_slice]
tm.assert_frame_equal(result, expected, check_dtype=False)
def test_astype_retain_attrs(self, any_numpy_dtype):
# GH#44414
df = DataFrame({"a": [0, 1, 2], "b": [3, 4, 5]})
df.attrs["Location"] = "Michigan"
result = df.astype({"a": any_numpy_dtype}).attrs
expected = df.attrs
tm.assert_dict_equal(expected, result)
class TestAstypeCategorical:
def test_astype_from_categorical3(self):
df = DataFrame({"cats": [1, 2, 3, 4, 5, 6], "vals": [1, 2, 3, 4, 5, 6]})
cats = Categorical([1, 2, 3, 4, 5, 6])
exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
df["cats"] = df["cats"].astype("category")
tm.assert_frame_equal(exp_df, df)
def test_astype_from_categorical4(self):
df = DataFrame(
{"cats": ["a", "b", "b", "a", "a", "d"], "vals": [1, 2, 3, 4, 5, 6]}
)
cats = Categorical(["a", "b", "b", "a", "a", "d"])
exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
df["cats"] = df["cats"].astype("category")
tm.assert_frame_equal(exp_df, df)
def test_categorical_astype_to_int(self, any_int_dtype):
# GH#39402
df = DataFrame(data={"col1": pd.array([2.0, 1.0, 3.0])})
df.col1 = df.col1.astype("category")
df.col1 = df.col1.astype(any_int_dtype)
expected = DataFrame({"col1": pd.array([2, 1, 3], dtype=any_int_dtype)})
tm.assert_frame_equal(df, expected)
def test_astype_categorical_to_string_missing(self):
# https://github.com/pandas-dev/pandas/issues/41797
df = DataFrame(["a", "b", np.nan])
expected = df.astype(str)
cat = df.astype("category")
result = cat.astype(str)
tm.assert_frame_equal(result, expected)
class IntegerArrayNoCopy(pd.core.arrays.IntegerArray):
# GH 42501
def copy(self):
assert False
class Int16DtypeNoCopy(pd.Int16Dtype):
# GH 42501
@classmethod
def construct_array_type(cls):
return IntegerArrayNoCopy
def test_frame_astype_no_copy():
# GH 42501
df = DataFrame({"a": [1, 4, None, 5], "b": [6, 7, 8, 9]}, dtype=object)
result = df.astype({"a": Int16DtypeNoCopy()}, copy=False)
assert result.a.dtype == pd.Int16Dtype()
assert np.shares_memory(df.b.values, result.b.values)
@pytest.mark.parametrize("dtype", ["int64", "Int64"])
def test_astype_copies(dtype):
# GH#50984
pytest.importorskip("pyarrow")
df = DataFrame({"a": [1, 2, 3]}, dtype=dtype)
result = df.astype("int64[pyarrow]", copy=True)
df.iloc[0, 0] = 100
expected = DataFrame({"a": [1, 2, 3]}, dtype="int64[pyarrow]")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT])
def test_astype_to_string_not_modifying_input(string_storage, val):
# GH#51073
df = DataFrame({"a": ["a", "b", val]})
expected = df.copy()
with option_context("mode.string_storage", string_storage):
df.astype("string", copy=False)
tm.assert_frame_equal(df, expected)

View File

@ -0,0 +1,132 @@
from datetime import time
import numpy as np
import pytest
import pytz
from pandas._libs.tslibs import timezones
from pandas import (
DataFrame,
date_range,
)
import pandas._testing as tm
class TestAtTime:
@pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"])
def test_localized_at_time(self, tzstr, frame_or_series):
tz = timezones.maybe_get_tz(tzstr)
rng = date_range("4/16/2012", "5/1/2012", freq="h")
ts = frame_or_series(
np.random.default_rng(2).standard_normal(len(rng)), index=rng
)
ts_local = ts.tz_localize(tzstr)
result = ts_local.at_time(time(10, 0))
expected = ts.at_time(time(10, 0)).tz_localize(tzstr)
tm.assert_equal(result, expected)
assert timezones.tz_compare(result.index.tz, tz)
def test_at_time(self, frame_or_series):
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
ts = DataFrame(
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
)
ts = tm.get_obj(ts, frame_or_series)
rs = ts.at_time(rng[1])
assert (rs.index.hour == rng[1].hour).all()
assert (rs.index.minute == rng[1].minute).all()
assert (rs.index.second == rng[1].second).all()
result = ts.at_time("9:30")
expected = ts.at_time(time(9, 30))
tm.assert_equal(result, expected)
def test_at_time_midnight(self, frame_or_series):
# midnight, everything
rng = date_range("1/1/2000", "1/31/2000")
ts = DataFrame(
np.random.default_rng(2).standard_normal((len(rng), 3)), index=rng
)
ts = tm.get_obj(ts, frame_or_series)
result = ts.at_time(time(0, 0))
tm.assert_equal(result, ts)
def test_at_time_nonexistent(self, frame_or_series):
# time doesn't exist
rng = date_range("1/1/2012", freq="23Min", periods=384)
ts = DataFrame(np.random.default_rng(2).standard_normal(len(rng)), rng)
ts = tm.get_obj(ts, frame_or_series)
rs = ts.at_time("16:00")
assert len(rs) == 0
@pytest.mark.parametrize(
"hour", ["1:00", "1:00AM", time(1), time(1, tzinfo=pytz.UTC)]
)
def test_at_time_errors(self, hour):
# GH#24043
dti = date_range("2018", periods=3, freq="h")
df = DataFrame(list(range(len(dti))), index=dti)
if getattr(hour, "tzinfo", None) is None:
result = df.at_time(hour)
expected = df.iloc[1:2]
tm.assert_frame_equal(result, expected)
else:
with pytest.raises(ValueError, match="Index must be timezone"):
df.at_time(hour)
def test_at_time_tz(self):
# GH#24043
dti = date_range("2018", periods=3, freq="h", tz="US/Pacific")
df = DataFrame(list(range(len(dti))), index=dti)
result = df.at_time(time(4, tzinfo=pytz.timezone("US/Eastern")))
expected = df.iloc[1:2]
tm.assert_frame_equal(result, expected)
def test_at_time_raises(self, frame_or_series):
# GH#20725
obj = DataFrame([[1, 2, 3], [4, 5, 6]])
obj = tm.get_obj(obj, frame_or_series)
msg = "Index must be DatetimeIndex"
with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex
obj.at_time("00:00")
@pytest.mark.parametrize("axis", ["index", "columns", 0, 1])
def test_at_time_axis(self, axis):
# issue 8839
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
ts = DataFrame(np.random.default_rng(2).standard_normal((len(rng), len(rng))))
ts.index, ts.columns = rng, rng
indices = rng[(rng.hour == 9) & (rng.minute == 30) & (rng.second == 0)]
if axis in ["index", 0]:
expected = ts.loc[indices, :]
elif axis in ["columns", 1]:
expected = ts.loc[:, indices]
result = ts.at_time("9:30", axis=axis)
# Without clearing freq, result has freq 1440T and expected 5T
result.index = result.index._with_freq(None)
expected.index = expected.index._with_freq(None)
tm.assert_frame_equal(result, expected)
def test_at_time_datetimeindex(self):
index = date_range("2012-01-01", "2012-01-05", freq="30min")
df = DataFrame(
np.random.default_rng(2).standard_normal((len(index), 5)), index=index
)
akey = time(12, 0, 0)
ainds = [24, 72, 120, 168]
result = df.at_time(akey)
expected = df.loc[akey]
expected2 = df.iloc[ainds]
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result, expected2)
assert len(result) == 4

View File

@ -0,0 +1,227 @@
from datetime import (
datetime,
time,
)
import numpy as np
import pytest
from pandas._libs.tslibs import timezones
import pandas.util._test_decorators as td
from pandas import (
DataFrame,
Series,
date_range,
)
import pandas._testing as tm
class TestBetweenTime:
@td.skip_if_not_us_locale
def test_between_time_formats(self, frame_or_series):
# GH#11818
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
ts = DataFrame(
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
)
ts = tm.get_obj(ts, frame_or_series)
strings = [
("2:00", "2:30"),
("0200", "0230"),
("2:00am", "2:30am"),
("0200am", "0230am"),
("2:00:00", "2:30:00"),
("020000", "023000"),
("2:00:00am", "2:30:00am"),
("020000am", "023000am"),
]
expected_length = 28
for time_string in strings:
assert len(ts.between_time(*time_string)) == expected_length
@pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"])
def test_localized_between_time(self, tzstr, frame_or_series):
tz = timezones.maybe_get_tz(tzstr)
rng = date_range("4/16/2012", "5/1/2012", freq="h")
ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
if frame_or_series is DataFrame:
ts = ts.to_frame()
ts_local = ts.tz_localize(tzstr)
t1, t2 = time(10, 0), time(11, 0)
result = ts_local.between_time(t1, t2)
expected = ts.between_time(t1, t2).tz_localize(tzstr)
tm.assert_equal(result, expected)
assert timezones.tz_compare(result.index.tz, tz)
def test_between_time_types(self, frame_or_series):
# GH11818
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
obj = DataFrame({"A": 0}, index=rng)
obj = tm.get_obj(obj, frame_or_series)
msg = r"Cannot convert arg \[datetime\.datetime\(2010, 1, 2, 1, 0\)\] to a time"
with pytest.raises(ValueError, match=msg):
obj.between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5))
def test_between_time(self, inclusive_endpoints_fixture, frame_or_series):
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
ts = DataFrame(
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
)
ts = tm.get_obj(ts, frame_or_series)
stime = time(0, 0)
etime = time(1, 0)
inclusive = inclusive_endpoints_fixture
filtered = ts.between_time(stime, etime, inclusive=inclusive)
exp_len = 13 * 4 + 1
if inclusive in ["right", "neither"]:
exp_len -= 5
if inclusive in ["left", "neither"]:
exp_len -= 4
assert len(filtered) == exp_len
for rs in filtered.index:
t = rs.time()
if inclusive in ["left", "both"]:
assert t >= stime
else:
assert t > stime
if inclusive in ["right", "both"]:
assert t <= etime
else:
assert t < etime
result = ts.between_time("00:00", "01:00")
expected = ts.between_time(stime, etime)
tm.assert_equal(result, expected)
# across midnight
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
ts = DataFrame(
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
)
ts = tm.get_obj(ts, frame_or_series)
stime = time(22, 0)
etime = time(9, 0)
filtered = ts.between_time(stime, etime, inclusive=inclusive)
exp_len = (12 * 11 + 1) * 4 + 1
if inclusive in ["right", "neither"]:
exp_len -= 4
if inclusive in ["left", "neither"]:
exp_len -= 4
assert len(filtered) == exp_len
for rs in filtered.index:
t = rs.time()
if inclusive in ["left", "both"]:
assert (t >= stime) or (t <= etime)
else:
assert (t > stime) or (t <= etime)
if inclusive in ["right", "both"]:
assert (t <= etime) or (t >= stime)
else:
assert (t < etime) or (t >= stime)
def test_between_time_raises(self, frame_or_series):
# GH#20725
obj = DataFrame([[1, 2, 3], [4, 5, 6]])
obj = tm.get_obj(obj, frame_or_series)
msg = "Index must be DatetimeIndex"
with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex
obj.between_time(start_time="00:00", end_time="12:00")
def test_between_time_axis(self, frame_or_series):
# GH#8839
rng = date_range("1/1/2000", periods=100, freq="10min")
ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
if frame_or_series is DataFrame:
ts = ts.to_frame()
stime, etime = ("08:00:00", "09:00:00")
expected_length = 7
assert len(ts.between_time(stime, etime)) == expected_length
assert len(ts.between_time(stime, etime, axis=0)) == expected_length
msg = f"No axis named {ts.ndim} for object type {type(ts).__name__}"
with pytest.raises(ValueError, match=msg):
ts.between_time(stime, etime, axis=ts.ndim)
def test_between_time_axis_aliases(self, axis):
# GH#8839
rng = date_range("1/1/2000", periods=100, freq="10min")
ts = DataFrame(np.random.default_rng(2).standard_normal((len(rng), len(rng))))
stime, etime = ("08:00:00", "09:00:00")
exp_len = 7
if axis in ["index", 0]:
ts.index = rng
assert len(ts.between_time(stime, etime)) == exp_len
assert len(ts.between_time(stime, etime, axis=0)) == exp_len
if axis in ["columns", 1]:
ts.columns = rng
selected = ts.between_time(stime, etime, axis=1).columns
assert len(selected) == exp_len
def test_between_time_axis_raises(self, axis):
# issue 8839
rng = date_range("1/1/2000", periods=100, freq="10min")
mask = np.arange(0, len(rng))
rand_data = np.random.default_rng(2).standard_normal((len(rng), len(rng)))
ts = DataFrame(rand_data, index=rng, columns=rng)
stime, etime = ("08:00:00", "09:00:00")
msg = "Index must be DatetimeIndex"
if axis in ["columns", 1]:
ts.index = mask
with pytest.raises(TypeError, match=msg):
ts.between_time(stime, etime)
with pytest.raises(TypeError, match=msg):
ts.between_time(stime, etime, axis=0)
if axis in ["index", 0]:
ts.columns = mask
with pytest.raises(TypeError, match=msg):
ts.between_time(stime, etime, axis=1)
def test_between_time_datetimeindex(self):
index = date_range("2012-01-01", "2012-01-05", freq="30min")
df = DataFrame(
np.random.default_rng(2).standard_normal((len(index), 5)), index=index
)
bkey = slice(time(13, 0, 0), time(14, 0, 0))
binds = [26, 27, 28, 74, 75, 76, 122, 123, 124, 170, 171, 172]
result = df.between_time(bkey.start, bkey.stop)
expected = df.loc[bkey]
expected2 = df.iloc[binds]
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result, expected2)
assert len(result) == 12
def test_between_time_incorrect_arg_inclusive(self):
# GH40245
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
ts = DataFrame(
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
)
stime = time(0, 0)
etime = time(1, 0)
inclusive = "bad_string"
msg = "Inclusive has to be either 'both', 'neither', 'left' or 'right'"
with pytest.raises(ValueError, match=msg):
ts.between_time(stime, etime, inclusive=inclusive)

View File

@ -0,0 +1,199 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
class TestDataFrameClip:
def test_clip(self, float_frame):
median = float_frame.median().median()
original = float_frame.copy()
double = float_frame.clip(upper=median, lower=median)
assert not (double.values != median).any()
# Verify that float_frame was not changed inplace
assert (float_frame.values == original.values).all()
def test_inplace_clip(self, float_frame):
# GH#15388
median = float_frame.median().median()
frame_copy = float_frame.copy()
return_value = frame_copy.clip(upper=median, lower=median, inplace=True)
assert return_value is None
assert not (frame_copy.values != median).any()
def test_dataframe_clip(self):
# GH#2747
df = DataFrame(np.random.default_rng(2).standard_normal((1000, 2)))
for lb, ub in [(-1, 1), (1, -1)]:
clipped_df = df.clip(lb, ub)
lb, ub = min(lb, ub), max(ub, lb)
lb_mask = df.values <= lb
ub_mask = df.values >= ub
mask = ~lb_mask & ~ub_mask
assert (clipped_df.values[lb_mask] == lb).all()
assert (clipped_df.values[ub_mask] == ub).all()
assert (clipped_df.values[mask] == df.values[mask]).all()
def test_clip_mixed_numeric(self):
# clip on mixed integer or floats
# GH#24162, clipping now preserves numeric types per column
df = DataFrame({"A": [1, 2, 3], "B": [1.0, np.nan, 3.0]})
result = df.clip(1, 2)
expected = DataFrame({"A": [1, 2, 2], "B": [1.0, np.nan, 2.0]})
tm.assert_frame_equal(result, expected)
df = DataFrame([[1, 2, 3.4], [3, 4, 5.6]], columns=["foo", "bar", "baz"])
expected = df.dtypes
result = df.clip(upper=3).dtypes
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("inplace", [True, False])
def test_clip_against_series(self, inplace):
# GH#6966
df = DataFrame(np.random.default_rng(2).standard_normal((1000, 2)))
lb = Series(np.random.default_rng(2).standard_normal(1000))
ub = lb + 1
original = df.copy()
clipped_df = df.clip(lb, ub, axis=0, inplace=inplace)
if inplace:
clipped_df = df
for i in range(2):
lb_mask = original.iloc[:, i] <= lb
ub_mask = original.iloc[:, i] >= ub
mask = ~lb_mask & ~ub_mask
result = clipped_df.loc[lb_mask, i]
tm.assert_series_equal(result, lb[lb_mask], check_names=False)
assert result.name == i
result = clipped_df.loc[ub_mask, i]
tm.assert_series_equal(result, ub[ub_mask], check_names=False)
assert result.name == i
tm.assert_series_equal(clipped_df.loc[mask, i], df.loc[mask, i])
@pytest.mark.parametrize("inplace", [True, False])
@pytest.mark.parametrize("lower", [[2, 3, 4], np.asarray([2, 3, 4])])
@pytest.mark.parametrize(
"axis,res",
[
(0, [[2.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 7.0, 7.0]]),
(1, [[2.0, 3.0, 4.0], [4.0, 5.0, 6.0], [5.0, 6.0, 7.0]]),
],
)
def test_clip_against_list_like(self, inplace, lower, axis, res):
# GH#15390
arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])
original = DataFrame(
arr, columns=["one", "two", "three"], index=["a", "b", "c"]
)
result = original.clip(lower=lower, upper=[5, 6, 7], axis=axis, inplace=inplace)
expected = DataFrame(res, columns=original.columns, index=original.index)
if inplace:
result = original
tm.assert_frame_equal(result, expected, check_exact=True)
@pytest.mark.parametrize("axis", [0, 1, None])
def test_clip_against_frame(self, axis):
df = DataFrame(np.random.default_rng(2).standard_normal((1000, 2)))
lb = DataFrame(np.random.default_rng(2).standard_normal((1000, 2)))
ub = lb + 1
clipped_df = df.clip(lb, ub, axis=axis)
lb_mask = df <= lb
ub_mask = df >= ub
mask = ~lb_mask & ~ub_mask
tm.assert_frame_equal(clipped_df[lb_mask], lb[lb_mask])
tm.assert_frame_equal(clipped_df[ub_mask], ub[ub_mask])
tm.assert_frame_equal(clipped_df[mask], df[mask])
def test_clip_against_unordered_columns(self):
# GH#20911
df1 = DataFrame(
np.random.default_rng(2).standard_normal((1000, 4)),
columns=["A", "B", "C", "D"],
)
df2 = DataFrame(
np.random.default_rng(2).standard_normal((1000, 4)),
columns=["D", "A", "B", "C"],
)
df3 = DataFrame(df2.values - 1, columns=["B", "D", "C", "A"])
result_upper = df1.clip(lower=0, upper=df2)
expected_upper = df1.clip(lower=0, upper=df2[df1.columns])
result_lower = df1.clip(lower=df3, upper=3)
expected_lower = df1.clip(lower=df3[df1.columns], upper=3)
result_lower_upper = df1.clip(lower=df3, upper=df2)
expected_lower_upper = df1.clip(lower=df3[df1.columns], upper=df2[df1.columns])
tm.assert_frame_equal(result_upper, expected_upper)
tm.assert_frame_equal(result_lower, expected_lower)
tm.assert_frame_equal(result_lower_upper, expected_lower_upper)
def test_clip_with_na_args(self, float_frame):
"""Should process np.nan argument as None"""
# GH#17276
tm.assert_frame_equal(float_frame.clip(np.nan), float_frame)
tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), float_frame)
# GH#19992 and adjusted in GH#40420
df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]})
msg = "Downcasting behavior in Series and DataFrame methods 'where'"
# TODO: avoid this warning here? seems like we should never be upcasting
# in the first place?
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.clip(lower=[4, 5, np.nan], axis=0)
expected = DataFrame(
{"col_0": [4, 5, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]}
)
tm.assert_frame_equal(result, expected)
result = df.clip(lower=[4, 5, np.nan], axis=1)
expected = DataFrame(
{"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [7, 8, 9]}
)
tm.assert_frame_equal(result, expected)
# GH#40420
data = {"col_0": [9, -3, 0, -1, 5], "col_1": [-2, -7, 6, 8, -5]}
df = DataFrame(data)
t = Series([2, -4, np.nan, 6, 3])
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.clip(lower=t, axis=0)
expected = DataFrame({"col_0": [9, -3, 0, 6, 5], "col_1": [2, -4, 6, 8, 3]})
tm.assert_frame_equal(result, expected)
def test_clip_int_data_with_float_bound(self):
# GH51472
df = DataFrame({"a": [1, 2, 3]})
result = df.clip(lower=1.5)
expected = DataFrame({"a": [1.5, 2.0, 3.0]})
tm.assert_frame_equal(result, expected)
def test_clip_with_list_bound(self):
# GH#54817
df = DataFrame([1, 5])
expected = DataFrame([3, 5])
result = df.clip([3])
tm.assert_frame_equal(result, expected)
expected = DataFrame([1, 3])
result = df.clip(upper=[3])
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,47 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
class TestCombine:
@pytest.mark.parametrize(
"data",
[
pd.date_range("2000", periods=4),
pd.date_range("2000", periods=4, tz="US/Central"),
pd.period_range("2000", periods=4),
pd.timedelta_range(0, periods=4),
],
)
def test_combine_datetlike_udf(self, data):
# GH#23079
df = pd.DataFrame({"A": data})
other = df.copy()
df.iloc[1, 0] = None
def combiner(a, b):
return b
result = df.combine(other, combiner)
tm.assert_frame_equal(result, other)
def test_combine_generic(self, float_frame):
df1 = float_frame
df2 = float_frame.loc[float_frame.index[:-5], ["A", "B", "C"]]
combined = df1.combine(df2, np.add)
combined2 = df2.combine(df1, np.add)
assert combined["D"].isna().all()
assert combined2["D"].isna().all()
chunk = combined.loc[combined.index[:-5], ["A", "B", "C"]]
chunk2 = combined2.loc[combined2.index[:-5], ["A", "B", "C"]]
exp = (
float_frame.loc[float_frame.index[:-5], ["A", "B", "C"]].reindex_like(chunk)
* 2
)
tm.assert_frame_equal(chunk, exp)
tm.assert_frame_equal(chunk2, exp)

View File

@ -0,0 +1,556 @@
from datetime import datetime
import numpy as np
import pytest
from pandas.core.dtypes.cast import find_common_type
from pandas.core.dtypes.common import is_dtype_equal
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
Series,
)
import pandas._testing as tm
class TestDataFrameCombineFirst:
def test_combine_first_mixed(self):
a = Series(["a", "b"], index=range(2))
b = Series(range(2), index=range(2))
f = DataFrame({"A": a, "B": b})
a = Series(["a", "b"], index=range(5, 7))
b = Series(range(2), index=range(5, 7))
g = DataFrame({"A": a, "B": b})
exp = DataFrame({"A": list("abab"), "B": [0, 1, 0, 1]}, index=[0, 1, 5, 6])
combined = f.combine_first(g)
tm.assert_frame_equal(combined, exp)
def test_combine_first(self, float_frame, using_infer_string):
# disjoint
head, tail = float_frame[:5], float_frame[5:]
combined = head.combine_first(tail)
reordered_frame = float_frame.reindex(combined.index)
tm.assert_frame_equal(combined, reordered_frame)
tm.assert_index_equal(combined.columns, float_frame.columns)
tm.assert_series_equal(combined["A"], reordered_frame["A"])
# same index
fcopy = float_frame.copy()
fcopy["A"] = 1
del fcopy["C"]
fcopy2 = float_frame.copy()
fcopy2["B"] = 0
del fcopy2["D"]
combined = fcopy.combine_first(fcopy2)
assert (combined["A"] == 1).all()
tm.assert_series_equal(combined["B"], fcopy["B"])
tm.assert_series_equal(combined["C"], fcopy2["C"])
tm.assert_series_equal(combined["D"], fcopy["D"])
# overlap
head, tail = reordered_frame[:10].copy(), reordered_frame
head["A"] = 1
combined = head.combine_first(tail)
assert (combined["A"][:10] == 1).all()
# reverse overlap
tail.iloc[:10, tail.columns.get_loc("A")] = 0
combined = tail.combine_first(head)
assert (combined["A"][:10] == 0).all()
# no overlap
f = float_frame[:10]
g = float_frame[10:]
combined = f.combine_first(g)
tm.assert_series_equal(combined["A"].reindex(f.index), f["A"])
tm.assert_series_equal(combined["A"].reindex(g.index), g["A"])
# corner cases
warning = FutureWarning if using_infer_string else None
with tm.assert_produces_warning(warning, match="empty entries"):
comb = float_frame.combine_first(DataFrame())
tm.assert_frame_equal(comb, float_frame)
comb = DataFrame().combine_first(float_frame)
tm.assert_frame_equal(comb, float_frame.sort_index())
comb = float_frame.combine_first(DataFrame(index=["faz", "boo"]))
assert "faz" in comb.index
# #2525
df = DataFrame({"a": [1]}, index=[datetime(2012, 1, 1)])
df2 = DataFrame(columns=["b"])
result = df.combine_first(df2)
assert "b" in result
def test_combine_first_mixed_bug(self):
idx = Index(["a", "b", "c", "e"])
ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx)
ser2 = Series(["a", "b", "c", "e"], index=idx)
ser3 = Series([12, 4, 5, 97], index=idx)
frame1 = DataFrame({"col0": ser1, "col2": ser2, "col3": ser3})
idx = Index(["a", "b", "c", "f"])
ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx)
ser2 = Series(["a", "b", "c", "f"], index=idx)
ser3 = Series([12, 4, 5, 97], index=idx)
frame2 = DataFrame({"col1": ser1, "col2": ser2, "col5": ser3})
combined = frame1.combine_first(frame2)
assert len(combined.columns) == 5
def test_combine_first_same_as_in_update(self):
# gh 3016 (same as in update)
df = DataFrame(
[[1.0, 2.0, False, True], [4.0, 5.0, True, False]],
columns=["A", "B", "bool1", "bool2"],
)
other = DataFrame([[45, 45]], index=[0], columns=["A", "B"])
result = df.combine_first(other)
tm.assert_frame_equal(result, df)
df.loc[0, "A"] = np.nan
result = df.combine_first(other)
df.loc[0, "A"] = 45
tm.assert_frame_equal(result, df)
def test_combine_first_doc_example(self):
# doc example
df1 = DataFrame(
{"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]}
)
df2 = DataFrame(
{
"A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0],
"B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0],
}
)
result = df1.combine_first(df2)
expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]})
tm.assert_frame_equal(result, expected)
def test_combine_first_return_obj_type_with_bools(self):
# GH3552
df1 = DataFrame(
[[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]]
)
df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2])
expected = Series([True, True, False], name=2, dtype=bool)
result_12 = df1.combine_first(df2)[2]
tm.assert_series_equal(result_12, expected)
result_21 = df2.combine_first(df1)[2]
tm.assert_series_equal(result_21, expected)
@pytest.mark.parametrize(
"data1, data2, data_expected",
(
(
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
[pd.NaT, pd.NaT, pd.NaT],
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
),
(
[pd.NaT, pd.NaT, pd.NaT],
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
),
(
[datetime(2000, 1, 2), pd.NaT, pd.NaT],
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
[datetime(2000, 1, 2), datetime(2000, 1, 2), datetime(2000, 1, 3)],
),
(
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
[datetime(2000, 1, 2), pd.NaT, pd.NaT],
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
),
),
)
def test_combine_first_convert_datatime_correctly(
self, data1, data2, data_expected
):
# GH 3593
df1, df2 = DataFrame({"a": data1}), DataFrame({"a": data2})
result = df1.combine_first(df2)
expected = DataFrame({"a": data_expected})
tm.assert_frame_equal(result, expected)
def test_combine_first_align_nan(self):
# GH 7509 (not fixed)
dfa = DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"])
dfb = DataFrame([[4], [5]], columns=["b"])
assert dfa["a"].dtype == "datetime64[ns]"
assert dfa["b"].dtype == "int64"
res = dfa.combine_first(dfb)
exp = DataFrame(
{"a": [pd.Timestamp("2011-01-01"), pd.NaT], "b": [2, 5]},
columns=["a", "b"],
)
tm.assert_frame_equal(res, exp)
assert res["a"].dtype == "datetime64[ns]"
# TODO: this must be int64
assert res["b"].dtype == "int64"
res = dfa.iloc[:0].combine_first(dfb)
exp = DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"])
tm.assert_frame_equal(res, exp)
# TODO: this must be datetime64
assert res["a"].dtype == "float64"
# TODO: this must be int64
assert res["b"].dtype == "int64"
def test_combine_first_timezone(self, unit):
# see gh-7630
data1 = pd.to_datetime("20100101 01:01").tz_localize("UTC").as_unit(unit)
df1 = DataFrame(
columns=["UTCdatetime", "abc"],
data=data1,
index=pd.date_range("20140627", periods=1),
)
data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC").as_unit(unit)
df2 = DataFrame(
columns=["UTCdatetime", "xyz"],
data=data2,
index=pd.date_range("20140628", periods=1),
)
res = df2[["UTCdatetime"]].combine_first(df1)
exp = DataFrame(
{
"UTCdatetime": [
pd.Timestamp("2010-01-01 01:01", tz="UTC"),
pd.Timestamp("2012-12-12 12:12", tz="UTC"),
],
"abc": [pd.Timestamp("2010-01-01 01:01:00", tz="UTC"), pd.NaT],
},
columns=["UTCdatetime", "abc"],
index=pd.date_range("20140627", periods=2, freq="D"),
dtype=f"datetime64[{unit}, UTC]",
)
assert res["UTCdatetime"].dtype == f"datetime64[{unit}, UTC]"
assert res["abc"].dtype == f"datetime64[{unit}, UTC]"
tm.assert_frame_equal(res, exp)
def test_combine_first_timezone2(self, unit):
# see gh-10567
dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC", unit=unit)
df1 = DataFrame({"DATE": dts1})
dts2 = pd.date_range("2015-01-03", "2015-01-05", tz="UTC", unit=unit)
df2 = DataFrame({"DATE": dts2})
res = df1.combine_first(df2)
tm.assert_frame_equal(res, df1)
assert res["DATE"].dtype == f"datetime64[{unit}, UTC]"
def test_combine_first_timezone3(self, unit):
dts1 = pd.DatetimeIndex(
["2011-01-01", "NaT", "2011-01-03", "2011-01-04"], tz="US/Eastern"
).as_unit(unit)
df1 = DataFrame({"DATE": dts1}, index=[1, 3, 5, 7])
dts2 = pd.DatetimeIndex(
["2012-01-01", "2012-01-02", "2012-01-03"], tz="US/Eastern"
).as_unit(unit)
df2 = DataFrame({"DATE": dts2}, index=[2, 4, 5])
res = df1.combine_first(df2)
exp_dts = pd.DatetimeIndex(
[
"2011-01-01",
"2012-01-01",
"NaT",
"2012-01-02",
"2011-01-03",
"2011-01-04",
],
tz="US/Eastern",
).as_unit(unit)
exp = DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7])
tm.assert_frame_equal(res, exp)
# FIXME: parametrizing over unit breaks on non-nano
def test_combine_first_timezone4(self):
# different tz
dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern")
df1 = DataFrame({"DATE": dts1})
dts2 = pd.date_range("2015-01-03", "2015-01-05")
df2 = DataFrame({"DATE": dts2})
# if df1 doesn't have NaN, keep its dtype
res = df1.combine_first(df2)
tm.assert_frame_equal(res, df1)
assert res["DATE"].dtype == "datetime64[ns, US/Eastern]"
def test_combine_first_timezone5(self, unit):
dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern", unit=unit)
df1 = DataFrame({"DATE": dts1})
dts2 = pd.date_range("2015-01-01", "2015-01-03", unit=unit)
df2 = DataFrame({"DATE": dts2})
res = df1.combine_first(df2)
exp_dts = [
pd.Timestamp("2015-01-01", tz="US/Eastern"),
pd.Timestamp("2015-01-02", tz="US/Eastern"),
pd.Timestamp("2015-01-03"),
]
exp = DataFrame({"DATE": exp_dts})
tm.assert_frame_equal(res, exp)
assert res["DATE"].dtype == "object"
def test_combine_first_timedelta(self):
data1 = pd.TimedeltaIndex(["1 day", "NaT", "3 day", "4day"])
df1 = DataFrame({"TD": data1}, index=[1, 3, 5, 7])
data2 = pd.TimedeltaIndex(["10 day", "11 day", "12 day"])
df2 = DataFrame({"TD": data2}, index=[2, 4, 5])
res = df1.combine_first(df2)
exp_dts = pd.TimedeltaIndex(
["1 day", "10 day", "NaT", "11 day", "3 day", "4 day"]
)
exp = DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7])
tm.assert_frame_equal(res, exp)
assert res["TD"].dtype == "timedelta64[ns]"
def test_combine_first_period(self):
data1 = pd.PeriodIndex(["2011-01", "NaT", "2011-03", "2011-04"], freq="M")
df1 = DataFrame({"P": data1}, index=[1, 3, 5, 7])
data2 = pd.PeriodIndex(["2012-01-01", "2012-02", "2012-03"], freq="M")
df2 = DataFrame({"P": data2}, index=[2, 4, 5])
res = df1.combine_first(df2)
exp_dts = pd.PeriodIndex(
["2011-01", "2012-01", "NaT", "2012-02", "2011-03", "2011-04"], freq="M"
)
exp = DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7])
tm.assert_frame_equal(res, exp)
assert res["P"].dtype == data1.dtype
# different freq
dts2 = pd.PeriodIndex(["2012-01-01", "2012-01-02", "2012-01-03"], freq="D")
df2 = DataFrame({"P": dts2}, index=[2, 4, 5])
res = df1.combine_first(df2)
exp_dts = [
pd.Period("2011-01", freq="M"),
pd.Period("2012-01-01", freq="D"),
pd.NaT,
pd.Period("2012-01-02", freq="D"),
pd.Period("2011-03", freq="M"),
pd.Period("2011-04", freq="M"),
]
exp = DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7])
tm.assert_frame_equal(res, exp)
assert res["P"].dtype == "object"
def test_combine_first_int(self):
# GH14687 - integer series that do no align exactly
df1 = DataFrame({"a": [0, 1, 3, 5]}, dtype="int64")
df2 = DataFrame({"a": [1, 4]}, dtype="int64")
result_12 = df1.combine_first(df2)
expected_12 = DataFrame({"a": [0, 1, 3, 5]})
tm.assert_frame_equal(result_12, expected_12)
result_21 = df2.combine_first(df1)
expected_21 = DataFrame({"a": [1, 4, 3, 5]})
tm.assert_frame_equal(result_21, expected_21)
@pytest.mark.parametrize("val", [1, 1.0])
def test_combine_first_with_asymmetric_other(self, val):
# see gh-20699
df1 = DataFrame({"isNum": [val]})
df2 = DataFrame({"isBool": [True]})
res = df1.combine_first(df2)
exp = DataFrame({"isBool": [True], "isNum": [val]})
tm.assert_frame_equal(res, exp)
def test_combine_first_string_dtype_only_na(self, nullable_string_dtype):
# GH: 37519
df = DataFrame(
{"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype
)
df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype=nullable_string_dtype)
df.set_index(["a", "b"], inplace=True)
df2.set_index(["a", "b"], inplace=True)
result = df.combine_first(df2)
expected = DataFrame(
{"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype
).set_index(["a", "b"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"scalar1, scalar2",
[
(datetime(2020, 1, 1), datetime(2020, 1, 2)),
(pd.Period("2020-01-01", "D"), pd.Period("2020-01-02", "D")),
(pd.Timedelta("89 days"), pd.Timedelta("60 min")),
(pd.Interval(left=0, right=1), pd.Interval(left=2, right=3, closed="left")),
],
)
def test_combine_first_timestamp_bug(scalar1, scalar2, nulls_fixture):
# GH28481
na_value = nulls_fixture
frame = DataFrame([[na_value, na_value]], columns=["a", "b"])
other = DataFrame([[scalar1, scalar2]], columns=["b", "c"])
common_dtype = find_common_type([frame.dtypes["b"], other.dtypes["b"]])
if is_dtype_equal(common_dtype, "object") or frame.dtypes["b"] == other.dtypes["b"]:
val = scalar1
else:
val = na_value
result = frame.combine_first(other)
expected = DataFrame([[na_value, val, scalar2]], columns=["a", "b", "c"])
expected["b"] = expected["b"].astype(common_dtype)
tm.assert_frame_equal(result, expected)
def test_combine_first_timestamp_bug_NaT():
# GH28481
frame = DataFrame([[pd.NaT, pd.NaT]], columns=["a", "b"])
other = DataFrame(
[[datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["b", "c"]
)
result = frame.combine_first(other)
expected = DataFrame(
[[pd.NaT, datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["a", "b", "c"]
)
tm.assert_frame_equal(result, expected)
def test_combine_first_with_nan_multiindex():
# gh-36562
mi1 = MultiIndex.from_arrays(
[["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"]
)
df = DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1)
mi2 = MultiIndex.from_arrays(
[["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"]
)
s = Series([1, 2, 3, 4, 5, 6], index=mi2)
res = df.combine_first(DataFrame({"d": s}))
mi_expected = MultiIndex.from_arrays(
[
["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan],
[1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6],
],
names=["a", "b"],
)
expected = DataFrame(
{
"c": [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan, 1, np.nan, 1],
"d": [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan],
},
index=mi_expected,
)
tm.assert_frame_equal(res, expected)
def test_combine_preserve_dtypes():
# GH7509
a_column = Series(["a", "b"], index=range(2))
b_column = Series(range(2), index=range(2))
df1 = DataFrame({"A": a_column, "B": b_column})
c_column = Series(["a", "b"], index=range(5, 7))
b_column = Series(range(-1, 1), index=range(5, 7))
df2 = DataFrame({"B": b_column, "C": c_column})
expected = DataFrame(
{
"A": ["a", "b", np.nan, np.nan],
"B": [0, 1, -1, 0],
"C": [np.nan, np.nan, "a", "b"],
},
index=[0, 1, 5, 6],
)
combined = df1.combine_first(df2)
tm.assert_frame_equal(combined, expected)
def test_combine_first_duplicates_rows_for_nan_index_values():
# GH39881
df1 = DataFrame(
{"x": [9, 10, 11]},
index=MultiIndex.from_arrays([[1, 2, 3], [np.nan, 5, 6]], names=["a", "b"]),
)
df2 = DataFrame(
{"y": [12, 13, 14]},
index=MultiIndex.from_arrays([[1, 2, 4], [np.nan, 5, 7]], names=["a", "b"]),
)
expected = DataFrame(
{
"x": [9.0, 10.0, 11.0, np.nan],
"y": [12.0, 13.0, np.nan, 14.0],
},
index=MultiIndex.from_arrays(
[[1, 2, 3, 4], [np.nan, 5, 6, 7]], names=["a", "b"]
),
)
combined = df1.combine_first(df2)
tm.assert_frame_equal(combined, expected)
def test_combine_first_int64_not_cast_to_float64():
# GH 28613
df_1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df_2 = DataFrame({"A": [1, 20, 30], "B": [40, 50, 60], "C": [12, 34, 65]})
result = df_1.combine_first(df_2)
expected = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [12, 34, 65]})
tm.assert_frame_equal(result, expected)
def test_midx_losing_dtype():
# GH#49830
midx = MultiIndex.from_arrays([[0, 0], [np.nan, np.nan]])
midx2 = MultiIndex.from_arrays([[1, 1], [np.nan, np.nan]])
df1 = DataFrame({"a": [None, 4]}, index=midx)
df2 = DataFrame({"a": [3, 3]}, index=midx2)
result = df1.combine_first(df2)
expected_midx = MultiIndex.from_arrays(
[[0, 0, 1, 1], [np.nan, np.nan, np.nan, np.nan]]
)
expected = DataFrame({"a": [np.nan, 4, 3, 3]}, index=expected_midx)
tm.assert_frame_equal(result, expected)
def test_combine_first_empty_columns():
left = DataFrame(columns=["a", "b"])
right = DataFrame(columns=["a", "c"])
result = left.combine_first(right)
expected = DataFrame(columns=["a", "b", "c"])
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,305 @@
import numpy as np
import pytest
from pandas.compat.numpy import np_version_gte1p25
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize("align_axis", [0, 1, "index", "columns"])
def test_compare_axis(align_axis):
# GH#30429
df = pd.DataFrame(
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
columns=["col1", "col2", "col3"],
)
df2 = df.copy()
df2.loc[0, "col1"] = "c"
df2.loc[2, "col3"] = 4.0
result = df.compare(df2, align_axis=align_axis)
if align_axis in (1, "columns"):
indices = pd.Index([0, 2])
columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]])
expected = pd.DataFrame(
[["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, 4.0]],
index=indices,
columns=columns,
)
else:
indices = pd.MultiIndex.from_product([[0, 2], ["self", "other"]])
columns = pd.Index(["col1", "col3"])
expected = pd.DataFrame(
[["a", np.nan], ["c", np.nan], [np.nan, 3.0], [np.nan, 4.0]],
index=indices,
columns=columns,
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"keep_shape, keep_equal",
[
(True, False),
(False, True),
(True, True),
# False, False case is already covered in test_compare_axis
],
)
def test_compare_various_formats(keep_shape, keep_equal):
df = pd.DataFrame(
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
columns=["col1", "col2", "col3"],
)
df2 = df.copy()
df2.loc[0, "col1"] = "c"
df2.loc[2, "col3"] = 4.0
result = df.compare(df2, keep_shape=keep_shape, keep_equal=keep_equal)
if keep_shape:
indices = pd.Index([0, 1, 2])
columns = pd.MultiIndex.from_product(
[["col1", "col2", "col3"], ["self", "other"]]
)
if keep_equal:
expected = pd.DataFrame(
[
["a", "c", 1.0, 1.0, 1.0, 1.0],
["b", "b", 2.0, 2.0, 2.0, 2.0],
["c", "c", np.nan, np.nan, 3.0, 4.0],
],
index=indices,
columns=columns,
)
else:
expected = pd.DataFrame(
[
["a", "c", np.nan, np.nan, np.nan, np.nan],
[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
[np.nan, np.nan, np.nan, np.nan, 3.0, 4.0],
],
index=indices,
columns=columns,
)
else:
indices = pd.Index([0, 2])
columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]])
expected = pd.DataFrame(
[["a", "c", 1.0, 1.0], ["c", "c", 3.0, 4.0]], index=indices, columns=columns
)
tm.assert_frame_equal(result, expected)
def test_compare_with_equal_nulls():
# We want to make sure two NaNs are considered the same
# and dropped where applicable
df = pd.DataFrame(
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
columns=["col1", "col2", "col3"],
)
df2 = df.copy()
df2.loc[0, "col1"] = "c"
result = df.compare(df2)
indices = pd.Index([0])
columns = pd.MultiIndex.from_product([["col1"], ["self", "other"]])
expected = pd.DataFrame([["a", "c"]], index=indices, columns=columns)
tm.assert_frame_equal(result, expected)
def test_compare_with_non_equal_nulls():
# We want to make sure the relevant NaNs do not get dropped
# even if the entire row or column are NaNs
df = pd.DataFrame(
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
columns=["col1", "col2", "col3"],
)
df2 = df.copy()
df2.loc[0, "col1"] = "c"
df2.loc[2, "col3"] = np.nan
result = df.compare(df2)
indices = pd.Index([0, 2])
columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]])
expected = pd.DataFrame(
[["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, np.nan]],
index=indices,
columns=columns,
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("align_axis", [0, 1])
def test_compare_multi_index(align_axis):
df = pd.DataFrame(
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}
)
df.columns = pd.MultiIndex.from_arrays([["a", "a", "b"], ["col1", "col2", "col3"]])
df.index = pd.MultiIndex.from_arrays([["x", "x", "y"], [0, 1, 2]])
df2 = df.copy()
df2.iloc[0, 0] = "c"
df2.iloc[2, 2] = 4.0
result = df.compare(df2, align_axis=align_axis)
if align_axis == 0:
indices = pd.MultiIndex.from_arrays(
[["x", "x", "y", "y"], [0, 0, 2, 2], ["self", "other", "self", "other"]]
)
columns = pd.MultiIndex.from_arrays([["a", "b"], ["col1", "col3"]])
data = [["a", np.nan], ["c", np.nan], [np.nan, 3.0], [np.nan, 4.0]]
else:
indices = pd.MultiIndex.from_arrays([["x", "y"], [0, 2]])
columns = pd.MultiIndex.from_arrays(
[
["a", "a", "b", "b"],
["col1", "col1", "col3", "col3"],
["self", "other", "self", "other"],
]
)
data = [["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, 4.0]]
expected = pd.DataFrame(data=data, index=indices, columns=columns)
tm.assert_frame_equal(result, expected)
def test_compare_unaligned_objects():
# test DataFrames with different indices
msg = (
r"Can only compare identically-labeled \(both index and columns\) DataFrame "
"objects"
)
with pytest.raises(ValueError, match=msg):
df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"])
df2 = pd.DataFrame([1, 2, 3], index=["a", "b", "d"])
df1.compare(df2)
# test DataFrames with different shapes
msg = (
r"Can only compare identically-labeled \(both index and columns\) DataFrame "
"objects"
)
with pytest.raises(ValueError, match=msg):
df1 = pd.DataFrame(np.ones((3, 3)))
df2 = pd.DataFrame(np.zeros((2, 1)))
df1.compare(df2)
def test_compare_result_names():
# GH 44354
df1 = pd.DataFrame(
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
)
df2 = pd.DataFrame(
{
"col1": ["c", "b", "c"],
"col2": [1.0, 2.0, np.nan],
"col3": [1.0, 2.0, np.nan],
},
)
result = df1.compare(df2, result_names=("left", "right"))
expected = pd.DataFrame(
{
("col1", "left"): {0: "a", 2: np.nan},
("col1", "right"): {0: "c", 2: np.nan},
("col3", "left"): {0: np.nan, 2: 3.0},
("col3", "right"): {0: np.nan, 2: np.nan},
}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"result_names",
[
[1, 2],
"HK",
{"2": 2, "3": 3},
3,
3.0,
],
)
def test_invalid_input_result_names(result_names):
# GH 44354
df1 = pd.DataFrame(
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
)
df2 = pd.DataFrame(
{
"col1": ["c", "b", "c"],
"col2": [1.0, 2.0, np.nan],
"col3": [1.0, 2.0, np.nan],
},
)
with pytest.raises(
TypeError,
match=(
f"Passing 'result_names' as a {type(result_names)} is not "
"supported. Provide 'result_names' as a tuple instead."
),
):
df1.compare(df2, result_names=result_names)
@pytest.mark.parametrize(
"val1,val2",
[(4, pd.NA), (pd.NA, pd.NA), (pd.NA, 4)],
)
def test_compare_ea_and_np_dtype(val1, val2):
# GH 48966
arr = [4.0, val1]
ser = pd.Series([1, val2], dtype="Int64")
df1 = pd.DataFrame({"a": arr, "b": [1.0, 2]})
df2 = pd.DataFrame({"a": ser, "b": [1.0, 2]})
expected = pd.DataFrame(
{
("a", "self"): arr,
("a", "other"): ser,
("b", "self"): np.nan,
("b", "other"): np.nan,
}
)
if val1 is pd.NA and val2 is pd.NA:
# GH#18463 TODO: is this really the desired behavior?
expected.loc[1, ("a", "self")] = np.nan
if val1 is pd.NA and np_version_gte1p25:
# can't compare with numpy array if it contains pd.NA
with pytest.raises(TypeError, match="boolean value of NA is ambiguous"):
result = df1.compare(df2, keep_shape=True)
else:
result = df1.compare(df2, keep_shape=True)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"df1_val,df2_val,diff_self,diff_other",
[
(4, 3, 4, 3),
(4, 4, pd.NA, pd.NA),
(4, pd.NA, 4, pd.NA),
(pd.NA, pd.NA, pd.NA, pd.NA),
],
)
def test_compare_nullable_int64_dtype(df1_val, df2_val, diff_self, diff_other):
# GH 48966
df1 = pd.DataFrame({"a": pd.Series([df1_val, pd.NA], dtype="Int64"), "b": [1.0, 2]})
df2 = df1.copy()
df2.loc[0, "a"] = df2_val
expected = pd.DataFrame(
{
("a", "self"): pd.Series([diff_self, pd.NA], dtype="Int64"),
("a", "other"): pd.Series([diff_other, pd.NA], dtype="Int64"),
("b", "self"): np.nan,
("b", "other"): np.nan,
}
)
result = df1.compare(df2, keep_shape=True)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,202 @@
import datetime
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
class TestConvertDtypes:
@pytest.mark.parametrize(
"convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")]
)
def test_convert_dtypes(
self, convert_integer, expected, string_storage, using_infer_string
):
# Specific types are tested in tests/series/test_dtypes.py
# Just check that it works for DataFrame here
if using_infer_string:
string_storage = "pyarrow_numpy"
df = pd.DataFrame(
{
"a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
"b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
}
)
with pd.option_context("string_storage", string_storage):
result = df.convert_dtypes(True, True, convert_integer, False)
expected = pd.DataFrame(
{
"a": pd.Series([1, 2, 3], dtype=expected),
"b": pd.Series(["x", "y", "z"], dtype=f"string[{string_storage}]"),
}
)
tm.assert_frame_equal(result, expected)
def test_convert_empty(self):
# Empty DataFrame can pass convert_dtypes, see GH#40393
empty_df = pd.DataFrame()
tm.assert_frame_equal(empty_df, empty_df.convert_dtypes())
def test_convert_dtypes_retain_column_names(self):
# GH#41435
df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
df.columns.name = "cols"
result = df.convert_dtypes()
tm.assert_index_equal(result.columns, df.columns)
assert result.columns.name == "cols"
def test_pyarrow_dtype_backend(self):
pa = pytest.importorskip("pyarrow")
df = pd.DataFrame(
{
"a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
"b": pd.Series(["x", "y", None], dtype=np.dtype("O")),
"c": pd.Series([True, False, None], dtype=np.dtype("O")),
"d": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
"e": pd.Series(pd.date_range("2022", periods=3)),
"f": pd.Series(pd.date_range("2022", periods=3, tz="UTC").as_unit("s")),
"g": pd.Series(pd.timedelta_range("1D", periods=3)),
}
)
result = df.convert_dtypes(dtype_backend="pyarrow")
expected = pd.DataFrame(
{
"a": pd.arrays.ArrowExtensionArray(
pa.array([1, 2, 3], type=pa.int32())
),
"b": pd.arrays.ArrowExtensionArray(pa.array(["x", "y", None])),
"c": pd.arrays.ArrowExtensionArray(pa.array([True, False, None])),
"d": pd.arrays.ArrowExtensionArray(pa.array([None, 100.5, 200.0])),
"e": pd.arrays.ArrowExtensionArray(
pa.array(
[
datetime.datetime(2022, 1, 1),
datetime.datetime(2022, 1, 2),
datetime.datetime(2022, 1, 3),
],
type=pa.timestamp(unit="ns"),
)
),
"f": pd.arrays.ArrowExtensionArray(
pa.array(
[
datetime.datetime(2022, 1, 1),
datetime.datetime(2022, 1, 2),
datetime.datetime(2022, 1, 3),
],
type=pa.timestamp(unit="s", tz="UTC"),
)
),
"g": pd.arrays.ArrowExtensionArray(
pa.array(
[
datetime.timedelta(1),
datetime.timedelta(2),
datetime.timedelta(3),
],
type=pa.duration("ns"),
)
),
}
)
tm.assert_frame_equal(result, expected)
def test_pyarrow_dtype_backend_already_pyarrow(self):
pytest.importorskip("pyarrow")
expected = pd.DataFrame([1, 2, 3], dtype="int64[pyarrow]")
result = expected.convert_dtypes(dtype_backend="pyarrow")
tm.assert_frame_equal(result, expected)
def test_pyarrow_dtype_backend_from_pandas_nullable(self):
pa = pytest.importorskip("pyarrow")
df = pd.DataFrame(
{
"a": pd.Series([1, 2, None], dtype="Int32"),
"b": pd.Series(["x", "y", None], dtype="string[python]"),
"c": pd.Series([True, False, None], dtype="boolean"),
"d": pd.Series([None, 100.5, 200], dtype="Float64"),
}
)
result = df.convert_dtypes(dtype_backend="pyarrow")
expected = pd.DataFrame(
{
"a": pd.arrays.ArrowExtensionArray(
pa.array([1, 2, None], type=pa.int32())
),
"b": pd.arrays.ArrowExtensionArray(pa.array(["x", "y", None])),
"c": pd.arrays.ArrowExtensionArray(pa.array([True, False, None])),
"d": pd.arrays.ArrowExtensionArray(pa.array([None, 100.5, 200.0])),
}
)
tm.assert_frame_equal(result, expected)
def test_pyarrow_dtype_empty_object(self):
# GH 50970
pytest.importorskip("pyarrow")
expected = pd.DataFrame(columns=[0])
result = expected.convert_dtypes(dtype_backend="pyarrow")
tm.assert_frame_equal(result, expected)
def test_pyarrow_engine_lines_false(self):
# GH 48893
df = pd.DataFrame({"a": [1, 2, 3]})
msg = (
"dtype_backend numpy is invalid, only 'numpy_nullable' and "
"'pyarrow' are allowed."
)
with pytest.raises(ValueError, match=msg):
df.convert_dtypes(dtype_backend="numpy")
def test_pyarrow_backend_no_conversion(self):
# GH#52872
pytest.importorskip("pyarrow")
df = pd.DataFrame({"a": [1, 2], "b": 1.5, "c": True, "d": "x"})
expected = df.copy()
result = df.convert_dtypes(
convert_floating=False,
convert_integer=False,
convert_boolean=False,
convert_string=False,
dtype_backend="pyarrow",
)
tm.assert_frame_equal(result, expected)
def test_convert_dtypes_pyarrow_to_np_nullable(self):
# GH 53648
pytest.importorskip("pyarrow")
ser = pd.DataFrame(range(2), dtype="int32[pyarrow]")
result = ser.convert_dtypes(dtype_backend="numpy_nullable")
expected = pd.DataFrame(range(2), dtype="Int32")
tm.assert_frame_equal(result, expected)
def test_convert_dtypes_pyarrow_timestamp(self):
# GH 54191
pytest.importorskip("pyarrow")
ser = pd.Series(pd.date_range("2020-01-01", "2020-01-02", freq="1min"))
expected = ser.astype("timestamp[ms][pyarrow]")
result = expected.convert_dtypes(dtype_backend="pyarrow")
tm.assert_series_equal(result, expected)
def test_convert_dtypes_avoid_block_splitting(self):
# GH#55341
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": "a"})
result = df.convert_dtypes(convert_integer=False)
expected = pd.DataFrame(
{
"a": [1, 2, 3],
"b": [4, 5, 6],
"c": pd.Series(["a"] * 3, dtype="string[python]"),
}
)
tm.assert_frame_equal(result, expected)
assert result._mgr.nblocks == 2
def test_convert_dtypes_from_arrow(self):
# GH#56581
df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"])
result = df.convert_dtypes()
expected = df.astype({"a": "string[python]"})
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,64 @@
import numpy as np
import pytest
import pandas.util._test_decorators as td
from pandas import DataFrame
import pandas._testing as tm
class TestCopy:
@pytest.mark.parametrize("attr", ["index", "columns"])
def test_copy_index_name_checking(self, float_frame, attr):
# don't want to be able to modify the index stored elsewhere after
# making a copy
ind = getattr(float_frame, attr)
ind.name = None
cp = float_frame.copy()
getattr(cp, attr).name = "foo"
assert getattr(float_frame, attr).name is None
@td.skip_copy_on_write_invalid_test
def test_copy_cache(self):
# GH#31784 _item_cache not cleared on copy causes incorrect reads after updates
df = DataFrame({"a": [1]})
df["x"] = [0]
df["a"]
df.copy()
df["a"].values[0] = -1
tm.assert_frame_equal(df, DataFrame({"a": [-1], "x": [0]}))
df["y"] = [0]
assert df["a"].values[0] == -1
tm.assert_frame_equal(df, DataFrame({"a": [-1], "x": [0], "y": [0]}))
def test_copy(self, float_frame, float_string_frame):
cop = float_frame.copy()
cop["E"] = cop["A"]
assert "E" not in float_frame
# copy objects
copy = float_string_frame.copy()
assert copy._mgr is not float_string_frame._mgr
@td.skip_array_manager_invalid_test
def test_copy_consolidates(self):
# GH#42477
df = DataFrame(
{
"a": np.random.default_rng(2).integers(0, 100, size=55),
"b": np.random.default_rng(2).integers(0, 100, size=55),
}
)
for i in range(10):
df.loc[:, f"n_{i}"] = np.random.default_rng(2).integers(0, 100, size=55)
assert len(df._mgr.blocks) == 11
result = df.copy()
assert len(result._mgr.blocks) == 1

View File

@ -0,0 +1,39 @@
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
class TestDataFrameCount:
def test_count(self):
# corner case
frame = DataFrame()
ct1 = frame.count(1)
assert isinstance(ct1, Series)
ct2 = frame.count(0)
assert isinstance(ct2, Series)
# GH#423
df = DataFrame(index=range(10))
result = df.count(1)
expected = Series(0, index=df.index)
tm.assert_series_equal(result, expected)
df = DataFrame(columns=range(10))
result = df.count(0)
expected = Series(0, index=df.columns)
tm.assert_series_equal(result, expected)
df = DataFrame()
result = df.count()
expected = Series(dtype="int64")
tm.assert_series_equal(result, expected)
def test_count_objects(self, float_string_frame):
dm = DataFrame(float_string_frame._series)
df = DataFrame(float_string_frame._series)
tm.assert_series_equal(dm.count(), df.count())
tm.assert_series_equal(dm.count(1), df.count(1))

View File

@ -0,0 +1,471 @@
import numpy as np
import pytest
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
DataFrame,
Index,
Series,
date_range,
isna,
)
import pandas._testing as tm
class TestDataFrameCov:
def test_cov(self, float_frame, float_string_frame):
# min_periods no NAs (corner case)
expected = float_frame.cov()
result = float_frame.cov(min_periods=len(float_frame))
tm.assert_frame_equal(expected, result)
result = float_frame.cov(min_periods=len(float_frame) + 1)
assert isna(result.values).all()
# with NAs
frame = float_frame.copy()
frame.iloc[:5, frame.columns.get_loc("A")] = np.nan
frame.iloc[5:10, frame.columns.get_loc("B")] = np.nan
result = frame.cov(min_periods=len(frame) - 8)
expected = frame.cov()
expected.loc["A", "B"] = np.nan
expected.loc["B", "A"] = np.nan
tm.assert_frame_equal(result, expected)
# regular
result = frame.cov()
expected = frame["A"].cov(frame["C"])
tm.assert_almost_equal(result["A"]["C"], expected)
# fails on non-numeric types
with pytest.raises(ValueError, match="could not convert string to float"):
float_string_frame.cov()
result = float_string_frame.cov(numeric_only=True)
expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].cov()
tm.assert_frame_equal(result, expected)
# Single column frame
df = DataFrame(np.linspace(0.0, 1.0, 10))
result = df.cov()
expected = DataFrame(
np.cov(df.values.T).reshape((1, 1)), index=df.columns, columns=df.columns
)
tm.assert_frame_equal(result, expected)
df.loc[0] = np.nan
result = df.cov()
expected = DataFrame(
np.cov(df.values[1:].T).reshape((1, 1)),
index=df.columns,
columns=df.columns,
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("test_ddof", [None, 0, 1, 2, 3])
def test_cov_ddof(self, test_ddof):
# GH#34611
np_array1 = np.random.default_rng(2).random(10)
np_array2 = np.random.default_rng(2).random(10)
df = DataFrame({0: np_array1, 1: np_array2})
result = df.cov(ddof=test_ddof)
expected_np = np.cov(np_array1, np_array2, ddof=test_ddof)
expected = DataFrame(expected_np)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"other_column", [pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0])]
)
def test_cov_nullable_integer(self, other_column):
# https://github.com/pandas-dev/pandas/issues/33803
data = DataFrame({"a": pd.array([1, 2, None]), "b": other_column})
result = data.cov()
arr = np.array([[0.5, 0.5], [0.5, 1.0]])
expected = DataFrame(arr, columns=["a", "b"], index=["a", "b"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("numeric_only", [True, False])
def test_cov_numeric_only(self, numeric_only):
# when dtypes of pandas series are different
# then ndarray will have dtype=object,
# so it need to be properly handled
df = DataFrame({"a": [1, 0], "c": ["x", "y"]})
expected = DataFrame(0.5, index=["a"], columns=["a"])
if numeric_only:
result = df.cov(numeric_only=numeric_only)
tm.assert_frame_equal(result, expected)
else:
with pytest.raises(ValueError, match="could not convert string to float"):
df.cov(numeric_only=numeric_only)
class TestDataFrameCorr:
# DataFrame.corr(), as opposed to DataFrame.corrwith
@pytest.mark.parametrize("method", ["pearson", "kendall", "spearman"])
def test_corr_scipy_method(self, float_frame, method):
pytest.importorskip("scipy")
float_frame.loc[float_frame.index[:5], "A"] = np.nan
float_frame.loc[float_frame.index[5:10], "B"] = np.nan
float_frame.loc[float_frame.index[:10], "A"] = float_frame["A"][10:20].copy()
correls = float_frame.corr(method=method)
expected = float_frame["A"].corr(float_frame["C"], method=method)
tm.assert_almost_equal(correls["A"]["C"], expected)
# ---------------------------------------------------------------------
def test_corr_non_numeric(self, float_string_frame):
with pytest.raises(ValueError, match="could not convert string to float"):
float_string_frame.corr()
result = float_string_frame.corr(numeric_only=True)
expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"])
def test_corr_nooverlap(self, meth):
# nothing in common
pytest.importorskip("scipy")
df = DataFrame(
{
"A": [1, 1.5, 1, np.nan, np.nan, np.nan],
"B": [np.nan, np.nan, np.nan, 1, 1.5, 1],
"C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
}
)
rs = df.corr(meth)
assert isna(rs.loc["A", "B"])
assert isna(rs.loc["B", "A"])
assert rs.loc["A", "A"] == 1
assert rs.loc["B", "B"] == 1
assert isna(rs.loc["C", "C"])
@pytest.mark.parametrize("meth", ["pearson", "spearman"])
def test_corr_constant(self, meth):
# constant --> all NA
df = DataFrame(
{
"A": [1, 1, 1, np.nan, np.nan, np.nan],
"B": [np.nan, np.nan, np.nan, 1, 1, 1],
}
)
rs = df.corr(meth)
assert isna(rs.values).all()
@pytest.mark.filterwarnings("ignore::RuntimeWarning")
@pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"])
def test_corr_int_and_boolean(self, meth):
# when dtypes of pandas series are different
# then ndarray will have dtype=object,
# so it need to be properly handled
pytest.importorskip("scipy")
df = DataFrame({"a": [True, False], "b": [1, 0]})
expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"])
result = df.corr(meth)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("method", ["cov", "corr"])
def test_corr_cov_independent_index_column(self, method):
# GH#14617
df = DataFrame(
np.random.default_rng(2).standard_normal(4 * 10).reshape(10, 4),
columns=list("abcd"),
)
result = getattr(df, method)()
assert result.index is not result.columns
assert result.index.equals(result.columns)
def test_corr_invalid_method(self):
# GH#22298
df = DataFrame(np.random.default_rng(2).normal(size=(10, 2)))
msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, "
with pytest.raises(ValueError, match=msg):
df.corr(method="____")
def test_corr_int(self):
# dtypes other than float64 GH#1761
df = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]})
df.cov()
df.corr()
@pytest.mark.parametrize(
"nullable_column", [pd.array([1, 2, 3]), pd.array([1, 2, None])]
)
@pytest.mark.parametrize(
"other_column",
[pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, np.nan])],
)
@pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"])
def test_corr_nullable_integer(self, nullable_column, other_column, method):
# https://github.com/pandas-dev/pandas/issues/33803
pytest.importorskip("scipy")
data = DataFrame({"a": nullable_column, "b": other_column})
result = data.corr(method=method)
expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"])
tm.assert_frame_equal(result, expected)
def test_corr_item_cache(self, using_copy_on_write, warn_copy_on_write):
# Check that corr does not lead to incorrect entries in item_cache
df = DataFrame({"A": range(10)})
df["B"] = range(10)[::-1]
ser = df["A"] # populate item_cache
assert len(df._mgr.arrays) == 2 # i.e. 2 blocks
_ = df.corr(numeric_only=True)
if using_copy_on_write:
ser.iloc[0] = 99
assert df.loc[0, "A"] == 0
else:
# Check that the corr didn't break link between ser and df
ser.values[0] = 99
assert df.loc[0, "A"] == 99
if not warn_copy_on_write:
assert df["A"] is ser
assert df.values[0, 0] == 99
@pytest.mark.parametrize("length", [2, 20, 200, 2000])
def test_corr_for_constant_columns(self, length):
# GH: 37448
df = DataFrame(length * [[0.4, 0.1]], columns=["A", "B"])
result = df.corr()
expected = DataFrame(
{"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"]
)
tm.assert_frame_equal(result, expected)
def test_calc_corr_small_numbers(self):
# GH: 37452
df = DataFrame(
{"A": [1.0e-20, 2.0e-20, 3.0e-20], "B": [1.0e-20, 2.0e-20, 3.0e-20]}
)
result = df.corr()
expected = DataFrame({"A": [1.0, 1.0], "B": [1.0, 1.0]}, index=["A", "B"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"])
def test_corr_min_periods_greater_than_length(self, method):
pytest.importorskip("scipy")
df = DataFrame({"A": [1, 2], "B": [1, 2]})
result = df.corr(method=method, min_periods=3)
expected = DataFrame(
{"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"]
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"])
@pytest.mark.parametrize("numeric_only", [True, False])
def test_corr_numeric_only(self, meth, numeric_only):
# when dtypes of pandas series are different
# then ndarray will have dtype=object,
# so it need to be properly handled
pytest.importorskip("scipy")
df = DataFrame({"a": [1, 0], "b": [1, 0], "c": ["x", "y"]})
expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"])
if numeric_only:
result = df.corr(meth, numeric_only=numeric_only)
tm.assert_frame_equal(result, expected)
else:
with pytest.raises(ValueError, match="could not convert string to float"):
df.corr(meth, numeric_only=numeric_only)
class TestDataFrameCorrWith:
@pytest.mark.parametrize(
"dtype",
[
"float64",
"Float64",
pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")),
],
)
def test_corrwith(self, datetime_frame, dtype):
datetime_frame = datetime_frame.astype(dtype)
a = datetime_frame
noise = Series(np.random.default_rng(2).standard_normal(len(a)), index=a.index)
b = datetime_frame.add(noise, axis=0)
# make sure order does not matter
b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:])
del b["B"]
colcorr = a.corrwith(b, axis=0)
tm.assert_almost_equal(colcorr["A"], a["A"].corr(b["A"]))
rowcorr = a.corrwith(b, axis=1)
tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0))
dropped = a.corrwith(b, axis=0, drop=True)
tm.assert_almost_equal(dropped["A"], a["A"].corr(b["A"]))
assert "B" not in dropped
dropped = a.corrwith(b, axis=1, drop=True)
assert a.index[-1] not in dropped.index
# non time-series data
index = ["a", "b", "c", "d", "e"]
columns = ["one", "two", "three", "four"]
df1 = DataFrame(
np.random.default_rng(2).standard_normal((5, 4)),
index=index,
columns=columns,
)
df2 = DataFrame(
np.random.default_rng(2).standard_normal((4, 4)),
index=index[:4],
columns=columns,
)
correls = df1.corrwith(df2, axis=1)
for row in index[:4]:
tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row]))
def test_corrwith_with_objects(self, using_infer_string):
df1 = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=10, freq="B"),
)
df2 = df1.copy()
cols = ["A", "B", "C", "D"]
df1["obj"] = "foo"
df2["obj"] = "bar"
if using_infer_string:
import pyarrow as pa
with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"):
df1.corrwith(df2)
else:
with pytest.raises(TypeError, match="Could not convert"):
df1.corrwith(df2)
result = df1.corrwith(df2, numeric_only=True)
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols])
tm.assert_series_equal(result, expected)
with pytest.raises(TypeError, match="unsupported operand type"):
df1.corrwith(df2, axis=1)
result = df1.corrwith(df2, axis=1, numeric_only=True)
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1)
tm.assert_series_equal(result, expected)
def test_corrwith_series(self, datetime_frame):
result = datetime_frame.corrwith(datetime_frame["A"])
expected = datetime_frame.apply(datetime_frame["A"].corr)
tm.assert_series_equal(result, expected)
def test_corrwith_matches_corrcoef(self):
df1 = DataFrame(np.arange(10000), columns=["a"])
df2 = DataFrame(np.arange(10000) ** 2, columns=["a"])
c1 = df1.corrwith(df2)["a"]
c2 = np.corrcoef(df1["a"], df2["a"])[0][1]
tm.assert_almost_equal(c1, c2)
assert c1 < 1
@pytest.mark.parametrize("numeric_only", [True, False])
def test_corrwith_mixed_dtypes(self, numeric_only):
# GH#18570
df = DataFrame(
{"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]}
)
s = Series([0, 6, 7, 3])
if numeric_only:
result = df.corrwith(s, numeric_only=numeric_only)
corrs = [df["a"].corr(s), df["b"].corr(s)]
expected = Series(data=corrs, index=["a", "b"])
tm.assert_series_equal(result, expected)
else:
with pytest.raises(
ValueError,
match="could not convert string to float",
):
df.corrwith(s, numeric_only=numeric_only)
def test_corrwith_index_intersection(self):
df1 = DataFrame(
np.random.default_rng(2).random(size=(10, 2)), columns=["a", "b"]
)
df2 = DataFrame(
np.random.default_rng(2).random(size=(10, 3)), columns=["a", "b", "c"]
)
result = df1.corrwith(df2, drop=True).index.sort_values()
expected = df1.columns.intersection(df2.columns).sort_values()
tm.assert_index_equal(result, expected)
def test_corrwith_index_union(self):
df1 = DataFrame(
np.random.default_rng(2).random(size=(10, 2)), columns=["a", "b"]
)
df2 = DataFrame(
np.random.default_rng(2).random(size=(10, 3)), columns=["a", "b", "c"]
)
result = df1.corrwith(df2, drop=False).index.sort_values()
expected = df1.columns.union(df2.columns).sort_values()
tm.assert_index_equal(result, expected)
def test_corrwith_dup_cols(self):
# GH#21925
df1 = DataFrame(np.vstack([np.arange(10)] * 3).T)
df2 = df1.copy()
df2 = pd.concat((df2, df2[0]), axis=1)
result = df1.corrwith(df2)
expected = Series(np.ones(4), index=[0, 0, 1, 2])
tm.assert_series_equal(result, expected)
def test_corr_numerical_instabilities(self):
# GH#45640
df = DataFrame([[0.2, 0.4], [0.4, 0.2]])
result = df.corr()
expected = DataFrame({0: [1.0, -1.0], 1: [-1.0, 1.0]})
tm.assert_frame_equal(result - 1, expected - 1, atol=1e-17)
def test_corrwith_spearman(self):
# GH#21925
pytest.importorskip("scipy")
df = DataFrame(np.random.default_rng(2).random(size=(100, 3)))
result = df.corrwith(df**2, method="spearman")
expected = Series(np.ones(len(result)))
tm.assert_series_equal(result, expected)
def test_corrwith_kendall(self):
# GH#21925
pytest.importorskip("scipy")
df = DataFrame(np.random.default_rng(2).random(size=(100, 3)))
result = df.corrwith(df**2, method="kendall")
expected = Series(np.ones(len(result)))
tm.assert_series_equal(result, expected)
def test_corrwith_spearman_with_tied_data(self):
# GH#48826
pytest.importorskip("scipy")
df1 = DataFrame(
{
"A": [1, np.nan, 7, 8],
"B": [False, True, True, False],
"C": [10, 4, 9, 3],
}
)
df2 = df1[["B", "C"]]
result = (df1 + 1).corrwith(df2.B, method="spearman")
expected = Series([0.0, 1.0, 0.0], index=["A", "B", "C"])
tm.assert_series_equal(result, expected)
df_bool = DataFrame(
{"A": [True, True, False, False], "B": [True, False, False, True]}
)
ser_bool = Series([True, True, False, True])
result = df_bool.corrwith(ser_bool)
expected = Series([0.57735, 0.57735], index=["A", "B"])
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,417 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Series,
Timestamp,
date_range,
)
import pandas._testing as tm
class TestDataFrameDescribe:
def test_describe_bool_in_mixed_frame(self):
df = DataFrame(
{
"string_data": ["a", "b", "c", "d", "e"],
"bool_data": [True, True, False, False, False],
"int_data": [10, 20, 30, 40, 50],
}
)
# Integer data are included in .describe() output,
# Boolean and string data are not.
result = df.describe()
expected = DataFrame(
{"int_data": [5, 30, df.int_data.std(), 10, 20, 30, 40, 50]},
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
)
tm.assert_frame_equal(result, expected)
# Top value is a boolean value that is False
result = df.describe(include=["bool"])
expected = DataFrame(
{"bool_data": [5, 2, False, 3]}, index=["count", "unique", "top", "freq"]
)
tm.assert_frame_equal(result, expected)
def test_describe_empty_object(self):
# GH#27183
df = DataFrame({"A": [None, None]}, dtype=object)
result = df.describe()
expected = DataFrame(
{"A": [0, 0, np.nan, np.nan]},
dtype=object,
index=["count", "unique", "top", "freq"],
)
tm.assert_frame_equal(result, expected)
result = df.iloc[:0].describe()
tm.assert_frame_equal(result, expected)
def test_describe_bool_frame(self):
# GH#13891
df = DataFrame(
{
"bool_data_1": [False, False, True, True],
"bool_data_2": [False, True, True, True],
}
)
result = df.describe()
expected = DataFrame(
{"bool_data_1": [4, 2, False, 2], "bool_data_2": [4, 2, True, 3]},
index=["count", "unique", "top", "freq"],
)
tm.assert_frame_equal(result, expected)
df = DataFrame(
{
"bool_data": [False, False, True, True, False],
"int_data": [0, 1, 2, 3, 4],
}
)
result = df.describe()
expected = DataFrame(
{"int_data": [5, 2, df.int_data.std(), 0, 1, 2, 3, 4]},
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
)
tm.assert_frame_equal(result, expected)
df = DataFrame(
{"bool_data": [False, False, True, True], "str_data": ["a", "b", "c", "a"]}
)
result = df.describe()
expected = DataFrame(
{"bool_data": [4, 2, False, 2], "str_data": [4, 3, "a", 2]},
index=["count", "unique", "top", "freq"],
)
tm.assert_frame_equal(result, expected)
def test_describe_categorical(self):
df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 100)})
labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)]
cat_labels = Categorical(labels, labels)
df = df.sort_values(by=["value"], ascending=True)
df["value_group"] = pd.cut(
df.value, range(0, 10500, 500), right=False, labels=cat_labels
)
cat = df
# Categoricals should not show up together with numerical columns
result = cat.describe()
assert len(result.columns) == 1
# In a frame, describe() for the cat should be the same as for string
# arrays (count, unique, top, freq)
cat = Categorical(
["a", "b", "b", "b"], categories=["a", "b", "c"], ordered=True
)
s = Series(cat)
result = s.describe()
expected = Series([4, 2, "b", 3], index=["count", "unique", "top", "freq"])
tm.assert_series_equal(result, expected)
cat = Series(Categorical(["a", "b", "c", "c"]))
df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]})
result = df3.describe()
tm.assert_numpy_array_equal(result["cat"].values, result["s"].values)
def test_describe_empty_categorical_column(self):
# GH#26397
# Ensure the index of an empty categorical DataFrame column
# also contains (count, unique, top, freq)
df = DataFrame({"empty_col": Categorical([])})
result = df.describe()
expected = DataFrame(
{"empty_col": [0, 0, np.nan, np.nan]},
index=["count", "unique", "top", "freq"],
dtype="object",
)
tm.assert_frame_equal(result, expected)
# ensure NaN, not None
assert np.isnan(result.iloc[2, 0])
assert np.isnan(result.iloc[3, 0])
def test_describe_categorical_columns(self):
# GH#11558
columns = pd.CategoricalIndex(["int1", "int2", "obj"], ordered=True, name="XXX")
df = DataFrame(
{
"int1": [10, 20, 30, 40, 50],
"int2": [10, 20, 30, 40, 50],
"obj": ["A", 0, None, "X", 1],
},
columns=columns,
)
result = df.describe()
exp_columns = pd.CategoricalIndex(
["int1", "int2"],
categories=["int1", "int2", "obj"],
ordered=True,
name="XXX",
)
expected = DataFrame(
{
"int1": [5, 30, df.int1.std(), 10, 20, 30, 40, 50],
"int2": [5, 30, df.int2.std(), 10, 20, 30, 40, 50],
},
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
columns=exp_columns,
)
tm.assert_frame_equal(result, expected)
tm.assert_categorical_equal(result.columns.values, expected.columns.values)
def test_describe_datetime_columns(self):
columns = pd.DatetimeIndex(
["2011-01-01", "2011-02-01", "2011-03-01"],
freq="MS",
tz="US/Eastern",
name="XXX",
)
df = DataFrame(
{
0: [10, 20, 30, 40, 50],
1: [10, 20, 30, 40, 50],
2: ["A", 0, None, "X", 1],
}
)
df.columns = columns
result = df.describe()
exp_columns = pd.DatetimeIndex(
["2011-01-01", "2011-02-01"], freq="MS", tz="US/Eastern", name="XXX"
)
expected = DataFrame(
{
0: [5, 30, df.iloc[:, 0].std(), 10, 20, 30, 40, 50],
1: [5, 30, df.iloc[:, 1].std(), 10, 20, 30, 40, 50],
},
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
)
expected.columns = exp_columns
tm.assert_frame_equal(result, expected)
assert result.columns.freq == "MS"
assert result.columns.tz == expected.columns.tz
def test_describe_timedelta_values(self):
# GH#6145
t1 = pd.timedelta_range("1 days", freq="D", periods=5)
t2 = pd.timedelta_range("1 hours", freq="h", periods=5)
df = DataFrame({"t1": t1, "t2": t2})
expected = DataFrame(
{
"t1": [
5,
pd.Timedelta("3 days"),
df.iloc[:, 0].std(),
pd.Timedelta("1 days"),
pd.Timedelta("2 days"),
pd.Timedelta("3 days"),
pd.Timedelta("4 days"),
pd.Timedelta("5 days"),
],
"t2": [
5,
pd.Timedelta("3 hours"),
df.iloc[:, 1].std(),
pd.Timedelta("1 hours"),
pd.Timedelta("2 hours"),
pd.Timedelta("3 hours"),
pd.Timedelta("4 hours"),
pd.Timedelta("5 hours"),
],
},
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
)
result = df.describe()
tm.assert_frame_equal(result, expected)
exp_repr = (
" t1 t2\n"
"count 5 5\n"
"mean 3 days 00:00:00 0 days 03:00:00\n"
"std 1 days 13:56:50.394919273 0 days 01:34:52.099788303\n"
"min 1 days 00:00:00 0 days 01:00:00\n"
"25% 2 days 00:00:00 0 days 02:00:00\n"
"50% 3 days 00:00:00 0 days 03:00:00\n"
"75% 4 days 00:00:00 0 days 04:00:00\n"
"max 5 days 00:00:00 0 days 05:00:00"
)
assert repr(result) == exp_repr
def test_describe_tz_values(self, tz_naive_fixture):
# GH#21332
tz = tz_naive_fixture
s1 = Series(range(5))
start = Timestamp(2018, 1, 1)
end = Timestamp(2018, 1, 5)
s2 = Series(date_range(start, end, tz=tz))
df = DataFrame({"s1": s1, "s2": s2})
expected = DataFrame(
{
"s1": [5, 2, 0, 1, 2, 3, 4, 1.581139],
"s2": [
5,
Timestamp(2018, 1, 3).tz_localize(tz),
start.tz_localize(tz),
s2[1],
s2[2],
s2[3],
end.tz_localize(tz),
np.nan,
],
},
index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
)
result = df.describe(include="all")
tm.assert_frame_equal(result, expected)
def test_datetime_is_numeric_includes_datetime(self):
df = DataFrame({"a": date_range("2012", periods=3), "b": [1, 2, 3]})
result = df.describe()
expected = DataFrame(
{
"a": [
3,
Timestamp("2012-01-02"),
Timestamp("2012-01-01"),
Timestamp("2012-01-01T12:00:00"),
Timestamp("2012-01-02"),
Timestamp("2012-01-02T12:00:00"),
Timestamp("2012-01-03"),
np.nan,
],
"b": [3, 2, 1, 1.5, 2, 2.5, 3, 1],
},
index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
)
tm.assert_frame_equal(result, expected)
def test_describe_tz_values2(self):
tz = "CET"
s1 = Series(range(5))
start = Timestamp(2018, 1, 1)
end = Timestamp(2018, 1, 5)
s2 = Series(date_range(start, end, tz=tz))
df = DataFrame({"s1": s1, "s2": s2})
s1_ = s1.describe()
s2_ = s2.describe()
idx = [
"count",
"mean",
"min",
"25%",
"50%",
"75%",
"max",
"std",
]
expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).reindex(
idx, copy=False
)
result = df.describe(include="all")
tm.assert_frame_equal(result, expected)
def test_describe_percentiles_integer_idx(self):
# GH#26660
df = DataFrame({"x": [1]})
pct = np.linspace(0, 1, 10 + 1)
result = df.describe(percentiles=pct)
expected = DataFrame(
{"x": [1.0, 1.0, np.nan, 1.0, *(1.0 for _ in pct), 1.0]},
index=[
"count",
"mean",
"std",
"min",
"0%",
"10%",
"20%",
"30%",
"40%",
"50%",
"60%",
"70%",
"80%",
"90%",
"100%",
"max",
],
)
tm.assert_frame_equal(result, expected)
def test_describe_does_not_raise_error_for_dictlike_elements(self):
# GH#32409
df = DataFrame([{"test": {"a": "1"}}, {"test": {"a": "2"}}])
expected = DataFrame(
{"test": [2, 2, {"a": "1"}, 1]}, index=["count", "unique", "top", "freq"]
)
result = df.describe()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("exclude", ["x", "y", ["x", "y"], ["x", "z"]])
def test_describe_when_include_all_exclude_not_allowed(self, exclude):
"""
When include is 'all', then setting exclude != None is not allowed.
"""
df = DataFrame({"x": [1], "y": [2], "z": [3]})
msg = "exclude must be None when include is 'all'"
with pytest.raises(ValueError, match=msg):
df.describe(include="all", exclude=exclude)
def test_describe_with_duplicate_columns(self):
df = DataFrame(
[[1, 1, 1], [2, 2, 2], [3, 3, 3]],
columns=["bar", "a", "a"],
dtype="float64",
)
result = df.describe()
ser = df.iloc[:, 0].describe()
expected = pd.concat([ser, ser, ser], keys=df.columns, axis=1)
tm.assert_frame_equal(result, expected)
def test_ea_with_na(self, any_numeric_ea_dtype):
# GH#48778
df = DataFrame({"a": [1, pd.NA, pd.NA], "b": pd.NA}, dtype=any_numeric_ea_dtype)
result = df.describe()
expected = DataFrame(
{"a": [1.0, 1.0, pd.NA] + [1.0] * 5, "b": [0.0] + [pd.NA] * 7},
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
dtype="Float64",
)
tm.assert_frame_equal(result, expected)
def test_describe_exclude_pa_dtype(self):
# GH#52570
pa = pytest.importorskip("pyarrow")
df = DataFrame(
{
"a": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int8())),
"b": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int16())),
"c": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int32())),
}
)
result = df.describe(
include=pd.ArrowDtype(pa.int8()), exclude=pd.ArrowDtype(pa.int32())
)
expected = DataFrame(
{"a": [3, 2, 1, 1, 1.5, 2, 2.5, 3]},
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
dtype=pd.ArrowDtype(pa.float64()),
)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,308 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Series,
Timestamp,
date_range,
)
import pandas._testing as tm
class TestDataFrameDiff:
def test_diff_requires_integer(self):
df = DataFrame(np.random.default_rng(2).standard_normal((2, 2)))
with pytest.raises(ValueError, match="periods must be an integer"):
df.diff(1.5)
# GH#44572 np.int64 is accepted
@pytest.mark.parametrize("num", [1, np.int64(1)])
def test_diff(self, datetime_frame, num):
df = datetime_frame
the_diff = df.diff(num)
expected = df["A"] - df["A"].shift(num)
tm.assert_series_equal(the_diff["A"], expected)
def test_diff_int_dtype(self):
# int dtype
a = 10_000_000_000_000_000
b = a + 1
ser = Series([a, b])
rs = DataFrame({"s": ser}).diff()
assert rs.s[1] == 1
def test_diff_mixed_numeric(self, datetime_frame):
# mixed numeric
tf = datetime_frame.astype("float32")
the_diff = tf.diff(1)
tm.assert_series_equal(the_diff["A"], tf["A"] - tf["A"].shift(1))
def test_diff_axis1_nonconsolidated(self):
# GH#10907
df = DataFrame({"y": Series([2]), "z": Series([3])})
df.insert(0, "x", 1)
result = df.diff(axis=1)
expected = DataFrame({"x": np.nan, "y": Series(1), "z": Series(1)})
tm.assert_frame_equal(result, expected)
def test_diff_timedelta64_with_nat(self):
# GH#32441
arr = np.arange(6).reshape(3, 2).astype("timedelta64[ns]")
arr[:, 0] = np.timedelta64("NaT", "ns")
df = DataFrame(arr)
result = df.diff(1, axis=0)
expected = DataFrame({0: df[0], 1: [pd.NaT, pd.Timedelta(2), pd.Timedelta(2)]})
tm.assert_equal(result, expected)
result = df.diff(0)
expected = df - df
assert expected[0].isna().all()
tm.assert_equal(result, expected)
result = df.diff(-1, axis=1)
expected = df * np.nan
tm.assert_equal(result, expected)
@pytest.mark.parametrize("tz", [None, "UTC"])
def test_diff_datetime_axis0_with_nat(self, tz, unit):
# GH#32441
dti = pd.DatetimeIndex(["NaT", "2019-01-01", "2019-01-02"], tz=tz).as_unit(unit)
ser = Series(dti)
df = ser.to_frame()
result = df.diff()
ex_index = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta(days=1)]).as_unit(
unit
)
expected = Series(ex_index).to_frame()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("tz", [None, "UTC"])
def test_diff_datetime_with_nat_zero_periods(self, tz):
# diff on NaT values should give NaT, not timedelta64(0)
dti = date_range("2016-01-01", periods=4, tz=tz)
ser = Series(dti)
df = ser.to_frame().copy()
df[1] = ser.copy()
df.iloc[:, 0] = pd.NaT
expected = df - df
assert expected[0].isna().all()
result = df.diff(0, axis=0)
tm.assert_frame_equal(result, expected)
result = df.diff(0, axis=1)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("tz", [None, "UTC"])
def test_diff_datetime_axis0(self, tz):
# GH#18578
df = DataFrame(
{
0: date_range("2010", freq="D", periods=2, tz=tz),
1: date_range("2010", freq="D", periods=2, tz=tz),
}
)
result = df.diff(axis=0)
expected = DataFrame(
{
0: pd.TimedeltaIndex(["NaT", "1 days"]),
1: pd.TimedeltaIndex(["NaT", "1 days"]),
}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("tz", [None, "UTC"])
def test_diff_datetime_axis1(self, tz):
# GH#18578
df = DataFrame(
{
0: date_range("2010", freq="D", periods=2, tz=tz),
1: date_range("2010", freq="D", periods=2, tz=tz),
}
)
result = df.diff(axis=1)
expected = DataFrame(
{
0: pd.TimedeltaIndex(["NaT", "NaT"]),
1: pd.TimedeltaIndex(["0 days", "0 days"]),
}
)
tm.assert_frame_equal(result, expected)
def test_diff_timedelta(self, unit):
# GH#4533
df = DataFrame(
{
"time": [Timestamp("20130101 9:01"), Timestamp("20130101 9:02")],
"value": [1.0, 2.0],
}
)
df["time"] = df["time"].dt.as_unit(unit)
res = df.diff()
exp = DataFrame(
[[pd.NaT, np.nan], [pd.Timedelta("00:01:00"), 1]], columns=["time", "value"]
)
exp["time"] = exp["time"].dt.as_unit(unit)
tm.assert_frame_equal(res, exp)
def test_diff_mixed_dtype(self):
df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
df["A"] = np.array([1, 2, 3, 4, 5], dtype=object)
result = df.diff()
assert result[0].dtype == np.float64
def test_diff_neg_n(self, datetime_frame):
rs = datetime_frame.diff(-1)
xp = datetime_frame - datetime_frame.shift(-1)
tm.assert_frame_equal(rs, xp)
def test_diff_float_n(self, datetime_frame):
rs = datetime_frame.diff(1.0)
xp = datetime_frame.diff(1)
tm.assert_frame_equal(rs, xp)
def test_diff_axis(self):
# GH#9727
df = DataFrame([[1.0, 2.0], [3.0, 4.0]])
tm.assert_frame_equal(
df.diff(axis=1), DataFrame([[np.nan, 1.0], [np.nan, 1.0]])
)
tm.assert_frame_equal(
df.diff(axis=0), DataFrame([[np.nan, np.nan], [2.0, 2.0]])
)
def test_diff_period(self):
# GH#32995 Don't pass an incorrect axis
pi = date_range("2016-01-01", periods=3).to_period("D")
df = DataFrame({"A": pi})
result = df.diff(1, axis=1)
expected = (df - pd.NaT).astype(object)
tm.assert_frame_equal(result, expected)
def test_diff_axis1_mixed_dtypes(self):
# GH#32995 operate column-wise when we have mixed dtypes and axis=1
df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)})
expected = DataFrame({"A": [np.nan, np.nan, np.nan], "B": df["B"] / 2})
result = df.diff(axis=1)
tm.assert_frame_equal(result, expected)
# GH#21437 mixed-float-dtypes
df = DataFrame(
{"a": np.arange(3, dtype="float32"), "b": np.arange(3, dtype="float64")}
)
result = df.diff(axis=1)
expected = DataFrame({"a": df["a"] * np.nan, "b": df["b"] * 0})
tm.assert_frame_equal(result, expected)
def test_diff_axis1_mixed_dtypes_large_periods(self):
# GH#32995 operate column-wise when we have mixed dtypes and axis=1
df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)})
expected = df * np.nan
result = df.diff(axis=1, periods=3)
tm.assert_frame_equal(result, expected)
def test_diff_axis1_mixed_dtypes_negative_periods(self):
# GH#32995 operate column-wise when we have mixed dtypes and axis=1
df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)})
expected = DataFrame({"A": -1.0 * df["A"], "B": df["B"] * np.nan})
result = df.diff(axis=1, periods=-1)
tm.assert_frame_equal(result, expected)
def test_diff_sparse(self):
# GH#28813 .diff() should work for sparse dataframes as well
sparse_df = DataFrame([[0, 1], [1, 0]], dtype="Sparse[int]")
result = sparse_df.diff()
expected = DataFrame(
[[np.nan, np.nan], [1.0, -1.0]], dtype=pd.SparseDtype("float", 0.0)
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"axis,expected",
[
(
0,
DataFrame(
{
"a": [np.nan, 0, 1, 0, np.nan, np.nan, np.nan, 0],
"b": [np.nan, 1, np.nan, np.nan, -2, 1, np.nan, np.nan],
"c": np.repeat(np.nan, 8),
"d": [np.nan, 3, 5, 7, 9, 11, 13, 15],
},
dtype="Int64",
),
),
(
1,
DataFrame(
{
"a": np.repeat(np.nan, 8),
"b": [0, 1, np.nan, 1, np.nan, np.nan, np.nan, 0],
"c": np.repeat(np.nan, 8),
"d": np.repeat(np.nan, 8),
},
dtype="Int64",
),
),
],
)
def test_diff_integer_na(self, axis, expected):
# GH#24171 IntegerNA Support for DataFrame.diff()
df = DataFrame(
{
"a": np.repeat([0, 1, np.nan, 2], 2),
"b": np.tile([0, 1, np.nan, 2], 2),
"c": np.repeat(np.nan, 8),
"d": np.arange(1, 9) ** 2,
},
dtype="Int64",
)
# Test case for default behaviour of diff
result = df.diff(axis=axis)
tm.assert_frame_equal(result, expected)
def test_diff_readonly(self):
# https://github.com/pandas-dev/pandas/issues/35559
arr = np.random.default_rng(2).standard_normal((5, 2))
arr.flags.writeable = False
df = DataFrame(arr)
result = df.diff()
expected = DataFrame(np.array(df)).diff()
tm.assert_frame_equal(result, expected)
def test_diff_all_int_dtype(self, any_int_numpy_dtype):
# GH 14773
df = DataFrame(range(5))
df = df.astype(any_int_numpy_dtype)
result = df.diff()
expected_dtype = (
"float32" if any_int_numpy_dtype in ("int8", "int16") else "float64"
)
expected = DataFrame([np.nan, 1.0, 1.0, 1.0, 1.0], dtype=expected_dtype)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,155 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
class DotSharedTests:
@pytest.fixture
def obj(self):
raise NotImplementedError
@pytest.fixture
def other(self) -> DataFrame:
"""
other is a DataFrame that is indexed so that obj.dot(other) is valid
"""
raise NotImplementedError
@pytest.fixture
def expected(self, obj, other) -> DataFrame:
"""
The expected result of obj.dot(other)
"""
raise NotImplementedError
@classmethod
def reduced_dim_assert(cls, result, expected):
"""
Assertion about results with 1 fewer dimension that self.obj
"""
raise NotImplementedError
def test_dot_equiv_values_dot(self, obj, other, expected):
# `expected` is constructed from obj.values.dot(other.values)
result = obj.dot(other)
tm.assert_equal(result, expected)
def test_dot_2d_ndarray(self, obj, other, expected):
# Check ndarray argument; in this case we get matching values,
# but index/columns may not match
result = obj.dot(other.values)
assert np.all(result == expected.values)
def test_dot_1d_ndarray(self, obj, expected):
# can pass correct-length array
row = obj.iloc[0] if obj.ndim == 2 else obj
result = obj.dot(row.values)
expected = obj.dot(row)
self.reduced_dim_assert(result, expected)
def test_dot_series(self, obj, other, expected):
# Check series argument
result = obj.dot(other["1"])
self.reduced_dim_assert(result, expected["1"])
def test_dot_series_alignment(self, obj, other, expected):
result = obj.dot(other.iloc[::-1]["1"])
self.reduced_dim_assert(result, expected["1"])
def test_dot_aligns(self, obj, other, expected):
# Check index alignment
other2 = other.iloc[::-1]
result = obj.dot(other2)
tm.assert_equal(result, expected)
def test_dot_shape_mismatch(self, obj):
msg = "Dot product shape mismatch"
# exception raised is of type Exception
with pytest.raises(Exception, match=msg):
obj.dot(obj.values[:3])
def test_dot_misaligned(self, obj, other):
msg = "matrices are not aligned"
with pytest.raises(ValueError, match=msg):
obj.dot(other.T)
class TestSeriesDot(DotSharedTests):
@pytest.fixture
def obj(self):
return Series(
np.random.default_rng(2).standard_normal(4), index=["p", "q", "r", "s"]
)
@pytest.fixture
def other(self):
return DataFrame(
np.random.default_rng(2).standard_normal((3, 4)),
index=["1", "2", "3"],
columns=["p", "q", "r", "s"],
).T
@pytest.fixture
def expected(self, obj, other):
return Series(np.dot(obj.values, other.values), index=other.columns)
@classmethod
def reduced_dim_assert(cls, result, expected):
"""
Assertion about results with 1 fewer dimension that self.obj
"""
tm.assert_almost_equal(result, expected)
class TestDataFrameDot(DotSharedTests):
@pytest.fixture
def obj(self):
return DataFrame(
np.random.default_rng(2).standard_normal((3, 4)),
index=["a", "b", "c"],
columns=["p", "q", "r", "s"],
)
@pytest.fixture
def other(self):
return DataFrame(
np.random.default_rng(2).standard_normal((4, 2)),
index=["p", "q", "r", "s"],
columns=["1", "2"],
)
@pytest.fixture
def expected(self, obj, other):
return DataFrame(
np.dot(obj.values, other.values), index=obj.index, columns=other.columns
)
@classmethod
def reduced_dim_assert(cls, result, expected):
"""
Assertion about results with 1 fewer dimension that self.obj
"""
tm.assert_series_equal(result, expected, check_names=False)
assert result.name is None
@pytest.mark.parametrize(
"dtype,exp_dtype",
[("Float32", "Float64"), ("Int16", "Int32"), ("float[pyarrow]", "double[pyarrow]")],
)
def test_arrow_dtype(dtype, exp_dtype):
pytest.importorskip("pyarrow")
cols = ["a", "b"]
df_a = DataFrame([[1, 2], [3, 4], [5, 6]], columns=cols, dtype="int32")
df_b = DataFrame([[1, 0], [0, 1]], index=cols, dtype=dtype)
result = df_a.dot(df_b)
expected = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=exp_dtype)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,546 @@
import re
import numpy as np
import pytest
from pandas.errors import PerformanceWarning
import pandas as pd
from pandas import (
DataFrame,
DatetimeIndex,
Index,
MultiIndex,
Series,
Timestamp,
)
import pandas._testing as tm
@pytest.mark.parametrize(
"msg,labels,level",
[
(r"labels \[4\] not found in level", 4, "a"),
(r"labels \[7\] not found in level", 7, "b"),
],
)
def test_drop_raise_exception_if_labels_not_in_level(msg, labels, level):
# GH 8594
mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"])
s = Series([10, 20, 30], index=mi)
df = DataFrame([10, 20, 30], index=mi)
with pytest.raises(KeyError, match=msg):
s.drop(labels, level=level)
with pytest.raises(KeyError, match=msg):
df.drop(labels, level=level)
@pytest.mark.parametrize("labels,level", [(4, "a"), (7, "b")])
def test_drop_errors_ignore(labels, level):
# GH 8594
mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"])
s = Series([10, 20, 30], index=mi)
df = DataFrame([10, 20, 30], index=mi)
expected_s = s.drop(labels, level=level, errors="ignore")
tm.assert_series_equal(s, expected_s)
expected_df = df.drop(labels, level=level, errors="ignore")
tm.assert_frame_equal(df, expected_df)
def test_drop_with_non_unique_datetime_index_and_invalid_keys():
# GH 30399
# define dataframe with unique datetime index
df = DataFrame(
np.random.default_rng(2).standard_normal((5, 3)),
columns=["a", "b", "c"],
index=pd.date_range("2012", freq="h", periods=5),
)
# create dataframe with non-unique datetime index
df = df.iloc[[0, 2, 2, 3]].copy()
with pytest.raises(KeyError, match="not found in axis"):
df.drop(["a", "b"]) # Dropping with labels not exist in the index
class TestDataFrameDrop:
def test_drop_names(self):
df = DataFrame(
[[1, 2, 3], [3, 4, 5], [5, 6, 7]],
index=["a", "b", "c"],
columns=["d", "e", "f"],
)
df.index.name, df.columns.name = "first", "second"
df_dropped_b = df.drop("b")
df_dropped_e = df.drop("e", axis=1)
df_inplace_b, df_inplace_e = df.copy(), df.copy()
return_value = df_inplace_b.drop("b", inplace=True)
assert return_value is None
return_value = df_inplace_e.drop("e", axis=1, inplace=True)
assert return_value is None
for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e):
assert obj.index.name == "first"
assert obj.columns.name == "second"
assert list(df.columns) == ["d", "e", "f"]
msg = r"\['g'\] not found in axis"
with pytest.raises(KeyError, match=msg):
df.drop(["g"])
with pytest.raises(KeyError, match=msg):
df.drop(["g"], axis=1)
# errors = 'ignore'
dropped = df.drop(["g"], errors="ignore")
expected = Index(["a", "b", "c"], name="first")
tm.assert_index_equal(dropped.index, expected)
dropped = df.drop(["b", "g"], errors="ignore")
expected = Index(["a", "c"], name="first")
tm.assert_index_equal(dropped.index, expected)
dropped = df.drop(["g"], axis=1, errors="ignore")
expected = Index(["d", "e", "f"], name="second")
tm.assert_index_equal(dropped.columns, expected)
dropped = df.drop(["d", "g"], axis=1, errors="ignore")
expected = Index(["e", "f"], name="second")
tm.assert_index_equal(dropped.columns, expected)
# GH 16398
dropped = df.drop([], errors="ignore")
expected = Index(["a", "b", "c"], name="first")
tm.assert_index_equal(dropped.index, expected)
def test_drop(self):
simple = DataFrame({"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]})
tm.assert_frame_equal(simple.drop("A", axis=1), simple[["B"]])
tm.assert_frame_equal(simple.drop(["A", "B"], axis="columns"), simple[[]])
tm.assert_frame_equal(simple.drop([0, 1, 3], axis=0), simple.loc[[2], :])
tm.assert_frame_equal(simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :])
with pytest.raises(KeyError, match=r"\[5\] not found in axis"):
simple.drop(5)
with pytest.raises(KeyError, match=r"\['C'\] not found in axis"):
simple.drop("C", axis=1)
with pytest.raises(KeyError, match=r"\[5\] not found in axis"):
simple.drop([1, 5])
with pytest.raises(KeyError, match=r"\['C'\] not found in axis"):
simple.drop(["A", "C"], axis=1)
# GH 42881
with pytest.raises(KeyError, match=r"\['C', 'D', 'F'\] not found in axis"):
simple.drop(["C", "D", "F"], axis=1)
# errors = 'ignore'
tm.assert_frame_equal(simple.drop(5, errors="ignore"), simple)
tm.assert_frame_equal(
simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :]
)
tm.assert_frame_equal(simple.drop("C", axis=1, errors="ignore"), simple)
tm.assert_frame_equal(
simple.drop(["A", "C"], axis=1, errors="ignore"), simple[["B"]]
)
# non-unique - wheee!
nu_df = DataFrame(
list(zip(range(3), range(-3, 1), list("abc"))), columns=["a", "a", "b"]
)
tm.assert_frame_equal(nu_df.drop("a", axis=1), nu_df[["b"]])
tm.assert_frame_equal(nu_df.drop("b", axis="columns"), nu_df["a"])
tm.assert_frame_equal(nu_df.drop([]), nu_df) # GH 16398
nu_df = nu_df.set_index(Index(["X", "Y", "X"]))
nu_df.columns = list("abc")
tm.assert_frame_equal(nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :])
tm.assert_frame_equal(nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :])
# inplace cache issue
# GH#5628
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc")
)
expected = df[~(df.b > 0)]
return_value = df.drop(labels=df[df.b > 0].index, inplace=True)
assert return_value is None
tm.assert_frame_equal(df, expected)
def test_drop_multiindex_not_lexsorted(self):
# GH#11640
# define the lexsorted version
lexsorted_mi = MultiIndex.from_tuples(
[("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"]
)
lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
assert lexsorted_df.columns._is_lexsorted()
# define the non-lexsorted version
not_lexsorted_df = DataFrame(
columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]]
)
not_lexsorted_df = not_lexsorted_df.pivot_table(
index="a", columns=["b", "c"], values="d"
)
not_lexsorted_df = not_lexsorted_df.reset_index()
assert not not_lexsorted_df.columns._is_lexsorted()
expected = lexsorted_df.drop("a", axis=1).astype(float)
with tm.assert_produces_warning(PerformanceWarning):
result = not_lexsorted_df.drop("a", axis=1)
tm.assert_frame_equal(result, expected)
def test_drop_api_equivalence(self):
# equivalence of the labels/axis and index/columns API's (GH#12392)
df = DataFrame(
[[1, 2, 3], [3, 4, 5], [5, 6, 7]],
index=["a", "b", "c"],
columns=["d", "e", "f"],
)
res1 = df.drop("a")
res2 = df.drop(index="a")
tm.assert_frame_equal(res1, res2)
res1 = df.drop("d", axis=1)
res2 = df.drop(columns="d")
tm.assert_frame_equal(res1, res2)
res1 = df.drop(labels="e", axis=1)
res2 = df.drop(columns="e")
tm.assert_frame_equal(res1, res2)
res1 = df.drop(["a"], axis=0)
res2 = df.drop(index=["a"])
tm.assert_frame_equal(res1, res2)
res1 = df.drop(["a"], axis=0).drop(["d"], axis=1)
res2 = df.drop(index=["a"], columns=["d"])
tm.assert_frame_equal(res1, res2)
msg = "Cannot specify both 'labels' and 'index'/'columns'"
with pytest.raises(ValueError, match=msg):
df.drop(labels="a", index="b")
with pytest.raises(ValueError, match=msg):
df.drop(labels="a", columns="b")
msg = "Need to specify at least one of 'labels', 'index' or 'columns'"
with pytest.raises(ValueError, match=msg):
df.drop(axis=1)
data = [[1, 2, 3], [1, 2, 3]]
@pytest.mark.parametrize(
"actual",
[
DataFrame(data=data, index=["a", "a"]),
DataFrame(data=data, index=["a", "b"]),
DataFrame(data=data, index=["a", "b"]).set_index([0, 1]),
DataFrame(data=data, index=["a", "a"]).set_index([0, 1]),
],
)
def test_raise_on_drop_duplicate_index(self, actual):
# GH#19186
level = 0 if isinstance(actual.index, MultiIndex) else None
msg = re.escape("\"['c'] not found in axis\"")
with pytest.raises(KeyError, match=msg):
actual.drop("c", level=level, axis=0)
with pytest.raises(KeyError, match=msg):
actual.T.drop("c", level=level, axis=1)
expected_no_err = actual.drop("c", axis=0, level=level, errors="ignore")
tm.assert_frame_equal(expected_no_err, actual)
expected_no_err = actual.T.drop("c", axis=1, level=level, errors="ignore")
tm.assert_frame_equal(expected_no_err.T, actual)
@pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 2]])
@pytest.mark.parametrize("drop_labels", [[], [1], [2]])
def test_drop_empty_list(self, index, drop_labels):
# GH#21494
expected_index = [i for i in index if i not in drop_labels]
frame = DataFrame(index=index).drop(drop_labels)
tm.assert_frame_equal(frame, DataFrame(index=expected_index))
@pytest.mark.parametrize("index", [[1, 2, 3], [1, 2, 2]])
@pytest.mark.parametrize("drop_labels", [[1, 4], [4, 5]])
def test_drop_non_empty_list(self, index, drop_labels):
# GH# 21494
with pytest.raises(KeyError, match="not found in axis"):
DataFrame(index=index).drop(drop_labels)
@pytest.mark.parametrize(
"empty_listlike",
[
[],
{},
np.array([]),
Series([], dtype="datetime64[ns]"),
Index([]),
DatetimeIndex([]),
],
)
def test_drop_empty_listlike_non_unique_datetime_index(self, empty_listlike):
# GH#27994
data = {"column_a": [5, 10], "column_b": ["one", "two"]}
index = [Timestamp("2021-01-01"), Timestamp("2021-01-01")]
df = DataFrame(data, index=index)
# Passing empty list-like should return the same DataFrame.
expected = df.copy()
result = df.drop(empty_listlike)
tm.assert_frame_equal(result, expected)
def test_mixed_depth_drop(self):
arrays = [
["a", "top", "top", "routine1", "routine1", "routine2"],
["", "OD", "OD", "result1", "result2", "result1"],
["", "wx", "wy", "", "", ""],
]
tuples = sorted(zip(*arrays))
index = MultiIndex.from_tuples(tuples)
df = DataFrame(np.random.default_rng(2).standard_normal((4, 6)), columns=index)
result = df.drop("a", axis=1)
expected = df.drop([("a", "", "")], axis=1)
tm.assert_frame_equal(expected, result)
result = df.drop(["top"], axis=1)
expected = df.drop([("top", "OD", "wx")], axis=1)
expected = expected.drop([("top", "OD", "wy")], axis=1)
tm.assert_frame_equal(expected, result)
result = df.drop(("top", "OD", "wx"), axis=1)
expected = df.drop([("top", "OD", "wx")], axis=1)
tm.assert_frame_equal(expected, result)
expected = df.drop([("top", "OD", "wy")], axis=1)
expected = df.drop("top", axis=1)
result = df.drop("result1", level=1, axis=1)
expected = df.drop(
[("routine1", "result1", ""), ("routine2", "result1", "")], axis=1
)
tm.assert_frame_equal(expected, result)
def test_drop_multiindex_other_level_nan(self):
# GH#12754
df = (
DataFrame(
{
"A": ["one", "one", "two", "two"],
"B": [np.nan, 0.0, 1.0, 2.0],
"C": ["a", "b", "c", "c"],
"D": [1, 2, 3, 4],
}
)
.set_index(["A", "B", "C"])
.sort_index()
)
result = df.drop("c", level="C")
expected = DataFrame(
[2, 1],
columns=["D"],
index=MultiIndex.from_tuples(
[("one", 0.0, "b"), ("one", np.nan, "a")], names=["A", "B", "C"]
),
)
tm.assert_frame_equal(result, expected)
def test_drop_nonunique(self):
df = DataFrame(
[
["x-a", "x", "a", 1.5],
["x-a", "x", "a", 1.2],
["z-c", "z", "c", 3.1],
["x-a", "x", "a", 4.1],
["x-b", "x", "b", 5.1],
["x-b", "x", "b", 4.1],
["x-b", "x", "b", 2.2],
["y-a", "y", "a", 1.2],
["z-b", "z", "b", 2.1],
],
columns=["var1", "var2", "var3", "var4"],
)
grp_size = df.groupby("var1").size()
drop_idx = grp_size.loc[grp_size == 1]
idf = df.set_index(["var1", "var2", "var3"])
# it works! GH#2101
result = idf.drop(drop_idx.index, level=0).reset_index()
expected = df[-df.var1.isin(drop_idx.index)]
result.index = expected.index
tm.assert_frame_equal(result, expected)
def test_drop_level(self, multiindex_dataframe_random_data):
frame = multiindex_dataframe_random_data
result = frame.drop(["bar", "qux"], level="first")
expected = frame.iloc[[0, 1, 2, 5, 6]]
tm.assert_frame_equal(result, expected)
result = frame.drop(["two"], level="second")
expected = frame.iloc[[0, 2, 3, 6, 7, 9]]
tm.assert_frame_equal(result, expected)
result = frame.T.drop(["bar", "qux"], axis=1, level="first")
expected = frame.iloc[[0, 1, 2, 5, 6]].T
tm.assert_frame_equal(result, expected)
result = frame.T.drop(["two"], axis=1, level="second")
expected = frame.iloc[[0, 2, 3, 6, 7, 9]].T
tm.assert_frame_equal(result, expected)
def test_drop_level_nonunique_datetime(self):
# GH#12701
idx = Index([2, 3, 4, 4, 5], name="id")
idxdt = pd.to_datetime(
[
"2016-03-23 14:00",
"2016-03-23 15:00",
"2016-03-23 16:00",
"2016-03-23 16:00",
"2016-03-23 17:00",
]
)
df = DataFrame(np.arange(10).reshape(5, 2), columns=list("ab"), index=idx)
df["tstamp"] = idxdt
df = df.set_index("tstamp", append=True)
ts = Timestamp("201603231600")
assert df.index.is_unique is False
result = df.drop(ts, level="tstamp")
expected = df.loc[idx != 4]
tm.assert_frame_equal(result, expected)
def test_drop_tz_aware_timestamp_across_dst(self, frame_or_series):
# GH#21761
start = Timestamp("2017-10-29", tz="Europe/Berlin")
end = Timestamp("2017-10-29 04:00:00", tz="Europe/Berlin")
index = pd.date_range(start, end, freq="15min")
data = frame_or_series(data=[1] * len(index), index=index)
result = data.drop(start)
expected_start = Timestamp("2017-10-29 00:15:00", tz="Europe/Berlin")
expected_idx = pd.date_range(expected_start, end, freq="15min")
expected = frame_or_series(data=[1] * len(expected_idx), index=expected_idx)
tm.assert_equal(result, expected)
def test_drop_preserve_names(self):
index = MultiIndex.from_arrays(
[[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]], names=["one", "two"]
)
df = DataFrame(np.random.default_rng(2).standard_normal((6, 3)), index=index)
result = df.drop([(0, 2)])
assert result.index.names == ("one", "two")
@pytest.mark.parametrize(
"operation", ["__iadd__", "__isub__", "__imul__", "__ipow__"]
)
@pytest.mark.parametrize("inplace", [False, True])
def test_inplace_drop_and_operation(self, operation, inplace):
# GH#30484
df = DataFrame({"x": range(5)})
expected = df.copy()
df["y"] = range(5)
y = df["y"]
with tm.assert_produces_warning(None):
if inplace:
df.drop("y", axis=1, inplace=inplace)
else:
df = df.drop("y", axis=1, inplace=inplace)
# Perform operation and check result
getattr(y, operation)(1)
tm.assert_frame_equal(df, expected)
def test_drop_with_non_unique_multiindex(self):
# GH#36293
mi = MultiIndex.from_arrays([["x", "y", "x"], ["i", "j", "i"]])
df = DataFrame([1, 2, 3], index=mi)
result = df.drop(index="x")
expected = DataFrame([2], index=MultiIndex.from_arrays([["y"], ["j"]]))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("indexer", [("a", "a"), [("a", "a")]])
def test_drop_tuple_with_non_unique_multiindex(self, indexer):
# GH#42771
idx = MultiIndex.from_product([["a", "b"], ["a", "a"]])
df = DataFrame({"x": range(len(idx))}, index=idx)
result = df.drop(index=[("a", "a")])
expected = DataFrame(
{"x": [2, 3]}, index=MultiIndex.from_tuples([("b", "a"), ("b", "a")])
)
tm.assert_frame_equal(result, expected)
def test_drop_with_duplicate_columns(self):
df = DataFrame(
[[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"]
)
result = df.drop(["a"], axis=1)
expected = DataFrame([[1], [1], [1]], columns=["bar"])
tm.assert_frame_equal(result, expected)
result = df.drop("a", axis=1)
tm.assert_frame_equal(result, expected)
def test_drop_with_duplicate_columns2(self):
# drop buggy GH#6240
df = DataFrame(
{
"A": np.random.default_rng(2).standard_normal(5),
"B": np.random.default_rng(2).standard_normal(5),
"C": np.random.default_rng(2).standard_normal(5),
"D": ["a", "b", "c", "d", "e"],
}
)
expected = df.take([0, 1, 1], axis=1)
df2 = df.take([2, 0, 1, 2, 1], axis=1)
result = df2.drop("C", axis=1)
tm.assert_frame_equal(result, expected)
def test_drop_inplace_no_leftover_column_reference(self):
# GH 13934
df = DataFrame({"a": [1, 2, 3]}, columns=Index(["a"], dtype="object"))
a = df.a
df.drop(["a"], axis=1, inplace=True)
tm.assert_index_equal(df.columns, Index([], dtype="object"))
a -= a.mean()
tm.assert_index_equal(df.columns, Index([], dtype="object"))
def test_drop_level_missing_label_multiindex(self):
# GH 18561
df = DataFrame(index=MultiIndex.from_product([range(3), range(3)]))
with pytest.raises(KeyError, match="labels \\[5\\] not found in level"):
df.drop(5, level=0)
@pytest.mark.parametrize("idx, level", [(["a", "b"], 0), (["a"], None)])
def test_drop_index_ea_dtype(self, any_numeric_ea_dtype, idx, level):
# GH#45860
df = DataFrame(
{"a": [1, 2, 2, pd.NA], "b": 100}, dtype=any_numeric_ea_dtype
).set_index(idx)
result = df.drop(Index([2, pd.NA]), level=level)
expected = DataFrame(
{"a": [1], "b": 100}, dtype=any_numeric_ea_dtype
).set_index(idx)
tm.assert_frame_equal(result, expected)
def test_drop_parse_strings_datetime_index(self):
# GH #5355
df = DataFrame(
{"a": [1, 2], "b": [1, 2]},
index=[Timestamp("2000-01-03"), Timestamp("2000-01-04")],
)
result = df.drop("2000-01-03", axis=0)
expected = DataFrame({"a": [2], "b": [2]}, index=[Timestamp("2000-01-04")])
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,473 @@
from datetime import datetime
import re
import numpy as np
import pytest
from pandas import (
DataFrame,
NaT,
concat,
)
import pandas._testing as tm
@pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]])
def test_drop_duplicates_with_misspelled_column_name(subset):
# GH 19730
df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
msg = re.escape("Index(['a'], dtype=")
with pytest.raises(KeyError, match=msg):
df.drop_duplicates(subset)
def test_drop_duplicates():
df = DataFrame(
{
"AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": [1, 1, 2, 2, 2, 2, 1, 2],
"D": range(8),
}
)
# single column
result = df.drop_duplicates("AAA")
expected = df[:2]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("AAA", keep="last")
expected = df.loc[[6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("AAA", keep=False)
expected = df.loc[[]]
tm.assert_frame_equal(result, expected)
assert len(result) == 0
# multi column
expected = df.loc[[0, 1, 2, 3]]
result = df.drop_duplicates(np.array(["AAA", "B"]))
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(["AAA", "B"])
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(("AAA", "B"), keep="last")
expected = df.loc[[0, 5, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(("AAA", "B"), keep=False)
expected = df.loc[[0]]
tm.assert_frame_equal(result, expected)
# consider everything
df2 = df.loc[:, ["AAA", "B", "C"]]
result = df2.drop_duplicates()
# in this case only
expected = df2.drop_duplicates(["AAA", "B"])
tm.assert_frame_equal(result, expected)
result = df2.drop_duplicates(keep="last")
expected = df2.drop_duplicates(["AAA", "B"], keep="last")
tm.assert_frame_equal(result, expected)
result = df2.drop_duplicates(keep=False)
expected = df2.drop_duplicates(["AAA", "B"], keep=False)
tm.assert_frame_equal(result, expected)
# integers
result = df.drop_duplicates("C")
expected = df.iloc[[0, 2]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("C", keep="last")
expected = df.iloc[[-2, -1]]
tm.assert_frame_equal(result, expected)
df["E"] = df["C"].astype("int8")
result = df.drop_duplicates("E")
expected = df.iloc[[0, 2]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("E", keep="last")
expected = df.iloc[[-2, -1]]
tm.assert_frame_equal(result, expected)
# GH 11376
df = DataFrame({"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]})
expected = df.loc[df.index != 3]
tm.assert_frame_equal(df.drop_duplicates(), expected)
df = DataFrame([[1, 0], [0, 2]])
tm.assert_frame_equal(df.drop_duplicates(), df)
df = DataFrame([[-2, 0], [0, -4]])
tm.assert_frame_equal(df.drop_duplicates(), df)
x = np.iinfo(np.int64).max / 3 * 2
df = DataFrame([[-x, x], [0, x + 4]])
tm.assert_frame_equal(df.drop_duplicates(), df)
df = DataFrame([[-x, x], [x, x + 4]])
tm.assert_frame_equal(df.drop_duplicates(), df)
# GH 11864
df = DataFrame([i] * 9 for i in range(16))
df = concat([df, DataFrame([[1] + [0] * 8])], ignore_index=True)
for keep in ["first", "last", False]:
assert df.duplicated(keep=keep).sum() == 0
def test_drop_duplicates_with_duplicate_column_names():
# GH17836
df = DataFrame([[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"])
result0 = df.drop_duplicates()
tm.assert_frame_equal(result0, df)
result1 = df.drop_duplicates("a")
expected1 = df[:2]
tm.assert_frame_equal(result1, expected1)
def test_drop_duplicates_for_take_all():
df = DataFrame(
{
"AAA": ["foo", "bar", "baz", "bar", "foo", "bar", "qux", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": [1, 1, 2, 2, 2, 2, 1, 2],
"D": range(8),
}
)
# single column
result = df.drop_duplicates("AAA")
expected = df.iloc[[0, 1, 2, 6]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("AAA", keep="last")
expected = df.iloc[[2, 5, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("AAA", keep=False)
expected = df.iloc[[2, 6]]
tm.assert_frame_equal(result, expected)
# multiple columns
result = df.drop_duplicates(["AAA", "B"])
expected = df.iloc[[0, 1, 2, 3, 4, 6]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(["AAA", "B"], keep="last")
expected = df.iloc[[0, 1, 2, 5, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(["AAA", "B"], keep=False)
expected = df.iloc[[0, 1, 2, 6]]
tm.assert_frame_equal(result, expected)
def test_drop_duplicates_tuple():
df = DataFrame(
{
("AA", "AB"): ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": [1, 1, 2, 2, 2, 2, 1, 2],
"D": range(8),
}
)
# single column
result = df.drop_duplicates(("AA", "AB"))
expected = df[:2]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(("AA", "AB"), keep="last")
expected = df.loc[[6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(("AA", "AB"), keep=False)
expected = df.loc[[]] # empty df
assert len(result) == 0
tm.assert_frame_equal(result, expected)
# multi column
expected = df.loc[[0, 1, 2, 3]]
result = df.drop_duplicates((("AA", "AB"), "B"))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"df",
[
DataFrame(),
DataFrame(columns=[]),
DataFrame(columns=["A", "B", "C"]),
DataFrame(index=[]),
DataFrame(index=["A", "B", "C"]),
],
)
def test_drop_duplicates_empty(df):
# GH 20516
result = df.drop_duplicates()
tm.assert_frame_equal(result, df)
result = df.copy()
result.drop_duplicates(inplace=True)
tm.assert_frame_equal(result, df)
def test_drop_duplicates_NA():
# none
df = DataFrame(
{
"A": [None, None, "foo", "bar", "foo", "bar", "bar", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0],
"D": range(8),
}
)
# single column
result = df.drop_duplicates("A")
expected = df.loc[[0, 2, 3]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("A", keep="last")
expected = df.loc[[1, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("A", keep=False)
expected = df.loc[[]] # empty df
tm.assert_frame_equal(result, expected)
assert len(result) == 0
# multi column
result = df.drop_duplicates(["A", "B"])
expected = df.loc[[0, 2, 3, 6]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(["A", "B"], keep="last")
expected = df.loc[[1, 5, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(["A", "B"], keep=False)
expected = df.loc[[6]]
tm.assert_frame_equal(result, expected)
# nan
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0],
"D": range(8),
}
)
# single column
result = df.drop_duplicates("C")
expected = df[:2]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("C", keep="last")
expected = df.loc[[3, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("C", keep=False)
expected = df.loc[[]] # empty df
tm.assert_frame_equal(result, expected)
assert len(result) == 0
# multi column
result = df.drop_duplicates(["C", "B"])
expected = df.loc[[0, 1, 2, 4]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(["C", "B"], keep="last")
expected = df.loc[[1, 3, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(["C", "B"], keep=False)
expected = df.loc[[1]]
tm.assert_frame_equal(result, expected)
def test_drop_duplicates_NA_for_take_all():
# none
df = DataFrame(
{
"A": [None, None, "foo", "bar", "foo", "baz", "bar", "qux"],
"C": [1.0, np.nan, np.nan, np.nan, 1.0, 2.0, 3, 1.0],
}
)
# single column
result = df.drop_duplicates("A")
expected = df.iloc[[0, 2, 3, 5, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("A", keep="last")
expected = df.iloc[[1, 4, 5, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("A", keep=False)
expected = df.iloc[[5, 7]]
tm.assert_frame_equal(result, expected)
# nan
# single column
result = df.drop_duplicates("C")
expected = df.iloc[[0, 1, 5, 6]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("C", keep="last")
expected = df.iloc[[3, 5, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("C", keep=False)
expected = df.iloc[[5, 6]]
tm.assert_frame_equal(result, expected)
def test_drop_duplicates_inplace():
orig = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": [1, 1, 2, 2, 2, 2, 1, 2],
"D": range(8),
}
)
# single column
df = orig.copy()
return_value = df.drop_duplicates("A", inplace=True)
expected = orig[:2]
result = df
tm.assert_frame_equal(result, expected)
assert return_value is None
df = orig.copy()
return_value = df.drop_duplicates("A", keep="last", inplace=True)
expected = orig.loc[[6, 7]]
result = df
tm.assert_frame_equal(result, expected)
assert return_value is None
df = orig.copy()
return_value = df.drop_duplicates("A", keep=False, inplace=True)
expected = orig.loc[[]]
result = df
tm.assert_frame_equal(result, expected)
assert len(df) == 0
assert return_value is None
# multi column
df = orig.copy()
return_value = df.drop_duplicates(["A", "B"], inplace=True)
expected = orig.loc[[0, 1, 2, 3]]
result = df
tm.assert_frame_equal(result, expected)
assert return_value is None
df = orig.copy()
return_value = df.drop_duplicates(["A", "B"], keep="last", inplace=True)
expected = orig.loc[[0, 5, 6, 7]]
result = df
tm.assert_frame_equal(result, expected)
assert return_value is None
df = orig.copy()
return_value = df.drop_duplicates(["A", "B"], keep=False, inplace=True)
expected = orig.loc[[0]]
result = df
tm.assert_frame_equal(result, expected)
assert return_value is None
# consider everything
orig2 = orig.loc[:, ["A", "B", "C"]].copy()
df2 = orig2.copy()
return_value = df2.drop_duplicates(inplace=True)
# in this case only
expected = orig2.drop_duplicates(["A", "B"])
result = df2
tm.assert_frame_equal(result, expected)
assert return_value is None
df2 = orig2.copy()
return_value = df2.drop_duplicates(keep="last", inplace=True)
expected = orig2.drop_duplicates(["A", "B"], keep="last")
result = df2
tm.assert_frame_equal(result, expected)
assert return_value is None
df2 = orig2.copy()
return_value = df2.drop_duplicates(keep=False, inplace=True)
expected = orig2.drop_duplicates(["A", "B"], keep=False)
result = df2
tm.assert_frame_equal(result, expected)
assert return_value is None
@pytest.mark.parametrize("inplace", [True, False])
@pytest.mark.parametrize(
"origin_dict, output_dict, ignore_index, output_index",
[
({"A": [2, 2, 3]}, {"A": [2, 3]}, True, [0, 1]),
({"A": [2, 2, 3]}, {"A": [2, 3]}, False, [0, 2]),
({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, [0, 1]),
({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, False, [0, 2]),
],
)
def test_drop_duplicates_ignore_index(
inplace, origin_dict, output_dict, ignore_index, output_index
):
# GH 30114
df = DataFrame(origin_dict)
expected = DataFrame(output_dict, index=output_index)
if inplace:
result_df = df.copy()
result_df.drop_duplicates(ignore_index=ignore_index, inplace=inplace)
else:
result_df = df.drop_duplicates(ignore_index=ignore_index, inplace=inplace)
tm.assert_frame_equal(result_df, expected)
tm.assert_frame_equal(df, DataFrame(origin_dict))
def test_drop_duplicates_null_in_object_column(nulls_fixture):
# https://github.com/pandas-dev/pandas/issues/32992
df = DataFrame([[1, nulls_fixture], [2, "a"]], dtype=object)
result = df.drop_duplicates()
tm.assert_frame_equal(result, df)
def test_drop_duplicates_series_vs_dataframe(keep):
# GH#14192
df = DataFrame(
{
"a": [1, 1, 1, "one", "one"],
"b": [2, 2, np.nan, np.nan, np.nan],
"c": [3, 3, np.nan, np.nan, "three"],
"d": [1, 2, 3, 4, 4],
"e": [
datetime(2015, 1, 1),
datetime(2015, 1, 1),
datetime(2015, 2, 1),
NaT,
NaT,
],
}
)
for column in df.columns:
dropped_frame = df[[column]].drop_duplicates(keep=keep)
dropped_series = df[column].drop_duplicates(keep=keep)
tm.assert_frame_equal(dropped_frame, dropped_series.to_frame())
@pytest.mark.parametrize("arg", [[1], 1, "True", [], 0])
def test_drop_duplicates_non_boolean_ignore_index(arg):
# GH#38274
df = DataFrame({"a": [1, 2, 1, 3]})
msg = '^For argument "ignore_index" expected type bool, received type .*.$'
with pytest.raises(ValueError, match=msg):
df.drop_duplicates(ignore_index=arg)

View File

@ -0,0 +1,36 @@
import pytest
from pandas import (
DataFrame,
Index,
MultiIndex,
)
import pandas._testing as tm
class TestDropLevel:
def test_droplevel(self, frame_or_series):
# GH#20342
cols = MultiIndex.from_tuples(
[("c", "e"), ("d", "f")], names=["level_1", "level_2"]
)
mi = MultiIndex.from_tuples([(1, 2), (5, 6), (9, 10)], names=["a", "b"])
df = DataFrame([[3, 4], [7, 8], [11, 12]], index=mi, columns=cols)
if frame_or_series is not DataFrame:
df = df.iloc[:, 0]
# test that dropping of a level in index works
expected = df.reset_index("a", drop=True)
result = df.droplevel("a", axis="index")
tm.assert_equal(result, expected)
if frame_or_series is DataFrame:
# test that dropping of a level in columns works
expected = df.copy()
expected.columns = Index(["c", "d"], name="level_1")
result = df.droplevel("level_2", axis="columns")
tm.assert_equal(result, expected)
else:
# test that droplevel raises ValueError on axis != 0
with pytest.raises(ValueError, match="No axis named columns"):
df.droplevel(1, axis="columns")

View File

@ -0,0 +1,285 @@
import datetime
import dateutil
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
class TestDataFrameMissingData:
def test_dropEmptyRows(self, float_frame):
N = len(float_frame.index)
mat = np.random.default_rng(2).standard_normal(N)
mat[:5] = np.nan
frame = DataFrame({"foo": mat}, index=float_frame.index)
original = Series(mat, index=float_frame.index, name="foo")
expected = original.dropna()
inplace_frame1, inplace_frame2 = frame.copy(), frame.copy()
smaller_frame = frame.dropna(how="all")
# check that original was preserved
tm.assert_series_equal(frame["foo"], original)
return_value = inplace_frame1.dropna(how="all", inplace=True)
tm.assert_series_equal(smaller_frame["foo"], expected)
tm.assert_series_equal(inplace_frame1["foo"], expected)
assert return_value is None
smaller_frame = frame.dropna(how="all", subset=["foo"])
return_value = inplace_frame2.dropna(how="all", subset=["foo"], inplace=True)
tm.assert_series_equal(smaller_frame["foo"], expected)
tm.assert_series_equal(inplace_frame2["foo"], expected)
assert return_value is None
def test_dropIncompleteRows(self, float_frame):
N = len(float_frame.index)
mat = np.random.default_rng(2).standard_normal(N)
mat[:5] = np.nan
frame = DataFrame({"foo": mat}, index=float_frame.index)
frame["bar"] = 5
original = Series(mat, index=float_frame.index, name="foo")
inp_frame1, inp_frame2 = frame.copy(), frame.copy()
smaller_frame = frame.dropna()
tm.assert_series_equal(frame["foo"], original)
return_value = inp_frame1.dropna(inplace=True)
exp = Series(mat[5:], index=float_frame.index[5:], name="foo")
tm.assert_series_equal(smaller_frame["foo"], exp)
tm.assert_series_equal(inp_frame1["foo"], exp)
assert return_value is None
samesize_frame = frame.dropna(subset=["bar"])
tm.assert_series_equal(frame["foo"], original)
assert (frame["bar"] == 5).all()
return_value = inp_frame2.dropna(subset=["bar"], inplace=True)
tm.assert_index_equal(samesize_frame.index, float_frame.index)
tm.assert_index_equal(inp_frame2.index, float_frame.index)
assert return_value is None
def test_dropna(self):
df = DataFrame(np.random.default_rng(2).standard_normal((6, 4)))
df.iloc[:2, 2] = np.nan
dropped = df.dropna(axis=1)
expected = df.loc[:, [0, 1, 3]]
inp = df.copy()
return_value = inp.dropna(axis=1, inplace=True)
tm.assert_frame_equal(dropped, expected)
tm.assert_frame_equal(inp, expected)
assert return_value is None
dropped = df.dropna(axis=0)
expected = df.loc[list(range(2, 6))]
inp = df.copy()
return_value = inp.dropna(axis=0, inplace=True)
tm.assert_frame_equal(dropped, expected)
tm.assert_frame_equal(inp, expected)
assert return_value is None
# threshold
dropped = df.dropna(axis=1, thresh=5)
expected = df.loc[:, [0, 1, 3]]
inp = df.copy()
return_value = inp.dropna(axis=1, thresh=5, inplace=True)
tm.assert_frame_equal(dropped, expected)
tm.assert_frame_equal(inp, expected)
assert return_value is None
dropped = df.dropna(axis=0, thresh=4)
expected = df.loc[range(2, 6)]
inp = df.copy()
return_value = inp.dropna(axis=0, thresh=4, inplace=True)
tm.assert_frame_equal(dropped, expected)
tm.assert_frame_equal(inp, expected)
assert return_value is None
dropped = df.dropna(axis=1, thresh=4)
tm.assert_frame_equal(dropped, df)
dropped = df.dropna(axis=1, thresh=3)
tm.assert_frame_equal(dropped, df)
# subset
dropped = df.dropna(axis=0, subset=[0, 1, 3])
inp = df.copy()
return_value = inp.dropna(axis=0, subset=[0, 1, 3], inplace=True)
tm.assert_frame_equal(dropped, df)
tm.assert_frame_equal(inp, df)
assert return_value is None
# all
dropped = df.dropna(axis=1, how="all")
tm.assert_frame_equal(dropped, df)
df[2] = np.nan
dropped = df.dropna(axis=1, how="all")
expected = df.loc[:, [0, 1, 3]]
tm.assert_frame_equal(dropped, expected)
# bad input
msg = "No axis named 3 for object type DataFrame"
with pytest.raises(ValueError, match=msg):
df.dropna(axis=3)
def test_drop_and_dropna_caching(self):
# tst that cacher updates
original = Series([1, 2, np.nan], name="A")
expected = Series([1, 2], dtype=original.dtype, name="A")
df = DataFrame({"A": original.values.copy()})
df2 = df.copy()
df["A"].dropna()
tm.assert_series_equal(df["A"], original)
ser = df["A"]
return_value = ser.dropna(inplace=True)
tm.assert_series_equal(ser, expected)
tm.assert_series_equal(df["A"], original)
assert return_value is None
df2["A"].drop([1])
tm.assert_series_equal(df2["A"], original)
ser = df2["A"]
return_value = ser.drop([1], inplace=True)
tm.assert_series_equal(ser, original.drop([1]))
tm.assert_series_equal(df2["A"], original)
assert return_value is None
def test_dropna_corner(self, float_frame):
# bad input
msg = "invalid how option: foo"
with pytest.raises(ValueError, match=msg):
float_frame.dropna(how="foo")
# non-existent column - 8303
with pytest.raises(KeyError, match=r"^\['X'\]$"):
float_frame.dropna(subset=["A", "X"])
def test_dropna_multiple_axes(self):
df = DataFrame(
[
[1, np.nan, 2, 3],
[4, np.nan, 5, 6],
[np.nan, np.nan, np.nan, np.nan],
[7, np.nan, 8, 9],
]
)
# GH20987
with pytest.raises(TypeError, match="supplying multiple axes"):
df.dropna(how="all", axis=[0, 1])
with pytest.raises(TypeError, match="supplying multiple axes"):
df.dropna(how="all", axis=(0, 1))
inp = df.copy()
with pytest.raises(TypeError, match="supplying multiple axes"):
inp.dropna(how="all", axis=(0, 1), inplace=True)
def test_dropna_tz_aware_datetime(self):
# GH13407
df = DataFrame()
dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc())
dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc())
df["Time"] = [dt1]
result = df.dropna(axis=0)
expected = DataFrame({"Time": [dt1]})
tm.assert_frame_equal(result, expected)
# Ex2
df = DataFrame({"Time": [dt1, None, np.nan, dt2]})
result = df.dropna(axis=0)
expected = DataFrame([dt1, dt2], columns=["Time"], index=[0, 3])
tm.assert_frame_equal(result, expected)
def test_dropna_categorical_interval_index(self):
# GH 25087
ii = pd.IntervalIndex.from_breaks([0, 2.78, 3.14, 6.28])
ci = pd.CategoricalIndex(ii)
df = DataFrame({"A": list("abc")}, index=ci)
expected = df
result = df.dropna()
tm.assert_frame_equal(result, expected)
def test_dropna_with_duplicate_columns(self):
df = DataFrame(
{
"A": np.random.default_rng(2).standard_normal(5),
"B": np.random.default_rng(2).standard_normal(5),
"C": np.random.default_rng(2).standard_normal(5),
"D": ["a", "b", "c", "d", "e"],
}
)
df.iloc[2, [0, 1, 2]] = np.nan
df.iloc[0, 0] = np.nan
df.iloc[1, 1] = np.nan
df.iloc[:, 3] = np.nan
expected = df.dropna(subset=["A", "B", "C"], how="all")
expected.columns = ["A", "A", "B", "C"]
df.columns = ["A", "A", "B", "C"]
result = df.dropna(subset=["A", "C"], how="all")
tm.assert_frame_equal(result, expected)
def test_set_single_column_subset(self):
# GH 41021
df = DataFrame({"A": [1, 2, 3], "B": list("abc"), "C": [4, np.nan, 5]})
expected = DataFrame(
{"A": [1, 3], "B": list("ac"), "C": [4.0, 5.0]}, index=[0, 2]
)
result = df.dropna(subset="C")
tm.assert_frame_equal(result, expected)
def test_single_column_not_present_in_axis(self):
# GH 41021
df = DataFrame({"A": [1, 2, 3]})
# Column not present
with pytest.raises(KeyError, match="['D']"):
df.dropna(subset="D", axis=0)
def test_subset_is_nparray(self):
# GH 41021
df = DataFrame({"A": [1, 2, np.nan], "B": list("abc"), "C": [4, np.nan, 5]})
expected = DataFrame({"A": [1.0], "B": ["a"], "C": [4.0]})
result = df.dropna(subset=np.array(["A", "C"]))
tm.assert_frame_equal(result, expected)
def test_no_nans_in_frame(self, axis):
# GH#41965
df = DataFrame([[1, 2], [3, 4]], columns=pd.RangeIndex(0, 2))
expected = df.copy()
result = df.dropna(axis=axis)
tm.assert_frame_equal(result, expected, check_index_type=True)
def test_how_thresh_param_incompatible(self):
# GH46575
df = DataFrame([1, 2, pd.NA])
msg = "You cannot set both the how and thresh arguments at the same time"
with pytest.raises(TypeError, match=msg):
df.dropna(how="all", thresh=2)
with pytest.raises(TypeError, match=msg):
df.dropna(how="any", thresh=2)
with pytest.raises(TypeError, match=msg):
df.dropna(how=None, thresh=None)
@pytest.mark.parametrize("val", [1, 1.5])
def test_dropna_ignore_index(self, val):
# GH#31725
df = DataFrame({"a": [1, 2, val]}, index=[3, 2, 1])
result = df.dropna(ignore_index=True)
expected = DataFrame({"a": [1, 2, val]})
tm.assert_frame_equal(result, expected)
df.dropna(ignore_index=True, inplace=True)
tm.assert_frame_equal(df, expected)

View File

@ -0,0 +1,153 @@
from datetime import timedelta
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import DatetimeTZDtype
import pandas as pd
from pandas import (
DataFrame,
Series,
date_range,
option_context,
)
import pandas._testing as tm
class TestDataFrameDataTypes:
def test_empty_frame_dtypes(self):
empty_df = DataFrame()
tm.assert_series_equal(empty_df.dtypes, Series(dtype=object))
nocols_df = DataFrame(index=[1, 2, 3])
tm.assert_series_equal(nocols_df.dtypes, Series(dtype=object))
norows_df = DataFrame(columns=list("abc"))
tm.assert_series_equal(norows_df.dtypes, Series(object, index=list("abc")))
norows_int_df = DataFrame(columns=list("abc")).astype(np.int32)
tm.assert_series_equal(
norows_int_df.dtypes, Series(np.dtype("int32"), index=list("abc"))
)
df = DataFrame({"a": 1, "b": True, "c": 1.0}, index=[1, 2, 3])
ex_dtypes = Series({"a": np.int64, "b": np.bool_, "c": np.float64})
tm.assert_series_equal(df.dtypes, ex_dtypes)
# same but for empty slice of df
tm.assert_series_equal(df[:0].dtypes, ex_dtypes)
def test_datetime_with_tz_dtypes(self):
tzframe = DataFrame(
{
"A": date_range("20130101", periods=3),
"B": date_range("20130101", periods=3, tz="US/Eastern"),
"C": date_range("20130101", periods=3, tz="CET"),
}
)
tzframe.iloc[1, 1] = pd.NaT
tzframe.iloc[1, 2] = pd.NaT
result = tzframe.dtypes.sort_index()
expected = Series(
[
np.dtype("datetime64[ns]"),
DatetimeTZDtype("ns", "US/Eastern"),
DatetimeTZDtype("ns", "CET"),
],
["A", "B", "C"],
)
tm.assert_series_equal(result, expected)
def test_dtypes_are_correct_after_column_slice(self):
# GH6525
df = DataFrame(index=range(5), columns=list("abc"), dtype=np.float64)
tm.assert_series_equal(
df.dtypes,
Series({"a": np.float64, "b": np.float64, "c": np.float64}),
)
tm.assert_series_equal(df.iloc[:, 2:].dtypes, Series({"c": np.float64}))
tm.assert_series_equal(
df.dtypes,
Series({"a": np.float64, "b": np.float64, "c": np.float64}),
)
@pytest.mark.parametrize(
"data",
[pd.NA, True],
)
def test_dtypes_are_correct_after_groupby_last(self, data):
# GH46409
df = DataFrame(
{"id": [1, 2, 3, 4], "test": [True, pd.NA, data, False]}
).convert_dtypes()
result = df.groupby("id").last().test
expected = df.set_index("id").test
assert result.dtype == pd.BooleanDtype()
tm.assert_series_equal(expected, result)
def test_dtypes_gh8722(self, float_string_frame):
float_string_frame["bool"] = float_string_frame["A"] > 0
result = float_string_frame.dtypes
expected = Series(
{k: v.dtype for k, v in float_string_frame.items()}, index=result.index
)
tm.assert_series_equal(result, expected)
# compat, GH 8722
msg = "use_inf_as_na option is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
with option_context("use_inf_as_na", True):
df = DataFrame([[1]])
result = df.dtypes
tm.assert_series_equal(result, Series({0: np.dtype("int64")}))
def test_dtypes_timedeltas(self):
df = DataFrame(
{
"A": Series(date_range("2012-1-1", periods=3, freq="D")),
"B": Series([timedelta(days=i) for i in range(3)]),
}
)
result = df.dtypes
expected = Series(
[np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]")], index=list("AB")
)
tm.assert_series_equal(result, expected)
df["C"] = df["A"] + df["B"]
result = df.dtypes
expected = Series(
[
np.dtype("datetime64[ns]"),
np.dtype("timedelta64[ns]"),
np.dtype("datetime64[ns]"),
],
index=list("ABC"),
)
tm.assert_series_equal(result, expected)
# mixed int types
df["D"] = 1
result = df.dtypes
expected = Series(
[
np.dtype("datetime64[ns]"),
np.dtype("timedelta64[ns]"),
np.dtype("datetime64[ns]"),
np.dtype("int64"),
],
index=list("ABCD"),
)
tm.assert_series_equal(result, expected)
def test_frame_apply_np_array_return_type(self, using_infer_string):
# GH 35517
df = DataFrame([["foo"]])
result = df.apply(lambda col: np.array("bar"))
if using_infer_string:
expected = Series([np.array(["bar"])])
else:
expected = Series(["bar"])
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,117 @@
import re
import sys
import numpy as np
import pytest
from pandas import (
DataFrame,
Series,
date_range,
)
import pandas._testing as tm
@pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]])
def test_duplicated_with_misspelled_column_name(subset):
# GH 19730
df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
msg = re.escape("Index(['a'], dtype=")
with pytest.raises(KeyError, match=msg):
df.duplicated(subset)
def test_duplicated_implemented_no_recursion():
# gh-21524
# Ensure duplicated isn't implemented using recursion that
# can fail on wide frames
df = DataFrame(np.random.default_rng(2).integers(0, 1000, (10, 1000)))
rec_limit = sys.getrecursionlimit()
try:
sys.setrecursionlimit(100)
result = df.duplicated()
finally:
sys.setrecursionlimit(rec_limit)
# Then duplicates produce the bool Series as a result and don't fail during
# calculation. Actual values doesn't matter here, though usually it's all
# False in this case
assert isinstance(result, Series)
assert result.dtype == np.bool_
@pytest.mark.parametrize(
"keep, expected",
[
("first", Series([False, False, True, False, True])),
("last", Series([True, True, False, False, False])),
(False, Series([True, True, True, False, True])),
],
)
def test_duplicated_keep(keep, expected):
df = DataFrame({"A": [0, 1, 1, 2, 0], "B": ["a", "b", "b", "c", "a"]})
result = df.duplicated(keep=keep)
tm.assert_series_equal(result, expected)
@pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal")
@pytest.mark.parametrize(
"keep, expected",
[
("first", Series([False, False, True, False, True])),
("last", Series([True, True, False, False, False])),
(False, Series([True, True, True, False, True])),
],
)
def test_duplicated_nan_none(keep, expected):
df = DataFrame({"C": [np.nan, 3, 3, None, np.nan], "x": 1}, dtype=object)
result = df.duplicated(keep=keep)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("subset", [None, ["A", "B"], "A"])
def test_duplicated_subset(subset, keep):
df = DataFrame(
{
"A": [0, 1, 1, 2, 0],
"B": ["a", "b", "b", "c", "a"],
"C": [np.nan, 3, 3, None, np.nan],
}
)
if subset is None:
subset = list(df.columns)
elif isinstance(subset, str):
# need to have a DataFrame, not a Series
# -> select columns with singleton list, not string
subset = [subset]
expected = df[subset].duplicated(keep=keep)
result = df.duplicated(keep=keep, subset=subset)
tm.assert_series_equal(result, expected)
def test_duplicated_on_empty_frame():
# GH 25184
df = DataFrame(columns=["a", "b"])
dupes = df.duplicated("a")
result = df[dupes]
expected = df.copy()
tm.assert_frame_equal(result, expected)
def test_frame_datetime64_duplicated():
dates = date_range("2010-07-01", end="2010-08-05")
tst = DataFrame({"symbol": "AAA", "date": dates})
result = tst.duplicated(["date", "symbol"])
assert (-result).all()
tst = DataFrame({"date": dates})
result = tst.date.duplicated()
assert (-result).all()

View File

@ -0,0 +1,85 @@
import numpy as np
from pandas import (
DataFrame,
date_range,
)
import pandas._testing as tm
class TestEquals:
def test_dataframe_not_equal(self):
# see GH#28839
df1 = DataFrame({"a": [1, 2], "b": ["s", "d"]})
df2 = DataFrame({"a": ["s", "d"], "b": [1, 2]})
assert df1.equals(df2) is False
def test_equals_different_blocks(self, using_array_manager, using_infer_string):
# GH#9330
df0 = DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]})
df1 = df0.reset_index()[["A", "B", "C"]]
if not using_array_manager and not using_infer_string:
# this assert verifies that the above operations have
# induced a block rearrangement
assert df0._mgr.blocks[0].dtype != df1._mgr.blocks[0].dtype
# do the real tests
tm.assert_frame_equal(df0, df1)
assert df0.equals(df1)
assert df1.equals(df0)
def test_equals(self):
# Add object dtype column with nans
index = np.random.default_rng(2).random(10)
df1 = DataFrame(
np.random.default_rng(2).random(10), index=index, columns=["floats"]
)
df1["text"] = "the sky is so blue. we could use more chocolate.".split()
df1["start"] = date_range("2000-1-1", periods=10, freq="min")
df1["end"] = date_range("2000-1-1", periods=10, freq="D")
df1["diff"] = df1["end"] - df1["start"]
# Explicitly cast to object, to avoid implicit cast when setting np.nan
df1["bool"] = (np.arange(10) % 3 == 0).astype(object)
df1.loc[::2] = np.nan
df2 = df1.copy()
assert df1["text"].equals(df2["text"])
assert df1["start"].equals(df2["start"])
assert df1["end"].equals(df2["end"])
assert df1["diff"].equals(df2["diff"])
assert df1["bool"].equals(df2["bool"])
assert df1.equals(df2)
assert not df1.equals(object)
# different dtype
different = df1.copy()
different["floats"] = different["floats"].astype("float32")
assert not df1.equals(different)
# different index
different_index = -index
different = df2.set_index(different_index)
assert not df1.equals(different)
# different columns
different = df2.copy()
different.columns = df2.columns[::-1]
assert not df1.equals(different)
# DatetimeIndex
index = date_range("2000-1-1", periods=10, freq="min")
df1 = df1.set_index(index)
df2 = df1.copy()
assert df1.equals(df2)
# MultiIndex
df3 = df1.set_index(["text"], append=True)
df2 = df1.set_index(["text"], append=True)
assert df3.equals(df2)
df2 = df1.set_index(["floats"], append=True)
assert not df3.equals(df2)
# NaN in index
df3 = df1.set_index(["floats"], append=True)
df2 = df1.set_index(["floats"], append=True)
assert df3.equals(df2)

View File

@ -0,0 +1,303 @@
import re
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
def test_error():
df = pd.DataFrame(
{"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
)
with pytest.raises(
ValueError, match="column must be a scalar, tuple, or list thereof"
):
df.explode([list("AA")])
with pytest.raises(ValueError, match="column must be unique"):
df.explode(list("AA"))
df.columns = list("AA")
with pytest.raises(
ValueError,
match=re.escape("DataFrame columns must be unique. Duplicate columns: ['A']"),
):
df.explode("A")
@pytest.mark.parametrize(
"input_subset, error_message",
[
(
list("AC"),
"columns must have matching element counts",
),
(
[],
"column must be nonempty",
),
(
list("AC"),
"columns must have matching element counts",
),
],
)
def test_error_multi_columns(input_subset, error_message):
# GH 39240
df = pd.DataFrame(
{
"A": [[0, 1, 2], np.nan, [], (3, 4)],
"B": 1,
"C": [["a", "b", "c"], "foo", [], ["d", "e", "f"]],
},
index=list("abcd"),
)
with pytest.raises(ValueError, match=error_message):
df.explode(input_subset)
@pytest.mark.parametrize(
"scalar",
["a", 0, 1.5, pd.Timedelta("1 days"), pd.Timestamp("2019-12-31")],
)
def test_basic(scalar):
df = pd.DataFrame(
{scalar: pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
)
result = df.explode(scalar)
expected = pd.DataFrame(
{
scalar: pd.Series(
[0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object
),
"B": 1,
}
)
tm.assert_frame_equal(result, expected)
def test_multi_index_rows():
df = pd.DataFrame(
{"A": np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), "B": 1},
index=pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]),
)
result = df.explode("A")
expected = pd.DataFrame(
{
"A": pd.Series(
[0, 1, 2, np.nan, np.nan, 3, 4],
index=pd.MultiIndex.from_tuples(
[
("a", 1),
("a", 1),
("a", 1),
("a", 2),
("b", 1),
("b", 2),
("b", 2),
]
),
dtype=object,
),
"B": 1,
}
)
tm.assert_frame_equal(result, expected)
def test_multi_index_columns():
df = pd.DataFrame(
{("A", 1): np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), ("A", 2): 1}
)
result = df.explode(("A", 1))
expected = pd.DataFrame(
{
("A", 1): pd.Series(
[0, 1, 2, np.nan, np.nan, 3, 4],
index=pd.Index([0, 0, 0, 1, 2, 3, 3]),
dtype=object,
),
("A", 2): 1,
}
)
tm.assert_frame_equal(result, expected)
def test_usecase():
# explode a single column
# gh-10511
df = pd.DataFrame(
[[11, range(5), 10], [22, range(3), 20]], columns=list("ABC")
).set_index("C")
result = df.explode("B")
expected = pd.DataFrame(
{
"A": [11, 11, 11, 11, 11, 22, 22, 22],
"B": np.array([0, 1, 2, 3, 4, 0, 1, 2], dtype=object),
"C": [10, 10, 10, 10, 10, 20, 20, 20],
},
columns=list("ABC"),
).set_index("C")
tm.assert_frame_equal(result, expected)
# gh-8517
df = pd.DataFrame(
[["2014-01-01", "Alice", "A B"], ["2014-01-02", "Bob", "C D"]],
columns=["dt", "name", "text"],
)
result = df.assign(text=df.text.str.split(" ")).explode("text")
expected = pd.DataFrame(
[
["2014-01-01", "Alice", "A"],
["2014-01-01", "Alice", "B"],
["2014-01-02", "Bob", "C"],
["2014-01-02", "Bob", "D"],
],
columns=["dt", "name", "text"],
index=[0, 0, 1, 1],
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"input_dict, input_index, expected_dict, expected_index",
[
(
{"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
[0, 0],
{"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
[0, 0, 0, 0],
),
(
{"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
pd.Index([0, 0], name="my_index"),
{"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
pd.Index([0, 0, 0, 0], name="my_index"),
),
(
{"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
pd.MultiIndex.from_arrays(
[[0, 0], [1, 1]], names=["my_first_index", "my_second_index"]
),
{"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
pd.MultiIndex.from_arrays(
[[0, 0, 0, 0], [1, 1, 1, 1]],
names=["my_first_index", "my_second_index"],
),
),
(
{"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
pd.MultiIndex.from_arrays([[0, 0], [1, 1]], names=["my_index", None]),
{"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
pd.MultiIndex.from_arrays(
[[0, 0, 0, 0], [1, 1, 1, 1]], names=["my_index", None]
),
),
],
)
def test_duplicate_index(input_dict, input_index, expected_dict, expected_index):
# GH 28005
df = pd.DataFrame(input_dict, index=input_index, dtype=object)
result = df.explode("col1")
expected = pd.DataFrame(expected_dict, index=expected_index, dtype=object)
tm.assert_frame_equal(result, expected)
def test_ignore_index():
# GH 34932
df = pd.DataFrame({"id": range(0, 20, 10), "values": [list("ab"), list("cd")]})
result = df.explode("values", ignore_index=True)
expected = pd.DataFrame(
{"id": [0, 0, 10, 10], "values": list("abcd")}, index=[0, 1, 2, 3]
)
tm.assert_frame_equal(result, expected)
def test_explode_sets():
# https://github.com/pandas-dev/pandas/issues/35614
df = pd.DataFrame({"a": [{"x", "y"}], "b": [1]}, index=[1])
result = df.explode(column="a").sort_values(by="a")
expected = pd.DataFrame({"a": ["x", "y"], "b": [1, 1]}, index=[1, 1])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"input_subset, expected_dict, expected_index",
[
(
list("AC"),
{
"A": pd.Series(
[0, 1, 2, np.nan, np.nan, 3, 4, np.nan],
index=list("aaabcdde"),
dtype=object,
),
"B": 1,
"C": ["a", "b", "c", "foo", np.nan, "d", "e", np.nan],
},
list("aaabcdde"),
),
(
list("A"),
{
"A": pd.Series(
[0, 1, 2, np.nan, np.nan, 3, 4, np.nan],
index=list("aaabcdde"),
dtype=object,
),
"B": 1,
"C": [
["a", "b", "c"],
["a", "b", "c"],
["a", "b", "c"],
"foo",
[],
["d", "e"],
["d", "e"],
np.nan,
],
},
list("aaabcdde"),
),
],
)
def test_multi_columns(input_subset, expected_dict, expected_index):
# GH 39240
df = pd.DataFrame(
{
"A": [[0, 1, 2], np.nan, [], (3, 4), np.nan],
"B": 1,
"C": [["a", "b", "c"], "foo", [], ["d", "e"], np.nan],
},
index=list("abcde"),
)
result = df.explode(input_subset)
expected = pd.DataFrame(expected_dict, expected_index)
tm.assert_frame_equal(result, expected)
def test_multi_columns_nan_empty():
# GH 46084
df = pd.DataFrame(
{
"A": [[0, 1], [5], [], [2, 3]],
"B": [9, 8, 7, 6],
"C": [[1, 2], np.nan, [], [3, 4]],
}
)
result = df.explode(["A", "C"])
expected = pd.DataFrame(
{
"A": np.array([0, 1, 5, np.nan, 2, 3], dtype=object),
"B": [9, 9, 8, 7, 6, 6],
"C": np.array([1, 2, np.nan, np.nan, 3, 4], dtype=object),
},
index=[0, 0, 1, 2, 3, 3],
)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,932 @@
import numpy as np
import pytest
from pandas._config import using_pyarrow_string_dtype
import pandas.util._test_decorators as td
from pandas import (
Categorical,
DataFrame,
DatetimeIndex,
NaT,
PeriodIndex,
Series,
TimedeltaIndex,
Timestamp,
date_range,
to_datetime,
)
import pandas._testing as tm
from pandas.tests.frame.common import _check_mixed_float
class TestFillNA:
def test_fillna_dict_inplace_nonunique_columns(
self, using_copy_on_write, warn_copy_on_write
):
df = DataFrame(
{"A": [np.nan] * 3, "B": [NaT, Timestamp(1), NaT], "C": [np.nan, "foo", 2]}
)
df.columns = ["A", "A", "A"]
orig = df[:]
# TODO(CoW-warn) better warning message
with tm.assert_cow_warning(warn_copy_on_write):
df.fillna({"A": 2}, inplace=True)
# The first and third columns can be set inplace, while the second cannot.
expected = DataFrame(
{"A": [2.0] * 3, "B": [2, Timestamp(1), 2], "C": [2, "foo", 2]}
)
expected.columns = ["A", "A", "A"]
tm.assert_frame_equal(df, expected)
# TODO: what's the expected/desired behavior with CoW?
if not using_copy_on_write:
assert tm.shares_memory(df.iloc[:, 0], orig.iloc[:, 0])
assert not tm.shares_memory(df.iloc[:, 1], orig.iloc[:, 1])
if not using_copy_on_write:
assert tm.shares_memory(df.iloc[:, 2], orig.iloc[:, 2])
@td.skip_array_manager_not_yet_implemented
def test_fillna_on_column_view(self, using_copy_on_write):
# GH#46149 avoid unnecessary copies
arr = np.full((40, 50), np.nan)
df = DataFrame(arr, copy=False)
if using_copy_on_write:
with tm.raises_chained_assignment_error():
df[0].fillna(-1, inplace=True)
assert np.isnan(arr[:, 0]).all()
else:
with tm.assert_produces_warning(FutureWarning, match="inplace method"):
df[0].fillna(-1, inplace=True)
assert (arr[:, 0] == -1).all()
# i.e. we didn't create a new 49-column block
assert len(df._mgr.arrays) == 1
assert np.shares_memory(df.values, arr)
def test_fillna_datetime(self, datetime_frame):
tf = datetime_frame
tf.loc[tf.index[:5], "A"] = np.nan
tf.loc[tf.index[-5:], "A"] = np.nan
zero_filled = datetime_frame.fillna(0)
assert (zero_filled.loc[zero_filled.index[:5], "A"] == 0).all()
msg = "DataFrame.fillna with 'method' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
padded = datetime_frame.fillna(method="pad")
assert np.isnan(padded.loc[padded.index[:5], "A"]).all()
assert (
padded.loc[padded.index[-5:], "A"] == padded.loc[padded.index[-5], "A"]
).all()
msg = "Must specify a fill 'value' or 'method'"
with pytest.raises(ValueError, match=msg):
datetime_frame.fillna()
msg = "Cannot specify both 'value' and 'method'"
with pytest.raises(ValueError, match=msg):
datetime_frame.fillna(5, method="ffill")
@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string")
def test_fillna_mixed_type(self, float_string_frame):
mf = float_string_frame
mf.loc[mf.index[5:20], "foo"] = np.nan
mf.loc[mf.index[-10:], "A"] = np.nan
# TODO: make stronger assertion here, GH 25640
mf.fillna(value=0)
msg = "DataFrame.fillna with 'method' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
mf.fillna(method="pad")
def test_fillna_mixed_float(self, mixed_float_frame):
# mixed numeric (but no float16)
mf = mixed_float_frame.reindex(columns=["A", "B", "D"])
mf.loc[mf.index[-10:], "A"] = np.nan
result = mf.fillna(value=0)
_check_mixed_float(result, dtype={"C": None})
msg = "DataFrame.fillna with 'method' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = mf.fillna(method="pad")
_check_mixed_float(result, dtype={"C": None})
def test_fillna_empty(self, using_copy_on_write):
if using_copy_on_write:
pytest.skip("condition is unnecessary complex and is deprecated anyway")
# empty frame (GH#2778)
df = DataFrame(columns=["x"])
for m in ["pad", "backfill"]:
msg = "Series.fillna with 'method' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
df.x.fillna(method=m, inplace=True)
df.x.fillna(method=m)
def test_fillna_different_dtype(self, using_infer_string):
# with different dtype (GH#3386)
df = DataFrame(
[["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]]
)
if using_infer_string:
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
result = df.fillna({2: "foo"})
else:
result = df.fillna({2: "foo"})
expected = DataFrame(
[["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]]
)
tm.assert_frame_equal(result, expected)
if using_infer_string:
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
return_value = df.fillna({2: "foo"}, inplace=True)
else:
return_value = df.fillna({2: "foo"}, inplace=True)
tm.assert_frame_equal(df, expected)
assert return_value is None
def test_fillna_limit_and_value(self):
# limit and value
df = DataFrame(np.random.default_rng(2).standard_normal((10, 3)))
df.iloc[2:7, 0] = np.nan
df.iloc[3:5, 2] = np.nan
expected = df.copy()
expected.iloc[2, 0] = 999
expected.iloc[3, 2] = 999
result = df.fillna(999, limit=1)
tm.assert_frame_equal(result, expected)
def test_fillna_datelike(self):
# with datelike
# GH#6344
df = DataFrame(
{
"Date": [NaT, Timestamp("2014-1-1")],
"Date2": [Timestamp("2013-1-1"), NaT],
}
)
expected = df.copy()
expected["Date"] = expected["Date"].fillna(df.loc[df.index[0], "Date2"])
result = df.fillna(value={"Date": df["Date2"]})
tm.assert_frame_equal(result, expected)
def test_fillna_tzaware(self):
# with timezone
# GH#15855
df = DataFrame({"A": [Timestamp("2012-11-11 00:00:00+01:00"), NaT]})
exp = DataFrame(
{
"A": [
Timestamp("2012-11-11 00:00:00+01:00"),
Timestamp("2012-11-11 00:00:00+01:00"),
]
}
)
msg = "DataFrame.fillna with 'method' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
res = df.fillna(method="pad")
tm.assert_frame_equal(res, exp)
df = DataFrame({"A": [NaT, Timestamp("2012-11-11 00:00:00+01:00")]})
exp = DataFrame(
{
"A": [
Timestamp("2012-11-11 00:00:00+01:00"),
Timestamp("2012-11-11 00:00:00+01:00"),
]
}
)
msg = "DataFrame.fillna with 'method' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
res = df.fillna(method="bfill")
tm.assert_frame_equal(res, exp)
def test_fillna_tzaware_different_column(self):
# with timezone in another column
# GH#15522
df = DataFrame(
{
"A": date_range("20130101", periods=4, tz="US/Eastern"),
"B": [1, 2, np.nan, np.nan],
}
)
msg = "DataFrame.fillna with 'method' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.fillna(method="pad")
expected = DataFrame(
{
"A": date_range("20130101", periods=4, tz="US/Eastern"),
"B": [1.0, 2.0, 2.0, 2.0],
}
)
tm.assert_frame_equal(result, expected)
def test_na_actions_categorical(self):
cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
vals = ["a", "b", np.nan, "d"]
df = DataFrame({"cats": cat, "vals": vals})
cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
vals2 = ["a", "b", "b", "d"]
df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
vals3 = ["a", "b", np.nan]
df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
cat4 = Categorical([1, 2], categories=[1, 2, 3])
vals4 = ["a", "b"]
df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})
# fillna
res = df.fillna(value={"cats": 3, "vals": "b"})
tm.assert_frame_equal(res, df_exp_fill)
msg = "Cannot setitem on a Categorical with a new category"
with pytest.raises(TypeError, match=msg):
df.fillna(value={"cats": 4, "vals": "c"})
msg = "DataFrame.fillna with 'method' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
res = df.fillna(method="pad")
tm.assert_frame_equal(res, df_exp_fill)
# dropna
res = df.dropna(subset=["cats"])
tm.assert_frame_equal(res, df_exp_drop_cats)
res = df.dropna()
tm.assert_frame_equal(res, df_exp_drop_all)
# make sure that fillna takes missing values into account
c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
df = DataFrame({"cats": c, "vals": [1, 2, 3]})
cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})
res = df.fillna("a")
tm.assert_frame_equal(res, df_exp)
def test_fillna_categorical_nan(self):
# GH#14021
# np.nan should always be a valid filler
cat = Categorical([np.nan, 2, np.nan])
val = Categorical([np.nan, np.nan, np.nan])
df = DataFrame({"cats": cat, "vals": val})
# GH#32950 df.median() is poorly behaved because there is no
# Categorical.median
median = Series({"cats": 2.0, "vals": np.nan})
res = df.fillna(median)
v_exp = [np.nan, np.nan, np.nan]
df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype="category")
tm.assert_frame_equal(res, df_exp)
result = df.cats.fillna(np.nan)
tm.assert_series_equal(result, df.cats)
result = df.vals.fillna(np.nan)
tm.assert_series_equal(result, df.vals)
idx = DatetimeIndex(
["2011-01-01 09:00", "2016-01-01 23:45", "2011-01-01 09:00", NaT, NaT]
)
df = DataFrame({"a": Categorical(idx)})
tm.assert_frame_equal(df.fillna(value=NaT), df)
idx = PeriodIndex(["2011-01", "2011-01", "2011-01", NaT, NaT], freq="M")
df = DataFrame({"a": Categorical(idx)})
tm.assert_frame_equal(df.fillna(value=NaT), df)
idx = TimedeltaIndex(["1 days", "2 days", "1 days", NaT, NaT])
df = DataFrame({"a": Categorical(idx)})
tm.assert_frame_equal(df.fillna(value=NaT), df)
def test_fillna_downcast(self):
# GH#15277
# infer int64 from float64
df = DataFrame({"a": [1.0, np.nan]})
msg = "The 'downcast' keyword in fillna is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.fillna(0, downcast="infer")
expected = DataFrame({"a": [1, 0]})
tm.assert_frame_equal(result, expected)
# infer int64 from float64 when fillna value is a dict
df = DataFrame({"a": [1.0, np.nan]})
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.fillna({"a": 0}, downcast="infer")
expected = DataFrame({"a": [1, 0]})
tm.assert_frame_equal(result, expected)
def test_fillna_downcast_false(self, frame_or_series):
# GH#45603 preserve object dtype with downcast=False
obj = frame_or_series([1, 2, 3], dtype="object")
msg = "The 'downcast' keyword in fillna"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = obj.fillna("", downcast=False)
tm.assert_equal(result, obj)
def test_fillna_downcast_noop(self, frame_or_series):
# GH#45423
# Two relevant paths:
# 1) not _can_hold_na (e.g. integer)
# 2) _can_hold_na + noop + not can_hold_element
obj = frame_or_series([1, 2, 3], dtype=np.int64)
msg = "The 'downcast' keyword in fillna"
with tm.assert_produces_warning(FutureWarning, match=msg):
# GH#40988
res = obj.fillna("foo", downcast=np.dtype(np.int32))
expected = obj.astype(np.int32)
tm.assert_equal(res, expected)
obj2 = obj.astype(np.float64)
with tm.assert_produces_warning(FutureWarning, match=msg):
res2 = obj2.fillna("foo", downcast="infer")
expected2 = obj # get back int64
tm.assert_equal(res2, expected2)
with tm.assert_produces_warning(FutureWarning, match=msg):
# GH#40988
res3 = obj2.fillna("foo", downcast=np.dtype(np.int32))
tm.assert_equal(res3, expected)
@pytest.mark.parametrize("columns", [["A", "A", "B"], ["A", "A"]])
def test_fillna_dictlike_value_duplicate_colnames(self, columns):
# GH#43476
df = DataFrame(np.nan, index=[0, 1], columns=columns)
with tm.assert_produces_warning(None):
result = df.fillna({"A": 0})
expected = df.copy()
expected["A"] = 0.0
tm.assert_frame_equal(result, expected)
def test_fillna_dtype_conversion(self, using_infer_string):
# make sure that fillna on an empty frame works
df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
result = df.dtypes
expected = Series([np.dtype("object")] * 5, index=[1, 2, 3, 4, 5])
tm.assert_series_equal(result, expected)
msg = "Downcasting object dtype arrays"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.fillna(1)
expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
tm.assert_frame_equal(result, expected)
# empty block
df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64")
if using_infer_string:
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
result = df.fillna("nan")
else:
result = df.fillna("nan")
expected = DataFrame("nan", index=range(3), columns=["A", "B"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("val", ["", 1, np.nan, 1.0])
def test_fillna_dtype_conversion_equiv_replace(self, val):
df = DataFrame({"A": [1, np.nan], "B": [1.0, 2.0]})
expected = df.replace(np.nan, val)
result = df.fillna(val)
tm.assert_frame_equal(result, expected)
def test_fillna_datetime_columns(self):
# GH#7095
df = DataFrame(
{
"A": [-1, -2, np.nan],
"B": date_range("20130101", periods=3),
"C": ["foo", "bar", None],
"D": ["foo2", "bar2", None],
},
index=date_range("20130110", periods=3),
)
result = df.fillna("?")
expected = DataFrame(
{
"A": [-1, -2, "?"],
"B": date_range("20130101", periods=3),
"C": ["foo", "bar", "?"],
"D": ["foo2", "bar2", "?"],
},
index=date_range("20130110", periods=3),
)
tm.assert_frame_equal(result, expected)
df = DataFrame(
{
"A": [-1, -2, np.nan],
"B": [Timestamp("2013-01-01"), Timestamp("2013-01-02"), NaT],
"C": ["foo", "bar", None],
"D": ["foo2", "bar2", None],
},
index=date_range("20130110", periods=3),
)
result = df.fillna("?")
expected = DataFrame(
{
"A": [-1, -2, "?"],
"B": [Timestamp("2013-01-01"), Timestamp("2013-01-02"), "?"],
"C": ["foo", "bar", "?"],
"D": ["foo2", "bar2", "?"],
},
index=date_range("20130110", periods=3),
)
tm.assert_frame_equal(result, expected)
def test_ffill(self, datetime_frame):
datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan
datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan
msg = "DataFrame.fillna with 'method' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
alt = datetime_frame.fillna(method="ffill")
tm.assert_frame_equal(datetime_frame.ffill(), alt)
def test_bfill(self, datetime_frame):
datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan
datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan
msg = "DataFrame.fillna with 'method' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
alt = datetime_frame.fillna(method="bfill")
tm.assert_frame_equal(datetime_frame.bfill(), alt)
def test_frame_pad_backfill_limit(self):
index = np.arange(10)
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)), index=index)
result = df[:2].reindex(index, method="pad", limit=5)
msg = "DataFrame.fillna with 'method' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
expected = df[:2].reindex(index).fillna(method="pad")
expected.iloc[-3:] = np.nan
tm.assert_frame_equal(result, expected)
result = df[-2:].reindex(index, method="backfill", limit=5)
with tm.assert_produces_warning(FutureWarning, match=msg):
expected = df[-2:].reindex(index).fillna(method="backfill")
expected.iloc[:3] = np.nan
tm.assert_frame_equal(result, expected)
def test_frame_fillna_limit(self):
index = np.arange(10)
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)), index=index)
result = df[:2].reindex(index)
msg = "DataFrame.fillna with 'method' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = result.fillna(method="pad", limit=5)
with tm.assert_produces_warning(FutureWarning, match=msg):
expected = df[:2].reindex(index).fillna(method="pad")
expected.iloc[-3:] = np.nan
tm.assert_frame_equal(result, expected)
result = df[-2:].reindex(index)
with tm.assert_produces_warning(FutureWarning, match=msg):
result = result.fillna(method="backfill", limit=5)
with tm.assert_produces_warning(FutureWarning, match=msg):
expected = df[-2:].reindex(index).fillna(method="backfill")
expected.iloc[:3] = np.nan
tm.assert_frame_equal(result, expected)
def test_fillna_skip_certain_blocks(self):
# don't try to fill boolean, int blocks
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)).astype(int))
# it works!
df.fillna(np.nan)
@pytest.mark.parametrize("type", [int, float])
def test_fillna_positive_limit(self, type):
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))).astype(type)
msg = "Limit must be greater than 0"
with pytest.raises(ValueError, match=msg):
df.fillna(0, limit=-5)
@pytest.mark.parametrize("type", [int, float])
def test_fillna_integer_limit(self, type):
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))).astype(type)
msg = "Limit must be an integer"
with pytest.raises(ValueError, match=msg):
df.fillna(0, limit=0.5)
def test_fillna_inplace(self):
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)))
df.loc[:4, 1] = np.nan
df.loc[-4:, 3] = np.nan
expected = df.fillna(value=0)
assert expected is not df
df.fillna(value=0, inplace=True)
tm.assert_frame_equal(df, expected)
expected = df.fillna(value={0: 0}, inplace=True)
assert expected is None
df.loc[:4, 1] = np.nan
df.loc[-4:, 3] = np.nan
msg = "DataFrame.fillna with 'method' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
expected = df.fillna(method="ffill")
assert expected is not df
with tm.assert_produces_warning(FutureWarning, match=msg):
df.fillna(method="ffill", inplace=True)
tm.assert_frame_equal(df, expected)
def test_fillna_dict_series(self):
df = DataFrame(
{
"a": [np.nan, 1, 2, np.nan, np.nan],
"b": [1, 2, 3, np.nan, np.nan],
"c": [np.nan, 1, 2, 3, 4],
}
)
result = df.fillna({"a": 0, "b": 5})
expected = df.copy()
expected["a"] = expected["a"].fillna(0)
expected["b"] = expected["b"].fillna(5)
tm.assert_frame_equal(result, expected)
# it works
result = df.fillna({"a": 0, "b": 5, "d": 7})
# Series treated same as dict
result = df.fillna(df.max())
expected = df.fillna(df.max().to_dict())
tm.assert_frame_equal(result, expected)
# disable this for now
with pytest.raises(NotImplementedError, match="column by column"):
df.fillna(df.max(1), axis=1)
def test_fillna_dataframe(self):
# GH#8377
df = DataFrame(
{
"a": [np.nan, 1, 2, np.nan, np.nan],
"b": [1, 2, 3, np.nan, np.nan],
"c": [np.nan, 1, 2, 3, 4],
},
index=list("VWXYZ"),
)
# df2 may have different index and columns
df2 = DataFrame(
{
"a": [np.nan, 10, 20, 30, 40],
"b": [50, 60, 70, 80, 90],
"foo": ["bar"] * 5,
},
index=list("VWXuZ"),
)
result = df.fillna(df2)
# only those columns and indices which are shared get filled
expected = DataFrame(
{
"a": [np.nan, 1, 2, np.nan, 40],
"b": [1, 2, 3, np.nan, 90],
"c": [np.nan, 1, 2, 3, 4],
},
index=list("VWXYZ"),
)
tm.assert_frame_equal(result, expected)
def test_fillna_columns(self):
arr = np.random.default_rng(2).standard_normal((10, 10))
arr[:, ::2] = np.nan
df = DataFrame(arr)
msg = "DataFrame.fillna with 'method' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.fillna(method="ffill", axis=1)
with tm.assert_produces_warning(FutureWarning, match=msg):
expected = df.T.fillna(method="pad").T
tm.assert_frame_equal(result, expected)
df.insert(6, "foo", 5)
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.fillna(method="ffill", axis=1)
with tm.assert_produces_warning(FutureWarning, match=msg):
expected = df.astype(float).fillna(method="ffill", axis=1)
tm.assert_frame_equal(result, expected)
def test_fillna_invalid_method(self, float_frame):
with pytest.raises(ValueError, match="ffil"):
float_frame.fillna(method="ffil")
def test_fillna_invalid_value(self, float_frame):
# list
msg = '"value" parameter must be a scalar or dict, but you passed a "{}"'
with pytest.raises(TypeError, match=msg.format("list")):
float_frame.fillna([1, 2])
# tuple
with pytest.raises(TypeError, match=msg.format("tuple")):
float_frame.fillna((1, 2))
# frame with series
msg = (
'"value" parameter must be a scalar, dict or Series, but you '
'passed a "DataFrame"'
)
with pytest.raises(TypeError, match=msg):
float_frame.iloc[:, 0].fillna(float_frame)
def test_fillna_col_reordering(self):
cols = ["COL." + str(i) for i in range(5, 0, -1)]
data = np.random.default_rng(2).random((20, 5))
df = DataFrame(index=range(20), columns=cols, data=data)
msg = "DataFrame.fillna with 'method' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
filled = df.fillna(method="ffill")
assert df.columns.tolist() == filled.columns.tolist()
@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string")
def test_fill_corner(self, float_frame, float_string_frame):
mf = float_string_frame
mf.loc[mf.index[5:20], "foo"] = np.nan
mf.loc[mf.index[-10:], "A"] = np.nan
filled = float_string_frame.fillna(value=0)
assert (filled.loc[filled.index[5:20], "foo"] == 0).all()
del float_string_frame["foo"]
float_frame.reindex(columns=[]).fillna(value=0)
def test_fillna_downcast_dict(self):
# GH#40809
df = DataFrame({"col1": [1, np.nan]})
msg = "The 'downcast' keyword in fillna"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.fillna({"col1": 2}, downcast={"col1": "int64"})
expected = DataFrame({"col1": [1, 2]})
tm.assert_frame_equal(result, expected)
def test_fillna_with_columns_and_limit(self):
# GH40989
df = DataFrame(
[
[np.nan, 2, np.nan, 0],
[3, 4, np.nan, 1],
[np.nan, np.nan, np.nan, 5],
[np.nan, 3, np.nan, 4],
],
columns=list("ABCD"),
)
result = df.fillna(axis=1, value=100, limit=1)
result2 = df.fillna(axis=1, value=100, limit=2)
expected = DataFrame(
{
"A": Series([100, 3, 100, 100], dtype="float64"),
"B": [2, 4, np.nan, 3],
"C": [np.nan, 100, np.nan, np.nan],
"D": Series([0, 1, 5, 4], dtype="float64"),
},
index=[0, 1, 2, 3],
)
expected2 = DataFrame(
{
"A": Series([100, 3, 100, 100], dtype="float64"),
"B": Series([2, 4, 100, 3], dtype="float64"),
"C": [100, 100, np.nan, 100],
"D": Series([0, 1, 5, 4], dtype="float64"),
},
index=[0, 1, 2, 3],
)
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result2, expected2)
def test_fillna_datetime_inplace(self):
# GH#48863
df = DataFrame(
{
"date1": to_datetime(["2018-05-30", None]),
"date2": to_datetime(["2018-09-30", None]),
}
)
expected = df.copy()
df.fillna(np.nan, inplace=True)
tm.assert_frame_equal(df, expected)
def test_fillna_inplace_with_columns_limit_and_value(self):
# GH40989
df = DataFrame(
[
[np.nan, 2, np.nan, 0],
[3, 4, np.nan, 1],
[np.nan, np.nan, np.nan, 5],
[np.nan, 3, np.nan, 4],
],
columns=list("ABCD"),
)
expected = df.fillna(axis=1, value=100, limit=1)
assert expected is not df
df.fillna(axis=1, value=100, limit=1, inplace=True)
tm.assert_frame_equal(df, expected)
@td.skip_array_manager_invalid_test
@pytest.mark.parametrize("val", [-1, {"x": -1, "y": -1}])
def test_inplace_dict_update_view(
self, val, using_copy_on_write, warn_copy_on_write
):
# GH#47188
df = DataFrame({"x": [np.nan, 2], "y": [np.nan, 2]})
df_orig = df.copy()
result_view = df[:]
with tm.assert_cow_warning(warn_copy_on_write):
df.fillna(val, inplace=True)
expected = DataFrame({"x": [-1, 2.0], "y": [-1.0, 2]})
tm.assert_frame_equal(df, expected)
if using_copy_on_write:
tm.assert_frame_equal(result_view, df_orig)
else:
tm.assert_frame_equal(result_view, expected)
def test_single_block_df_with_horizontal_axis(self):
# GH 47713
df = DataFrame(
{
"col1": [5, 0, np.nan, 10, np.nan],
"col2": [7, np.nan, np.nan, 5, 3],
"col3": [12, np.nan, 1, 2, 0],
"col4": [np.nan, 1, 1, np.nan, 18],
}
)
result = df.fillna(50, limit=1, axis=1)
expected = DataFrame(
[
[5.0, 7.0, 12.0, 50.0],
[0.0, 50.0, np.nan, 1.0],
[50.0, np.nan, 1.0, 1.0],
[10.0, 5.0, 2.0, 50.0],
[50.0, 3.0, 0.0, 18.0],
],
columns=["col1", "col2", "col3", "col4"],
)
tm.assert_frame_equal(result, expected)
def test_fillna_with_multi_index_frame(self):
# GH 47649
pdf = DataFrame(
{
("x", "a"): [np.nan, 2.0, 3.0],
("x", "b"): [1.0, 2.0, np.nan],
("y", "c"): [1.0, 2.0, np.nan],
}
)
expected = DataFrame(
{
("x", "a"): [-1.0, 2.0, 3.0],
("x", "b"): [1.0, 2.0, -1.0],
("y", "c"): [1.0, 2.0, np.nan],
}
)
tm.assert_frame_equal(pdf.fillna({"x": -1}), expected)
tm.assert_frame_equal(pdf.fillna({"x": -1, ("x", "b"): -2}), expected)
expected = DataFrame(
{
("x", "a"): [-1.0, 2.0, 3.0],
("x", "b"): [1.0, 2.0, -2.0],
("y", "c"): [1.0, 2.0, np.nan],
}
)
tm.assert_frame_equal(pdf.fillna({("x", "b"): -2, "x": -1}), expected)
def test_fillna_nonconsolidated_frame():
# https://github.com/pandas-dev/pandas/issues/36495
df = DataFrame(
[
[1, 1, 1, 1.0],
[2, 2, 2, 2.0],
[3, 3, 3, 3.0],
],
columns=["i1", "i2", "i3", "f1"],
)
df_nonconsol = df.pivot(index="i1", columns="i2")
result = df_nonconsol.fillna(0)
assert result.isna().sum().sum() == 0
def test_fillna_nones_inplace():
# GH 48480
df = DataFrame(
[[None, None], [None, None]],
columns=["A", "B"],
)
msg = "Downcasting object dtype arrays"
with tm.assert_produces_warning(FutureWarning, match=msg):
df.fillna(value={"A": 1, "B": 2}, inplace=True)
expected = DataFrame([[1, 2], [1, 2]], columns=["A", "B"])
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize("func", ["pad", "backfill"])
def test_pad_backfill_deprecated(func):
# GH#33396
df = DataFrame({"a": [1, 2, 3]})
with tm.assert_produces_warning(FutureWarning):
getattr(df, func)()
@pytest.mark.parametrize(
"data, expected_data, method, kwargs",
(
(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[np.nan, np.nan, 3.0, 3.0, 3.0, 3.0, 7.0, np.nan, np.nan],
"ffill",
{"limit_area": "inside"},
),
(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[np.nan, np.nan, 3.0, 3.0, np.nan, np.nan, 7.0, np.nan, np.nan],
"ffill",
{"limit_area": "inside", "limit": 1},
),
(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, 7.0],
"ffill",
{"limit_area": "outside"},
),
(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan],
"ffill",
{"limit_area": "outside", "limit": 1},
),
(
[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
"ffill",
{"limit_area": "outside", "limit": 1},
),
(
range(5),
range(5),
"ffill",
{"limit_area": "outside", "limit": 1},
),
(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[np.nan, np.nan, 3.0, 7.0, 7.0, 7.0, 7.0, np.nan, np.nan],
"bfill",
{"limit_area": "inside"},
),
(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[np.nan, np.nan, 3.0, np.nan, np.nan, 7.0, 7.0, np.nan, np.nan],
"bfill",
{"limit_area": "inside", "limit": 1},
),
(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[3.0, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan],
"bfill",
{"limit_area": "outside"},
),
(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[np.nan, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan],
"bfill",
{"limit_area": "outside", "limit": 1},
),
),
)
def test_ffill_bfill_limit_area(data, expected_data, method, kwargs):
# GH#56492
df = DataFrame(data)
expected = DataFrame(expected_data)
result = getattr(df, method)(**kwargs)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,153 @@
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame
import pandas._testing as tm
class TestDataFrameFilter:
def test_filter(self, float_frame, float_string_frame):
# Items
filtered = float_frame.filter(["A", "B", "E"])
assert len(filtered.columns) == 2
assert "E" not in filtered
filtered = float_frame.filter(["A", "B", "E"], axis="columns")
assert len(filtered.columns) == 2
assert "E" not in filtered
# Other axis
idx = float_frame.index[0:4]
filtered = float_frame.filter(idx, axis="index")
expected = float_frame.reindex(index=idx)
tm.assert_frame_equal(filtered, expected)
# like
fcopy = float_frame.copy()
fcopy["AA"] = 1
filtered = fcopy.filter(like="A")
assert len(filtered.columns) == 2
assert "AA" in filtered
# like with ints in column names
df = DataFrame(0.0, index=[0, 1, 2], columns=[0, 1, "_A", "_B"])
filtered = df.filter(like="_")
assert len(filtered.columns) == 2
# regex with ints in column names
# from PR #10384
df = DataFrame(0.0, index=[0, 1, 2], columns=["A1", 1, "B", 2, "C"])
expected = DataFrame(
0.0, index=[0, 1, 2], columns=pd.Index([1, 2], dtype=object)
)
filtered = df.filter(regex="^[0-9]+$")
tm.assert_frame_equal(filtered, expected)
expected = DataFrame(0.0, index=[0, 1, 2], columns=[0, "0", 1, "1"])
# shouldn't remove anything
filtered = expected.filter(regex="^[0-9]+$")
tm.assert_frame_equal(filtered, expected)
# pass in None
with pytest.raises(TypeError, match="Must pass"):
float_frame.filter()
with pytest.raises(TypeError, match="Must pass"):
float_frame.filter(items=None)
with pytest.raises(TypeError, match="Must pass"):
float_frame.filter(axis=1)
# test mutually exclusive arguments
with pytest.raises(TypeError, match="mutually exclusive"):
float_frame.filter(items=["one", "three"], regex="e$", like="bbi")
with pytest.raises(TypeError, match="mutually exclusive"):
float_frame.filter(items=["one", "three"], regex="e$", axis=1)
with pytest.raises(TypeError, match="mutually exclusive"):
float_frame.filter(items=["one", "three"], regex="e$")
with pytest.raises(TypeError, match="mutually exclusive"):
float_frame.filter(items=["one", "three"], like="bbi", axis=0)
with pytest.raises(TypeError, match="mutually exclusive"):
float_frame.filter(items=["one", "three"], like="bbi")
# objects
filtered = float_string_frame.filter(like="foo")
assert "foo" in filtered
# unicode columns, won't ascii-encode
df = float_frame.rename(columns={"B": "\u2202"})
filtered = df.filter(like="C")
assert "C" in filtered
def test_filter_regex_search(self, float_frame):
fcopy = float_frame.copy()
fcopy["AA"] = 1
# regex
filtered = fcopy.filter(regex="[A]+")
assert len(filtered.columns) == 2
assert "AA" in filtered
# doesn't have to be at beginning
df = DataFrame(
{"aBBa": [1, 2], "BBaBB": [1, 2], "aCCa": [1, 2], "aCCaBB": [1, 2]}
)
result = df.filter(regex="BB")
exp = df[[x for x in df.columns if "BB" in x]]
tm.assert_frame_equal(result, exp)
@pytest.mark.parametrize(
"name,expected",
[
("a", DataFrame({"a": [1, 2]})),
("a", DataFrame({"a": [1, 2]})),
("", DataFrame({"": [3, 4]})),
],
)
def test_filter_unicode(self, name, expected):
# GH13101
df = DataFrame({"a": [1, 2], "": [3, 4]})
tm.assert_frame_equal(df.filter(like=name), expected)
tm.assert_frame_equal(df.filter(regex=name), expected)
@pytest.mark.parametrize("name", ["a", "a"])
def test_filter_bytestring(self, name):
# GH13101
df = DataFrame({b"a": [1, 2], b"b": [3, 4]})
expected = DataFrame({b"a": [1, 2]})
tm.assert_frame_equal(df.filter(like=name), expected)
tm.assert_frame_equal(df.filter(regex=name), expected)
def test_filter_corner(self):
empty = DataFrame()
result = empty.filter([])
tm.assert_frame_equal(result, empty)
result = empty.filter(like="foo")
tm.assert_frame_equal(result, empty)
def test_filter_regex_non_string(self):
# GH#5798 trying to filter on non-string columns should drop,
# not raise
df = DataFrame(np.random.default_rng(2).random((3, 2)), columns=["STRING", 123])
result = df.filter(regex="STRING")
expected = df[["STRING"]]
tm.assert_frame_equal(result, expected)
def test_filter_keep_order(self):
# GH#54980
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
result = df.filter(items=["B", "A"])
expected = df[["B", "A"]]
tm.assert_frame_equal(result, expected)
def test_filter_different_dtype(self):
# GH#54980
df = DataFrame({1: [1, 2, 3], 2: [4, 5, 6]})
result = df.filter(items=["B", "A"])
expected = df[[]]
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,143 @@
"""
Note: includes tests for `last`
"""
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
bdate_range,
date_range,
)
import pandas._testing as tm
deprecated_msg = "first is deprecated"
last_deprecated_msg = "last is deprecated"
class TestFirst:
def test_first_subset(self, frame_or_series):
ts = DataFrame(
np.random.default_rng(2).standard_normal((100, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=100, freq="12h"),
)
ts = tm.get_obj(ts, frame_or_series)
with tm.assert_produces_warning(FutureWarning, match=deprecated_msg):
result = ts.first("10d")
assert len(result) == 20
ts = DataFrame(
np.random.default_rng(2).standard_normal((100, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=100, freq="D"),
)
ts = tm.get_obj(ts, frame_or_series)
with tm.assert_produces_warning(FutureWarning, match=deprecated_msg):
result = ts.first("10d")
assert len(result) == 10
with tm.assert_produces_warning(FutureWarning, match=deprecated_msg):
result = ts.first("3ME")
expected = ts[:"3/31/2000"]
tm.assert_equal(result, expected)
with tm.assert_produces_warning(FutureWarning, match=deprecated_msg):
result = ts.first("21D")
expected = ts[:21]
tm.assert_equal(result, expected)
with tm.assert_produces_warning(FutureWarning, match=deprecated_msg):
result = ts[:0].first("3ME")
tm.assert_equal(result, ts[:0])
def test_first_last_raises(self, frame_or_series):
# GH#20725
obj = DataFrame([[1, 2, 3], [4, 5, 6]])
obj = tm.get_obj(obj, frame_or_series)
msg = "'first' only supports a DatetimeIndex index"
with tm.assert_produces_warning(
FutureWarning, match=deprecated_msg
), pytest.raises(
TypeError, match=msg
): # index is not a DatetimeIndex
obj.first("1D")
msg = "'last' only supports a DatetimeIndex index"
with tm.assert_produces_warning(
FutureWarning, match=last_deprecated_msg
), pytest.raises(
TypeError, match=msg
): # index is not a DatetimeIndex
obj.last("1D")
def test_last_subset(self, frame_or_series):
ts = DataFrame(
np.random.default_rng(2).standard_normal((100, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=100, freq="12h"),
)
ts = tm.get_obj(ts, frame_or_series)
with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg):
result = ts.last("10d")
assert len(result) == 20
ts = DataFrame(
np.random.default_rng(2).standard_normal((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=30, freq="D"),
)
ts = tm.get_obj(ts, frame_or_series)
with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg):
result = ts.last("10d")
assert len(result) == 10
with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg):
result = ts.last("21D")
expected = ts["2000-01-10":]
tm.assert_equal(result, expected)
with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg):
result = ts.last("21D")
expected = ts[-21:]
tm.assert_equal(result, expected)
with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg):
result = ts[:0].last("3ME")
tm.assert_equal(result, ts[:0])
@pytest.mark.parametrize("start, periods", [("2010-03-31", 1), ("2010-03-30", 2)])
def test_first_with_first_day_last_of_month(self, frame_or_series, start, periods):
# GH#29623
x = frame_or_series([1] * 100, index=bdate_range(start, periods=100))
with tm.assert_produces_warning(FutureWarning, match=deprecated_msg):
result = x.first("1ME")
expected = frame_or_series(
[1] * periods, index=bdate_range(start, periods=periods)
)
tm.assert_equal(result, expected)
def test_first_with_first_day_end_of_frq_n_greater_one(self, frame_or_series):
# GH#29623
x = frame_or_series([1] * 100, index=bdate_range("2010-03-31", periods=100))
with tm.assert_produces_warning(FutureWarning, match=deprecated_msg):
result = x.first("2ME")
expected = frame_or_series(
[1] * 23, index=bdate_range("2010-03-31", "2010-04-30")
)
tm.assert_equal(result, expected)
def test_empty_not_input(self):
# GH#51032
df = DataFrame(index=pd.DatetimeIndex([]))
with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg):
result = df.last(offset=1)
with tm.assert_produces_warning(FutureWarning, match=deprecated_msg):
result = df.first(offset=1)
tm.assert_frame_equal(df, result)
assert df is not result

View File

@ -0,0 +1,78 @@
"""
Includes test for last_valid_index.
"""
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
Series,
date_range,
)
class TestFirstValidIndex:
def test_first_valid_index_single_nan(self, frame_or_series):
# GH#9752 Series/DataFrame should both return None, not raise
obj = frame_or_series([np.nan])
assert obj.first_valid_index() is None
assert obj.iloc[:0].first_valid_index() is None
@pytest.mark.parametrize(
"empty", [DataFrame(), Series(dtype=object), Series([], index=[], dtype=object)]
)
def test_first_valid_index_empty(self, empty):
# GH#12800
assert empty.last_valid_index() is None
assert empty.first_valid_index() is None
@pytest.mark.parametrize(
"data,idx,expected_first,expected_last",
[
({"A": [1, 2, 3]}, [1, 1, 2], 1, 2),
({"A": [1, 2, 3]}, [1, 2, 2], 1, 2),
({"A": [1, 2, 3, 4]}, ["d", "d", "d", "d"], "d", "d"),
({"A": [1, np.nan, 3]}, [1, 1, 2], 1, 2),
({"A": [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2),
({"A": [1, np.nan, 3]}, [1, 2, 2], 1, 2),
],
)
def test_first_last_valid_frame(self, data, idx, expected_first, expected_last):
# GH#21441
df = DataFrame(data, index=idx)
assert expected_first == df.first_valid_index()
assert expected_last == df.last_valid_index()
@pytest.mark.parametrize(
"index",
[Index([str(i) for i in range(20)]), date_range("2020-01-01", periods=20)],
)
def test_first_last_valid(self, index):
mat = np.random.default_rng(2).standard_normal(len(index))
mat[:5] = np.nan
mat[-5:] = np.nan
frame = DataFrame({"foo": mat}, index=index)
assert frame.first_valid_index() == frame.index[5]
assert frame.last_valid_index() == frame.index[-6]
ser = frame["foo"]
assert ser.first_valid_index() == frame.index[5]
assert ser.last_valid_index() == frame.index[-6]
@pytest.mark.parametrize(
"index",
[Index([str(i) for i in range(10)]), date_range("2020-01-01", periods=10)],
)
def test_first_last_valid_all_nan(self, index):
# GH#17400: no valid entries
frame = DataFrame(np.nan, columns=["foo"], index=index)
assert frame.last_valid_index() is None
assert frame.first_valid_index() is None
ser = frame["foo"]
assert ser.first_valid_index() is None
assert ser.last_valid_index() is None

View File

@ -0,0 +1,102 @@
import numpy as np
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Index,
Series,
Timestamp,
)
import pandas._testing as tm
from pandas.core.arrays import IntervalArray
class TestGetNumericData:
def test_get_numeric_data_preserve_dtype(self):
# get the numeric data
obj = DataFrame({"A": [1, "2", 3.0]}, columns=Index(["A"], dtype="object"))
result = obj._get_numeric_data()
expected = DataFrame(dtype=object, index=pd.RangeIndex(3), columns=[])
tm.assert_frame_equal(result, expected)
def test_get_numeric_data(self, using_infer_string):
datetime64name = np.dtype("M8[s]").name
objectname = np.dtype(np.object_).name
df = DataFrame(
{"a": 1.0, "b": 2, "c": "foo", "f": Timestamp("20010102")},
index=np.arange(10),
)
result = df.dtypes
expected = Series(
[
np.dtype("float64"),
np.dtype("int64"),
np.dtype(objectname) if not using_infer_string else "string",
np.dtype(datetime64name),
],
index=["a", "b", "c", "f"],
)
tm.assert_series_equal(result, expected)
df = DataFrame(
{
"a": 1.0,
"b": 2,
"c": "foo",
"d": np.array([1.0] * 10, dtype="float32"),
"e": np.array([1] * 10, dtype="int32"),
"f": np.array([1] * 10, dtype="int16"),
"g": Timestamp("20010102"),
},
index=np.arange(10),
)
result = df._get_numeric_data()
expected = df.loc[:, ["a", "b", "d", "e", "f"]]
tm.assert_frame_equal(result, expected)
only_obj = df.loc[:, ["c", "g"]]
result = only_obj._get_numeric_data()
expected = df.loc[:, []]
tm.assert_frame_equal(result, expected)
df = DataFrame.from_dict({"a": [1, 2], "b": ["foo", "bar"], "c": [np.pi, np.e]})
result = df._get_numeric_data()
expected = DataFrame.from_dict({"a": [1, 2], "c": [np.pi, np.e]})
tm.assert_frame_equal(result, expected)
df = result.copy()
result = df._get_numeric_data()
expected = df
tm.assert_frame_equal(result, expected)
def test_get_numeric_data_mixed_dtype(self):
# numeric and object columns
df = DataFrame(
{
"a": [1, 2, 3],
"b": [True, False, True],
"c": ["foo", "bar", "baz"],
"d": [None, None, None],
"e": [3.14, 0.577, 2.773],
}
)
result = df._get_numeric_data()
tm.assert_index_equal(result.columns, Index(["a", "b", "e"]))
def test_get_numeric_data_extension_dtype(self):
# GH#22290
df = DataFrame(
{
"A": pd.array([-10, np.nan, 0, 10, 20, 30], dtype="Int64"),
"B": Categorical(list("abcabc")),
"C": pd.array([0, 1, 2, 3, np.nan, 5], dtype="UInt8"),
"D": IntervalArray.from_breaks(range(7)),
}
)
result = df._get_numeric_data()
expected = df.loc[:, ["A", "C"]]
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,57 @@
import numpy as np
from pandas import DataFrame
import pandas._testing as tm
def test_head_tail_generic(index, frame_or_series):
# GH#5370
ndim = 2 if frame_or_series is DataFrame else 1
shape = (len(index),) * ndim
vals = np.random.default_rng(2).standard_normal(shape)
obj = frame_or_series(vals, index=index)
tm.assert_equal(obj.head(), obj.iloc[:5])
tm.assert_equal(obj.tail(), obj.iloc[-5:])
# 0-len
tm.assert_equal(obj.head(0), obj.iloc[0:0])
tm.assert_equal(obj.tail(0), obj.iloc[0:0])
# bounded
tm.assert_equal(obj.head(len(obj) + 1), obj)
tm.assert_equal(obj.tail(len(obj) + 1), obj)
# neg index
tm.assert_equal(obj.head(-3), obj.head(len(index) - 3))
tm.assert_equal(obj.tail(-3), obj.tail(len(index) - 3))
def test_head_tail(float_frame):
tm.assert_frame_equal(float_frame.head(), float_frame[:5])
tm.assert_frame_equal(float_frame.tail(), float_frame[-5:])
tm.assert_frame_equal(float_frame.head(0), float_frame[0:0])
tm.assert_frame_equal(float_frame.tail(0), float_frame[0:0])
tm.assert_frame_equal(float_frame.head(-1), float_frame[:-1])
tm.assert_frame_equal(float_frame.tail(-1), float_frame[1:])
tm.assert_frame_equal(float_frame.head(1), float_frame[:1])
tm.assert_frame_equal(float_frame.tail(1), float_frame[-1:])
# with a float index
df = float_frame.copy()
df.index = np.arange(len(float_frame)) + 0.1
tm.assert_frame_equal(df.head(), df.iloc[:5])
tm.assert_frame_equal(df.tail(), df.iloc[-5:])
tm.assert_frame_equal(df.head(0), df[0:0])
tm.assert_frame_equal(df.tail(0), df[0:0])
tm.assert_frame_equal(df.head(-1), df.iloc[:-1])
tm.assert_frame_equal(df.tail(-1), df.iloc[1:])
def test_head_tail_empty():
# test empty dataframe
empty_df = DataFrame()
tm.assert_frame_equal(empty_df.tail(), empty_df)
tm.assert_frame_equal(empty_df.head(), empty_df)

View File

@ -0,0 +1,42 @@
from datetime import datetime
from pandas import DataFrame
import pandas._testing as tm
class TestInferObjects:
def test_infer_objects(self):
# GH#11221
df = DataFrame(
{
"a": ["a", 1, 2, 3],
"b": ["b", 2.0, 3.0, 4.1],
"c": [
"c",
datetime(2016, 1, 1),
datetime(2016, 1, 2),
datetime(2016, 1, 3),
],
"d": [1, 2, 3, "d"],
},
columns=["a", "b", "c", "d"],
)
df = df.iloc[1:].infer_objects()
assert df["a"].dtype == "int64"
assert df["b"].dtype == "float64"
assert df["c"].dtype == "M8[ns]"
assert df["d"].dtype == "object"
expected = DataFrame(
{
"a": [1, 2, 3],
"b": [2.0, 3.0, 4.1],
"c": [datetime(2016, 1, 1), datetime(2016, 1, 2), datetime(2016, 1, 3)],
"d": [2, 3, "d"],
},
columns=["a", "b", "c", "d"],
)
# reconstruct frame to verify inference is same
result = df.reset_index(drop=True)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,565 @@
from io import StringIO
import re
from string import ascii_uppercase
import sys
import textwrap
import numpy as np
import pytest
from pandas.compat import (
IS64,
PYPY,
)
from pandas import (
CategoricalIndex,
DataFrame,
MultiIndex,
Series,
date_range,
option_context,
)
import pandas._testing as tm
@pytest.fixture
def duplicate_columns_frame():
"""Dataframe with duplicate column names."""
return DataFrame(
np.random.default_rng(2).standard_normal((1500, 4)),
columns=["a", "a", "b", "b"],
)
def test_info_empty():
# GH #45494
df = DataFrame()
buf = StringIO()
df.info(buf=buf)
result = buf.getvalue()
expected = textwrap.dedent(
"""\
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame\n"""
)
assert result == expected
def test_info_categorical_column_smoke_test():
n = 2500
df = DataFrame({"int64": np.random.default_rng(2).integers(100, size=n, dtype=int)})
df["category"] = Series(
np.array(list("abcdefghij")).take(
np.random.default_rng(2).integers(0, 10, size=n, dtype=int)
)
).astype("category")
df.isna()
buf = StringIO()
df.info(buf=buf)
df2 = df[df["category"] == "d"]
buf = StringIO()
df2.info(buf=buf)
@pytest.mark.parametrize(
"fixture_func_name",
[
"int_frame",
"float_frame",
"datetime_frame",
"duplicate_columns_frame",
"float_string_frame",
],
)
def test_info_smoke_test(fixture_func_name, request):
frame = request.getfixturevalue(fixture_func_name)
buf = StringIO()
frame.info(buf=buf)
result = buf.getvalue().splitlines()
assert len(result) > 10
buf = StringIO()
frame.info(buf=buf, verbose=False)
def test_info_smoke_test2(float_frame):
# pretty useless test, used to be mixed into the repr tests
buf = StringIO()
float_frame.reindex(columns=["A"]).info(verbose=False, buf=buf)
float_frame.reindex(columns=["A", "B"]).info(verbose=False, buf=buf)
# no columns or index
DataFrame().info(buf=buf)
@pytest.mark.parametrize(
"num_columns, max_info_columns, verbose",
[
(10, 100, True),
(10, 11, True),
(10, 10, True),
(10, 9, False),
(10, 1, False),
],
)
def test_info_default_verbose_selection(num_columns, max_info_columns, verbose):
frame = DataFrame(np.random.default_rng(2).standard_normal((5, num_columns)))
with option_context("display.max_info_columns", max_info_columns):
io_default = StringIO()
frame.info(buf=io_default)
result = io_default.getvalue()
io_explicit = StringIO()
frame.info(buf=io_explicit, verbose=verbose)
expected = io_explicit.getvalue()
assert result == expected
def test_info_verbose_check_header_separator_body():
buf = StringIO()
size = 1001
start = 5
frame = DataFrame(np.random.default_rng(2).standard_normal((3, size)))
frame.info(verbose=True, buf=buf)
res = buf.getvalue()
header = " # Column Dtype \n--- ------ ----- "
assert header in res
frame.info(verbose=True, buf=buf)
buf.seek(0)
lines = buf.readlines()
assert len(lines) > 0
for i, line in enumerate(lines):
if start <= i < start + size:
line_nr = f" {i - start} "
assert line.startswith(line_nr)
@pytest.mark.parametrize(
"size, header_exp, separator_exp, first_line_exp, last_line_exp",
[
(
4,
" # Column Non-Null Count Dtype ",
"--- ------ -------------- ----- ",
" 0 0 3 non-null float64",
" 3 3 3 non-null float64",
),
(
11,
" # Column Non-Null Count Dtype ",
"--- ------ -------------- ----- ",
" 0 0 3 non-null float64",
" 10 10 3 non-null float64",
),
(
101,
" # Column Non-Null Count Dtype ",
"--- ------ -------------- ----- ",
" 0 0 3 non-null float64",
" 100 100 3 non-null float64",
),
(
1001,
" # Column Non-Null Count Dtype ",
"--- ------ -------------- ----- ",
" 0 0 3 non-null float64",
" 1000 1000 3 non-null float64",
),
(
10001,
" # Column Non-Null Count Dtype ",
"--- ------ -------------- ----- ",
" 0 0 3 non-null float64",
" 10000 10000 3 non-null float64",
),
],
)
def test_info_verbose_with_counts_spacing(
size, header_exp, separator_exp, first_line_exp, last_line_exp
):
"""Test header column, spacer, first line and last line in verbose mode."""
frame = DataFrame(np.random.default_rng(2).standard_normal((3, size)))
with StringIO() as buf:
frame.info(verbose=True, show_counts=True, buf=buf)
all_lines = buf.getvalue().splitlines()
# Here table would contain only header, separator and table lines
# dframe repr, index summary, memory usage and dtypes are excluded
table = all_lines[3:-2]
header, separator, first_line, *rest, last_line = table
assert header == header_exp
assert separator == separator_exp
assert first_line == first_line_exp
assert last_line == last_line_exp
def test_info_memory():
# https://github.com/pandas-dev/pandas/issues/21056
df = DataFrame({"a": Series([1, 2], dtype="i8")})
buf = StringIO()
df.info(buf=buf)
result = buf.getvalue()
bytes = float(df.memory_usage().sum())
expected = textwrap.dedent(
f"""\
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 1 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 a 2 non-null int64
dtypes: int64(1)
memory usage: {bytes} bytes
"""
)
assert result == expected
def test_info_wide():
io = StringIO()
df = DataFrame(np.random.default_rng(2).standard_normal((5, 101)))
df.info(buf=io)
io = StringIO()
df.info(buf=io, max_cols=101)
result = io.getvalue()
assert len(result.splitlines()) > 100
expected = result
with option_context("display.max_info_columns", 101):
io = StringIO()
df.info(buf=io)
result = io.getvalue()
assert result == expected
def test_info_duplicate_columns_shows_correct_dtypes():
# GH11761
io = StringIO()
frame = DataFrame([[1, 2.0]], columns=["a", "a"])
frame.info(buf=io)
lines = io.getvalue().splitlines(True)
assert " 0 a 1 non-null int64 \n" == lines[5]
assert " 1 a 1 non-null float64\n" == lines[6]
def test_info_shows_column_dtypes():
dtypes = [
"int64",
"float64",
"datetime64[ns]",
"timedelta64[ns]",
"complex128",
"object",
"bool",
]
data = {}
n = 10
for i, dtype in enumerate(dtypes):
data[i] = np.random.default_rng(2).integers(2, size=n).astype(dtype)
df = DataFrame(data)
buf = StringIO()
df.info(buf=buf)
res = buf.getvalue()
header = (
" # Column Non-Null Count Dtype \n"
"--- ------ -------------- ----- "
)
assert header in res
for i, dtype in enumerate(dtypes):
name = f" {i:d} {i:d} {n:d} non-null {dtype}"
assert name in res
def test_info_max_cols():
df = DataFrame(np.random.default_rng(2).standard_normal((10, 5)))
for len_, verbose in [(5, None), (5, False), (12, True)]:
# For verbose always ^ setting ^ summarize ^ full output
with option_context("max_info_columns", 4):
buf = StringIO()
df.info(buf=buf, verbose=verbose)
res = buf.getvalue()
assert len(res.strip().split("\n")) == len_
for len_, verbose in [(12, None), (5, False), (12, True)]:
# max_cols not exceeded
with option_context("max_info_columns", 5):
buf = StringIO()
df.info(buf=buf, verbose=verbose)
res = buf.getvalue()
assert len(res.strip().split("\n")) == len_
for len_, max_cols in [(12, 5), (5, 4)]:
# setting truncates
with option_context("max_info_columns", 4):
buf = StringIO()
df.info(buf=buf, max_cols=max_cols)
res = buf.getvalue()
assert len(res.strip().split("\n")) == len_
# setting wouldn't truncate
with option_context("max_info_columns", 5):
buf = StringIO()
df.info(buf=buf, max_cols=max_cols)
res = buf.getvalue()
assert len(res.strip().split("\n")) == len_
def test_info_memory_usage():
# Ensure memory usage is displayed, when asserted, on the last line
dtypes = [
"int64",
"float64",
"datetime64[ns]",
"timedelta64[ns]",
"complex128",
"object",
"bool",
]
data = {}
n = 10
for i, dtype in enumerate(dtypes):
data[i] = np.random.default_rng(2).integers(2, size=n).astype(dtype)
df = DataFrame(data)
buf = StringIO()
# display memory usage case
df.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
assert "memory usage: " in res[-1]
# do not display memory usage case
df.info(buf=buf, memory_usage=False)
res = buf.getvalue().splitlines()
assert "memory usage: " not in res[-1]
df.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
# memory usage is a lower bound, so print it as XYZ+ MB
assert re.match(r"memory usage: [^+]+\+", res[-1])
df.iloc[:, :5].info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
# excluded column with object dtype, so estimate is accurate
assert not re.match(r"memory usage: [^+]+\+", res[-1])
# Test a DataFrame with duplicate columns
dtypes = ["int64", "int64", "int64", "float64"]
data = {}
n = 100
for i, dtype in enumerate(dtypes):
data[i] = np.random.default_rng(2).integers(2, size=n).astype(dtype)
df = DataFrame(data)
df.columns = dtypes
df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
df_with_object_index.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
assert re.match(r"memory usage: [^+]+\+", res[-1])
df_with_object_index.info(buf=buf, memory_usage="deep")
res = buf.getvalue().splitlines()
assert re.match(r"memory usage: [^+]+$", res[-1])
# Ensure df size is as expected
# (cols * rows * bytes) + index size
df_size = df.memory_usage().sum()
exp_size = len(dtypes) * n * 8 + df.index.nbytes
assert df_size == exp_size
# Ensure number of cols in memory_usage is the same as df
size_df = np.size(df.columns.values) + 1 # index=True; default
assert size_df == np.size(df.memory_usage())
# assert deep works only on object
assert df.memory_usage().sum() == df.memory_usage(deep=True).sum()
# test for validity
DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True)
DataFrame(1, index=["a"], columns=["A"]).index.nbytes
df = DataFrame(
data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"]
)
df.index.nbytes
df.memory_usage(index=True)
df.index.values.nbytes
mem = df.memory_usage(deep=True).sum()
assert mem > 0
@pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result")
def test_info_memory_usage_deep_not_pypy():
df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
assert (
df_with_object_index.memory_usage(index=True, deep=True).sum()
> df_with_object_index.memory_usage(index=True).sum()
)
df_object = DataFrame({"a": ["a"]})
assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum()
@pytest.mark.xfail(not PYPY, reason="on PyPy deep=True does not change result")
def test_info_memory_usage_deep_pypy():
df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
assert (
df_with_object_index.memory_usage(index=True, deep=True).sum()
== df_with_object_index.memory_usage(index=True).sum()
)
df_object = DataFrame({"a": ["a"]})
assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum()
@pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
def test_usage_via_getsizeof():
df = DataFrame(
data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"]
)
mem = df.memory_usage(deep=True).sum()
# sys.getsizeof will call the .memory_usage with
# deep=True, and add on some GC overhead
diff = mem - sys.getsizeof(df)
assert abs(diff) < 100
def test_info_memory_usage_qualified():
buf = StringIO()
df = DataFrame(1, columns=list("ab"), index=[1, 2, 3])
df.info(buf=buf)
assert "+" not in buf.getvalue()
buf = StringIO()
df = DataFrame(1, columns=list("ab"), index=list("ABC"))
df.info(buf=buf)
assert "+" in buf.getvalue()
buf = StringIO()
df = DataFrame(
1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)])
)
df.info(buf=buf)
assert "+" not in buf.getvalue()
buf = StringIO()
df = DataFrame(
1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]])
)
df.info(buf=buf)
assert "+" in buf.getvalue()
def test_info_memory_usage_bug_on_multiindex():
# GH 14308
# memory usage introspection should not materialize .values
def memory_usage(f):
return f.memory_usage(deep=True).sum()
N = 100
M = len(ascii_uppercase)
index = MultiIndex.from_product(
[list(ascii_uppercase), date_range("20160101", periods=N)],
names=["id", "date"],
)
df = DataFrame(
{"value": np.random.default_rng(2).standard_normal(N * M)}, index=index
)
unstacked = df.unstack("id")
assert df.values.nbytes == unstacked.values.nbytes
assert memory_usage(df) > memory_usage(unstacked)
# high upper bound
assert memory_usage(unstacked) - memory_usage(df) < 2000
def test_info_categorical():
# GH14298
idx = CategoricalIndex(["a", "b"])
df = DataFrame(np.zeros((2, 2)), index=idx, columns=idx)
buf = StringIO()
df.info(buf=buf)
@pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system")
def test_info_int_columns():
# GH#37245
df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"])
buf = StringIO()
df.info(show_counts=True, buf=buf)
result = buf.getvalue()
expected = textwrap.dedent(
"""\
<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, A to B
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 1 2 non-null int64
1 2 2 non-null int64
dtypes: int64(2)
memory usage: 48.0+ bytes
"""
)
assert result == expected
def test_memory_usage_empty_no_warning():
# GH#50066
df = DataFrame(index=["a", "b"])
with tm.assert_produces_warning(None):
result = df.memory_usage()
expected = Series(16 if IS64 else 8, index=["Index"])
tm.assert_series_equal(result, expected)
@pytest.mark.single_cpu
def test_info_compute_numba():
# GH#51922
pytest.importorskip("numba")
df = DataFrame([[1, 2], [3, 4]])
with option_context("compute.use_numba", True):
buf = StringIO()
df.info(buf=buf)
result = buf.getvalue()
buf = StringIO()
df.info(buf=buf)
expected = buf.getvalue()
assert result == expected
@pytest.mark.parametrize(
"row, columns, show_counts, result",
[
[20, 20, None, True],
[20, 20, True, True],
[20, 20, False, False],
[5, 5, None, False],
[5, 5, True, False],
[5, 5, False, False],
],
)
def test_info_show_counts(row, columns, show_counts, result):
# Explicit cast to float to avoid implicit cast when setting nan
df = DataFrame(1, columns=range(10), index=range(10)).astype({1: "float"})
df.iloc[1, 1] = np.nan
with option_context(
"display.max_info_rows", row, "display.max_info_columns", columns
):
with StringIO() as buf:
df.info(buf=buf, show_counts=show_counts)
assert ("non-null" in buf.getvalue()) is result

View File

@ -0,0 +1,548 @@
import numpy as np
import pytest
from pandas._config import using_pyarrow_string_dtype
from pandas.errors import ChainedAssignmentError
import pandas.util._test_decorators as td
from pandas import (
DataFrame,
NaT,
Series,
date_range,
)
import pandas._testing as tm
class TestDataFrameInterpolate:
def test_interpolate_complex(self):
# GH#53635
ser = Series([complex("1+1j"), float("nan"), complex("2+2j")])
assert ser.dtype.kind == "c"
res = ser.interpolate()
expected = Series([ser[0], ser[0] * 1.5, ser[2]])
tm.assert_series_equal(res, expected)
df = ser.to_frame()
res = df.interpolate()
expected = expected.to_frame()
tm.assert_frame_equal(res, expected)
def test_interpolate_datetimelike_values(self, frame_or_series):
# GH#11312, GH#51005
orig = Series(date_range("2012-01-01", periods=5))
ser = orig.copy()
ser[2] = NaT
res = frame_or_series(ser).interpolate()
expected = frame_or_series(orig)
tm.assert_equal(res, expected)
# datetime64tz cast
ser_tz = ser.dt.tz_localize("US/Pacific")
res_tz = frame_or_series(ser_tz).interpolate()
expected_tz = frame_or_series(orig.dt.tz_localize("US/Pacific"))
tm.assert_equal(res_tz, expected_tz)
# timedelta64 cast
ser_td = ser - ser[0]
res_td = frame_or_series(ser_td).interpolate()
expected_td = frame_or_series(orig - orig[0])
tm.assert_equal(res_td, expected_td)
def test_interpolate_inplace(self, frame_or_series, using_array_manager, request):
# GH#44749
if using_array_manager and frame_or_series is DataFrame:
mark = pytest.mark.xfail(reason=".values-based in-place check is invalid")
request.applymarker(mark)
obj = frame_or_series([1, np.nan, 2])
orig = obj.values
obj.interpolate(inplace=True)
expected = frame_or_series([1, 1.5, 2])
tm.assert_equal(obj, expected)
# check we operated *actually* inplace
assert np.shares_memory(orig, obj.values)
assert orig.squeeze()[1] == 1.5
@pytest.mark.xfail(
using_pyarrow_string_dtype(), reason="interpolate doesn't work for string"
)
def test_interp_basic(self, using_copy_on_write):
df = DataFrame(
{
"A": [1, 2, np.nan, 4],
"B": [1, 4, 9, np.nan],
"C": [1, 2, 3, 5],
"D": list("abcd"),
}
)
expected = DataFrame(
{
"A": [1.0, 2.0, 3.0, 4.0],
"B": [1.0, 4.0, 9.0, 9.0],
"C": [1, 2, 3, 5],
"D": list("abcd"),
}
)
msg = "DataFrame.interpolate with object dtype"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.interpolate()
tm.assert_frame_equal(result, expected)
# check we didn't operate inplace GH#45791
cvalues = df["C"]._values
dvalues = df["D"].values
if using_copy_on_write:
assert np.shares_memory(cvalues, result["C"]._values)
assert np.shares_memory(dvalues, result["D"]._values)
else:
assert not np.shares_memory(cvalues, result["C"]._values)
assert not np.shares_memory(dvalues, result["D"]._values)
with tm.assert_produces_warning(FutureWarning, match=msg):
res = df.interpolate(inplace=True)
assert res is None
tm.assert_frame_equal(df, expected)
# check we DID operate inplace
assert np.shares_memory(df["C"]._values, cvalues)
assert np.shares_memory(df["D"]._values, dvalues)
@pytest.mark.xfail(
using_pyarrow_string_dtype(), reason="interpolate doesn't work for string"
)
def test_interp_basic_with_non_range_index(self, using_infer_string):
df = DataFrame(
{
"A": [1, 2, np.nan, 4],
"B": [1, 4, 9, np.nan],
"C": [1, 2, 3, 5],
"D": list("abcd"),
}
)
msg = "DataFrame.interpolate with object dtype"
warning = FutureWarning if not using_infer_string else None
with tm.assert_produces_warning(warning, match=msg):
result = df.set_index("C").interpolate()
expected = df.set_index("C")
expected.loc[3, "A"] = 3
expected.loc[5, "B"] = 9
tm.assert_frame_equal(result, expected)
def test_interp_empty(self):
# https://github.com/pandas-dev/pandas/issues/35598
df = DataFrame()
result = df.interpolate()
assert result is not df
expected = df
tm.assert_frame_equal(result, expected)
def test_interp_bad_method(self):
df = DataFrame(
{
"A": [1, 2, np.nan, 4],
"B": [1, 4, 9, np.nan],
"C": [1, 2, 3, 5],
}
)
msg = (
r"method must be one of \['linear', 'time', 'index', 'values', "
r"'nearest', 'zero', 'slinear', 'quadratic', 'cubic', "
r"'barycentric', 'krogh', 'spline', 'polynomial', "
r"'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima', "
r"'cubicspline'\]. Got 'not_a_method' instead."
)
with pytest.raises(ValueError, match=msg):
df.interpolate(method="not_a_method")
def test_interp_combo(self):
df = DataFrame(
{
"A": [1.0, 2.0, np.nan, 4.0],
"B": [1, 4, 9, np.nan],
"C": [1, 2, 3, 5],
"D": list("abcd"),
}
)
result = df["A"].interpolate()
expected = Series([1.0, 2.0, 3.0, 4.0], name="A")
tm.assert_series_equal(result, expected)
msg = "The 'downcast' keyword in Series.interpolate is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df["A"].interpolate(downcast="infer")
expected = Series([1, 2, 3, 4], name="A")
tm.assert_series_equal(result, expected)
def test_inerpolate_invalid_downcast(self):
# GH#53103
df = DataFrame(
{
"A": [1.0, 2.0, np.nan, 4.0],
"B": [1, 4, 9, np.nan],
"C": [1, 2, 3, 5],
"D": list("abcd"),
}
)
msg = "downcast must be either None or 'infer'"
msg2 = "The 'downcast' keyword in DataFrame.interpolate is deprecated"
msg3 = "The 'downcast' keyword in Series.interpolate is deprecated"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning, match=msg2):
df.interpolate(downcast="int64")
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning, match=msg3):
df["A"].interpolate(downcast="int64")
def test_interp_nan_idx(self):
df = DataFrame({"A": [1, 2, np.nan, 4], "B": [np.nan, 2, 3, 4]})
df = df.set_index("A")
msg = (
"Interpolation with NaNs in the index has not been implemented. "
"Try filling those NaNs before interpolating."
)
with pytest.raises(NotImplementedError, match=msg):
df.interpolate(method="values")
def test_interp_various(self):
pytest.importorskip("scipy")
df = DataFrame(
{"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]}
)
df = df.set_index("C")
expected = df.copy()
result = df.interpolate(method="polynomial", order=1)
expected.loc[3, "A"] = 2.66666667
expected.loc[13, "A"] = 5.76923076
tm.assert_frame_equal(result, expected)
result = df.interpolate(method="cubic")
# GH #15662.
expected.loc[3, "A"] = 2.81547781
expected.loc[13, "A"] = 5.52964175
tm.assert_frame_equal(result, expected)
result = df.interpolate(method="nearest")
expected.loc[3, "A"] = 2
expected.loc[13, "A"] = 5
tm.assert_frame_equal(result, expected, check_dtype=False)
result = df.interpolate(method="quadratic")
expected.loc[3, "A"] = 2.82150771
expected.loc[13, "A"] = 6.12648668
tm.assert_frame_equal(result, expected)
result = df.interpolate(method="slinear")
expected.loc[3, "A"] = 2.66666667
expected.loc[13, "A"] = 5.76923077
tm.assert_frame_equal(result, expected)
result = df.interpolate(method="zero")
expected.loc[3, "A"] = 2.0
expected.loc[13, "A"] = 5
tm.assert_frame_equal(result, expected, check_dtype=False)
def test_interp_alt_scipy(self):
pytest.importorskip("scipy")
df = DataFrame(
{"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]}
)
result = df.interpolate(method="barycentric")
expected = df.copy()
expected.loc[2, "A"] = 3
expected.loc[5, "A"] = 6
tm.assert_frame_equal(result, expected)
msg = "The 'downcast' keyword in DataFrame.interpolate is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.interpolate(method="barycentric", downcast="infer")
tm.assert_frame_equal(result, expected.astype(np.int64))
result = df.interpolate(method="krogh")
expectedk = df.copy()
expectedk["A"] = expected["A"]
tm.assert_frame_equal(result, expectedk)
result = df.interpolate(method="pchip")
expected.loc[2, "A"] = 3
expected.loc[5, "A"] = 6.0
tm.assert_frame_equal(result, expected)
def test_interp_rowwise(self):
df = DataFrame(
{
0: [1, 2, np.nan, 4],
1: [2, 3, 4, np.nan],
2: [np.nan, 4, 5, 6],
3: [4, np.nan, 6, 7],
4: [1, 2, 3, 4],
}
)
result = df.interpolate(axis=1)
expected = df.copy()
expected.loc[3, 1] = 5
expected.loc[0, 2] = 3
expected.loc[1, 3] = 3
expected[4] = expected[4].astype(np.float64)
tm.assert_frame_equal(result, expected)
result = df.interpolate(axis=1, method="values")
tm.assert_frame_equal(result, expected)
result = df.interpolate(axis=0)
expected = df.interpolate()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"axis_name, axis_number",
[
pytest.param("rows", 0, id="rows_0"),
pytest.param("index", 0, id="index_0"),
pytest.param("columns", 1, id="columns_1"),
],
)
def test_interp_axis_names(self, axis_name, axis_number):
# GH 29132: test axis names
data = {0: [0, np.nan, 6], 1: [1, np.nan, 7], 2: [2, 5, 8]}
df = DataFrame(data, dtype=np.float64)
result = df.interpolate(axis=axis_name, method="linear")
expected = df.interpolate(axis=axis_number, method="linear")
tm.assert_frame_equal(result, expected)
def test_rowwise_alt(self):
df = DataFrame(
{
0: [0, 0.5, 1.0, np.nan, 4, 8, np.nan, np.nan, 64],
1: [1, 2, 3, 4, 3, 2, 1, 0, -1],
}
)
df.interpolate(axis=0)
# TODO: assert something?
@pytest.mark.parametrize(
"check_scipy", [False, pytest.param(True, marks=td.skip_if_no("scipy"))]
)
def test_interp_leading_nans(self, check_scipy):
df = DataFrame(
{"A": [np.nan, np.nan, 0.5, 0.25, 0], "B": [np.nan, -3, -3.5, np.nan, -4]}
)
result = df.interpolate()
expected = df.copy()
expected.loc[3, "B"] = -3.75
tm.assert_frame_equal(result, expected)
if check_scipy:
result = df.interpolate(method="polynomial", order=1)
tm.assert_frame_equal(result, expected)
def test_interp_raise_on_only_mixed(self, axis):
df = DataFrame(
{
"A": [1, 2, np.nan, 4],
"B": ["a", "b", "c", "d"],
"C": [np.nan, 2, 5, 7],
"D": [np.nan, np.nan, 9, 9],
"E": [1, 2, 3, 4],
}
)
msg = (
"Cannot interpolate with all object-dtype columns "
"in the DataFrame. Try setting at least one "
"column to a numeric dtype."
)
with pytest.raises(TypeError, match=msg):
df.astype("object").interpolate(axis=axis)
def test_interp_raise_on_all_object_dtype(self):
# GH 22985
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, dtype="object")
msg = (
"Cannot interpolate with all object-dtype columns "
"in the DataFrame. Try setting at least one "
"column to a numeric dtype."
)
with pytest.raises(TypeError, match=msg):
df.interpolate()
def test_interp_inplace(self, using_copy_on_write):
df = DataFrame({"a": [1.0, 2.0, np.nan, 4.0]})
expected = DataFrame({"a": [1.0, 2.0, 3.0, 4.0]})
expected_cow = df.copy()
result = df.copy()
if using_copy_on_write:
with tm.raises_chained_assignment_error():
return_value = result["a"].interpolate(inplace=True)
assert return_value is None
tm.assert_frame_equal(result, expected_cow)
else:
with tm.assert_produces_warning(FutureWarning, match="inplace method"):
return_value = result["a"].interpolate(inplace=True)
assert return_value is None
tm.assert_frame_equal(result, expected)
result = df.copy()
msg = "The 'downcast' keyword in Series.interpolate is deprecated"
if using_copy_on_write:
with tm.assert_produces_warning(
(FutureWarning, ChainedAssignmentError), match=msg
):
return_value = result["a"].interpolate(inplace=True, downcast="infer")
assert return_value is None
tm.assert_frame_equal(result, expected_cow)
else:
with tm.assert_produces_warning(FutureWarning, match=msg):
return_value = result["a"].interpolate(inplace=True, downcast="infer")
assert return_value is None
tm.assert_frame_equal(result, expected.astype("int64"))
def test_interp_inplace_row(self):
# GH 10395
result = DataFrame(
{"a": [1.0, 2.0, 3.0, 4.0], "b": [np.nan, 2.0, 3.0, 4.0], "c": [3, 2, 2, 2]}
)
expected = result.interpolate(method="linear", axis=1, inplace=False)
return_value = result.interpolate(method="linear", axis=1, inplace=True)
assert return_value is None
tm.assert_frame_equal(result, expected)
def test_interp_ignore_all_good(self):
# GH
df = DataFrame(
{
"A": [1, 2, np.nan, 4],
"B": [1, 2, 3, 4],
"C": [1.0, 2.0, np.nan, 4.0],
"D": [1.0, 2.0, 3.0, 4.0],
}
)
expected = DataFrame(
{
"A": np.array([1, 2, 3, 4], dtype="float64"),
"B": np.array([1, 2, 3, 4], dtype="int64"),
"C": np.array([1.0, 2.0, 3, 4.0], dtype="float64"),
"D": np.array([1.0, 2.0, 3.0, 4.0], dtype="float64"),
}
)
msg = "The 'downcast' keyword in DataFrame.interpolate is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.interpolate(downcast=None)
tm.assert_frame_equal(result, expected)
# all good
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df[["B", "D"]].interpolate(downcast=None)
tm.assert_frame_equal(result, df[["B", "D"]])
def test_interp_time_inplace_axis(self):
# GH 9687
periods = 5
idx = date_range(start="2014-01-01", periods=periods)
data = np.random.default_rng(2).random((periods, periods))
data[data < 0.5] = np.nan
expected = DataFrame(index=idx, columns=idx, data=data)
result = expected.interpolate(axis=0, method="time")
return_value = expected.interpolate(axis=0, method="time", inplace=True)
assert return_value is None
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("axis_name, axis_number", [("index", 0), ("columns", 1)])
def test_interp_string_axis(self, axis_name, axis_number):
# https://github.com/pandas-dev/pandas/issues/25190
x = np.linspace(0, 100, 1000)
y = np.sin(x)
df = DataFrame(
data=np.tile(y, (10, 1)), index=np.arange(10), columns=x
).reindex(columns=x * 1.005)
result = df.interpolate(method="linear", axis=axis_name)
expected = df.interpolate(method="linear", axis=axis_number)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("multiblock", [True, False])
@pytest.mark.parametrize("method", ["ffill", "bfill", "pad"])
def test_interp_fillna_methods(
self, request, axis, multiblock, method, using_array_manager
):
# GH 12918
if using_array_manager and axis in (1, "columns"):
# TODO(ArrayManager) support axis=1
td.mark_array_manager_not_yet_implemented(request)
df = DataFrame(
{
"A": [1.0, 2.0, 3.0, 4.0, np.nan, 5.0],
"B": [2.0, 4.0, 6.0, np.nan, 8.0, 10.0],
"C": [3.0, 6.0, 9.0, np.nan, np.nan, 30.0],
}
)
if multiblock:
df["D"] = np.nan
df["E"] = 1.0
method2 = method if method != "pad" else "ffill"
expected = getattr(df, method2)(axis=axis)
msg = f"DataFrame.interpolate with method={method} is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.interpolate(method=method, axis=axis)
tm.assert_frame_equal(result, expected)
def test_interpolate_empty_df(self):
# GH#53199
df = DataFrame()
expected = df.copy()
result = df.interpolate(inplace=True)
assert result is None
tm.assert_frame_equal(df, expected)
def test_interpolate_ea(self, any_int_ea_dtype):
# GH#55347
df = DataFrame({"a": [1, None, None, None, 3]}, dtype=any_int_ea_dtype)
orig = df.copy()
result = df.interpolate(limit=2)
expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype="Float64")
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(df, orig)
@pytest.mark.parametrize(
"dtype",
[
"Float64",
"Float32",
pytest.param("float32[pyarrow]", marks=td.skip_if_no("pyarrow")),
pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")),
],
)
def test_interpolate_ea_float(self, dtype):
# GH#55347
df = DataFrame({"a": [1, None, None, None, 3]}, dtype=dtype)
orig = df.copy()
result = df.interpolate(limit=2)
expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype=dtype)
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(df, orig)
@pytest.mark.parametrize(
"dtype",
["int64", "uint64", "int32", "int16", "int8", "uint32", "uint16", "uint8"],
)
def test_interpolate_arrow(self, dtype):
# GH#55347
pytest.importorskip("pyarrow")
df = DataFrame({"a": [1, None, None, None, 3]}, dtype=dtype + "[pyarrow]")
result = df.interpolate(limit=2)
expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype="float64[pyarrow]")
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,58 @@
import numpy as np
import pytest
import pandas.util._test_decorators as td
from pandas import (
Categorical,
DataFrame,
)
# _is_homogeneous_type always returns True for ArrayManager
pytestmark = td.skip_array_manager_invalid_test
@pytest.mark.parametrize(
"data, expected",
[
# empty
(DataFrame(), True),
# multi-same
(DataFrame({"A": [1, 2], "B": [1, 2]}), True),
# multi-object
(
DataFrame(
{
"A": np.array([1, 2], dtype=object),
"B": np.array(["a", "b"], dtype=object),
},
dtype="object",
),
True,
),
# multi-extension
(
DataFrame({"A": Categorical(["a", "b"]), "B": Categorical(["a", "b"])}),
True,
),
# differ types
(DataFrame({"A": [1, 2], "B": [1.0, 2.0]}), False),
# differ sizes
(
DataFrame(
{
"A": np.array([1, 2], dtype=np.int32),
"B": np.array([1, 2], dtype=np.int64),
}
),
False,
),
# multi-extension differ
(
DataFrame({"A": Categorical(["a", "b"]), "B": Categorical(["b", "c"])}),
False,
),
],
)
def test_is_homogeneous_type(data, expected):
assert data._is_homogeneous_type is expected

View File

@ -0,0 +1,50 @@
import pytest
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
class TestDataFrameSetItem:
def test_isetitem_ea_df(self):
# GH#49922
df = DataFrame([[1, 2, 3], [4, 5, 6]])
rhs = DataFrame([[11, 12], [13, 14]], dtype="Int64")
df.isetitem([0, 1], rhs)
expected = DataFrame(
{
0: Series([11, 13], dtype="Int64"),
1: Series([12, 14], dtype="Int64"),
2: [3, 6],
}
)
tm.assert_frame_equal(df, expected)
def test_isetitem_ea_df_scalar_indexer(self):
# GH#49922
df = DataFrame([[1, 2, 3], [4, 5, 6]])
rhs = DataFrame([[11], [13]], dtype="Int64")
df.isetitem(2, rhs)
expected = DataFrame(
{
0: [1, 4],
1: [2, 5],
2: Series([11, 13], dtype="Int64"),
}
)
tm.assert_frame_equal(df, expected)
def test_isetitem_dimension_mismatch(self):
# GH#51701
df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]})
value = df.copy()
with pytest.raises(ValueError, match="Got 2 positions but value has 3 columns"):
df.isetitem([1, 2], value)
value = df.copy()
with pytest.raises(ValueError, match="Got 2 positions but value has 1 columns"):
df.isetitem([1, 2], value[["a"]])

View File

@ -0,0 +1,227 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
MultiIndex,
Series,
)
import pandas._testing as tm
class TestDataFrameIsIn:
def test_isin(self):
# GH#4211
df = DataFrame(
{
"vals": [1, 2, 3, 4],
"ids": ["a", "b", "f", "n"],
"ids2": ["a", "n", "c", "n"],
},
index=["foo", "bar", "baz", "qux"],
)
other = ["a", "b", "c"]
result = df.isin(other)
expected = DataFrame([df.loc[s].isin(other) for s in df.index])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])])
def test_isin_empty(self, empty):
# GH#16991
df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]})
expected = DataFrame(False, df.index, df.columns)
result = df.isin(empty)
tm.assert_frame_equal(result, expected)
def test_isin_dict(self):
df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]})
d = {"A": ["a"]}
expected = DataFrame(False, df.index, df.columns)
expected.loc[0, "A"] = True
result = df.isin(d)
tm.assert_frame_equal(result, expected)
# non unique columns
df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]})
df.columns = ["A", "A"]
expected = DataFrame(False, df.index, df.columns)
expected.loc[0, "A"] = True
result = df.isin(d)
tm.assert_frame_equal(result, expected)
def test_isin_with_string_scalar(self):
# GH#4763
df = DataFrame(
{
"vals": [1, 2, 3, 4],
"ids": ["a", "b", "f", "n"],
"ids2": ["a", "n", "c", "n"],
},
index=["foo", "bar", "baz", "qux"],
)
msg = (
r"only list-like or dict-like objects are allowed "
r"to be passed to DataFrame.isin\(\), you passed a 'str'"
)
with pytest.raises(TypeError, match=msg):
df.isin("a")
with pytest.raises(TypeError, match=msg):
df.isin("aaa")
def test_isin_df(self):
df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]})
df2 = DataFrame({"A": [0, 2, 12, 4], "B": [2, np.nan, 4, 5]})
expected = DataFrame(False, df1.index, df1.columns)
result = df1.isin(df2)
expected.loc[[1, 3], "A"] = True
expected.loc[[0, 2], "B"] = True
tm.assert_frame_equal(result, expected)
# partial overlapping columns
df2.columns = ["A", "C"]
result = df1.isin(df2)
expected["B"] = False
tm.assert_frame_equal(result, expected)
def test_isin_tuples(self):
# GH#16394
df = DataFrame({"A": [1, 2, 3], "B": ["a", "b", "f"]})
df["C"] = list(zip(df["A"], df["B"]))
result = df["C"].isin([(1, "a")])
tm.assert_series_equal(result, Series([True, False, False], name="C"))
def test_isin_df_dupe_values(self):
df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]})
# just cols duped
df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]], columns=["B", "B"])
msg = r"cannot compute isin with a duplicate axis\."
with pytest.raises(ValueError, match=msg):
df1.isin(df2)
# just index duped
df2 = DataFrame(
[[0, 2], [12, 4], [2, np.nan], [4, 5]],
columns=["A", "B"],
index=[0, 0, 1, 1],
)
with pytest.raises(ValueError, match=msg):
df1.isin(df2)
# cols and index:
df2.columns = ["B", "B"]
with pytest.raises(ValueError, match=msg):
df1.isin(df2)
def test_isin_dupe_self(self):
other = DataFrame({"A": [1, 0, 1, 0], "B": [1, 1, 0, 0]})
df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=["A", "A"])
result = df.isin(other)
expected = DataFrame(False, index=df.index, columns=df.columns)
expected.loc[0] = True
expected.iloc[1, 1] = True
tm.assert_frame_equal(result, expected)
def test_isin_against_series(self):
df = DataFrame(
{"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}, index=["a", "b", "c", "d"]
)
s = Series([1, 3, 11, 4], index=["a", "b", "c", "d"])
expected = DataFrame(False, index=df.index, columns=df.columns)
expected.loc["a", "A"] = True
expected.loc["d"] = True
result = df.isin(s)
tm.assert_frame_equal(result, expected)
def test_isin_multiIndex(self):
idx = MultiIndex.from_tuples(
[
(0, "a", "foo"),
(0, "a", "bar"),
(0, "b", "bar"),
(0, "b", "baz"),
(2, "a", "foo"),
(2, "a", "bar"),
(2, "c", "bar"),
(2, "c", "baz"),
(1, "b", "foo"),
(1, "b", "bar"),
(1, "c", "bar"),
(1, "c", "baz"),
]
)
df1 = DataFrame({"A": np.ones(12), "B": np.zeros(12)}, index=idx)
df2 = DataFrame(
{
"A": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
"B": [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1],
}
)
# against regular index
expected = DataFrame(False, index=df1.index, columns=df1.columns)
result = df1.isin(df2)
tm.assert_frame_equal(result, expected)
df2.index = idx
expected = df2.values.astype(bool)
expected[:, 1] = ~expected[:, 1]
expected = DataFrame(expected, columns=["A", "B"], index=idx)
result = df1.isin(df2)
tm.assert_frame_equal(result, expected)
def test_isin_empty_datetimelike(self):
# GH#15473
df1_ts = DataFrame({"date": pd.to_datetime(["2014-01-01", "2014-01-02"])})
df1_td = DataFrame({"date": [pd.Timedelta(1, "s"), pd.Timedelta(2, "s")]})
df2 = DataFrame({"date": []})
df3 = DataFrame()
expected = DataFrame({"date": [False, False]})
result = df1_ts.isin(df2)
tm.assert_frame_equal(result, expected)
result = df1_ts.isin(df3)
tm.assert_frame_equal(result, expected)
result = df1_td.isin(df2)
tm.assert_frame_equal(result, expected)
result = df1_td.isin(df3)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"values",
[
DataFrame({"a": [1, 2, 3]}, dtype="category"),
Series([1, 2, 3], dtype="category"),
],
)
def test_isin_category_frame(self, values):
# GH#34256
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
expected = DataFrame({"a": [True, True, True], "b": [False, False, False]})
result = df.isin(values)
tm.assert_frame_equal(result, expected)
def test_isin_read_only(self):
# https://github.com/pandas-dev/pandas/issues/37174
arr = np.array([1, 2, 3])
arr.setflags(write=False)
df = DataFrame([1, 2, 3])
result = df.isin(arr)
expected = DataFrame([True, True, True])
tm.assert_frame_equal(result, expected)
def test_isin_not_lossy(self):
# GH 53514
val = 1666880195890293744
df = DataFrame({"a": [val], "b": [1.0]})
result = df.isin([val])
expected = DataFrame({"a": [True], "b": [False]})
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,16 @@
from pandas import (
DataFrame,
Timedelta,
)
def test_no_overflow_of_freq_and_time_in_dataframe():
# GH 35665
df = DataFrame(
{
"some_string": ["2222Y3"],
"time": [Timedelta("0 days 00:00:00.990000")],
}
)
for _, row in df.iterrows():
assert row.dtype == "object"

View File

@ -0,0 +1,576 @@
from datetime import datetime
import numpy as np
import pytest
from pandas.errors import MergeError
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
date_range,
period_range,
)
import pandas._testing as tm
from pandas.core.reshape.concat import concat
@pytest.fixture
def frame_with_period_index():
return DataFrame(
data=np.arange(20).reshape(4, 5),
columns=list("abcde"),
index=period_range(start="2000", freq="Y", periods=4),
)
@pytest.fixture
def left():
return DataFrame({"a": [20, 10, 0]}, index=[2, 1, 0])
@pytest.fixture
def right():
return DataFrame({"b": [300, 100, 200]}, index=[3, 1, 2])
@pytest.fixture
def left_no_dup():
return DataFrame(
{"a": ["a", "b", "c", "d"], "b": ["cat", "dog", "weasel", "horse"]},
index=range(4),
)
@pytest.fixture
def right_no_dup():
return DataFrame(
{
"a": ["a", "b", "c", "d", "e"],
"c": ["meow", "bark", "um... weasel noise?", "nay", "chirp"],
},
index=range(5),
).set_index("a")
@pytest.fixture
def left_w_dups(left_no_dup):
return concat(
[left_no_dup, DataFrame({"a": ["a"], "b": ["cow"]}, index=[3])], sort=True
)
@pytest.fixture
def right_w_dups(right_no_dup):
return concat(
[right_no_dup, DataFrame({"a": ["e"], "c": ["moo"]}, index=[3])]
).set_index("a")
@pytest.mark.parametrize(
"how, sort, expected",
[
("inner", False, DataFrame({"a": [20, 10], "b": [200, 100]}, index=[2, 1])),
("inner", True, DataFrame({"a": [10, 20], "b": [100, 200]}, index=[1, 2])),
(
"left",
False,
DataFrame({"a": [20, 10, 0], "b": [200, 100, np.nan]}, index=[2, 1, 0]),
),
(
"left",
True,
DataFrame({"a": [0, 10, 20], "b": [np.nan, 100, 200]}, index=[0, 1, 2]),
),
(
"right",
False,
DataFrame({"a": [np.nan, 10, 20], "b": [300, 100, 200]}, index=[3, 1, 2]),
),
(
"right",
True,
DataFrame({"a": [10, 20, np.nan], "b": [100, 200, 300]}, index=[1, 2, 3]),
),
(
"outer",
False,
DataFrame(
{"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]},
index=[0, 1, 2, 3],
),
),
(
"outer",
True,
DataFrame(
{"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]},
index=[0, 1, 2, 3],
),
),
],
)
def test_join(left, right, how, sort, expected):
result = left.join(right, how=how, sort=sort, validate="1:1")
tm.assert_frame_equal(result, expected)
def test_suffix_on_list_join():
first = DataFrame({"key": [1, 2, 3, 4, 5]})
second = DataFrame({"key": [1, 8, 3, 2, 5], "v1": [1, 2, 3, 4, 5]})
third = DataFrame({"keys": [5, 2, 3, 4, 1], "v2": [1, 2, 3, 4, 5]})
# check proper errors are raised
msg = "Suffixes not supported when joining multiple DataFrames"
with pytest.raises(ValueError, match=msg):
first.join([second], lsuffix="y")
with pytest.raises(ValueError, match=msg):
first.join([second, third], rsuffix="x")
with pytest.raises(ValueError, match=msg):
first.join([second, third], lsuffix="y", rsuffix="x")
with pytest.raises(ValueError, match="Indexes have overlapping values"):
first.join([second, third])
# no errors should be raised
arr_joined = first.join([third])
norm_joined = first.join(third)
tm.assert_frame_equal(arr_joined, norm_joined)
def test_join_invalid_validate(left_no_dup, right_no_dup):
# GH 46622
# Check invalid arguments
msg = (
'"invalid" is not a valid argument. '
"Valid arguments are:\n"
'- "1:1"\n'
'- "1:m"\n'
'- "m:1"\n'
'- "m:m"\n'
'- "one_to_one"\n'
'- "one_to_many"\n'
'- "many_to_one"\n'
'- "many_to_many"'
)
with pytest.raises(ValueError, match=msg):
left_no_dup.merge(right_no_dup, on="a", validate="invalid")
@pytest.mark.parametrize("dtype", ["object", "string[pyarrow]"])
def test_join_on_single_col_dup_on_right(left_no_dup, right_w_dups, dtype):
# GH 46622
# Dups on right allowed by one_to_many constraint
if dtype == "string[pyarrow]":
pytest.importorskip("pyarrow")
left_no_dup = left_no_dup.astype(dtype)
right_w_dups.index = right_w_dups.index.astype(dtype)
left_no_dup.join(
right_w_dups,
on="a",
validate="one_to_many",
)
# Dups on right not allowed by one_to_one constraint
msg = "Merge keys are not unique in right dataset; not a one-to-one merge"
with pytest.raises(MergeError, match=msg):
left_no_dup.join(
right_w_dups,
on="a",
validate="one_to_one",
)
def test_join_on_single_col_dup_on_left(left_w_dups, right_no_dup):
# GH 46622
# Dups on left allowed by many_to_one constraint
left_w_dups.join(
right_no_dup,
on="a",
validate="many_to_one",
)
# Dups on left not allowed by one_to_one constraint
msg = "Merge keys are not unique in left dataset; not a one-to-one merge"
with pytest.raises(MergeError, match=msg):
left_w_dups.join(
right_no_dup,
on="a",
validate="one_to_one",
)
def test_join_on_single_col_dup_on_both(left_w_dups, right_w_dups):
# GH 46622
# Dups on both allowed by many_to_many constraint
left_w_dups.join(right_w_dups, on="a", validate="many_to_many")
# Dups on both not allowed by many_to_one constraint
msg = "Merge keys are not unique in right dataset; not a many-to-one merge"
with pytest.raises(MergeError, match=msg):
left_w_dups.join(
right_w_dups,
on="a",
validate="many_to_one",
)
# Dups on both not allowed by one_to_many constraint
msg = "Merge keys are not unique in left dataset; not a one-to-many merge"
with pytest.raises(MergeError, match=msg):
left_w_dups.join(
right_w_dups,
on="a",
validate="one_to_many",
)
def test_join_on_multi_col_check_dup():
# GH 46622
# Two column join, dups in both, but jointly no dups
left = DataFrame(
{
"a": ["a", "a", "b", "b"],
"b": [0, 1, 0, 1],
"c": ["cat", "dog", "weasel", "horse"],
},
index=range(4),
).set_index(["a", "b"])
right = DataFrame(
{
"a": ["a", "a", "b"],
"b": [0, 1, 0],
"d": ["meow", "bark", "um... weasel noise?"],
},
index=range(3),
).set_index(["a", "b"])
expected_multi = DataFrame(
{
"a": ["a", "a", "b"],
"b": [0, 1, 0],
"c": ["cat", "dog", "weasel"],
"d": ["meow", "bark", "um... weasel noise?"],
},
index=range(3),
).set_index(["a", "b"])
# Jointly no dups allowed by one_to_one constraint
result = left.join(right, how="inner", validate="1:1")
tm.assert_frame_equal(result, expected_multi)
def test_join_index(float_frame):
# left / right
f = float_frame.loc[float_frame.index[:10], ["A", "B"]]
f2 = float_frame.loc[float_frame.index[5:], ["C", "D"]].iloc[::-1]
joined = f.join(f2)
tm.assert_index_equal(f.index, joined.index)
expected_columns = Index(["A", "B", "C", "D"])
tm.assert_index_equal(joined.columns, expected_columns)
joined = f.join(f2, how="left")
tm.assert_index_equal(joined.index, f.index)
tm.assert_index_equal(joined.columns, expected_columns)
joined = f.join(f2, how="right")
tm.assert_index_equal(joined.index, f2.index)
tm.assert_index_equal(joined.columns, expected_columns)
# inner
joined = f.join(f2, how="inner")
tm.assert_index_equal(joined.index, f.index[5:10])
tm.assert_index_equal(joined.columns, expected_columns)
# outer
joined = f.join(f2, how="outer")
tm.assert_index_equal(joined.index, float_frame.index.sort_values())
tm.assert_index_equal(joined.columns, expected_columns)
with pytest.raises(ValueError, match="join method"):
f.join(f2, how="foo")
# corner case - overlapping columns
msg = "columns overlap but no suffix"
for how in ("outer", "left", "inner"):
with pytest.raises(ValueError, match=msg):
float_frame.join(float_frame, how=how)
def test_join_index_more(float_frame):
af = float_frame.loc[:, ["A", "B"]]
bf = float_frame.loc[::2, ["C", "D"]]
expected = af.copy()
expected["C"] = float_frame["C"][::2]
expected["D"] = float_frame["D"][::2]
result = af.join(bf)
tm.assert_frame_equal(result, expected)
result = af.join(bf, how="right")
tm.assert_frame_equal(result, expected[::2])
result = bf.join(af, how="right")
tm.assert_frame_equal(result, expected.loc[:, result.columns])
def test_join_index_series(float_frame):
df = float_frame.copy()
ser = df.pop(float_frame.columns[-1])
joined = df.join(ser)
tm.assert_frame_equal(joined, float_frame)
ser.name = None
with pytest.raises(ValueError, match="must have a name"):
df.join(ser)
def test_join_overlap(float_frame):
df1 = float_frame.loc[:, ["A", "B", "C"]]
df2 = float_frame.loc[:, ["B", "C", "D"]]
joined = df1.join(df2, lsuffix="_df1", rsuffix="_df2")
df1_suf = df1.loc[:, ["B", "C"]].add_suffix("_df1")
df2_suf = df2.loc[:, ["B", "C"]].add_suffix("_df2")
no_overlap = float_frame.loc[:, ["A", "D"]]
expected = df1_suf.join(df2_suf).join(no_overlap)
# column order not necessarily sorted
tm.assert_frame_equal(joined, expected.loc[:, joined.columns])
def test_join_period_index(frame_with_period_index):
other = frame_with_period_index.rename(columns=lambda key: f"{key}{key}")
joined_values = np.concatenate([frame_with_period_index.values] * 2, axis=1)
joined_cols = frame_with_period_index.columns.append(other.columns)
joined = frame_with_period_index.join(other)
expected = DataFrame(
data=joined_values, columns=joined_cols, index=frame_with_period_index.index
)
tm.assert_frame_equal(joined, expected)
def test_join_left_sequence_non_unique_index():
# https://github.com/pandas-dev/pandas/issues/19607
df1 = DataFrame({"a": [0, 10, 20]}, index=[1, 2, 3])
df2 = DataFrame({"b": [100, 200, 300]}, index=[4, 3, 2])
df3 = DataFrame({"c": [400, 500, 600]}, index=[2, 2, 4])
joined = df1.join([df2, df3], how="left")
expected = DataFrame(
{
"a": [0, 10, 10, 20],
"b": [np.nan, 300, 300, 200],
"c": [np.nan, 400, 500, np.nan],
},
index=[1, 2, 2, 3],
)
tm.assert_frame_equal(joined, expected)
def test_join_list_series(float_frame):
# GH#46850
# Join a DataFrame with a list containing both a Series and a DataFrame
left = float_frame.A.to_frame()
right = [float_frame.B, float_frame[["C", "D"]]]
result = left.join(right)
tm.assert_frame_equal(result, float_frame)
@pytest.mark.parametrize("sort_kw", [True, False])
def test_suppress_future_warning_with_sort_kw(sort_kw):
a = DataFrame({"col1": [1, 2]}, index=["c", "a"])
b = DataFrame({"col2": [4, 5]}, index=["b", "a"])
c = DataFrame({"col3": [7, 8]}, index=["a", "b"])
expected = DataFrame(
{
"col1": {"a": 2.0, "b": float("nan"), "c": 1.0},
"col2": {"a": 5.0, "b": 4.0, "c": float("nan")},
"col3": {"a": 7.0, "b": 8.0, "c": float("nan")},
}
)
if sort_kw is False:
expected = expected.reindex(index=["c", "a", "b"])
with tm.assert_produces_warning(None):
result = a.join([b, c], how="outer", sort=sort_kw)
tm.assert_frame_equal(result, expected)
class TestDataFrameJoin:
def test_join(self, multiindex_dataframe_random_data):
frame = multiindex_dataframe_random_data
a = frame.loc[frame.index[:5], ["A"]]
b = frame.loc[frame.index[2:], ["B", "C"]]
joined = a.join(b, how="outer").reindex(frame.index)
expected = frame.copy().values.copy()
expected[np.isnan(joined.values)] = np.nan
expected = DataFrame(expected, index=frame.index, columns=frame.columns)
assert not np.isnan(joined.values).all()
tm.assert_frame_equal(joined, expected)
def test_join_segfault(self):
# GH#1532
df1 = DataFrame({"a": [1, 1], "b": [1, 2], "x": [1, 2]})
df2 = DataFrame({"a": [2, 2], "b": [1, 2], "y": [1, 2]})
df1 = df1.set_index(["a", "b"])
df2 = df2.set_index(["a", "b"])
# it works!
for how in ["left", "right", "outer"]:
df1.join(df2, how=how)
def test_join_str_datetime(self):
str_dates = ["20120209", "20120222"]
dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)]
A = DataFrame(str_dates, index=range(2), columns=["aa"])
C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates)
tst = A.join(C, on="aa")
assert len(tst.columns) == 3
def test_join_multiindex_leftright(self):
# GH 10741
df1 = DataFrame(
[
["a", "x", 0.471780],
["a", "y", 0.774908],
["a", "z", 0.563634],
["b", "x", -0.353756],
["b", "y", 0.368062],
["b", "z", -1.721840],
["c", "x", 1],
["c", "y", 2],
["c", "z", 3],
],
columns=["first", "second", "value1"],
).set_index(["first", "second"])
df2 = DataFrame([["a", 10], ["b", 20]], columns=["first", "value2"]).set_index(
["first"]
)
exp = DataFrame(
[
[0.471780, 10],
[0.774908, 10],
[0.563634, 10],
[-0.353756, 20],
[0.368062, 20],
[-1.721840, 20],
[1.000000, np.nan],
[2.000000, np.nan],
[3.000000, np.nan],
],
index=df1.index,
columns=["value1", "value2"],
)
# these must be the same results (but columns are flipped)
tm.assert_frame_equal(df1.join(df2, how="left"), exp)
tm.assert_frame_equal(df2.join(df1, how="right"), exp[["value2", "value1"]])
exp_idx = MultiIndex.from_product(
[["a", "b"], ["x", "y", "z"]], names=["first", "second"]
)
exp = DataFrame(
[
[0.471780, 10],
[0.774908, 10],
[0.563634, 10],
[-0.353756, 20],
[0.368062, 20],
[-1.721840, 20],
],
index=exp_idx,
columns=["value1", "value2"],
)
tm.assert_frame_equal(df1.join(df2, how="right"), exp)
tm.assert_frame_equal(df2.join(df1, how="left"), exp[["value2", "value1"]])
def test_join_multiindex_dates(self):
# GH 33692
date = pd.Timestamp(2000, 1, 1).date()
df1_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"])
df1 = DataFrame({"col1": [0]}, index=df1_index)
df2_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"])
df2 = DataFrame({"col2": [0]}, index=df2_index)
df3_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"])
df3 = DataFrame({"col3": [0]}, index=df3_index)
result = df1.join([df2, df3])
expected_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"])
expected = DataFrame(
{"col1": [0], "col2": [0], "col3": [0]}, index=expected_index
)
tm.assert_equal(result, expected)
def test_merge_join_different_levels_raises(self):
# GH#9455
# GH 40993: For raising, enforced in 2.0
# first dataframe
df1 = DataFrame(columns=["a", "b"], data=[[1, 11], [0, 22]])
# second dataframe
columns = MultiIndex.from_tuples([("a", ""), ("c", "c1")])
df2 = DataFrame(columns=columns, data=[[1, 33], [0, 44]])
# merge
with pytest.raises(
MergeError, match="Not allowed to merge between different levels"
):
pd.merge(df1, df2, on="a")
# join, see discussion in GH#12219
with pytest.raises(
MergeError, match="Not allowed to merge between different levels"
):
df1.join(df2, on="a")
def test_frame_join_tzaware(self):
test1 = DataFrame(
np.zeros((6, 3)),
index=date_range(
"2012-11-15 00:00:00", periods=6, freq="100ms", tz="US/Central"
),
)
test2 = DataFrame(
np.zeros((3, 3)),
index=date_range(
"2012-11-15 00:00:00", periods=3, freq="250ms", tz="US/Central"
),
columns=range(3, 6),
)
result = test1.join(test2, how="outer")
expected = test1.index.union(test2.index)
tm.assert_index_equal(result.index, expected)
assert result.index.tz.zone == "US/Central"

View File

@ -0,0 +1,216 @@
from datetime import datetime
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Series,
Timestamp,
date_range,
)
import pandas._testing as tm
from pandas.tseries.offsets import BDay
def test_map(float_frame):
result = float_frame.map(lambda x: x * 2)
tm.assert_frame_equal(result, float_frame * 2)
float_frame.map(type)
# GH 465: function returning tuples
result = float_frame.map(lambda x: (x, x))["A"].iloc[0]
assert isinstance(result, tuple)
@pytest.mark.parametrize("val", [1, 1.0])
def test_map_float_object_conversion(val):
# GH 2909: object conversion to float in constructor?
df = DataFrame(data=[val, "a"])
result = df.map(lambda x: x).dtypes[0]
assert result == object
@pytest.mark.parametrize("na_action", [None, "ignore"])
def test_map_keeps_dtype(na_action):
# GH52219
arr = Series(["a", np.nan, "b"])
sparse_arr = arr.astype(pd.SparseDtype(object))
df = DataFrame(data={"a": arr, "b": sparse_arr})
def func(x):
return str.upper(x) if not pd.isna(x) else x
result = df.map(func, na_action=na_action)
expected_sparse = pd.array(["A", np.nan, "B"], dtype=pd.SparseDtype(object))
expected_arr = expected_sparse.astype(object)
expected = DataFrame({"a": expected_arr, "b": expected_sparse})
tm.assert_frame_equal(result, expected)
result_empty = df.iloc[:0, :].map(func, na_action=na_action)
expected_empty = expected.iloc[:0, :]
tm.assert_frame_equal(result_empty, expected_empty)
def test_map_str():
# GH 2786
df = DataFrame(np.random.default_rng(2).random((3, 4)))
df2 = df.copy()
cols = ["a", "a", "a", "a"]
df.columns = cols
expected = df2.map(str)
expected.columns = cols
result = df.map(str)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"col, val",
[["datetime", Timestamp("20130101")], ["timedelta", pd.Timedelta("1 min")]],
)
def test_map_datetimelike(col, val):
# datetime/timedelta
df = DataFrame(np.random.default_rng(2).random((3, 4)))
df[col] = val
result = df.map(str)
assert result.loc[0, col] == str(df.loc[0, col])
@pytest.mark.parametrize(
"expected",
[
DataFrame(),
DataFrame(columns=list("ABC")),
DataFrame(index=list("ABC")),
DataFrame({"A": [], "B": [], "C": []}),
],
)
@pytest.mark.parametrize("func", [round, lambda x: x])
def test_map_empty(expected, func):
# GH 8222
result = expected.map(func)
tm.assert_frame_equal(result, expected)
def test_map_kwargs():
# GH 40652
result = DataFrame([[1, 2], [3, 4]]).map(lambda x, y: x + y, y=2)
expected = DataFrame([[3, 4], [5, 6]])
tm.assert_frame_equal(result, expected)
def test_map_na_ignore(float_frame):
# GH 23803
strlen_frame = float_frame.map(lambda x: len(str(x)))
float_frame_with_na = float_frame.copy()
mask = np.random.default_rng(2).integers(0, 2, size=float_frame.shape, dtype=bool)
float_frame_with_na[mask] = pd.NA
strlen_frame_na_ignore = float_frame_with_na.map(
lambda x: len(str(x)), na_action="ignore"
)
# Set float64 type to avoid upcast when setting NA below
strlen_frame_with_na = strlen_frame.copy().astype("float64")
strlen_frame_with_na[mask] = pd.NA
tm.assert_frame_equal(strlen_frame_na_ignore, strlen_frame_with_na)
def test_map_box_timestamps():
# GH 2689, GH 2627
ser = Series(date_range("1/1/2000", periods=10))
def func(x):
return (x.hour, x.day, x.month)
# it works!
DataFrame(ser).map(func)
def test_map_box():
# ufunc will not be boxed. Same test cases as the test_map_box
df = DataFrame(
{
"a": [Timestamp("2011-01-01"), Timestamp("2011-01-02")],
"b": [
Timestamp("2011-01-01", tz="US/Eastern"),
Timestamp("2011-01-02", tz="US/Eastern"),
],
"c": [pd.Timedelta("1 days"), pd.Timedelta("2 days")],
"d": [
pd.Period("2011-01-01", freq="M"),
pd.Period("2011-01-02", freq="M"),
],
}
)
result = df.map(lambda x: type(x).__name__)
expected = DataFrame(
{
"a": ["Timestamp", "Timestamp"],
"b": ["Timestamp", "Timestamp"],
"c": ["Timedelta", "Timedelta"],
"d": ["Period", "Period"],
}
)
tm.assert_frame_equal(result, expected)
def test_frame_map_dont_convert_datetime64():
df = DataFrame({"x1": [datetime(1996, 1, 1)]})
df = df.map(lambda x: x + BDay())
df = df.map(lambda x: x + BDay())
result = df.x1.dtype
assert result == "M8[ns]"
def test_map_function_runs_once():
df = DataFrame({"a": [1, 2, 3]})
values = [] # Save values function is applied to
def reducing_function(val):
values.append(val)
def non_reducing_function(val):
values.append(val)
return val
for func in [reducing_function, non_reducing_function]:
del values[:]
df.map(func)
assert values == df.a.to_list()
def test_map_type():
# GH 46719
df = DataFrame(
{"col1": [3, "string", float], "col2": [0.25, datetime(2020, 1, 1), np.nan]},
index=["a", "b", "c"],
)
result = df.map(type)
expected = DataFrame(
{"col1": [int, str, type], "col2": [float, datetime, float]},
index=["a", "b", "c"],
)
tm.assert_frame_equal(result, expected)
def test_map_invalid_na_action(float_frame):
# GH 23803
with pytest.raises(ValueError, match="na_action must be .*Got 'abc'"):
float_frame.map(lambda x: len(str(x)), na_action="abc")
def test_applymap_deprecated():
# GH52353
df = DataFrame({"a": [1, 2, 3]})
msg = "DataFrame.applymap has been deprecated. Use DataFrame.map instead."
with tm.assert_produces_warning(FutureWarning, match=msg):
df.applymap(lambda x: x)

View File

@ -0,0 +1,98 @@
import operator
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
Series,
)
import pandas._testing as tm
class TestMatMul:
def test_matmul(self):
# matmul test is for GH#10259
a = DataFrame(
np.random.default_rng(2).standard_normal((3, 4)),
index=["a", "b", "c"],
columns=["p", "q", "r", "s"],
)
b = DataFrame(
np.random.default_rng(2).standard_normal((4, 2)),
index=["p", "q", "r", "s"],
columns=["one", "two"],
)
# DataFrame @ DataFrame
result = operator.matmul(a, b)
expected = DataFrame(
np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"]
)
tm.assert_frame_equal(result, expected)
# DataFrame @ Series
result = operator.matmul(a, b.one)
expected = Series(np.dot(a.values, b.one.values), index=["a", "b", "c"])
tm.assert_series_equal(result, expected)
# np.array @ DataFrame
result = operator.matmul(a.values, b)
assert isinstance(result, DataFrame)
assert result.columns.equals(b.columns)
assert result.index.equals(Index(range(3)))
expected = np.dot(a.values, b.values)
tm.assert_almost_equal(result.values, expected)
# nested list @ DataFrame (__rmatmul__)
result = operator.matmul(a.values.tolist(), b)
expected = DataFrame(
np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"]
)
tm.assert_almost_equal(result.values, expected.values)
# mixed dtype DataFrame @ DataFrame
a["q"] = a.q.round().astype(int)
result = operator.matmul(a, b)
expected = DataFrame(
np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"]
)
tm.assert_frame_equal(result, expected)
# different dtypes DataFrame @ DataFrame
a = a.astype(int)
result = operator.matmul(a, b)
expected = DataFrame(
np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"]
)
tm.assert_frame_equal(result, expected)
# unaligned
df = DataFrame(
np.random.default_rng(2).standard_normal((3, 4)),
index=[1, 2, 3],
columns=range(4),
)
df2 = DataFrame(
np.random.default_rng(2).standard_normal((5, 3)),
index=range(5),
columns=[1, 2, 3],
)
with pytest.raises(ValueError, match="aligned"):
operator.matmul(df, df2)
def test_matmul_message_shapes(self):
# GH#21581 exception message should reflect original shapes,
# not transposed shapes
a = np.random.default_rng(2).random((10, 4))
b = np.random.default_rng(2).random((5, 3))
df = DataFrame(b)
msg = r"shapes \(10, 4\) and \(5, 3\) not aligned"
with pytest.raises(ValueError, match=msg):
a @ df
with pytest.raises(ValueError, match=msg):
a.tolist() @ df

View File

@ -0,0 +1,250 @@
"""
Note: for naming purposes, most tests are title with as e.g. "test_nlargest_foo"
but are implicitly also testing nsmallest_foo.
"""
from string import ascii_lowercase
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.util.version import Version
@pytest.fixture
def df_duplicates():
return pd.DataFrame(
{"a": [1, 2, 3, 4, 4], "b": [1, 1, 1, 1, 1], "c": [0, 1, 2, 5, 4]},
index=[0, 0, 1, 1, 1],
)
@pytest.fixture
def df_strings():
return pd.DataFrame(
{
"a": np.random.default_rng(2).permutation(10),
"b": list(ascii_lowercase[:10]),
"c": np.random.default_rng(2).permutation(10).astype("float64"),
}
)
@pytest.fixture
def df_main_dtypes():
return pd.DataFrame(
{
"group": [1, 1, 2],
"int": [1, 2, 3],
"float": [4.0, 5.0, 6.0],
"string": list("abc"),
"category_string": pd.Series(list("abc")).astype("category"),
"category_int": [7, 8, 9],
"datetime": pd.date_range("20130101", periods=3),
"datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"),
"timedelta": pd.timedelta_range("1 s", periods=3, freq="s"),
},
columns=[
"group",
"int",
"float",
"string",
"category_string",
"category_int",
"datetime",
"datetimetz",
"timedelta",
],
)
class TestNLargestNSmallest:
# ----------------------------------------------------------------------
# Top / bottom
@pytest.mark.parametrize(
"order",
[
["a"],
["c"],
["a", "b"],
["a", "c"],
["b", "a"],
["b", "c"],
["a", "b", "c"],
["c", "a", "b"],
["c", "b", "a"],
["b", "c", "a"],
["b", "a", "c"],
# dups!
["b", "c", "c"],
],
)
@pytest.mark.parametrize("n", range(1, 11))
def test_nlargest_n(self, df_strings, nselect_method, n, order):
# GH#10393
df = df_strings
if "b" in order:
error_msg = (
f"Column 'b' has dtype (object|string), "
f"cannot use method '{nselect_method}' with this dtype"
)
with pytest.raises(TypeError, match=error_msg):
getattr(df, nselect_method)(n, order)
else:
ascending = nselect_method == "nsmallest"
result = getattr(df, nselect_method)(n, order)
expected = df.sort_values(order, ascending=ascending).head(n)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"columns", [["group", "category_string"], ["group", "string"]]
)
def test_nlargest_error(self, df_main_dtypes, nselect_method, columns):
df = df_main_dtypes
col = columns[1]
error_msg = (
f"Column '{col}' has dtype {df[col].dtype}, "
f"cannot use method '{nselect_method}' with this dtype"
)
# escape some characters that may be in the repr
error_msg = (
error_msg.replace("(", "\\(")
.replace(")", "\\)")
.replace("[", "\\[")
.replace("]", "\\]")
)
with pytest.raises(TypeError, match=error_msg):
getattr(df, nselect_method)(2, columns)
def test_nlargest_all_dtypes(self, df_main_dtypes):
df = df_main_dtypes
df.nsmallest(2, list(set(df) - {"category_string", "string"}))
df.nlargest(2, list(set(df) - {"category_string", "string"}))
def test_nlargest_duplicates_on_starter_columns(self):
# regression test for GH#22752
df = pd.DataFrame({"a": [2, 2, 2, 1, 1, 1], "b": [1, 2, 3, 3, 2, 1]})
result = df.nlargest(4, columns=["a", "b"])
expected = pd.DataFrame(
{"a": [2, 2, 2, 1], "b": [3, 2, 1, 3]}, index=[2, 1, 0, 3]
)
tm.assert_frame_equal(result, expected)
result = df.nsmallest(4, columns=["a", "b"])
expected = pd.DataFrame(
{"a": [1, 1, 1, 2], "b": [1, 2, 3, 1]}, index=[5, 4, 3, 0]
)
tm.assert_frame_equal(result, expected)
def test_nlargest_n_identical_values(self):
# GH#15297
df = pd.DataFrame({"a": [1] * 5, "b": [1, 2, 3, 4, 5]})
result = df.nlargest(3, "a")
expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}, index=[0, 1, 2])
tm.assert_frame_equal(result, expected)
result = df.nsmallest(3, "a")
expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"order",
[["a", "b", "c"], ["c", "b", "a"], ["a"], ["b"], ["a", "b"], ["c", "b"]],
)
@pytest.mark.parametrize("n", range(1, 6))
def test_nlargest_n_duplicate_index(self, df_duplicates, n, order, request):
# GH#13412
df = df_duplicates
result = df.nsmallest(n, order)
expected = df.sort_values(order).head(n)
tm.assert_frame_equal(result, expected)
result = df.nlargest(n, order)
expected = df.sort_values(order, ascending=False).head(n)
if Version(np.__version__) >= Version("1.25") and (
(order == ["a"] and n in (1, 2, 3, 4)) or (order == ["a", "b"]) and n == 5
):
request.applymarker(
pytest.mark.xfail(
reason=(
"pandas default unstable sorting of duplicates"
"issue with numpy>=1.25 with AVX instructions"
),
strict=False,
)
)
tm.assert_frame_equal(result, expected)
def test_nlargest_duplicate_keep_all_ties(self):
# GH#16818
df = pd.DataFrame(
{"a": [5, 4, 4, 2, 3, 3, 3, 3], "b": [10, 9, 8, 7, 5, 50, 10, 20]}
)
result = df.nlargest(4, "a", keep="all")
expected = pd.DataFrame(
{
"a": {0: 5, 1: 4, 2: 4, 4: 3, 5: 3, 6: 3, 7: 3},
"b": {0: 10, 1: 9, 2: 8, 4: 5, 5: 50, 6: 10, 7: 20},
}
)
tm.assert_frame_equal(result, expected)
result = df.nsmallest(2, "a", keep="all")
expected = pd.DataFrame(
{
"a": {3: 2, 4: 3, 5: 3, 6: 3, 7: 3},
"b": {3: 7, 4: 5, 5: 50, 6: 10, 7: 20},
}
)
tm.assert_frame_equal(result, expected)
def test_nlargest_multiindex_column_lookup(self):
# Check whether tuples are correctly treated as multi-level lookups.
# GH#23033
df = pd.DataFrame(
columns=pd.MultiIndex.from_product([["x"], ["a", "b"]]),
data=[[0.33, 0.13], [0.86, 0.25], [0.25, 0.70], [0.85, 0.91]],
)
# nsmallest
result = df.nsmallest(3, ("x", "a"))
expected = df.iloc[[2, 0, 3]]
tm.assert_frame_equal(result, expected)
# nlargest
result = df.nlargest(3, ("x", "b"))
expected = df.iloc[[3, 2, 1]]
tm.assert_frame_equal(result, expected)
def test_nlargest_nan(self):
# GH#43060
df = pd.DataFrame([np.nan, np.nan, 0, 1, 2, 3])
result = df.nlargest(5, 0)
expected = df.sort_values(0, ascending=False).head(5)
tm.assert_frame_equal(result, expected)
def test_nsmallest_nan_after_n_element(self):
# GH#46589
df = pd.DataFrame(
{
"a": [1, 2, 3, 4, 5, None, 7],
"b": [7, 6, 5, 4, 3, 2, 1],
"c": [1, 1, 2, 2, 3, 3, 3],
},
index=range(7),
)
result = df.nsmallest(5, columns=["a", "b"])
expected = pd.DataFrame(
{
"a": [1, 2, 3, 4, 5],
"b": [7, 6, 5, 4, 3],
"c": [1, 1, 2, 2, 3],
},
index=range(5),
).astype({"a": "float"})
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,180 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
class TestDataFramePctChange:
@pytest.mark.parametrize(
"periods, fill_method, limit, exp",
[
(1, "ffill", None, [np.nan, np.nan, np.nan, 1, 1, 1.5, 0, 0]),
(1, "ffill", 1, [np.nan, np.nan, np.nan, 1, 1, 1.5, 0, np.nan]),
(1, "bfill", None, [np.nan, 0, 0, 1, 1, 1.5, np.nan, np.nan]),
(1, "bfill", 1, [np.nan, np.nan, 0, 1, 1, 1.5, np.nan, np.nan]),
(-1, "ffill", None, [np.nan, np.nan, -0.5, -0.5, -0.6, 0, 0, np.nan]),
(-1, "ffill", 1, [np.nan, np.nan, -0.5, -0.5, -0.6, 0, np.nan, np.nan]),
(-1, "bfill", None, [0, 0, -0.5, -0.5, -0.6, np.nan, np.nan, np.nan]),
(-1, "bfill", 1, [np.nan, 0, -0.5, -0.5, -0.6, np.nan, np.nan, np.nan]),
],
)
def test_pct_change_with_nas(
self, periods, fill_method, limit, exp, frame_or_series
):
vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan]
obj = frame_or_series(vals)
msg = (
"The 'fill_method' keyword being not None and the 'limit' keyword in "
f"{type(obj).__name__}.pct_change are deprecated"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
res = obj.pct_change(periods=periods, fill_method=fill_method, limit=limit)
tm.assert_equal(res, frame_or_series(exp))
def test_pct_change_numeric(self):
# GH#11150
pnl = DataFrame(
[np.arange(0, 40, 10), np.arange(0, 40, 10), np.arange(0, 40, 10)]
).astype(np.float64)
pnl.iat[1, 0] = np.nan
pnl.iat[1, 1] = np.nan
pnl.iat[2, 3] = 60
msg = (
"The 'fill_method' keyword being not None and the 'limit' keyword in "
"DataFrame.pct_change are deprecated"
)
for axis in range(2):
expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift(axis=axis) - 1
with tm.assert_produces_warning(FutureWarning, match=msg):
result = pnl.pct_change(axis=axis, fill_method="pad")
tm.assert_frame_equal(result, expected)
def test_pct_change(self, datetime_frame):
msg = (
"The 'fill_method' keyword being not None and the 'limit' keyword in "
"DataFrame.pct_change are deprecated"
)
rs = datetime_frame.pct_change(fill_method=None)
tm.assert_frame_equal(rs, datetime_frame / datetime_frame.shift(1) - 1)
rs = datetime_frame.pct_change(2)
filled = datetime_frame.ffill()
tm.assert_frame_equal(rs, filled / filled.shift(2) - 1)
with tm.assert_produces_warning(FutureWarning, match=msg):
rs = datetime_frame.pct_change(fill_method="bfill", limit=1)
filled = datetime_frame.bfill(limit=1)
tm.assert_frame_equal(rs, filled / filled.shift(1) - 1)
rs = datetime_frame.pct_change(freq="5D")
filled = datetime_frame.ffill()
tm.assert_frame_equal(
rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled)
)
def test_pct_change_shift_over_nas(self):
s = Series([1.0, 1.5, np.nan, 2.5, 3.0])
df = DataFrame({"a": s, "b": s})
msg = "The default fill_method='pad' in DataFrame.pct_change is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
chg = df.pct_change()
expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2])
edf = DataFrame({"a": expected, "b": expected})
tm.assert_frame_equal(chg, edf)
@pytest.mark.parametrize(
"freq, periods, fill_method, limit",
[
("5B", 5, None, None),
("3B", 3, None, None),
("3B", 3, "bfill", None),
("7B", 7, "pad", 1),
("7B", 7, "bfill", 3),
("14B", 14, None, None),
],
)
def test_pct_change_periods_freq(
self, datetime_frame, freq, periods, fill_method, limit
):
msg = (
"The 'fill_method' keyword being not None and the 'limit' keyword in "
"DataFrame.pct_change are deprecated"
)
# GH#7292
with tm.assert_produces_warning(FutureWarning, match=msg):
rs_freq = datetime_frame.pct_change(
freq=freq, fill_method=fill_method, limit=limit
)
with tm.assert_produces_warning(FutureWarning, match=msg):
rs_periods = datetime_frame.pct_change(
periods, fill_method=fill_method, limit=limit
)
tm.assert_frame_equal(rs_freq, rs_periods)
empty_ts = DataFrame(index=datetime_frame.index, columns=datetime_frame.columns)
with tm.assert_produces_warning(FutureWarning, match=msg):
rs_freq = empty_ts.pct_change(
freq=freq, fill_method=fill_method, limit=limit
)
with tm.assert_produces_warning(FutureWarning, match=msg):
rs_periods = empty_ts.pct_change(
periods, fill_method=fill_method, limit=limit
)
tm.assert_frame_equal(rs_freq, rs_periods)
@pytest.mark.parametrize("fill_method", ["pad", "ffill", None])
def test_pct_change_with_duplicated_indices(fill_method):
# GH30463
data = DataFrame(
{0: [np.nan, 1, 2, 3, 9, 18], 1: [0, 1, np.nan, 3, 9, 18]}, index=["a", "b"] * 3
)
warn = None if fill_method is None else FutureWarning
msg = (
"The 'fill_method' keyword being not None and the 'limit' keyword in "
"DataFrame.pct_change are deprecated"
)
with tm.assert_produces_warning(warn, match=msg):
result = data.pct_change(fill_method=fill_method)
if fill_method is None:
second_column = [np.nan, np.inf, np.nan, np.nan, 2.0, 1.0]
else:
second_column = [np.nan, np.inf, 0.0, 2.0, 2.0, 1.0]
expected = DataFrame(
{0: [np.nan, np.nan, 1.0, 0.5, 2.0, 1.0], 1: second_column},
index=["a", "b"] * 3,
)
tm.assert_frame_equal(result, expected)
def test_pct_change_none_beginning_no_warning():
# GH#54481
df = DataFrame(
[
[1, None],
[2, 1],
[3, 2],
[4, 3],
[5, 4],
]
)
result = df.pct_change()
expected = DataFrame(
{0: [np.nan, 1, 0.5, 1 / 3, 0.25], 1: [np.nan, np.nan, 1, 0.5, 1 / 3]}
)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,39 @@
import pytest
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
class TestPipe:
def test_pipe(self, frame_or_series):
obj = DataFrame({"A": [1, 2, 3]})
expected = DataFrame({"A": [1, 4, 9]})
if frame_or_series is Series:
obj = obj["A"]
expected = expected["A"]
f = lambda x, y: x**y
result = obj.pipe(f, 2)
tm.assert_equal(result, expected)
def test_pipe_tuple(self, frame_or_series):
obj = DataFrame({"A": [1, 2, 3]})
obj = tm.get_obj(obj, frame_or_series)
f = lambda x, y: y
result = obj.pipe((f, "y"), 0)
tm.assert_equal(result, obj)
def test_pipe_tuple_error(self, frame_or_series):
obj = DataFrame({"A": [1, 2, 3]})
obj = tm.get_obj(obj, frame_or_series)
f = lambda x, y: y
msg = "y is both the pipe target and a keyword argument"
with pytest.raises(ValueError, match=msg):
obj.pipe((f, "y"), x=1, y=0)

View File

@ -0,0 +1,72 @@
import numpy as np
from pandas import (
DataFrame,
MultiIndex,
Series,
)
import pandas._testing as tm
class TestDataFramePop:
def test_pop(self, float_frame, warn_copy_on_write):
float_frame.columns.name = "baz"
float_frame.pop("A")
assert "A" not in float_frame
float_frame["foo"] = "bar"
float_frame.pop("foo")
assert "foo" not in float_frame
assert float_frame.columns.name == "baz"
# gh-10912: inplace ops cause caching issue
a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"], index=["X", "Y"])
b = a.pop("B")
with tm.assert_cow_warning(warn_copy_on_write):
b += 1
# original frame
expected = DataFrame([[1, 3], [4, 6]], columns=["A", "C"], index=["X", "Y"])
tm.assert_frame_equal(a, expected)
# result
expected = Series([2, 5], index=["X", "Y"], name="B") + 1
tm.assert_series_equal(b, expected)
def test_pop_non_unique_cols(self):
df = DataFrame({0: [0, 1], 1: [0, 1], 2: [4, 5]})
df.columns = ["a", "b", "a"]
res = df.pop("a")
assert type(res) == DataFrame
assert len(res) == 2
assert len(df.columns) == 1
assert "b" in df.columns
assert "a" not in df.columns
assert len(df.index) == 2
def test_mixed_depth_pop(self):
arrays = [
["a", "top", "top", "routine1", "routine1", "routine2"],
["", "OD", "OD", "result1", "result2", "result1"],
["", "wx", "wy", "", "", ""],
]
tuples = sorted(zip(*arrays))
index = MultiIndex.from_tuples(tuples)
df = DataFrame(np.random.default_rng(2).standard_normal((4, 6)), columns=index)
df1 = df.copy()
df2 = df.copy()
result = df1.pop("a")
expected = df2.pop(("a", "", ""))
tm.assert_series_equal(expected, result, check_names=False)
tm.assert_frame_equal(df1, df2)
assert result.name == "a"
expected = df1["top"]
df1 = df1.drop(["top"], axis=1)
result = df2.pop("top")
tm.assert_frame_equal(expected, result)
tm.assert_frame_equal(df1, df2)

View File

@ -0,0 +1,972 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
Series,
Timestamp,
)
import pandas._testing as tm
@pytest.fixture(
params=[["linear", "single"], ["nearest", "table"]], ids=lambda x: "-".join(x)
)
def interp_method(request):
"""(interpolation, method) arguments for quantile"""
return request.param
class TestDataFrameQuantile:
@pytest.mark.parametrize(
"df,expected",
[
[
DataFrame(
{
0: Series(pd.arrays.SparseArray([1, 2])),
1: Series(pd.arrays.SparseArray([3, 4])),
}
),
Series([1.5, 3.5], name=0.5),
],
[
DataFrame(Series([0.0, None, 1.0, 2.0], dtype="Sparse[float]")),
Series([1.0], name=0.5),
],
],
)
def test_quantile_sparse(self, df, expected):
# GH#17198
# GH#24600
result = df.quantile()
expected = expected.astype("Sparse[float]")
tm.assert_series_equal(result, expected)
def test_quantile(
self, datetime_frame, interp_method, using_array_manager, request
):
interpolation, method = interp_method
df = datetime_frame
result = df.quantile(
0.1, axis=0, numeric_only=True, interpolation=interpolation, method=method
)
expected = Series(
[np.percentile(df[col], 10) for col in df.columns],
index=df.columns,
name=0.1,
)
if interpolation == "linear":
# np.percentile values only comparable to linear interpolation
tm.assert_series_equal(result, expected)
else:
tm.assert_index_equal(result.index, expected.index)
request.applymarker(
pytest.mark.xfail(
using_array_manager, reason="Name set incorrectly for arraymanager"
)
)
assert result.name == expected.name
result = df.quantile(
0.9, axis=1, numeric_only=True, interpolation=interpolation, method=method
)
expected = Series(
[np.percentile(df.loc[date], 90) for date in df.index],
index=df.index,
name=0.9,
)
if interpolation == "linear":
# np.percentile values only comparable to linear interpolation
tm.assert_series_equal(result, expected)
else:
tm.assert_index_equal(result.index, expected.index)
request.applymarker(
pytest.mark.xfail(
using_array_manager, reason="Name set incorrectly for arraymanager"
)
)
assert result.name == expected.name
def test_empty(self, interp_method):
interpolation, method = interp_method
q = DataFrame({"x": [], "y": []}).quantile(
0.1, axis=0, numeric_only=True, interpolation=interpolation, method=method
)
assert np.isnan(q["x"]) and np.isnan(q["y"])
def test_non_numeric_exclusion(self, interp_method, request, using_array_manager):
interpolation, method = interp_method
df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]})
rs = df.quantile(
0.5, numeric_only=True, interpolation=interpolation, method=method
)
xp = df.median(numeric_only=True).rename(0.5)
if interpolation == "nearest":
xp = (xp + 0.5).astype(np.int64)
if method == "table" and using_array_manager:
request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
tm.assert_series_equal(rs, xp)
def test_axis(self, interp_method, request, using_array_manager):
# axis
interpolation, method = interp_method
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
result = df.quantile(0.5, axis=1, interpolation=interpolation, method=method)
expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
if interpolation == "nearest":
expected = expected.astype(np.int64)
if method == "table" and using_array_manager:
request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
tm.assert_series_equal(result, expected)
result = df.quantile(
[0.5, 0.75], axis=1, interpolation=interpolation, method=method
)
expected = DataFrame(
{1: [1.5, 1.75], 2: [2.5, 2.75], 3: [3.5, 3.75]}, index=[0.5, 0.75]
)
if interpolation == "nearest":
expected.iloc[0, :] -= 0.5
expected.iloc[1, :] += 0.25
expected = expected.astype(np.int64)
tm.assert_frame_equal(result, expected, check_index_type=True)
def test_axis_numeric_only_true(self, interp_method, request, using_array_manager):
# We may want to break API in the future to change this
# so that we exclude non-numeric along the same axis
# See GH #7312
interpolation, method = interp_method
df = DataFrame([[1, 2, 3], ["a", "b", 4]])
result = df.quantile(
0.5, axis=1, numeric_only=True, interpolation=interpolation, method=method
)
expected = Series([3.0, 4.0], index=[0, 1], name=0.5)
if interpolation == "nearest":
expected = expected.astype(np.int64)
if method == "table" and using_array_manager:
request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
tm.assert_series_equal(result, expected)
def test_quantile_date_range(self, interp_method, request, using_array_manager):
# GH 2460
interpolation, method = interp_method
dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific")
ser = Series(dti)
df = DataFrame(ser)
result = df.quantile(
numeric_only=False, interpolation=interpolation, method=method
)
expected = Series(
["2016-01-02 00:00:00"], name=0.5, dtype="datetime64[ns, US/Pacific]"
)
if method == "table" and using_array_manager:
request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
tm.assert_series_equal(result, expected)
def test_quantile_axis_mixed(self, interp_method, request, using_array_manager):
# mixed on axis=1
interpolation, method = interp_method
df = DataFrame(
{
"A": [1, 2, 3],
"B": [2.0, 3.0, 4.0],
"C": pd.date_range("20130101", periods=3),
"D": ["foo", "bar", "baz"],
}
)
result = df.quantile(
0.5, axis=1, numeric_only=True, interpolation=interpolation, method=method
)
expected = Series([1.5, 2.5, 3.5], name=0.5)
if interpolation == "nearest":
expected -= 0.5
if method == "table" and using_array_manager:
request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
tm.assert_series_equal(result, expected)
# must raise
msg = "'<' not supported between instances of 'Timestamp' and 'float'"
with pytest.raises(TypeError, match=msg):
df.quantile(0.5, axis=1, numeric_only=False)
def test_quantile_axis_parameter(self, interp_method, request, using_array_manager):
# GH 9543/9544
interpolation, method = interp_method
if method == "table" and using_array_manager:
request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
result = df.quantile(0.5, axis=0, interpolation=interpolation, method=method)
expected = Series([2.0, 3.0], index=["A", "B"], name=0.5)
if interpolation == "nearest":
expected = expected.astype(np.int64)
tm.assert_series_equal(result, expected)
expected = df.quantile(
0.5, axis="index", interpolation=interpolation, method=method
)
if interpolation == "nearest":
expected = expected.astype(np.int64)
tm.assert_series_equal(result, expected)
result = df.quantile(0.5, axis=1, interpolation=interpolation, method=method)
expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
if interpolation == "nearest":
expected = expected.astype(np.int64)
tm.assert_series_equal(result, expected)
result = df.quantile(
0.5, axis="columns", interpolation=interpolation, method=method
)
tm.assert_series_equal(result, expected)
msg = "No axis named -1 for object type DataFrame"
with pytest.raises(ValueError, match=msg):
df.quantile(0.1, axis=-1, interpolation=interpolation, method=method)
msg = "No axis named column for object type DataFrame"
with pytest.raises(ValueError, match=msg):
df.quantile(0.1, axis="column")
def test_quantile_interpolation(self):
# see gh-10174
# interpolation method other than default linear
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
result = df.quantile(0.5, axis=1, interpolation="nearest")
expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5)
tm.assert_series_equal(result, expected)
# cross-check interpolation=nearest results in original dtype
exp = np.percentile(
np.array([[1, 2, 3], [2, 3, 4]]),
0.5,
axis=0,
method="nearest",
)
expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="int64")
tm.assert_series_equal(result, expected)
# float
df = DataFrame({"A": [1.0, 2.0, 3.0], "B": [2.0, 3.0, 4.0]}, index=[1, 2, 3])
result = df.quantile(0.5, axis=1, interpolation="nearest")
expected = Series([1.0, 2.0, 3.0], index=[1, 2, 3], name=0.5)
tm.assert_series_equal(result, expected)
exp = np.percentile(
np.array([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]]),
0.5,
axis=0,
method="nearest",
)
expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="float64")
tm.assert_series_equal(result, expected)
# axis
result = df.quantile([0.5, 0.75], axis=1, interpolation="lower")
expected = DataFrame(
{1: [1.0, 1.0], 2: [2.0, 2.0], 3: [3.0, 3.0]}, index=[0.5, 0.75]
)
tm.assert_frame_equal(result, expected)
# test degenerate case
df = DataFrame({"x": [], "y": []})
q = df.quantile(0.1, axis=0, interpolation="higher")
assert np.isnan(q["x"]) and np.isnan(q["y"])
# multi
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"])
result = df.quantile([0.25, 0.5], interpolation="midpoint")
# https://github.com/numpy/numpy/issues/7163
expected = DataFrame(
[[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]],
index=[0.25, 0.5],
columns=["a", "b", "c"],
)
tm.assert_frame_equal(result, expected)
def test_quantile_interpolation_datetime(self, datetime_frame):
# see gh-10174
# interpolation = linear (default case)
df = datetime_frame
q = df.quantile(0.1, axis=0, numeric_only=True, interpolation="linear")
assert q["A"] == np.percentile(df["A"], 10)
def test_quantile_interpolation_int(self, int_frame):
# see gh-10174
df = int_frame
# interpolation = linear (default case)
q = df.quantile(0.1)
assert q["A"] == np.percentile(df["A"], 10)
# test with and without interpolation keyword
q1 = df.quantile(0.1, axis=0, interpolation="linear")
assert q1["A"] == np.percentile(df["A"], 10)
tm.assert_series_equal(q, q1)
def test_quantile_multi(self, interp_method, request, using_array_manager):
interpolation, method = interp_method
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"])
result = df.quantile([0.25, 0.5], interpolation=interpolation, method=method)
expected = DataFrame(
[[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]],
index=[0.25, 0.5],
columns=["a", "b", "c"],
)
if interpolation == "nearest":
expected = expected.astype(np.int64)
if method == "table" and using_array_manager:
request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
tm.assert_frame_equal(result, expected)
def test_quantile_multi_axis_1(self, interp_method, request, using_array_manager):
interpolation, method = interp_method
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"])
result = df.quantile(
[0.25, 0.5], axis=1, interpolation=interpolation, method=method
)
expected = DataFrame(
[[1.0, 2.0, 3.0]] * 2, index=[0.25, 0.5], columns=[0, 1, 2]
)
if interpolation == "nearest":
expected = expected.astype(np.int64)
if method == "table" and using_array_manager:
request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
tm.assert_frame_equal(result, expected)
def test_quantile_multi_empty(self, interp_method):
interpolation, method = interp_method
result = DataFrame({"x": [], "y": []}).quantile(
[0.1, 0.9], axis=0, interpolation=interpolation, method=method
)
expected = DataFrame(
{"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9]
)
tm.assert_frame_equal(result, expected)
def test_quantile_datetime(self, unit):
dti = pd.to_datetime(["2010", "2011"]).as_unit(unit)
df = DataFrame({"a": dti, "b": [0, 5]})
# exclude datetime
result = df.quantile(0.5, numeric_only=True)
expected = Series([2.5], index=["b"], name=0.5)
tm.assert_series_equal(result, expected)
# datetime
result = df.quantile(0.5, numeric_only=False)
expected = Series(
[Timestamp("2010-07-02 12:00:00"), 2.5], index=["a", "b"], name=0.5
)
tm.assert_series_equal(result, expected)
# datetime w/ multi
result = df.quantile([0.5], numeric_only=False)
expected = DataFrame(
{"a": Timestamp("2010-07-02 12:00:00").as_unit(unit), "b": 2.5},
index=[0.5],
)
tm.assert_frame_equal(result, expected)
# axis = 1
df["c"] = pd.to_datetime(["2011", "2012"]).as_unit(unit)
result = df[["a", "c"]].quantile(0.5, axis=1, numeric_only=False)
expected = Series(
[Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")],
index=[0, 1],
name=0.5,
dtype=f"M8[{unit}]",
)
tm.assert_series_equal(result, expected)
result = df[["a", "c"]].quantile([0.5], axis=1, numeric_only=False)
expected = DataFrame(
[[Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")]],
index=[0.5],
columns=[0, 1],
dtype=f"M8[{unit}]",
)
tm.assert_frame_equal(result, expected)
# empty when numeric_only=True
result = df[["a", "c"]].quantile(0.5, numeric_only=True)
expected = Series([], index=[], dtype=np.float64, name=0.5)
tm.assert_series_equal(result, expected)
result = df[["a", "c"]].quantile([0.5], numeric_only=True)
expected = DataFrame(index=[0.5], columns=[])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype",
[
"datetime64[ns]",
"datetime64[ns, US/Pacific]",
"timedelta64[ns]",
"Period[D]",
],
)
def test_quantile_dt64_empty(self, dtype, interp_method):
# GH#41544
interpolation, method = interp_method
df = DataFrame(columns=["a", "b"], dtype=dtype)
res = df.quantile(
0.5, axis=1, numeric_only=False, interpolation=interpolation, method=method
)
expected = Series([], index=[], name=0.5, dtype=dtype)
tm.assert_series_equal(res, expected)
# no columns in result, so no dtype preservation
res = df.quantile(
[0.5],
axis=1,
numeric_only=False,
interpolation=interpolation,
method=method,
)
expected = DataFrame(index=[0.5], columns=[])
tm.assert_frame_equal(res, expected)
@pytest.mark.parametrize("invalid", [-1, 2, [0.5, -1], [0.5, 2]])
def test_quantile_invalid(self, invalid, datetime_frame, interp_method):
msg = "percentiles should all be in the interval \\[0, 1\\]"
interpolation, method = interp_method
with pytest.raises(ValueError, match=msg):
datetime_frame.quantile(invalid, interpolation=interpolation, method=method)
def test_quantile_box(self, interp_method, request, using_array_manager):
interpolation, method = interp_method
if method == "table" and using_array_manager:
request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
df = DataFrame(
{
"A": [
Timestamp("2011-01-01"),
Timestamp("2011-01-02"),
Timestamp("2011-01-03"),
],
"B": [
Timestamp("2011-01-01", tz="US/Eastern"),
Timestamp("2011-01-02", tz="US/Eastern"),
Timestamp("2011-01-03", tz="US/Eastern"),
],
"C": [
pd.Timedelta("1 days"),
pd.Timedelta("2 days"),
pd.Timedelta("3 days"),
],
}
)
res = df.quantile(
0.5, numeric_only=False, interpolation=interpolation, method=method
)
exp = Series(
[
Timestamp("2011-01-02"),
Timestamp("2011-01-02", tz="US/Eastern"),
pd.Timedelta("2 days"),
],
name=0.5,
index=["A", "B", "C"],
)
tm.assert_series_equal(res, exp)
res = df.quantile(
[0.5], numeric_only=False, interpolation=interpolation, method=method
)
exp = DataFrame(
[
[
Timestamp("2011-01-02"),
Timestamp("2011-01-02", tz="US/Eastern"),
pd.Timedelta("2 days"),
]
],
index=[0.5],
columns=["A", "B", "C"],
)
tm.assert_frame_equal(res, exp)
def test_quantile_box_nat(self):
# DatetimeLikeBlock may be consolidated and contain NaT in different loc
df = DataFrame(
{
"A": [
Timestamp("2011-01-01"),
pd.NaT,
Timestamp("2011-01-02"),
Timestamp("2011-01-03"),
],
"a": [
Timestamp("2011-01-01"),
Timestamp("2011-01-02"),
pd.NaT,
Timestamp("2011-01-03"),
],
"B": [
Timestamp("2011-01-01", tz="US/Eastern"),
pd.NaT,
Timestamp("2011-01-02", tz="US/Eastern"),
Timestamp("2011-01-03", tz="US/Eastern"),
],
"b": [
Timestamp("2011-01-01", tz="US/Eastern"),
Timestamp("2011-01-02", tz="US/Eastern"),
pd.NaT,
Timestamp("2011-01-03", tz="US/Eastern"),
],
"C": [
pd.Timedelta("1 days"),
pd.Timedelta("2 days"),
pd.Timedelta("3 days"),
pd.NaT,
],
"c": [
pd.NaT,
pd.Timedelta("1 days"),
pd.Timedelta("2 days"),
pd.Timedelta("3 days"),
],
},
columns=list("AaBbCc"),
)
res = df.quantile(0.5, numeric_only=False)
exp = Series(
[
Timestamp("2011-01-02"),
Timestamp("2011-01-02"),
Timestamp("2011-01-02", tz="US/Eastern"),
Timestamp("2011-01-02", tz="US/Eastern"),
pd.Timedelta("2 days"),
pd.Timedelta("2 days"),
],
name=0.5,
index=list("AaBbCc"),
)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5], numeric_only=False)
exp = DataFrame(
[
[
Timestamp("2011-01-02"),
Timestamp("2011-01-02"),
Timestamp("2011-01-02", tz="US/Eastern"),
Timestamp("2011-01-02", tz="US/Eastern"),
pd.Timedelta("2 days"),
pd.Timedelta("2 days"),
]
],
index=[0.5],
columns=list("AaBbCc"),
)
tm.assert_frame_equal(res, exp)
def test_quantile_nan(self, interp_method, request, using_array_manager):
interpolation, method = interp_method
if method == "table" and using_array_manager:
request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
# GH 14357 - float block where some cols have missing values
df = DataFrame({"a": np.arange(1, 6.0), "b": np.arange(1, 6.0)})
df.iloc[-1, 1] = np.nan
res = df.quantile(0.5, interpolation=interpolation, method=method)
exp = Series(
[3.0, 2.5 if interpolation == "linear" else 3.0], index=["a", "b"], name=0.5
)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5, 0.75], interpolation=interpolation, method=method)
exp = DataFrame(
{
"a": [3.0, 4.0],
"b": [2.5, 3.25] if interpolation == "linear" else [3.0, 4.0],
},
index=[0.5, 0.75],
)
tm.assert_frame_equal(res, exp)
res = df.quantile(0.5, axis=1, interpolation=interpolation, method=method)
exp = Series(np.arange(1.0, 6.0), name=0.5)
tm.assert_series_equal(res, exp)
res = df.quantile(
[0.5, 0.75], axis=1, interpolation=interpolation, method=method
)
exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75])
if interpolation == "nearest":
exp.iloc[1, -1] = np.nan
tm.assert_frame_equal(res, exp)
# full-nan column
df["b"] = np.nan
res = df.quantile(0.5, interpolation=interpolation, method=method)
exp = Series([3.0, np.nan], index=["a", "b"], name=0.5)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5, 0.75], interpolation=interpolation, method=method)
exp = DataFrame({"a": [3.0, 4.0], "b": [np.nan, np.nan]}, index=[0.5, 0.75])
tm.assert_frame_equal(res, exp)
def test_quantile_nat(self, interp_method, request, using_array_manager, unit):
interpolation, method = interp_method
if method == "table" and using_array_manager:
request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
# full NaT column
df = DataFrame({"a": [pd.NaT, pd.NaT, pd.NaT]}, dtype=f"M8[{unit}]")
res = df.quantile(
0.5, numeric_only=False, interpolation=interpolation, method=method
)
exp = Series([pd.NaT], index=["a"], name=0.5, dtype=f"M8[{unit}]")
tm.assert_series_equal(res, exp)
res = df.quantile(
[0.5], numeric_only=False, interpolation=interpolation, method=method
)
exp = DataFrame({"a": [pd.NaT]}, index=[0.5], dtype=f"M8[{unit}]")
tm.assert_frame_equal(res, exp)
# mixed non-null / full null column
df = DataFrame(
{
"a": [
Timestamp("2012-01-01"),
Timestamp("2012-01-02"),
Timestamp("2012-01-03"),
],
"b": [pd.NaT, pd.NaT, pd.NaT],
},
dtype=f"M8[{unit}]",
)
res = df.quantile(
0.5, numeric_only=False, interpolation=interpolation, method=method
)
exp = Series(
[Timestamp("2012-01-02"), pd.NaT],
index=["a", "b"],
name=0.5,
dtype=f"M8[{unit}]",
)
tm.assert_series_equal(res, exp)
res = df.quantile(
[0.5], numeric_only=False, interpolation=interpolation, method=method
)
exp = DataFrame(
[[Timestamp("2012-01-02"), pd.NaT]],
index=[0.5],
columns=["a", "b"],
dtype=f"M8[{unit}]",
)
tm.assert_frame_equal(res, exp)
def test_quantile_empty_no_rows_floats(self, interp_method):
interpolation, method = interp_method
df = DataFrame(columns=["a", "b"], dtype="float64")
res = df.quantile(0.5, interpolation=interpolation, method=method)
exp = Series([np.nan, np.nan], index=["a", "b"], name=0.5)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5], interpolation=interpolation, method=method)
exp = DataFrame([[np.nan, np.nan]], columns=["a", "b"], index=[0.5])
tm.assert_frame_equal(res, exp)
res = df.quantile(0.5, axis=1, interpolation=interpolation, method=method)
exp = Series([], index=[], dtype="float64", name=0.5)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5], axis=1, interpolation=interpolation, method=method)
exp = DataFrame(columns=[], index=[0.5])
tm.assert_frame_equal(res, exp)
def test_quantile_empty_no_rows_ints(self, interp_method):
interpolation, method = interp_method
df = DataFrame(columns=["a", "b"], dtype="int64")
res = df.quantile(0.5, interpolation=interpolation, method=method)
exp = Series([np.nan, np.nan], index=["a", "b"], name=0.5)
tm.assert_series_equal(res, exp)
def test_quantile_empty_no_rows_dt64(self, interp_method):
interpolation, method = interp_method
# datetimes
df = DataFrame(columns=["a", "b"], dtype="datetime64[ns]")
res = df.quantile(
0.5, numeric_only=False, interpolation=interpolation, method=method
)
exp = Series(
[pd.NaT, pd.NaT], index=["a", "b"], dtype="datetime64[ns]", name=0.5
)
tm.assert_series_equal(res, exp)
# Mixed dt64/dt64tz
df["a"] = df["a"].dt.tz_localize("US/Central")
res = df.quantile(
0.5, numeric_only=False, interpolation=interpolation, method=method
)
exp = exp.astype(object)
if interpolation == "nearest":
# GH#18463 TODO: would we prefer NaTs here?
msg = "The 'downcast' keyword in fillna is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
exp = exp.fillna(np.nan, downcast=False)
tm.assert_series_equal(res, exp)
# both dt64tz
df["b"] = df["b"].dt.tz_localize("US/Central")
res = df.quantile(
0.5, numeric_only=False, interpolation=interpolation, method=method
)
exp = exp.astype(df["b"].dtype)
tm.assert_series_equal(res, exp)
def test_quantile_empty_no_columns(self, interp_method):
# GH#23925 _get_numeric_data may drop all columns
interpolation, method = interp_method
df = DataFrame(pd.date_range("1/1/18", periods=5))
df.columns.name = "captain tightpants"
result = df.quantile(
0.5, numeric_only=True, interpolation=interpolation, method=method
)
expected = Series([], index=[], name=0.5, dtype=np.float64)
expected.index.name = "captain tightpants"
tm.assert_series_equal(result, expected)
result = df.quantile(
[0.5], numeric_only=True, interpolation=interpolation, method=method
)
expected = DataFrame([], index=[0.5], columns=[])
expected.columns.name = "captain tightpants"
tm.assert_frame_equal(result, expected)
def test_quantile_item_cache(
self, using_array_manager, interp_method, using_copy_on_write
):
# previous behavior incorrect retained an invalid _item_cache entry
interpolation, method = interp_method
df = DataFrame(
np.random.default_rng(2).standard_normal((4, 3)), columns=["A", "B", "C"]
)
df["D"] = df["A"] * 2
ser = df["A"]
if not using_array_manager:
assert len(df._mgr.blocks) == 2
df.quantile(numeric_only=False, interpolation=interpolation, method=method)
if using_copy_on_write:
ser.iloc[0] = 99
assert df.iloc[0, 0] == df["A"][0]
assert df.iloc[0, 0] != 99
else:
ser.values[0] = 99
assert df.iloc[0, 0] == df["A"][0]
assert df.iloc[0, 0] == 99
def test_invalid_method(self):
with pytest.raises(ValueError, match="Invalid method: foo"):
DataFrame(range(1)).quantile(0.5, method="foo")
def test_table_invalid_interpolation(self):
with pytest.raises(ValueError, match="Invalid interpolation: foo"):
DataFrame(range(1)).quantile(0.5, method="table", interpolation="foo")
class TestQuantileExtensionDtype:
# TODO: tests for axis=1?
# TODO: empty case?
@pytest.fixture(
params=[
pytest.param(
pd.IntervalIndex.from_breaks(range(10)),
marks=pytest.mark.xfail(reason="raises when trying to add Intervals"),
),
pd.period_range("2016-01-01", periods=9, freq="D"),
pd.date_range("2016-01-01", periods=9, tz="US/Pacific"),
pd.timedelta_range("1 Day", periods=9),
pd.array(np.arange(9), dtype="Int64"),
pd.array(np.arange(9), dtype="Float64"),
],
ids=lambda x: str(x.dtype),
)
def index(self, request):
# NB: not actually an Index object
idx = request.param
idx.name = "A"
return idx
@pytest.fixture
def obj(self, index, frame_or_series):
# bc index is not always an Index (yet), we need to re-patch .name
obj = frame_or_series(index).copy()
if frame_or_series is Series:
obj.name = "A"
else:
obj.columns = ["A"]
return obj
def compute_quantile(self, obj, qs):
if isinstance(obj, Series):
result = obj.quantile(qs)
else:
result = obj.quantile(qs, numeric_only=False)
return result
def test_quantile_ea(self, request, obj, index):
# result should be invariant to shuffling
indexer = np.arange(len(index), dtype=np.intp)
np.random.default_rng(2).shuffle(indexer)
obj = obj.iloc[indexer]
qs = [0.5, 0, 1]
result = self.compute_quantile(obj, qs)
exp_dtype = index.dtype
if index.dtype == "Int64":
# match non-nullable casting behavior
exp_dtype = "Float64"
# expected here assumes len(index) == 9
expected = Series(
[index[4], index[0], index[-1]], dtype=exp_dtype, index=qs, name="A"
)
expected = type(obj)(expected)
tm.assert_equal(result, expected)
def test_quantile_ea_with_na(self, obj, index):
obj.iloc[0] = index._na_value
obj.iloc[-1] = index._na_value
# result should be invariant to shuffling
indexer = np.arange(len(index), dtype=np.intp)
np.random.default_rng(2).shuffle(indexer)
obj = obj.iloc[indexer]
qs = [0.5, 0, 1]
result = self.compute_quantile(obj, qs)
# expected here assumes len(index) == 9
expected = Series(
[index[4], index[1], index[-2]], dtype=index.dtype, index=qs, name="A"
)
expected = type(obj)(expected)
tm.assert_equal(result, expected)
def test_quantile_ea_all_na(self, request, obj, index):
obj.iloc[:] = index._na_value
# Check dtypes were preserved; this was once a problem see GH#39763
assert np.all(obj.dtypes == index.dtype)
# result should be invariant to shuffling
indexer = np.arange(len(index), dtype=np.intp)
np.random.default_rng(2).shuffle(indexer)
obj = obj.iloc[indexer]
qs = [0.5, 0, 1]
result = self.compute_quantile(obj, qs)
expected = index.take([-1, -1, -1], allow_fill=True, fill_value=index._na_value)
expected = Series(expected, index=qs, name="A")
expected = type(obj)(expected)
tm.assert_equal(result, expected)
def test_quantile_ea_scalar(self, request, obj, index):
# scalar qs
# result should be invariant to shuffling
indexer = np.arange(len(index), dtype=np.intp)
np.random.default_rng(2).shuffle(indexer)
obj = obj.iloc[indexer]
qs = 0.5
result = self.compute_quantile(obj, qs)
exp_dtype = index.dtype
if index.dtype == "Int64":
exp_dtype = "Float64"
expected = Series({"A": index[4]}, dtype=exp_dtype, name=0.5)
if isinstance(obj, Series):
expected = expected["A"]
assert result == expected
else:
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"dtype, expected_data, expected_index, axis",
[
["float64", [], [], 1],
["int64", [], [], 1],
["float64", [np.nan, np.nan], ["a", "b"], 0],
["int64", [np.nan, np.nan], ["a", "b"], 0],
],
)
def test_empty_numeric(self, dtype, expected_data, expected_index, axis):
# GH 14564
df = DataFrame(columns=["a", "b"], dtype=dtype)
result = df.quantile(0.5, axis=axis)
expected = Series(
expected_data, name=0.5, index=Index(expected_index), dtype="float64"
)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"dtype, expected_data, expected_index, axis, expected_dtype",
[
["datetime64[ns]", [], [], 1, "datetime64[ns]"],
["datetime64[ns]", [pd.NaT, pd.NaT], ["a", "b"], 0, "datetime64[ns]"],
],
)
def test_empty_datelike(
self, dtype, expected_data, expected_index, axis, expected_dtype
):
# GH 14564
df = DataFrame(columns=["a", "b"], dtype=dtype)
result = df.quantile(0.5, axis=axis, numeric_only=False)
expected = Series(
expected_data, name=0.5, index=Index(expected_index), dtype=expected_dtype
)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"expected_data, expected_index, axis",
[
[[np.nan, np.nan], range(2), 1],
[[], [], 0],
],
)
def test_datelike_numeric_only(self, expected_data, expected_index, axis):
# GH 14564
df = DataFrame(
{
"a": pd.to_datetime(["2010", "2011"]),
"b": [0, 5],
"c": pd.to_datetime(["2011", "2012"]),
}
)
result = df[["a", "c"]].quantile(0.5, axis=axis, numeric_only=True)
expected = Series(
expected_data, name=0.5, index=Index(expected_index), dtype=np.float64
)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,510 @@
from datetime import (
datetime,
timedelta,
)
import numpy as np
import pytest
from pandas._libs.algos import (
Infinity,
NegInfinity,
)
from pandas import (
DataFrame,
Index,
Series,
)
import pandas._testing as tm
class TestRank:
s = Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])
df = DataFrame({"A": s, "B": s})
results = {
"average": np.array([1.5, 5.5, 7.0, 3.5, np.nan, 3.5, 1.5, 8.0, np.nan, 5.5]),
"min": np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5]),
"max": np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6]),
"first": np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6]),
"dense": np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]),
}
@pytest.fixture(params=["average", "min", "max", "first", "dense"])
def method(self, request):
"""
Fixture for trying all rank methods
"""
return request.param
def test_rank(self, float_frame):
sp_stats = pytest.importorskip("scipy.stats")
float_frame.loc[::2, "A"] = np.nan
float_frame.loc[::3, "B"] = np.nan
float_frame.loc[::4, "C"] = np.nan
float_frame.loc[::5, "D"] = np.nan
ranks0 = float_frame.rank()
ranks1 = float_frame.rank(1)
mask = np.isnan(float_frame.values)
fvals = float_frame.fillna(np.inf).values
exp0 = np.apply_along_axis(sp_stats.rankdata, 0, fvals)
exp0[mask] = np.nan
exp1 = np.apply_along_axis(sp_stats.rankdata, 1, fvals)
exp1[mask] = np.nan
tm.assert_almost_equal(ranks0.values, exp0)
tm.assert_almost_equal(ranks1.values, exp1)
# integers
df = DataFrame(
np.random.default_rng(2).integers(0, 5, size=40).reshape((10, 4))
)
result = df.rank()
exp = df.astype(float).rank()
tm.assert_frame_equal(result, exp)
result = df.rank(1)
exp = df.astype(float).rank(1)
tm.assert_frame_equal(result, exp)
def test_rank2(self):
df = DataFrame([[1, 3, 2], [1, 2, 3]])
expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0
result = df.rank(1, pct=True)
tm.assert_frame_equal(result, expected)
df = DataFrame([[1, 3, 2], [1, 2, 3]])
expected = df.rank(0) / 2.0
result = df.rank(0, pct=True)
tm.assert_frame_equal(result, expected)
df = DataFrame([["b", "c", "a"], ["a", "c", "b"]])
expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]])
result = df.rank(1, numeric_only=False)
tm.assert_frame_equal(result, expected)
expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]])
result = df.rank(0, numeric_only=False)
tm.assert_frame_equal(result, expected)
df = DataFrame([["b", np.nan, "a"], ["a", "c", "b"]])
expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 3.0, 2.0]])
result = df.rank(1, numeric_only=False)
tm.assert_frame_equal(result, expected)
expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 1.0, 2.0]])
result = df.rank(0, numeric_only=False)
tm.assert_frame_equal(result, expected)
# f7u12, this does not work without extensive workaround
data = [
[datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)],
[datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)],
]
df = DataFrame(data)
# check the rank
expected = DataFrame([[2.0, np.nan, 1.0], [2.0, 3.0, 1.0]])
result = df.rank(1, numeric_only=False, ascending=True)
tm.assert_frame_equal(result, expected)
expected = DataFrame([[1.0, np.nan, 2.0], [2.0, 1.0, 3.0]])
result = df.rank(1, numeric_only=False, ascending=False)
tm.assert_frame_equal(result, expected)
df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10, 1e60, 1e80, 1e-30]})
exp = DataFrame({"a": [3.5, 1.0, 3.5, 5.0, 6.0, 7.0, 2.0]})
tm.assert_frame_equal(df.rank(), exp)
def test_rank_does_not_mutate(self):
# GH#18521
# Check rank does not mutate DataFrame
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 3)), dtype="float64"
)
expected = df.copy()
df.rank()
result = df
tm.assert_frame_equal(result, expected)
def test_rank_mixed_frame(self, float_string_frame):
float_string_frame["datetime"] = datetime.now()
float_string_frame["timedelta"] = timedelta(days=1, seconds=1)
float_string_frame.rank(numeric_only=False)
with pytest.raises(TypeError, match="not supported between instances of"):
float_string_frame.rank(axis=1)
def test_rank_na_option(self, float_frame):
sp_stats = pytest.importorskip("scipy.stats")
float_frame.loc[::2, "A"] = np.nan
float_frame.loc[::3, "B"] = np.nan
float_frame.loc[::4, "C"] = np.nan
float_frame.loc[::5, "D"] = np.nan
# bottom
ranks0 = float_frame.rank(na_option="bottom")
ranks1 = float_frame.rank(1, na_option="bottom")
fvals = float_frame.fillna(np.inf).values
exp0 = np.apply_along_axis(sp_stats.rankdata, 0, fvals)
exp1 = np.apply_along_axis(sp_stats.rankdata, 1, fvals)
tm.assert_almost_equal(ranks0.values, exp0)
tm.assert_almost_equal(ranks1.values, exp1)
# top
ranks0 = float_frame.rank(na_option="top")
ranks1 = float_frame.rank(1, na_option="top")
fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values
fval1 = float_frame.T
fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
fval1 = fval1.fillna(np.inf).values
exp0 = np.apply_along_axis(sp_stats.rankdata, 0, fval0)
exp1 = np.apply_along_axis(sp_stats.rankdata, 1, fval1)
tm.assert_almost_equal(ranks0.values, exp0)
tm.assert_almost_equal(ranks1.values, exp1)
# descending
# bottom
ranks0 = float_frame.rank(na_option="top", ascending=False)
ranks1 = float_frame.rank(1, na_option="top", ascending=False)
fvals = float_frame.fillna(np.inf).values
exp0 = np.apply_along_axis(sp_stats.rankdata, 0, -fvals)
exp1 = np.apply_along_axis(sp_stats.rankdata, 1, -fvals)
tm.assert_almost_equal(ranks0.values, exp0)
tm.assert_almost_equal(ranks1.values, exp1)
# descending
# top
ranks0 = float_frame.rank(na_option="bottom", ascending=False)
ranks1 = float_frame.rank(1, na_option="bottom", ascending=False)
fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values
fval1 = float_frame.T
fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
fval1 = fval1.fillna(np.inf).values
exp0 = np.apply_along_axis(sp_stats.rankdata, 0, -fval0)
exp1 = np.apply_along_axis(sp_stats.rankdata, 1, -fval1)
tm.assert_numpy_array_equal(ranks0.values, exp0)
tm.assert_numpy_array_equal(ranks1.values, exp1)
# bad values throw error
msg = "na_option must be one of 'keep', 'top', or 'bottom'"
with pytest.raises(ValueError, match=msg):
float_frame.rank(na_option="bad", ascending=False)
# invalid type
with pytest.raises(ValueError, match=msg):
float_frame.rank(na_option=True, ascending=False)
def test_rank_axis(self):
# check if using axes' names gives the same result
df = DataFrame([[2, 1], [4, 3]])
tm.assert_frame_equal(df.rank(axis=0), df.rank(axis="index"))
tm.assert_frame_equal(df.rank(axis=1), df.rank(axis="columns"))
@pytest.mark.parametrize("ax", [0, 1])
@pytest.mark.parametrize("m", ["average", "min", "max", "first", "dense"])
def test_rank_methods_frame(self, ax, m):
sp_stats = pytest.importorskip("scipy.stats")
xs = np.random.default_rng(2).integers(0, 21, (100, 26))
xs = (xs - 10.0) / 10.0
cols = [chr(ord("z") - i) for i in range(xs.shape[1])]
for vals in [xs, xs + 1e6, xs * 1e-6]:
df = DataFrame(vals, columns=cols)
result = df.rank(axis=ax, method=m)
sprank = np.apply_along_axis(
sp_stats.rankdata, ax, vals, m if m != "first" else "ordinal"
)
sprank = sprank.astype(np.float64)
expected = DataFrame(sprank, columns=cols).astype("float64")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype", ["O", "f8", "i8"])
def test_rank_descending(self, method, dtype):
if "i" in dtype:
df = self.df.dropna().astype(dtype)
else:
df = self.df.astype(dtype)
res = df.rank(ascending=False)
expected = (df.max() - df).rank()
tm.assert_frame_equal(res, expected)
expected = (df.max() - df).rank(method=method)
if dtype != "O":
res2 = df.rank(method=method, ascending=False, numeric_only=True)
tm.assert_frame_equal(res2, expected)
res3 = df.rank(method=method, ascending=False, numeric_only=False)
tm.assert_frame_equal(res3, expected)
@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize("dtype", [None, object])
def test_rank_2d_tie_methods(self, method, axis, dtype):
df = self.df
def _check2d(df, expected, method="average", axis=0):
exp_df = DataFrame({"A": expected, "B": expected})
if axis == 1:
df = df.T
exp_df = exp_df.T
result = df.rank(method=method, axis=axis)
tm.assert_frame_equal(result, exp_df)
frame = df if dtype is None else df.astype(dtype)
_check2d(frame, self.results[method], method=method, axis=axis)
@pytest.mark.parametrize(
"method,exp",
[
("dense", [[1.0, 1.0, 1.0], [1.0, 0.5, 2.0 / 3], [1.0, 0.5, 1.0 / 3]]),
(
"min",
[
[1.0 / 3, 1.0, 1.0],
[1.0 / 3, 1.0 / 3, 2.0 / 3],
[1.0 / 3, 1.0 / 3, 1.0 / 3],
],
),
(
"max",
[[1.0, 1.0, 1.0], [1.0, 2.0 / 3, 2.0 / 3], [1.0, 2.0 / 3, 1.0 / 3]],
),
(
"average",
[[2.0 / 3, 1.0, 1.0], [2.0 / 3, 0.5, 2.0 / 3], [2.0 / 3, 0.5, 1.0 / 3]],
),
(
"first",
[
[1.0 / 3, 1.0, 1.0],
[2.0 / 3, 1.0 / 3, 2.0 / 3],
[3.0 / 3, 2.0 / 3, 1.0 / 3],
],
),
],
)
def test_rank_pct_true(self, method, exp):
# see gh-15630.
df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]])
result = df.rank(method=method, pct=True)
expected = DataFrame(exp)
tm.assert_frame_equal(result, expected)
@pytest.mark.single_cpu
def test_pct_max_many_rows(self):
# GH 18271
df = DataFrame(
{"A": np.arange(2**24 + 1), "B": np.arange(2**24 + 1, 0, -1)}
)
result = df.rank(pct=True).max()
assert (result == 1).all()
@pytest.mark.parametrize(
"contents,dtype",
[
(
[
-np.inf,
-50,
-1,
-1e-20,
-1e-25,
-1e-50,
0,
1e-40,
1e-20,
1e-10,
2,
40,
np.inf,
],
"float64",
),
(
[
-np.inf,
-50,
-1,
-1e-20,
-1e-25,
-1e-45,
0,
1e-40,
1e-20,
1e-10,
2,
40,
np.inf,
],
"float32",
),
([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"),
(
[
np.iinfo(np.int64).min,
-100,
0,
1,
9999,
100000,
1e10,
np.iinfo(np.int64).max,
],
"int64",
),
([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"),
(
[datetime(2001, 1, 1), datetime(2001, 1, 2), datetime(2001, 1, 5)],
"datetime64",
),
],
)
def test_rank_inf_and_nan(self, contents, dtype, frame_or_series):
dtype_na_map = {
"float64": np.nan,
"float32": np.nan,
"object": None,
"datetime64": np.datetime64("nat"),
}
# Insert nans at random positions if underlying dtype has missing
# value. Then adjust the expected order by adding nans accordingly
# This is for testing whether rank calculation is affected
# when values are interwined with nan values.
values = np.array(contents, dtype=dtype)
exp_order = np.array(range(len(values)), dtype="float64") + 1.0
if dtype in dtype_na_map:
na_value = dtype_na_map[dtype]
nan_indices = np.random.default_rng(2).choice(range(len(values)), 5)
values = np.insert(values, nan_indices, na_value)
exp_order = np.insert(exp_order, nan_indices, np.nan)
# Shuffle the testing array and expected results in the same way
random_order = np.random.default_rng(2).permutation(len(values))
obj = frame_or_series(values[random_order])
expected = frame_or_series(exp_order[random_order], dtype="float64")
result = obj.rank()
tm.assert_equal(result, expected)
def test_df_series_inf_nan_consistency(self):
# GH#32593
index = [5, 4, 3, 2, 1, 6, 7, 8, 9, 10]
col1 = [5, 4, 3, 5, 8, 5, 2, 1, 6, 6]
col2 = [5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf]
df = DataFrame(
data={
"col1": col1,
"col2": col2,
},
index=index,
dtype="f8",
)
df_result = df.rank()
series_result = df.copy()
series_result["col1"] = df["col1"].rank()
series_result["col2"] = df["col2"].rank()
tm.assert_frame_equal(df_result, series_result)
def test_rank_both_inf(self):
# GH#32593
df = DataFrame({"a": [-np.inf, 0, np.inf]})
expected = DataFrame({"a": [1.0, 2.0, 3.0]})
result = df.rank()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"na_option,ascending,expected",
[
("top", True, [3.0, 1.0, 2.0]),
("top", False, [2.0, 1.0, 3.0]),
("bottom", True, [2.0, 3.0, 1.0]),
("bottom", False, [1.0, 3.0, 2.0]),
],
)
def test_rank_inf_nans_na_option(
self, frame_or_series, method, na_option, ascending, expected
):
obj = frame_or_series([np.inf, np.nan, -np.inf])
result = obj.rank(method=method, na_option=na_option, ascending=ascending)
expected = frame_or_series(expected)
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"na_option,ascending,expected",
[
("bottom", True, [1.0, 2.0, 4.0, 3.0]),
("bottom", False, [1.0, 2.0, 4.0, 3.0]),
("top", True, [2.0, 3.0, 1.0, 4.0]),
("top", False, [2.0, 3.0, 1.0, 4.0]),
],
)
def test_rank_object_first(
self, frame_or_series, na_option, ascending, expected, using_infer_string
):
obj = frame_or_series(["foo", "foo", None, "foo"])
result = obj.rank(method="first", na_option=na_option, ascending=ascending)
expected = frame_or_series(expected)
if using_infer_string and isinstance(obj, Series):
expected = expected.astype("uint64")
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"data,expected",
[
(
{"a": [1, 2, "a"], "b": [4, 5, 6]},
DataFrame({"b": [1.0, 2.0, 3.0]}, columns=Index(["b"], dtype=object)),
),
({"a": [1, 2, "a"]}, DataFrame(index=range(3), columns=[])),
],
)
def test_rank_mixed_axis_zero(self, data, expected):
df = DataFrame(data, columns=Index(list(data.keys()), dtype=object))
with pytest.raises(TypeError, match="'<' not supported between instances of"):
df.rank()
result = df.rank(numeric_only=True)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype, exp_dtype",
[("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")],
)
def test_rank_string_dtype(self, dtype, exp_dtype):
# GH#55362
pytest.importorskip("pyarrow")
obj = Series(["foo", "foo", None, "foo"], dtype=dtype)
result = obj.rank(method="first")
expected = Series([1, 2, None, 3], dtype=exp_dtype)
tm.assert_series_equal(result, expected)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,39 @@
import numpy as np
import pytest
from pandas import DataFrame
import pandas._testing as tm
class TestDataFrameReindexLike:
def test_reindex_like(self, float_frame):
other = float_frame.reindex(index=float_frame.index[:10], columns=["C", "B"])
tm.assert_frame_equal(other, float_frame.reindex_like(other))
@pytest.mark.parametrize(
"method,expected_values",
[
("nearest", [0, 1, 1, 2]),
("pad", [np.nan, 0, 1, 1]),
("backfill", [0, 1, 2, 2]),
],
)
def test_reindex_like_methods(self, method, expected_values):
df = DataFrame({"x": list(range(5))})
result = df.reindex_like(df, method=method, tolerance=0)
tm.assert_frame_equal(df, result)
result = df.reindex_like(df, method=method, tolerance=[0, 0, 0, 0])
tm.assert_frame_equal(df, result)
def test_reindex_like_subclass(self):
# https://github.com/pandas-dev/pandas/issues/31925
class MyDataFrame(DataFrame):
pass
expected = DataFrame()
df = MyDataFrame()
result = df.reindex_like(expected)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,415 @@
from collections import ChainMap
import inspect
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
MultiIndex,
merge,
)
import pandas._testing as tm
class TestRename:
def test_rename_signature(self):
sig = inspect.signature(DataFrame.rename)
parameters = set(sig.parameters)
assert parameters == {
"self",
"mapper",
"index",
"columns",
"axis",
"inplace",
"copy",
"level",
"errors",
}
def test_rename_mi(self, frame_or_series):
obj = frame_or_series(
[11, 21, 31],
index=MultiIndex.from_tuples([("A", x) for x in ["a", "B", "c"]]),
)
obj.rename(str.lower)
def test_rename(self, float_frame):
mapping = {"A": "a", "B": "b", "C": "c", "D": "d"}
renamed = float_frame.rename(columns=mapping)
renamed2 = float_frame.rename(columns=str.lower)
tm.assert_frame_equal(renamed, renamed2)
tm.assert_frame_equal(
renamed2.rename(columns=str.upper), float_frame, check_names=False
)
# index
data = {"A": {"foo": 0, "bar": 1}}
df = DataFrame(data)
renamed = df.rename(index={"foo": "bar", "bar": "foo"})
tm.assert_index_equal(renamed.index, Index(["bar", "foo"]))
renamed = df.rename(index=str.upper)
tm.assert_index_equal(renamed.index, Index(["FOO", "BAR"]))
# have to pass something
with pytest.raises(TypeError, match="must pass an index to rename"):
float_frame.rename()
# partial columns
renamed = float_frame.rename(columns={"C": "foo", "D": "bar"})
tm.assert_index_equal(renamed.columns, Index(["A", "B", "foo", "bar"]))
# other axis
renamed = float_frame.T.rename(index={"C": "foo", "D": "bar"})
tm.assert_index_equal(renamed.index, Index(["A", "B", "foo", "bar"]))
# index with name
index = Index(["foo", "bar"], name="name")
renamer = DataFrame(data, index=index)
renamed = renamer.rename(index={"foo": "bar", "bar": "foo"})
tm.assert_index_equal(renamed.index, Index(["bar", "foo"], name="name"))
assert renamed.index.name == renamer.index.name
@pytest.mark.parametrize(
"args,kwargs",
[
((ChainMap({"A": "a"}, {"B": "b"}),), {"axis": "columns"}),
((), {"columns": ChainMap({"A": "a"}, {"B": "b"})}),
],
)
def test_rename_chainmap(self, args, kwargs):
# see gh-23859
colAData = range(1, 11)
colBdata = np.random.default_rng(2).standard_normal(10)
df = DataFrame({"A": colAData, "B": colBdata})
result = df.rename(*args, **kwargs)
expected = DataFrame({"a": colAData, "b": colBdata})
tm.assert_frame_equal(result, expected)
def test_rename_multiindex(self):
tuples_index = [("foo1", "bar1"), ("foo2", "bar2")]
tuples_columns = [("fizz1", "buzz1"), ("fizz2", "buzz2")]
index = MultiIndex.from_tuples(tuples_index, names=["foo", "bar"])
columns = MultiIndex.from_tuples(tuples_columns, names=["fizz", "buzz"])
df = DataFrame([(0, 0), (1, 1)], index=index, columns=columns)
#
# without specifying level -> across all levels
renamed = df.rename(
index={"foo1": "foo3", "bar2": "bar3"},
columns={"fizz1": "fizz3", "buzz2": "buzz3"},
)
new_index = MultiIndex.from_tuples(
[("foo3", "bar1"), ("foo2", "bar3")], names=["foo", "bar"]
)
new_columns = MultiIndex.from_tuples(
[("fizz3", "buzz1"), ("fizz2", "buzz3")], names=["fizz", "buzz"]
)
tm.assert_index_equal(renamed.index, new_index)
tm.assert_index_equal(renamed.columns, new_columns)
assert renamed.index.names == df.index.names
assert renamed.columns.names == df.columns.names
#
# with specifying a level (GH13766)
# dict
new_columns = MultiIndex.from_tuples(
[("fizz3", "buzz1"), ("fizz2", "buzz2")], names=["fizz", "buzz"]
)
renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=0)
tm.assert_index_equal(renamed.columns, new_columns)
renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz")
tm.assert_index_equal(renamed.columns, new_columns)
new_columns = MultiIndex.from_tuples(
[("fizz1", "buzz1"), ("fizz2", "buzz3")], names=["fizz", "buzz"]
)
renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1)
tm.assert_index_equal(renamed.columns, new_columns)
renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz")
tm.assert_index_equal(renamed.columns, new_columns)
# function
func = str.upper
new_columns = MultiIndex.from_tuples(
[("FIZZ1", "buzz1"), ("FIZZ2", "buzz2")], names=["fizz", "buzz"]
)
renamed = df.rename(columns=func, level=0)
tm.assert_index_equal(renamed.columns, new_columns)
renamed = df.rename(columns=func, level="fizz")
tm.assert_index_equal(renamed.columns, new_columns)
new_columns = MultiIndex.from_tuples(
[("fizz1", "BUZZ1"), ("fizz2", "BUZZ2")], names=["fizz", "buzz"]
)
renamed = df.rename(columns=func, level=1)
tm.assert_index_equal(renamed.columns, new_columns)
renamed = df.rename(columns=func, level="buzz")
tm.assert_index_equal(renamed.columns, new_columns)
# index
new_index = MultiIndex.from_tuples(
[("foo3", "bar1"), ("foo2", "bar2")], names=["foo", "bar"]
)
renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0)
tm.assert_index_equal(renamed.index, new_index)
def test_rename_nocopy(self, float_frame, using_copy_on_write, warn_copy_on_write):
renamed = float_frame.rename(columns={"C": "foo"}, copy=False)
assert np.shares_memory(renamed["foo"]._values, float_frame["C"]._values)
with tm.assert_cow_warning(warn_copy_on_write):
renamed.loc[:, "foo"] = 1.0
if using_copy_on_write:
assert not (float_frame["C"] == 1.0).all()
else:
assert (float_frame["C"] == 1.0).all()
def test_rename_inplace(self, float_frame):
float_frame.rename(columns={"C": "foo"})
assert "C" in float_frame
assert "foo" not in float_frame
c_values = float_frame["C"]
float_frame = float_frame.copy()
return_value = float_frame.rename(columns={"C": "foo"}, inplace=True)
assert return_value is None
assert "C" not in float_frame
assert "foo" in float_frame
# GH 44153
# Used to be id(float_frame["foo"]) != c_id, but flaky in the CI
assert float_frame["foo"] is not c_values
def test_rename_bug(self):
# GH 5344
# rename set ref_locs, and set_index was not resetting
df = DataFrame({0: ["foo", "bar"], 1: ["bah", "bas"], 2: [1, 2]})
df = df.rename(columns={0: "a"})
df = df.rename(columns={1: "b"})
df = df.set_index(["a", "b"])
df.columns = ["2001-01-01"]
expected = DataFrame(
[[1], [2]],
index=MultiIndex.from_tuples(
[("foo", "bah"), ("bar", "bas")], names=["a", "b"]
),
columns=["2001-01-01"],
)
tm.assert_frame_equal(df, expected)
def test_rename_bug2(self):
# GH 19497
# rename was changing Index to MultiIndex if Index contained tuples
df = DataFrame(data=np.arange(3), index=[(0, 0), (1, 1), (2, 2)], columns=["a"])
df = df.rename({(1, 1): (5, 4)}, axis="index")
expected = DataFrame(
data=np.arange(3), index=[(0, 0), (5, 4), (2, 2)], columns=["a"]
)
tm.assert_frame_equal(df, expected)
def test_rename_errors_raises(self):
df = DataFrame(columns=["A", "B", "C", "D"])
with pytest.raises(KeyError, match="'E'] not found in axis"):
df.rename(columns={"A": "a", "E": "e"}, errors="raise")
@pytest.mark.parametrize(
"mapper, errors, expected_columns",
[
({"A": "a", "E": "e"}, "ignore", ["a", "B", "C", "D"]),
({"A": "a"}, "raise", ["a", "B", "C", "D"]),
(str.lower, "raise", ["a", "b", "c", "d"]),
],
)
def test_rename_errors(self, mapper, errors, expected_columns):
# GH 13473
# rename now works with errors parameter
df = DataFrame(columns=["A", "B", "C", "D"])
result = df.rename(columns=mapper, errors=errors)
expected = DataFrame(columns=expected_columns)
tm.assert_frame_equal(result, expected)
def test_rename_objects(self, float_string_frame):
renamed = float_string_frame.rename(columns=str.upper)
assert "FOO" in renamed
assert "foo" not in renamed
def test_rename_axis_style(self):
# https://github.com/pandas-dev/pandas/issues/12392
df = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["X", "Y"])
expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=["X", "Y"])
result = df.rename(str.lower, axis=1)
tm.assert_frame_equal(result, expected)
result = df.rename(str.lower, axis="columns")
tm.assert_frame_equal(result, expected)
result = df.rename({"A": "a", "B": "b"}, axis=1)
tm.assert_frame_equal(result, expected)
result = df.rename({"A": "a", "B": "b"}, axis="columns")
tm.assert_frame_equal(result, expected)
# Index
expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["x", "y"])
result = df.rename(str.lower, axis=0)
tm.assert_frame_equal(result, expected)
result = df.rename(str.lower, axis="index")
tm.assert_frame_equal(result, expected)
result = df.rename({"X": "x", "Y": "y"}, axis=0)
tm.assert_frame_equal(result, expected)
result = df.rename({"X": "x", "Y": "y"}, axis="index")
tm.assert_frame_equal(result, expected)
result = df.rename(mapper=str.lower, axis="index")
tm.assert_frame_equal(result, expected)
def test_rename_mapper_multi(self):
df = DataFrame({"A": ["a", "b"], "B": ["c", "d"], "C": [1, 2]}).set_index(
["A", "B"]
)
result = df.rename(str.upper)
expected = df.rename(index=str.upper)
tm.assert_frame_equal(result, expected)
def test_rename_positional_named(self):
# https://github.com/pandas-dev/pandas/issues/12392
df = DataFrame({"a": [1, 2], "b": [1, 2]}, index=["X", "Y"])
result = df.rename(index=str.lower, columns=str.upper)
expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["x", "y"])
tm.assert_frame_equal(result, expected)
def test_rename_axis_style_raises(self):
# see gh-12392
df = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["0", "1"])
# Named target and axis
over_spec_msg = "Cannot specify both 'axis' and any of 'index' or 'columns'"
with pytest.raises(TypeError, match=over_spec_msg):
df.rename(index=str.lower, axis=1)
with pytest.raises(TypeError, match=over_spec_msg):
df.rename(index=str.lower, axis="columns")
with pytest.raises(TypeError, match=over_spec_msg):
df.rename(columns=str.lower, axis="columns")
with pytest.raises(TypeError, match=over_spec_msg):
df.rename(index=str.lower, axis=0)
# Multiple targets and axis
with pytest.raises(TypeError, match=over_spec_msg):
df.rename(str.lower, index=str.lower, axis="columns")
# Too many targets
over_spec_msg = "Cannot specify both 'mapper' and any of 'index' or 'columns'"
with pytest.raises(TypeError, match=over_spec_msg):
df.rename(str.lower, index=str.lower, columns=str.lower)
# Duplicates
with pytest.raises(TypeError, match="multiple values"):
df.rename(id, mapper=id)
def test_rename_positional_raises(self):
# GH 29136
df = DataFrame(columns=["A", "B"])
msg = r"rename\(\) takes from 1 to 2 positional arguments"
with pytest.raises(TypeError, match=msg):
df.rename(None, str.lower)
def test_rename_no_mappings_raises(self):
# GH 29136
df = DataFrame([[1]])
msg = "must pass an index to rename"
with pytest.raises(TypeError, match=msg):
df.rename()
with pytest.raises(TypeError, match=msg):
df.rename(None, index=None)
with pytest.raises(TypeError, match=msg):
df.rename(None, columns=None)
with pytest.raises(TypeError, match=msg):
df.rename(None, columns=None, index=None)
def test_rename_mapper_and_positional_arguments_raises(self):
# GH 29136
df = DataFrame([[1]])
msg = "Cannot specify both 'mapper' and any of 'index' or 'columns'"
with pytest.raises(TypeError, match=msg):
df.rename({}, index={})
with pytest.raises(TypeError, match=msg):
df.rename({}, columns={})
with pytest.raises(TypeError, match=msg):
df.rename({}, columns={}, index={})
def test_rename_with_duplicate_columns(self):
# GH#4403
df4 = DataFrame(
{"RT": [0.0454], "TClose": [22.02], "TExg": [0.0422]},
index=MultiIndex.from_tuples(
[(600809, 20130331)], names=["STK_ID", "RPT_Date"]
),
)
df5 = DataFrame(
{
"RPT_Date": [20120930, 20121231, 20130331],
"STK_ID": [600809] * 3,
"STK_Name": ["饡驦", "饡驦", "饡驦"],
"TClose": [38.05, 41.66, 30.01],
},
index=MultiIndex.from_tuples(
[(600809, 20120930), (600809, 20121231), (600809, 20130331)],
names=["STK_ID", "RPT_Date"],
),
)
# TODO: can we construct this without merge?
k = merge(df4, df5, how="inner", left_index=True, right_index=True)
result = k.rename(columns={"TClose_x": "TClose", "TClose_y": "QT_Close"})
expected = DataFrame(
[[0.0454, 22.02, 0.0422, 20130331, 600809, "饡驦", 30.01]],
columns=[
"RT",
"TClose",
"TExg",
"RPT_Date",
"STK_ID",
"STK_Name",
"QT_Close",
],
).set_index(["STK_ID", "RPT_Date"], drop=False)
tm.assert_frame_equal(result, expected)
def test_rename_boolean_index(self):
df = DataFrame(np.arange(15).reshape(3, 5), columns=[False, True, 2, 3, 4])
mapper = {0: "foo", 1: "bar", 2: "bah"}
res = df.rename(index=mapper)
exp = DataFrame(
np.arange(15).reshape(3, 5),
columns=[False, True, 2, 3, 4],
index=["foo", "bar", "bah"],
)
tm.assert_frame_equal(res, exp)

View File

@ -0,0 +1,111 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
MultiIndex,
)
import pandas._testing as tm
class TestDataFrameRenameAxis:
def test_rename_axis_inplace(self, float_frame):
# GH#15704
expected = float_frame.rename_axis("foo")
result = float_frame.copy()
return_value = no_return = result.rename_axis("foo", inplace=True)
assert return_value is None
assert no_return is None
tm.assert_frame_equal(result, expected)
expected = float_frame.rename_axis("bar", axis=1)
result = float_frame.copy()
return_value = no_return = result.rename_axis("bar", axis=1, inplace=True)
assert return_value is None
assert no_return is None
tm.assert_frame_equal(result, expected)
def test_rename_axis_raises(self):
# GH#17833
df = DataFrame({"A": [1, 2], "B": [1, 2]})
with pytest.raises(ValueError, match="Use `.rename`"):
df.rename_axis(id, axis=0)
with pytest.raises(ValueError, match="Use `.rename`"):
df.rename_axis({0: 10, 1: 20}, axis=0)
with pytest.raises(ValueError, match="Use `.rename`"):
df.rename_axis(id, axis=1)
with pytest.raises(ValueError, match="Use `.rename`"):
df["A"].rename_axis(id)
def test_rename_axis_mapper(self):
# GH#19978
mi = MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["ll", "nn"])
df = DataFrame(
{"x": list(range(len(mi))), "y": [i * 10 for i in range(len(mi))]}, index=mi
)
# Test for rename of the Index object of columns
result = df.rename_axis("cols", axis=1)
tm.assert_index_equal(result.columns, Index(["x", "y"], name="cols"))
# Test for rename of the Index object of columns using dict
result = result.rename_axis(columns={"cols": "new"}, axis=1)
tm.assert_index_equal(result.columns, Index(["x", "y"], name="new"))
# Test for renaming index using dict
result = df.rename_axis(index={"ll": "foo"})
assert result.index.names == ["foo", "nn"]
# Test for renaming index using a function
result = df.rename_axis(index=str.upper, axis=0)
assert result.index.names == ["LL", "NN"]
# Test for renaming index providing complete list
result = df.rename_axis(index=["foo", "goo"])
assert result.index.names == ["foo", "goo"]
# Test for changing index and columns at same time
sdf = df.reset_index().set_index("nn").drop(columns=["ll", "y"])
result = sdf.rename_axis(index="foo", columns="meh")
assert result.index.name == "foo"
assert result.columns.name == "meh"
# Test different error cases
with pytest.raises(TypeError, match="Must pass"):
df.rename_axis(index="wrong")
with pytest.raises(ValueError, match="Length of names"):
df.rename_axis(index=["wrong"])
with pytest.raises(TypeError, match="bogus"):
df.rename_axis(bogus=None)
@pytest.mark.parametrize(
"kwargs, rename_index, rename_columns",
[
({"mapper": None, "axis": 0}, True, False),
({"mapper": None, "axis": 1}, False, True),
({"index": None}, True, False),
({"columns": None}, False, True),
({"index": None, "columns": None}, True, True),
({}, False, False),
],
)
def test_rename_axis_none(self, kwargs, rename_index, rename_columns):
# GH 25034
index = Index(list("abc"), name="foo")
columns = Index(["col1", "col2"], name="bar")
data = np.arange(6).reshape(3, 2)
df = DataFrame(data, index, columns)
result = df.rename_axis(**kwargs)
expected_index = index.rename(None) if rename_index else index
expected_columns = columns.rename(None) if rename_columns else columns
expected = DataFrame(data, expected_index, expected_columns)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,74 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
MultiIndex,
)
import pandas._testing as tm
class TestReorderLevels:
def test_reorder_levels(self, frame_or_series):
index = MultiIndex(
levels=[["bar"], ["one", "two", "three"], [0, 1]],
codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]],
names=["L0", "L1", "L2"],
)
df = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=index)
obj = tm.get_obj(df, frame_or_series)
# no change, position
result = obj.reorder_levels([0, 1, 2])
tm.assert_equal(obj, result)
# no change, labels
result = obj.reorder_levels(["L0", "L1", "L2"])
tm.assert_equal(obj, result)
# rotate, position
result = obj.reorder_levels([1, 2, 0])
e_idx = MultiIndex(
levels=[["one", "two", "three"], [0, 1], ["bar"]],
codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1], [0, 0, 0, 0, 0, 0]],
names=["L1", "L2", "L0"],
)
expected = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=e_idx)
expected = tm.get_obj(expected, frame_or_series)
tm.assert_equal(result, expected)
result = obj.reorder_levels([0, 0, 0])
e_idx = MultiIndex(
levels=[["bar"], ["bar"], ["bar"]],
codes=[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]],
names=["L0", "L0", "L0"],
)
expected = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=e_idx)
expected = tm.get_obj(expected, frame_or_series)
tm.assert_equal(result, expected)
result = obj.reorder_levels(["L0", "L0", "L0"])
tm.assert_equal(result, expected)
def test_reorder_levels_swaplevel_equivalence(
self, multiindex_year_month_day_dataframe_random_data
):
ymd = multiindex_year_month_day_dataframe_random_data
result = ymd.reorder_levels(["month", "day", "year"])
expected = ymd.swaplevel(0, 1).swaplevel(1, 2)
tm.assert_frame_equal(result, expected)
result = ymd["A"].reorder_levels(["month", "day", "year"])
expected = ymd["A"].swaplevel(0, 1).swaplevel(1, 2)
tm.assert_series_equal(result, expected)
result = ymd.T.reorder_levels(["month", "day", "year"], axis=1)
expected = ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1)
tm.assert_frame_equal(result, expected)
with pytest.raises(TypeError, match="hierarchical axis"):
ymd.reorder_levels([1, 2], axis=1)
with pytest.raises(IndexError, match="Too many levels"):
ymd.index.reorder_levels([1, 2, 3])

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,782 @@
from datetime import datetime
from itertools import product
import numpy as np
import pytest
from pandas.core.dtypes.common import (
is_float_dtype,
is_integer_dtype,
)
import pandas as pd
from pandas import (
Categorical,
CategoricalIndex,
DataFrame,
Index,
Interval,
IntervalIndex,
MultiIndex,
RangeIndex,
Series,
Timestamp,
cut,
date_range,
)
import pandas._testing as tm
@pytest.fixture()
def multiindex_df():
levels = [["A", ""], ["B", "b"]]
return DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels))
class TestResetIndex:
def test_reset_index_empty_rangeindex(self):
# GH#45230
df = DataFrame(
columns=["brand"], dtype=np.int64, index=RangeIndex(0, 0, 1, name="foo")
)
df2 = df.set_index([df.index, "brand"])
result = df2.reset_index([1], drop=True)
tm.assert_frame_equal(result, df[[]], check_index_type=True)
def test_set_reset(self):
idx = Index([2**63, 2**63 + 5, 2**63 + 10], name="foo")
# set/reset
df = DataFrame({"A": [0, 1, 2]}, index=idx)
result = df.reset_index()
assert result["foo"].dtype == np.dtype("uint64")
df = result.set_index("foo")
tm.assert_index_equal(df.index, idx)
def test_set_index_reset_index_dt64tz(self):
idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo")
# set/reset
df = DataFrame({"A": [0, 1, 2]}, index=idx)
result = df.reset_index()
assert result["foo"].dtype == "datetime64[ns, US/Eastern]"
df = result.set_index("foo")
tm.assert_index_equal(df.index, idx)
def test_reset_index_tz(self, tz_aware_fixture):
# GH 3950
# reset_index with single level
tz = tz_aware_fixture
idx = date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx")
df = DataFrame({"a": range(5), "b": ["A", "B", "C", "D", "E"]}, index=idx)
expected = DataFrame(
{
"idx": idx,
"a": range(5),
"b": ["A", "B", "C", "D", "E"],
},
columns=["idx", "a", "b"],
)
result = df.reset_index()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"])
def test_frame_reset_index_tzaware_index(self, tz):
dr = date_range("2012-06-02", periods=10, tz=tz)
df = DataFrame(np.random.default_rng(2).standard_normal(len(dr)), dr)
roundtripped = df.reset_index().set_index("index")
xp = df.index.tz
rs = roundtripped.index.tz
assert xp == rs
def test_reset_index_with_intervals(self):
idx = IntervalIndex.from_breaks(np.arange(11), name="x")
original = DataFrame({"x": idx, "y": np.arange(10)})[["x", "y"]]
result = original.set_index("x")
expected = DataFrame({"y": np.arange(10)}, index=idx)
tm.assert_frame_equal(result, expected)
result2 = result.reset_index()
tm.assert_frame_equal(result2, original)
def test_reset_index(self, float_frame):
stacked = float_frame.stack(future_stack=True)[::2]
stacked = DataFrame({"foo": stacked, "bar": stacked})
names = ["first", "second"]
stacked.index.names = names
deleveled = stacked.reset_index()
for i, (lev, level_codes) in enumerate(
zip(stacked.index.levels, stacked.index.codes)
):
values = lev.take(level_codes)
name = names[i]
tm.assert_index_equal(values, Index(deleveled[name]))
stacked.index.names = [None, None]
deleveled2 = stacked.reset_index()
tm.assert_series_equal(
deleveled["first"], deleveled2["level_0"], check_names=False
)
tm.assert_series_equal(
deleveled["second"], deleveled2["level_1"], check_names=False
)
# default name assigned
rdf = float_frame.reset_index()
exp = Series(float_frame.index.values, name="index")
tm.assert_series_equal(rdf["index"], exp)
# default name assigned, corner case
df = float_frame.copy()
df["index"] = "foo"
rdf = df.reset_index()
exp = Series(float_frame.index.values, name="level_0")
tm.assert_series_equal(rdf["level_0"], exp)
# but this is ok
float_frame.index.name = "index"
deleveled = float_frame.reset_index()
tm.assert_series_equal(deleveled["index"], Series(float_frame.index))
tm.assert_index_equal(deleveled.index, Index(range(len(deleveled))), exact=True)
# preserve column names
float_frame.columns.name = "columns"
reset = float_frame.reset_index()
assert reset.columns.name == "columns"
# only remove certain columns
df = float_frame.reset_index().set_index(["index", "A", "B"])
rs = df.reset_index(["A", "B"])
tm.assert_frame_equal(rs, float_frame)
rs = df.reset_index(["index", "A", "B"])
tm.assert_frame_equal(rs, float_frame.reset_index())
rs = df.reset_index(["index", "A", "B"])
tm.assert_frame_equal(rs, float_frame.reset_index())
rs = df.reset_index("A")
xp = float_frame.reset_index().set_index(["index", "B"])
tm.assert_frame_equal(rs, xp)
# test resetting in place
df = float_frame.copy()
reset = float_frame.reset_index()
return_value = df.reset_index(inplace=True)
assert return_value is None
tm.assert_frame_equal(df, reset)
df = float_frame.reset_index().set_index(["index", "A", "B"])
rs = df.reset_index("A", drop=True)
xp = float_frame.copy()
del xp["A"]
xp = xp.set_index(["B"], append=True)
tm.assert_frame_equal(rs, xp)
def test_reset_index_name(self):
df = DataFrame(
[[1, 2, 3, 4], [5, 6, 7, 8]],
columns=["A", "B", "C", "D"],
index=Index(range(2), name="x"),
)
assert df.reset_index().index.name is None
assert df.reset_index(drop=True).index.name is None
return_value = df.reset_index(inplace=True)
assert return_value is None
assert df.index.name is None
@pytest.mark.parametrize("levels", [["A", "B"], [0, 1]])
def test_reset_index_level(self, levels):
df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "C", "D"])
# With MultiIndex
result = df.set_index(["A", "B"]).reset_index(level=levels[0])
tm.assert_frame_equal(result, df.set_index("B"))
result = df.set_index(["A", "B"]).reset_index(level=levels[:1])
tm.assert_frame_equal(result, df.set_index("B"))
result = df.set_index(["A", "B"]).reset_index(level=levels)
tm.assert_frame_equal(result, df)
result = df.set_index(["A", "B"]).reset_index(level=levels, drop=True)
tm.assert_frame_equal(result, df[["C", "D"]])
# With single-level Index (GH 16263)
result = df.set_index("A").reset_index(level=levels[0])
tm.assert_frame_equal(result, df)
result = df.set_index("A").reset_index(level=levels[:1])
tm.assert_frame_equal(result, df)
result = df.set_index(["A"]).reset_index(level=levels[0], drop=True)
tm.assert_frame_equal(result, df[["B", "C", "D"]])
@pytest.mark.parametrize("idx_lev", [["A", "B"], ["A"]])
def test_reset_index_level_missing(self, idx_lev):
# Missing levels - for both MultiIndex and single-level Index:
df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "C", "D"])
with pytest.raises(KeyError, match=r"(L|l)evel \(?E\)?"):
df.set_index(idx_lev).reset_index(level=["A", "E"])
with pytest.raises(IndexError, match="Too many levels"):
df.set_index(idx_lev).reset_index(level=[0, 1, 2])
def test_reset_index_right_dtype(self):
time = np.arange(0.0, 10, np.sqrt(2) / 2)
s1 = Series(
(9.81 * time**2) / 2, index=Index(time, name="time"), name="speed"
)
df = DataFrame(s1)
reset = s1.reset_index()
assert reset["time"].dtype == np.float64
reset = df.reset_index()
assert reset["time"].dtype == np.float64
def test_reset_index_multiindex_col(self):
vals = np.random.default_rng(2).standard_normal((3, 3)).astype(object)
idx = ["x", "y", "z"]
full = np.hstack(([[x] for x in idx], vals))
df = DataFrame(
vals,
Index(idx, name="a"),
columns=[["b", "b", "c"], ["mean", "median", "mean"]],
)
rs = df.reset_index()
xp = DataFrame(
full, columns=[["a", "b", "b", "c"], ["", "mean", "median", "mean"]]
)
tm.assert_frame_equal(rs, xp)
rs = df.reset_index(col_fill=None)
xp = DataFrame(
full, columns=[["a", "b", "b", "c"], ["a", "mean", "median", "mean"]]
)
tm.assert_frame_equal(rs, xp)
rs = df.reset_index(col_level=1, col_fill="blah")
xp = DataFrame(
full, columns=[["blah", "b", "b", "c"], ["a", "mean", "median", "mean"]]
)
tm.assert_frame_equal(rs, xp)
df = DataFrame(
vals,
MultiIndex.from_arrays([[0, 1, 2], ["x", "y", "z"]], names=["d", "a"]),
columns=[["b", "b", "c"], ["mean", "median", "mean"]],
)
rs = df.reset_index("a")
xp = DataFrame(
full,
Index([0, 1, 2], name="d"),
columns=[["a", "b", "b", "c"], ["", "mean", "median", "mean"]],
)
tm.assert_frame_equal(rs, xp)
rs = df.reset_index("a", col_fill=None)
xp = DataFrame(
full,
Index(range(3), name="d"),
columns=[["a", "b", "b", "c"], ["a", "mean", "median", "mean"]],
)
tm.assert_frame_equal(rs, xp)
rs = df.reset_index("a", col_fill="blah", col_level=1)
xp = DataFrame(
full,
Index(range(3), name="d"),
columns=[["blah", "b", "b", "c"], ["a", "mean", "median", "mean"]],
)
tm.assert_frame_equal(rs, xp)
def test_reset_index_multiindex_nan(self):
# GH#6322, testing reset_index on MultiIndexes
# when we have a nan or all nan
df = DataFrame(
{
"A": ["a", "b", "c"],
"B": [0, 1, np.nan],
"C": np.random.default_rng(2).random(3),
}
)
rs = df.set_index(["A", "B"]).reset_index()
tm.assert_frame_equal(rs, df)
df = DataFrame(
{
"A": [np.nan, "b", "c"],
"B": [0, 1, 2],
"C": np.random.default_rng(2).random(3),
}
)
rs = df.set_index(["A", "B"]).reset_index()
tm.assert_frame_equal(rs, df)
df = DataFrame({"A": ["a", "b", "c"], "B": [0, 1, 2], "C": [np.nan, 1.1, 2.2]})
rs = df.set_index(["A", "B"]).reset_index()
tm.assert_frame_equal(rs, df)
df = DataFrame(
{
"A": ["a", "b", "c"],
"B": [np.nan, np.nan, np.nan],
"C": np.random.default_rng(2).random(3),
}
)
rs = df.set_index(["A", "B"]).reset_index()
tm.assert_frame_equal(rs, df)
@pytest.mark.parametrize(
"name",
[
None,
"foo",
2,
3.0,
pd.Timedelta(6),
Timestamp("2012-12-30", tz="UTC"),
"2012-12-31",
],
)
def test_reset_index_with_datetimeindex_cols(self, name):
# GH#5818
df = DataFrame(
[[1, 2], [3, 4]],
columns=date_range("1/1/2013", "1/2/2013"),
index=["A", "B"],
)
df.index.name = name
result = df.reset_index()
item = name if name is not None else "index"
columns = Index([item, datetime(2013, 1, 1), datetime(2013, 1, 2)])
if isinstance(item, str) and item == "2012-12-31":
columns = columns.astype("datetime64[ns]")
else:
assert columns.dtype == object
expected = DataFrame(
[["A", 1, 2], ["B", 3, 4]],
columns=columns,
)
tm.assert_frame_equal(result, expected)
def test_reset_index_range(self):
# GH#12071
df = DataFrame([[0, 0], [1, 1]], columns=["A", "B"], index=RangeIndex(stop=2))
result = df.reset_index()
assert isinstance(result.index, RangeIndex)
expected = DataFrame(
[[0, 0, 0], [1, 1, 1]],
columns=["index", "A", "B"],
index=RangeIndex(stop=2),
)
tm.assert_frame_equal(result, expected)
def test_reset_index_multiindex_columns(self, multiindex_df):
result = multiindex_df[["B"]].rename_axis("A").reset_index()
tm.assert_frame_equal(result, multiindex_df)
# GH#16120: already existing column
msg = r"cannot insert \('A', ''\), already exists"
with pytest.raises(ValueError, match=msg):
multiindex_df.rename_axis("A").reset_index()
# GH#16164: multiindex (tuple) full key
result = multiindex_df.set_index([("A", "")]).reset_index()
tm.assert_frame_equal(result, multiindex_df)
# with additional (unnamed) index level
idx_col = DataFrame(
[[0], [1]], columns=MultiIndex.from_tuples([("level_0", "")])
)
expected = pd.concat([idx_col, multiindex_df[[("B", "b"), ("A", "")]]], axis=1)
result = multiindex_df.set_index([("B", "b")], append=True).reset_index()
tm.assert_frame_equal(result, expected)
# with index name which is a too long tuple...
msg = "Item must have length equal to number of levels."
with pytest.raises(ValueError, match=msg):
multiindex_df.rename_axis([("C", "c", "i")]).reset_index()
# or too short...
levels = [["A", "a", ""], ["B", "b", "i"]]
df2 = DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels))
idx_col = DataFrame(
[[0], [1]], columns=MultiIndex.from_tuples([("C", "c", "ii")])
)
expected = pd.concat([idx_col, df2], axis=1)
result = df2.rename_axis([("C", "c")]).reset_index(col_fill="ii")
tm.assert_frame_equal(result, expected)
# ... which is incompatible with col_fill=None
with pytest.raises(
ValueError,
match=(
"col_fill=None is incompatible with "
r"incomplete column name \('C', 'c'\)"
),
):
df2.rename_axis([("C", "c")]).reset_index(col_fill=None)
# with col_level != 0
result = df2.rename_axis([("c", "ii")]).reset_index(col_level=1, col_fill="C")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("flag", [False, True])
@pytest.mark.parametrize("allow_duplicates", [False, True])
def test_reset_index_duplicate_columns_allow(
self, multiindex_df, flag, allow_duplicates
):
# GH#44755 reset_index with duplicate column labels
df = multiindex_df.rename_axis("A")
df = df.set_flags(allows_duplicate_labels=flag)
if flag and allow_duplicates:
result = df.reset_index(allow_duplicates=allow_duplicates)
levels = [["A", ""], ["A", ""], ["B", "b"]]
expected = DataFrame(
[[0, 0, 2], [1, 1, 3]], columns=MultiIndex.from_tuples(levels)
)
tm.assert_frame_equal(result, expected)
else:
if not flag and allow_duplicates:
msg = (
"Cannot specify 'allow_duplicates=True' when "
"'self.flags.allows_duplicate_labels' is False"
)
else:
msg = r"cannot insert \('A', ''\), already exists"
with pytest.raises(ValueError, match=msg):
df.reset_index(allow_duplicates=allow_duplicates)
@pytest.mark.parametrize("flag", [False, True])
def test_reset_index_duplicate_columns_default(self, multiindex_df, flag):
df = multiindex_df.rename_axis("A")
df = df.set_flags(allows_duplicate_labels=flag)
msg = r"cannot insert \('A', ''\), already exists"
with pytest.raises(ValueError, match=msg):
df.reset_index()
@pytest.mark.parametrize("allow_duplicates", ["bad value"])
def test_reset_index_allow_duplicates_check(self, multiindex_df, allow_duplicates):
with pytest.raises(ValueError, match="expected type bool"):
multiindex_df.reset_index(allow_duplicates=allow_duplicates)
def test_reset_index_datetime(self, tz_naive_fixture):
# GH#3950
tz = tz_naive_fixture
idx1 = date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx1")
idx2 = Index(range(5), name="idx2", dtype="int64")
idx = MultiIndex.from_arrays([idx1, idx2])
df = DataFrame(
{"a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"]},
index=idx,
)
expected = DataFrame(
{
"idx1": idx1,
"idx2": np.arange(5, dtype="int64"),
"a": np.arange(5, dtype="int64"),
"b": ["A", "B", "C", "D", "E"],
},
columns=["idx1", "idx2", "a", "b"],
)
tm.assert_frame_equal(df.reset_index(), expected)
def test_reset_index_datetime2(self, tz_naive_fixture):
tz = tz_naive_fixture
idx1 = date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx1")
idx2 = Index(range(5), name="idx2", dtype="int64")
idx3 = date_range(
"1/1/2012", periods=5, freq="MS", tz="Europe/Paris", name="idx3"
)
idx = MultiIndex.from_arrays([idx1, idx2, idx3])
df = DataFrame(
{"a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"]},
index=idx,
)
expected = DataFrame(
{
"idx1": idx1,
"idx2": np.arange(5, dtype="int64"),
"idx3": idx3,
"a": np.arange(5, dtype="int64"),
"b": ["A", "B", "C", "D", "E"],
},
columns=["idx1", "idx2", "idx3", "a", "b"],
)
result = df.reset_index()
tm.assert_frame_equal(result, expected)
def test_reset_index_datetime3(self, tz_naive_fixture):
# GH#7793
tz = tz_naive_fixture
dti = date_range("20130101", periods=3, tz=tz)
idx = MultiIndex.from_product([["a", "b"], dti])
df = DataFrame(
np.arange(6, dtype="int64").reshape(6, 1), columns=["a"], index=idx
)
expected = DataFrame(
{
"level_0": "a a a b b b".split(),
"level_1": dti.append(dti),
"a": np.arange(6, dtype="int64"),
},
columns=["level_0", "level_1", "a"],
)
result = df.reset_index()
tm.assert_frame_equal(result, expected)
def test_reset_index_period(self):
# GH#7746
idx = MultiIndex.from_product(
[pd.period_range("20130101", periods=3, freq="M"), list("abc")],
names=["month", "feature"],
)
df = DataFrame(
np.arange(9, dtype="int64").reshape(-1, 1), index=idx, columns=["a"]
)
expected = DataFrame(
{
"month": (
[pd.Period("2013-01", freq="M")] * 3
+ [pd.Period("2013-02", freq="M")] * 3
+ [pd.Period("2013-03", freq="M")] * 3
),
"feature": ["a", "b", "c"] * 3,
"a": np.arange(9, dtype="int64"),
},
columns=["month", "feature", "a"],
)
result = df.reset_index()
tm.assert_frame_equal(result, expected)
def test_reset_index_delevel_infer_dtype(self):
tuples = list(product(["foo", "bar"], [10, 20], [1.0, 1.1]))
index = MultiIndex.from_tuples(tuples, names=["prm0", "prm1", "prm2"])
df = DataFrame(
np.random.default_rng(2).standard_normal((8, 3)),
columns=["A", "B", "C"],
index=index,
)
deleveled = df.reset_index()
assert is_integer_dtype(deleveled["prm1"])
assert is_float_dtype(deleveled["prm2"])
def test_reset_index_with_drop(
self, multiindex_year_month_day_dataframe_random_data
):
ymd = multiindex_year_month_day_dataframe_random_data
deleveled = ymd.reset_index(drop=True)
assert len(deleveled.columns) == len(ymd.columns)
assert deleveled.index.name == ymd.index.name
@pytest.mark.parametrize(
"ix_data, exp_data",
[
(
[(pd.NaT, 1), (pd.NaT, 2)],
{"a": [pd.NaT, pd.NaT], "b": [1, 2], "x": [11, 12]},
),
(
[(pd.NaT, 1), (Timestamp("2020-01-01"), 2)],
{"a": [pd.NaT, Timestamp("2020-01-01")], "b": [1, 2], "x": [11, 12]},
),
(
[(pd.NaT, 1), (pd.Timedelta(123, "d"), 2)],
{"a": [pd.NaT, pd.Timedelta(123, "d")], "b": [1, 2], "x": [11, 12]},
),
],
)
def test_reset_index_nat_multiindex(self, ix_data, exp_data):
# GH#36541: that reset_index() does not raise ValueError
ix = MultiIndex.from_tuples(ix_data, names=["a", "b"])
result = DataFrame({"x": [11, 12]}, index=ix)
result = result.reset_index()
expected = DataFrame(exp_data)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"codes", ([[0, 0, 1, 1], [0, 1, 0, 1]], [[0, 0, -1, 1], [0, 1, 0, 1]])
)
def test_rest_index_multiindex_categorical_with_missing_values(self, codes):
# GH#24206
index = MultiIndex(
[CategoricalIndex(["A", "B"]), CategoricalIndex(["a", "b"])], codes
)
data = {"col": range(len(index))}
df = DataFrame(data=data, index=index)
expected = DataFrame(
{
"level_0": Categorical.from_codes(codes[0], categories=["A", "B"]),
"level_1": Categorical.from_codes(codes[1], categories=["a", "b"]),
"col": range(4),
}
)
res = df.reset_index()
tm.assert_frame_equal(res, expected)
# roundtrip
res = expected.set_index(["level_0", "level_1"]).reset_index()
tm.assert_frame_equal(res, expected)
@pytest.mark.parametrize(
"array, dtype",
[
(["a", "b"], object),
(
pd.period_range("12-1-2000", periods=2, freq="Q-DEC"),
pd.PeriodDtype(freq="Q-DEC"),
),
],
)
def test_reset_index_dtypes_on_empty_frame_with_multiindex(
array, dtype, using_infer_string
):
# GH 19602 - Preserve dtype on empty DataFrame with MultiIndex
idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array])
result = DataFrame(index=idx)[:0].reset_index().dtypes
if using_infer_string and dtype == object:
dtype = "string"
expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": dtype})
tm.assert_series_equal(result, expected)
def test_reset_index_empty_frame_with_datetime64_multiindex():
# https://github.com/pandas-dev/pandas/issues/35606
dti = pd.DatetimeIndex(["2020-07-20 00:00:00"], dtype="M8[ns]")
idx = MultiIndex.from_product([dti, [3, 4]], names=["a", "b"])[:0]
df = DataFrame(index=idx, columns=["c", "d"])
result = df.reset_index()
expected = DataFrame(
columns=list("abcd"), index=RangeIndex(start=0, stop=0, step=1)
)
expected["a"] = expected["a"].astype("datetime64[ns]")
expected["b"] = expected["b"].astype("int64")
tm.assert_frame_equal(result, expected)
def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby(
using_infer_string,
):
# https://github.com/pandas-dev/pandas/issues/35657
dti = pd.DatetimeIndex(["2020-01-01"], dtype="M8[ns]")
df = DataFrame({"c1": [10.0], "c2": ["a"], "c3": dti})
df = df.head(0).groupby(["c2", "c3"])[["c1"]].sum()
result = df.reset_index()
expected = DataFrame(
columns=["c2", "c3", "c1"], index=RangeIndex(start=0, stop=0, step=1)
)
expected["c3"] = expected["c3"].astype("datetime64[ns]")
expected["c1"] = expected["c1"].astype("float64")
if using_infer_string:
expected["c2"] = expected["c2"].astype("string[pyarrow_numpy]")
tm.assert_frame_equal(result, expected)
def test_reset_index_multiindex_nat():
# GH 11479
idx = range(3)
tstamp = date_range("2015-07-01", freq="D", periods=3)
df = DataFrame({"id": idx, "tstamp": tstamp, "a": list("abc")})
df.loc[2, "tstamp"] = pd.NaT
result = df.set_index(["id", "tstamp"]).reset_index("id")
exp_dti = pd.DatetimeIndex(
["2015-07-01", "2015-07-02", "NaT"], dtype="M8[ns]", name="tstamp"
)
expected = DataFrame(
{"id": range(3), "a": list("abc")},
index=exp_dti,
)
tm.assert_frame_equal(result, expected)
def test_reset_index_interval_columns_object_cast():
# GH 19136
df = DataFrame(
np.eye(2), index=Index([1, 2], name="Year"), columns=cut([1, 2], [0, 1, 2])
)
result = df.reset_index()
expected = DataFrame(
[[1, 1.0, 0.0], [2, 0.0, 1.0]],
columns=Index(["Year", Interval(0, 1), Interval(1, 2)]),
)
tm.assert_frame_equal(result, expected)
def test_reset_index_rename(float_frame):
# GH 6878
result = float_frame.reset_index(names="new_name")
expected = Series(float_frame.index.values, name="new_name")
tm.assert_series_equal(result["new_name"], expected)
result = float_frame.reset_index(names=123)
expected = Series(float_frame.index.values, name=123)
tm.assert_series_equal(result[123], expected)
def test_reset_index_rename_multiindex(float_frame):
# GH 6878
stacked_df = float_frame.stack(future_stack=True)[::2]
stacked_df = DataFrame({"foo": stacked_df, "bar": stacked_df})
names = ["first", "second"]
stacked_df.index.names = names
result = stacked_df.reset_index()
expected = stacked_df.reset_index(names=["new_first", "new_second"])
tm.assert_series_equal(result["first"], expected["new_first"], check_names=False)
tm.assert_series_equal(result["second"], expected["new_second"], check_names=False)
def test_errorreset_index_rename(float_frame):
# GH 6878
stacked_df = float_frame.stack(future_stack=True)[::2]
stacked_df = DataFrame({"first": stacked_df, "second": stacked_df})
with pytest.raises(
ValueError, match="Index names must be str or 1-dimensional list"
):
stacked_df.reset_index(names={"first": "new_first", "second": "new_second"})
with pytest.raises(IndexError, match="list index out of range"):
stacked_df.reset_index(names=["new_first"])
def test_reset_index_false_index_name():
result_series = Series(data=range(5, 10), index=range(5))
result_series.index.name = False
result_series.reset_index()
expected_series = Series(range(5, 10), RangeIndex(range(5), name=False))
tm.assert_series_equal(result_series, expected_series)
# GH 38147
result_frame = DataFrame(data=range(5, 10), index=range(5))
result_frame.index.name = False
result_frame.reset_index()
expected_frame = DataFrame(range(5, 10), RangeIndex(range(5), name=False))
tm.assert_frame_equal(result_frame, expected_frame)

View File

@ -0,0 +1,225 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Series,
date_range,
)
import pandas._testing as tm
class TestDataFrameRound:
def test_round(self):
# GH#2665
# Test that rounding an empty DataFrame does nothing
df = DataFrame()
tm.assert_frame_equal(df, df.round())
# Here's the test frame we'll be working with
df = DataFrame({"col1": [1.123, 2.123, 3.123], "col2": [1.234, 2.234, 3.234]})
# Default round to integer (i.e. decimals=0)
expected_rounded = DataFrame({"col1": [1.0, 2.0, 3.0], "col2": [1.0, 2.0, 3.0]})
tm.assert_frame_equal(df.round(), expected_rounded)
# Round with an integer
decimals = 2
expected_rounded = DataFrame(
{"col1": [1.12, 2.12, 3.12], "col2": [1.23, 2.23, 3.23]}
)
tm.assert_frame_equal(df.round(decimals), expected_rounded)
# This should also work with np.round (since np.round dispatches to
# df.round)
tm.assert_frame_equal(np.round(df, decimals), expected_rounded)
# Round with a list
round_list = [1, 2]
msg = "decimals must be an integer, a dict-like or a Series"
with pytest.raises(TypeError, match=msg):
df.round(round_list)
# Round with a dictionary
expected_rounded = DataFrame(
{"col1": [1.1, 2.1, 3.1], "col2": [1.23, 2.23, 3.23]}
)
round_dict = {"col1": 1, "col2": 2}
tm.assert_frame_equal(df.round(round_dict), expected_rounded)
# Incomplete dict
expected_partially_rounded = DataFrame(
{"col1": [1.123, 2.123, 3.123], "col2": [1.2, 2.2, 3.2]}
)
partial_round_dict = {"col2": 1}
tm.assert_frame_equal(df.round(partial_round_dict), expected_partially_rounded)
# Dict with unknown elements
wrong_round_dict = {"col3": 2, "col2": 1}
tm.assert_frame_equal(df.round(wrong_round_dict), expected_partially_rounded)
# float input to `decimals`
non_int_round_dict = {"col1": 1, "col2": 0.5}
msg = "Values in decimals must be integers"
with pytest.raises(TypeError, match=msg):
df.round(non_int_round_dict)
# String input
non_int_round_dict = {"col1": 1, "col2": "foo"}
with pytest.raises(TypeError, match=msg):
df.round(non_int_round_dict)
non_int_round_Series = Series(non_int_round_dict)
with pytest.raises(TypeError, match=msg):
df.round(non_int_round_Series)
# List input
non_int_round_dict = {"col1": 1, "col2": [1, 2]}
with pytest.raises(TypeError, match=msg):
df.round(non_int_round_dict)
non_int_round_Series = Series(non_int_round_dict)
with pytest.raises(TypeError, match=msg):
df.round(non_int_round_Series)
# Non integer Series inputs
non_int_round_Series = Series(non_int_round_dict)
with pytest.raises(TypeError, match=msg):
df.round(non_int_round_Series)
non_int_round_Series = Series(non_int_round_dict)
with pytest.raises(TypeError, match=msg):
df.round(non_int_round_Series)
# Negative numbers
negative_round_dict = {"col1": -1, "col2": -2}
big_df = df * 100
expected_neg_rounded = DataFrame(
{"col1": [110.0, 210, 310], "col2": [100.0, 200, 300]}
)
tm.assert_frame_equal(big_df.round(negative_round_dict), expected_neg_rounded)
# nan in Series round
nan_round_Series = Series({"col1": np.nan, "col2": 1})
with pytest.raises(TypeError, match=msg):
df.round(nan_round_Series)
# Make sure this doesn't break existing Series.round
tm.assert_series_equal(df["col1"].round(1), expected_rounded["col1"])
# named columns
# GH#11986
decimals = 2
expected_rounded = DataFrame(
{"col1": [1.12, 2.12, 3.12], "col2": [1.23, 2.23, 3.23]}
)
df.columns.name = "cols"
expected_rounded.columns.name = "cols"
tm.assert_frame_equal(df.round(decimals), expected_rounded)
# interaction of named columns & series
tm.assert_series_equal(df["col1"].round(decimals), expected_rounded["col1"])
tm.assert_series_equal(df.round(decimals)["col1"], expected_rounded["col1"])
def test_round_numpy(self):
# GH#12600
df = DataFrame([[1.53, 1.36], [0.06, 7.01]])
out = np.round(df, decimals=0)
expected = DataFrame([[2.0, 1.0], [0.0, 7.0]])
tm.assert_frame_equal(out, expected)
msg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.round(df, decimals=0, out=df)
def test_round_numpy_with_nan(self):
# See GH#14197
df = Series([1.53, np.nan, 0.06]).to_frame()
with tm.assert_produces_warning(None):
result = df.round()
expected = Series([2.0, np.nan, 0.0]).to_frame()
tm.assert_frame_equal(result, expected)
def test_round_mixed_type(self):
# GH#11885
df = DataFrame(
{
"col1": [1.1, 2.2, 3.3, 4.4],
"col2": ["1", "a", "c", "f"],
"col3": date_range("20111111", periods=4),
}
)
round_0 = DataFrame(
{
"col1": [1.0, 2.0, 3.0, 4.0],
"col2": ["1", "a", "c", "f"],
"col3": date_range("20111111", periods=4),
}
)
tm.assert_frame_equal(df.round(), round_0)
tm.assert_frame_equal(df.round(1), df)
tm.assert_frame_equal(df.round({"col1": 1}), df)
tm.assert_frame_equal(df.round({"col1": 0}), round_0)
tm.assert_frame_equal(df.round({"col1": 0, "col2": 1}), round_0)
tm.assert_frame_equal(df.round({"col3": 1}), df)
def test_round_with_duplicate_columns(self):
# GH#11611
df = DataFrame(
np.random.default_rng(2).random([3, 3]),
columns=["A", "B", "C"],
index=["first", "second", "third"],
)
dfs = pd.concat((df, df), axis=1)
rounded = dfs.round()
tm.assert_index_equal(rounded.index, dfs.index)
decimals = Series([1, 0, 2], index=["A", "B", "A"])
msg = "Index of decimals must be unique"
with pytest.raises(ValueError, match=msg):
df.round(decimals)
def test_round_builtin(self):
# GH#11763
# Here's the test frame we'll be working with
df = DataFrame({"col1": [1.123, 2.123, 3.123], "col2": [1.234, 2.234, 3.234]})
# Default round to integer (i.e. decimals=0)
expected_rounded = DataFrame({"col1": [1.0, 2.0, 3.0], "col2": [1.0, 2.0, 3.0]})
tm.assert_frame_equal(round(df), expected_rounded)
def test_round_nonunique_categorical(self):
# See GH#21809
idx = pd.CategoricalIndex(["low"] * 3 + ["hi"] * 3)
df = DataFrame(np.random.default_rng(2).random((6, 3)), columns=list("abc"))
expected = df.round(3)
expected.index = idx
df_categorical = df.copy().set_index(idx)
assert df_categorical.shape == (6, 3)
result = df_categorical.round(3)
assert result.shape == (6, 3)
tm.assert_frame_equal(result, expected)
def test_round_interval_category_columns(self):
# GH#30063
columns = pd.CategoricalIndex(pd.interval_range(0, 2))
df = DataFrame([[0.66, 1.1], [0.3, 0.25]], columns=columns)
result = df.round()
expected = DataFrame([[1.0, 1.0], [0.0, 0.0]], columns=columns)
tm.assert_frame_equal(result, expected)
def test_round_empty_not_input(self):
# GH#51032
df = DataFrame()
result = df.round()
tm.assert_frame_equal(df, result)
assert df is not result

View File

@ -0,0 +1,372 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
Series,
)
import pandas._testing as tm
import pandas.core.common as com
class TestSample:
@pytest.fixture
def obj(self, frame_or_series):
if frame_or_series is Series:
arr = np.random.default_rng(2).standard_normal(10)
else:
arr = np.random.default_rng(2).standard_normal((10, 10))
return frame_or_series(arr, dtype=None)
@pytest.mark.parametrize("test", list(range(10)))
def test_sample(self, test, obj):
# Fixes issue: 2419
# Check behavior of random_state argument
# Check for stability when receives seed or random state -- run 10
# times.
seed = np.random.default_rng(2).integers(0, 100)
tm.assert_equal(
obj.sample(n=4, random_state=seed), obj.sample(n=4, random_state=seed)
)
tm.assert_equal(
obj.sample(frac=0.7, random_state=seed),
obj.sample(frac=0.7, random_state=seed),
)
tm.assert_equal(
obj.sample(n=4, random_state=np.random.default_rng(test)),
obj.sample(n=4, random_state=np.random.default_rng(test)),
)
tm.assert_equal(
obj.sample(frac=0.7, random_state=np.random.default_rng(test)),
obj.sample(frac=0.7, random_state=np.random.default_rng(test)),
)
tm.assert_equal(
obj.sample(
frac=2,
replace=True,
random_state=np.random.default_rng(test),
),
obj.sample(
frac=2,
replace=True,
random_state=np.random.default_rng(test),
),
)
os1, os2 = [], []
for _ in range(2):
os1.append(obj.sample(n=4, random_state=test))
os2.append(obj.sample(frac=0.7, random_state=test))
tm.assert_equal(*os1)
tm.assert_equal(*os2)
def test_sample_lengths(self, obj):
# Check lengths are right
assert len(obj.sample(n=4) == 4)
assert len(obj.sample(frac=0.34) == 3)
assert len(obj.sample(frac=0.36) == 4)
def test_sample_invalid_random_state(self, obj):
# Check for error when random_state argument invalid.
msg = (
"random_state must be an integer, array-like, a BitGenerator, Generator, "
"a numpy RandomState, or None"
)
with pytest.raises(ValueError, match=msg):
obj.sample(random_state="a_string")
def test_sample_wont_accept_n_and_frac(self, obj):
# Giving both frac and N throws error
msg = "Please enter a value for `frac` OR `n`, not both"
with pytest.raises(ValueError, match=msg):
obj.sample(n=3, frac=0.3)
def test_sample_requires_positive_n_frac(self, obj):
with pytest.raises(
ValueError,
match="A negative number of rows requested. Please provide `n` >= 0",
):
obj.sample(n=-3)
with pytest.raises(
ValueError,
match="A negative number of rows requested. Please provide `frac` >= 0",
):
obj.sample(frac=-0.3)
def test_sample_requires_integer_n(self, obj):
# Make sure float values of `n` give error
with pytest.raises(ValueError, match="Only integers accepted as `n` values"):
obj.sample(n=3.2)
def test_sample_invalid_weight_lengths(self, obj):
# Weight length must be right
msg = "Weights and axis to be sampled must be of same length"
with pytest.raises(ValueError, match=msg):
obj.sample(n=3, weights=[0, 1])
with pytest.raises(ValueError, match=msg):
bad_weights = [0.5] * 11
obj.sample(n=3, weights=bad_weights)
with pytest.raises(ValueError, match="Fewer non-zero entries in p than size"):
bad_weight_series = Series([0, 0, 0.2])
obj.sample(n=4, weights=bad_weight_series)
def test_sample_negative_weights(self, obj):
# Check won't accept negative weights
bad_weights = [-0.1] * 10
msg = "weight vector many not include negative values"
with pytest.raises(ValueError, match=msg):
obj.sample(n=3, weights=bad_weights)
def test_sample_inf_weights(self, obj):
# Check inf and -inf throw errors:
weights_with_inf = [0.1] * 10
weights_with_inf[0] = np.inf
msg = "weight vector may not include `inf` values"
with pytest.raises(ValueError, match=msg):
obj.sample(n=3, weights=weights_with_inf)
weights_with_ninf = [0.1] * 10
weights_with_ninf[0] = -np.inf
with pytest.raises(ValueError, match=msg):
obj.sample(n=3, weights=weights_with_ninf)
def test_sample_zero_weights(self, obj):
# All zeros raises errors
zero_weights = [0] * 10
with pytest.raises(ValueError, match="Invalid weights: weights sum to zero"):
obj.sample(n=3, weights=zero_weights)
def test_sample_missing_weights(self, obj):
# All missing weights
nan_weights = [np.nan] * 10
with pytest.raises(ValueError, match="Invalid weights: weights sum to zero"):
obj.sample(n=3, weights=nan_weights)
def test_sample_none_weights(self, obj):
# Check None are also replaced by zeros.
weights_with_None = [None] * 10
weights_with_None[5] = 0.5
tm.assert_equal(
obj.sample(n=1, axis=0, weights=weights_with_None), obj.iloc[5:6]
)
@pytest.mark.parametrize(
"func_str,arg",
[
("np.array", [2, 3, 1, 0]),
("np.random.MT19937", 3),
("np.random.PCG64", 11),
],
)
def test_sample_random_state(self, func_str, arg, frame_or_series):
# GH#32503
obj = DataFrame({"col1": range(10, 20), "col2": range(20, 30)})
obj = tm.get_obj(obj, frame_or_series)
result = obj.sample(n=3, random_state=eval(func_str)(arg))
expected = obj.sample(n=3, random_state=com.random_state(eval(func_str)(arg)))
tm.assert_equal(result, expected)
def test_sample_generator(self, frame_or_series):
# GH#38100
obj = frame_or_series(np.arange(100))
rng = np.random.default_rng(2)
# Consecutive calls should advance the seed
result1 = obj.sample(n=50, random_state=rng)
result2 = obj.sample(n=50, random_state=rng)
assert not (result1.index.values == result2.index.values).all()
# Matching generator initialization must give same result
# Consecutive calls should advance the seed
result1 = obj.sample(n=50, random_state=np.random.default_rng(11))
result2 = obj.sample(n=50, random_state=np.random.default_rng(11))
tm.assert_equal(result1, result2)
def test_sample_upsampling_without_replacement(self, frame_or_series):
# GH#27451
obj = DataFrame({"A": list("abc")})
obj = tm.get_obj(obj, frame_or_series)
msg = (
"Replace has to be set to `True` when "
"upsampling the population `frac` > 1."
)
with pytest.raises(ValueError, match=msg):
obj.sample(frac=2, replace=False)
class TestSampleDataFrame:
# Tests which are relevant only for DataFrame, so these are
# as fully parametrized as they can get.
def test_sample(self):
# GH#2419
# additional specific object based tests
# A few dataframe test with degenerate weights.
easy_weight_list = [0] * 10
easy_weight_list[5] = 1
df = DataFrame(
{
"col1": range(10, 20),
"col2": range(20, 30),
"colString": ["a"] * 10,
"easyweights": easy_weight_list,
}
)
sample1 = df.sample(n=1, weights="easyweights")
tm.assert_frame_equal(sample1, df.iloc[5:6])
# Ensure proper error if string given as weight for Series or
# DataFrame with axis = 1.
ser = Series(range(10))
msg = "Strings cannot be passed as weights when sampling from a Series."
with pytest.raises(ValueError, match=msg):
ser.sample(n=3, weights="weight_column")
msg = (
"Strings can only be passed to weights when sampling from rows on a "
"DataFrame"
)
with pytest.raises(ValueError, match=msg):
df.sample(n=1, weights="weight_column", axis=1)
# Check weighting key error
with pytest.raises(
KeyError, match="'String passed to weights not a valid column'"
):
df.sample(n=3, weights="not_a_real_column_name")
# Check that re-normalizes weights that don't sum to one.
weights_less_than_1 = [0] * 10
weights_less_than_1[0] = 0.5
tm.assert_frame_equal(df.sample(n=1, weights=weights_less_than_1), df.iloc[:1])
###
# Test axis argument
###
# Test axis argument
df = DataFrame({"col1": range(10), "col2": ["a"] * 10})
second_column_weight = [0, 1]
tm.assert_frame_equal(
df.sample(n=1, axis=1, weights=second_column_weight), df[["col2"]]
)
# Different axis arg types
tm.assert_frame_equal(
df.sample(n=1, axis="columns", weights=second_column_weight), df[["col2"]]
)
weight = [0] * 10
weight[5] = 0.5
tm.assert_frame_equal(df.sample(n=1, axis="rows", weights=weight), df.iloc[5:6])
tm.assert_frame_equal(
df.sample(n=1, axis="index", weights=weight), df.iloc[5:6]
)
# Check out of range axis values
msg = "No axis named 2 for object type DataFrame"
with pytest.raises(ValueError, match=msg):
df.sample(n=1, axis=2)
msg = "No axis named not_a_name for object type DataFrame"
with pytest.raises(ValueError, match=msg):
df.sample(n=1, axis="not_a_name")
ser = Series(range(10))
with pytest.raises(ValueError, match="No axis named 1 for object type Series"):
ser.sample(n=1, axis=1)
# Test weight length compared to correct axis
msg = "Weights and axis to be sampled must be of same length"
with pytest.raises(ValueError, match=msg):
df.sample(n=1, axis=1, weights=[0.5] * 10)
def test_sample_axis1(self):
# Check weights with axis = 1
easy_weight_list = [0] * 3
easy_weight_list[2] = 1
df = DataFrame(
{"col1": range(10, 20), "col2": range(20, 30), "colString": ["a"] * 10}
)
sample1 = df.sample(n=1, axis=1, weights=easy_weight_list)
tm.assert_frame_equal(sample1, df[["colString"]])
# Test default axes
tm.assert_frame_equal(
df.sample(n=3, random_state=42), df.sample(n=3, axis=0, random_state=42)
)
def test_sample_aligns_weights_with_frame(self):
# Test that function aligns weights with frame
df = DataFrame({"col1": [5, 6, 7], "col2": ["a", "b", "c"]}, index=[9, 5, 3])
ser = Series([1, 0, 0], index=[3, 5, 9])
tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser))
# Weights have index values to be dropped because not in
# sampled DataFrame
ser2 = Series([0.001, 0, 10000], index=[3, 5, 10])
tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser2))
# Weights have empty values to be filed with zeros
ser3 = Series([0.01, 0], index=[3, 5])
tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser3))
# No overlap in weight and sampled DataFrame indices
ser4 = Series([1, 0], index=[1, 2])
with pytest.raises(ValueError, match="Invalid weights: weights sum to zero"):
df.sample(1, weights=ser4)
def test_sample_is_copy(self):
# GH#27357, GH#30784: ensure the result of sample is an actual copy and
# doesn't track the parent dataframe / doesn't give SettingWithCopy warnings
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 3)), columns=["a", "b", "c"]
)
df2 = df.sample(3)
with tm.assert_produces_warning(None):
df2["d"] = 1
def test_sample_does_not_modify_weights(self):
# GH-42843
result = np.array([np.nan, 1, np.nan])
expected = result.copy()
ser = Series([1, 2, 3])
# Test numpy array weights won't be modified in place
ser.sample(weights=result)
tm.assert_numpy_array_equal(result, expected)
# Test DataFrame column won't be modified in place
df = DataFrame({"values": [1, 1, 1], "weights": [1, np.nan, np.nan]})
expected = df["weights"].copy()
df.sample(frac=1.0, replace=True, weights="weights")
result = df["weights"]
tm.assert_series_equal(result, expected)
def test_sample_ignore_index(self):
# GH 38581
df = DataFrame(
{"col1": range(10, 20), "col2": range(20, 30), "colString": ["a"] * 10}
)
result = df.sample(3, ignore_index=True)
expected_index = Index(range(3))
tm.assert_index_equal(result.index, expected_index, exact=True)

View File

@ -0,0 +1,469 @@
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import ExtensionDtype
import pandas as pd
from pandas import (
DataFrame,
Timestamp,
)
import pandas._testing as tm
from pandas.core.arrays import ExtensionArray
class DummyDtype(ExtensionDtype):
type = int
def __init__(self, numeric) -> None:
self._numeric = numeric
@property
def name(self):
return "Dummy"
@property
def _is_numeric(self):
return self._numeric
class DummyArray(ExtensionArray):
def __init__(self, data, dtype) -> None:
self.data = data
self._dtype = dtype
def __array__(self, dtype=None, copy=None):
return self.data
@property
def dtype(self):
return self._dtype
def __len__(self) -> int:
return len(self.data)
def __getitem__(self, item):
pass
def copy(self):
return self
class TestSelectDtypes:
def test_select_dtypes_include_using_list_like(self):
df = DataFrame(
{
"a": list("abc"),
"b": list(range(1, 4)),
"c": np.arange(3, 6).astype("u1"),
"d": np.arange(4.0, 7.0, dtype="float64"),
"e": [True, False, True],
"f": pd.Categorical(list("abc")),
"g": pd.date_range("20130101", periods=3),
"h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
"i": pd.date_range("20130101", periods=3, tz="CET"),
"j": pd.period_range("2013-01", periods=3, freq="M"),
"k": pd.timedelta_range("1 day", periods=3),
}
)
ri = df.select_dtypes(include=[np.number])
ei = df[["b", "c", "d", "k"]]
tm.assert_frame_equal(ri, ei)
ri = df.select_dtypes(include=[np.number], exclude=["timedelta"])
ei = df[["b", "c", "d"]]
tm.assert_frame_equal(ri, ei)
ri = df.select_dtypes(include=[np.number, "category"], exclude=["timedelta"])
ei = df[["b", "c", "d", "f"]]
tm.assert_frame_equal(ri, ei)
ri = df.select_dtypes(include=["datetime"])
ei = df[["g"]]
tm.assert_frame_equal(ri, ei)
ri = df.select_dtypes(include=["datetime64"])
ei = df[["g"]]
tm.assert_frame_equal(ri, ei)
ri = df.select_dtypes(include=["datetimetz"])
ei = df[["h", "i"]]
tm.assert_frame_equal(ri, ei)
with pytest.raises(NotImplementedError, match=r"^$"):
df.select_dtypes(include=["period"])
def test_select_dtypes_exclude_using_list_like(self):
df = DataFrame(
{
"a": list("abc"),
"b": list(range(1, 4)),
"c": np.arange(3, 6).astype("u1"),
"d": np.arange(4.0, 7.0, dtype="float64"),
"e": [True, False, True],
}
)
re = df.select_dtypes(exclude=[np.number])
ee = df[["a", "e"]]
tm.assert_frame_equal(re, ee)
def test_select_dtypes_exclude_include_using_list_like(self):
df = DataFrame(
{
"a": list("abc"),
"b": list(range(1, 4)),
"c": np.arange(3, 6, dtype="u1"),
"d": np.arange(4.0, 7.0, dtype="float64"),
"e": [True, False, True],
"f": pd.date_range("now", periods=3).values,
}
)
exclude = (np.datetime64,)
include = np.bool_, "integer"
r = df.select_dtypes(include=include, exclude=exclude)
e = df[["b", "c", "e"]]
tm.assert_frame_equal(r, e)
exclude = ("datetime",)
include = "bool", "int64", "int32"
r = df.select_dtypes(include=include, exclude=exclude)
e = df[["b", "e"]]
tm.assert_frame_equal(r, e)
@pytest.mark.parametrize(
"include", [(np.bool_, "int"), (np.bool_, "integer"), ("bool", int)]
)
def test_select_dtypes_exclude_include_int(self, include):
# Fix select_dtypes(include='int') for Windows, FYI #36596
df = DataFrame(
{
"a": list("abc"),
"b": list(range(1, 4)),
"c": np.arange(3, 6, dtype="int32"),
"d": np.arange(4.0, 7.0, dtype="float64"),
"e": [True, False, True],
"f": pd.date_range("now", periods=3).values,
}
)
exclude = (np.datetime64,)
result = df.select_dtypes(include=include, exclude=exclude)
expected = df[["b", "c", "e"]]
tm.assert_frame_equal(result, expected)
def test_select_dtypes_include_using_scalars(self):
df = DataFrame(
{
"a": list("abc"),
"b": list(range(1, 4)),
"c": np.arange(3, 6).astype("u1"),
"d": np.arange(4.0, 7.0, dtype="float64"),
"e": [True, False, True],
"f": pd.Categorical(list("abc")),
"g": pd.date_range("20130101", periods=3),
"h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
"i": pd.date_range("20130101", periods=3, tz="CET"),
"j": pd.period_range("2013-01", periods=3, freq="M"),
"k": pd.timedelta_range("1 day", periods=3),
}
)
ri = df.select_dtypes(include=np.number)
ei = df[["b", "c", "d", "k"]]
tm.assert_frame_equal(ri, ei)
ri = df.select_dtypes(include="datetime")
ei = df[["g"]]
tm.assert_frame_equal(ri, ei)
ri = df.select_dtypes(include="datetime64")
ei = df[["g"]]
tm.assert_frame_equal(ri, ei)
ri = df.select_dtypes(include="category")
ei = df[["f"]]
tm.assert_frame_equal(ri, ei)
with pytest.raises(NotImplementedError, match=r"^$"):
df.select_dtypes(include="period")
def test_select_dtypes_exclude_using_scalars(self):
df = DataFrame(
{
"a": list("abc"),
"b": list(range(1, 4)),
"c": np.arange(3, 6).astype("u1"),
"d": np.arange(4.0, 7.0, dtype="float64"),
"e": [True, False, True],
"f": pd.Categorical(list("abc")),
"g": pd.date_range("20130101", periods=3),
"h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
"i": pd.date_range("20130101", periods=3, tz="CET"),
"j": pd.period_range("2013-01", periods=3, freq="M"),
"k": pd.timedelta_range("1 day", periods=3),
}
)
ri = df.select_dtypes(exclude=np.number)
ei = df[["a", "e", "f", "g", "h", "i", "j"]]
tm.assert_frame_equal(ri, ei)
ri = df.select_dtypes(exclude="category")
ei = df[["a", "b", "c", "d", "e", "g", "h", "i", "j", "k"]]
tm.assert_frame_equal(ri, ei)
with pytest.raises(NotImplementedError, match=r"^$"):
df.select_dtypes(exclude="period")
def test_select_dtypes_include_exclude_using_scalars(self):
df = DataFrame(
{
"a": list("abc"),
"b": list(range(1, 4)),
"c": np.arange(3, 6).astype("u1"),
"d": np.arange(4.0, 7.0, dtype="float64"),
"e": [True, False, True],
"f": pd.Categorical(list("abc")),
"g": pd.date_range("20130101", periods=3),
"h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
"i": pd.date_range("20130101", periods=3, tz="CET"),
"j": pd.period_range("2013-01", periods=3, freq="M"),
"k": pd.timedelta_range("1 day", periods=3),
}
)
ri = df.select_dtypes(include=np.number, exclude="floating")
ei = df[["b", "c", "k"]]
tm.assert_frame_equal(ri, ei)
def test_select_dtypes_include_exclude_mixed_scalars_lists(self):
df = DataFrame(
{
"a": list("abc"),
"b": list(range(1, 4)),
"c": np.arange(3, 6).astype("u1"),
"d": np.arange(4.0, 7.0, dtype="float64"),
"e": [True, False, True],
"f": pd.Categorical(list("abc")),
"g": pd.date_range("20130101", periods=3),
"h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
"i": pd.date_range("20130101", periods=3, tz="CET"),
"j": pd.period_range("2013-01", periods=3, freq="M"),
"k": pd.timedelta_range("1 day", periods=3),
}
)
ri = df.select_dtypes(include=np.number, exclude=["floating", "timedelta"])
ei = df[["b", "c"]]
tm.assert_frame_equal(ri, ei)
ri = df.select_dtypes(include=[np.number, "category"], exclude="floating")
ei = df[["b", "c", "f", "k"]]
tm.assert_frame_equal(ri, ei)
def test_select_dtypes_duplicate_columns(self):
# GH20839
df = DataFrame(
{
"a": ["a", "b", "c"],
"b": [1, 2, 3],
"c": np.arange(3, 6).astype("u1"),
"d": np.arange(4.0, 7.0, dtype="float64"),
"e": [True, False, True],
"f": pd.date_range("now", periods=3).values,
}
)
df.columns = ["a", "a", "b", "b", "b", "c"]
expected = DataFrame(
{"a": list(range(1, 4)), "b": np.arange(3, 6).astype("u1")}
)
result = df.select_dtypes(include=[np.number], exclude=["floating"])
tm.assert_frame_equal(result, expected)
def test_select_dtypes_not_an_attr_but_still_valid_dtype(self, using_infer_string):
df = DataFrame(
{
"a": list("abc"),
"b": list(range(1, 4)),
"c": np.arange(3, 6).astype("u1"),
"d": np.arange(4.0, 7.0, dtype="float64"),
"e": [True, False, True],
"f": pd.date_range("now", periods=3).values,
}
)
df["g"] = df.f.diff()
assert not hasattr(np, "u8")
r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"])
if using_infer_string:
e = df[["b"]]
else:
e = df[["a", "b"]]
tm.assert_frame_equal(r, e)
r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"])
if using_infer_string:
e = df[["b", "g"]]
else:
e = df[["a", "b", "g"]]
tm.assert_frame_equal(r, e)
def test_select_dtypes_empty(self):
df = DataFrame({"a": list("abc"), "b": list(range(1, 4))})
msg = "at least one of include or exclude must be nonempty"
with pytest.raises(ValueError, match=msg):
df.select_dtypes()
def test_select_dtypes_bad_datetime64(self):
df = DataFrame(
{
"a": list("abc"),
"b": list(range(1, 4)),
"c": np.arange(3, 6).astype("u1"),
"d": np.arange(4.0, 7.0, dtype="float64"),
"e": [True, False, True],
"f": pd.date_range("now", periods=3).values,
}
)
with pytest.raises(ValueError, match=".+ is too specific"):
df.select_dtypes(include=["datetime64[D]"])
with pytest.raises(ValueError, match=".+ is too specific"):
df.select_dtypes(exclude=["datetime64[as]"])
def test_select_dtypes_datetime_with_tz(self):
df2 = DataFrame(
{
"A": Timestamp("20130102", tz="US/Eastern"),
"B": Timestamp("20130603", tz="CET"),
},
index=range(5),
)
df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1)
result = df3.select_dtypes(include=["datetime64[ns]"])
expected = df3.reindex(columns=[])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"])
@pytest.mark.parametrize("arg", ["include", "exclude"])
def test_select_dtypes_str_raises(self, dtype, arg):
df = DataFrame(
{
"a": list("abc"),
"g": list("abc"),
"b": list(range(1, 4)),
"c": np.arange(3, 6).astype("u1"),
"d": np.arange(4.0, 7.0, dtype="float64"),
"e": [True, False, True],
"f": pd.date_range("now", periods=3).values,
}
)
msg = "string dtypes are not allowed"
kwargs = {arg: [dtype]}
with pytest.raises(TypeError, match=msg):
df.select_dtypes(**kwargs)
def test_select_dtypes_bad_arg_raises(self):
df = DataFrame(
{
"a": list("abc"),
"g": list("abc"),
"b": list(range(1, 4)),
"c": np.arange(3, 6).astype("u1"),
"d": np.arange(4.0, 7.0, dtype="float64"),
"e": [True, False, True],
"f": pd.date_range("now", periods=3).values,
}
)
msg = "data type.*not understood"
with pytest.raises(TypeError, match=msg):
df.select_dtypes(["blargy, blarg, blarg"])
def test_select_dtypes_typecodes(self):
# GH 11990
df = DataFrame(np.random.default_rng(2).random((5, 3)))
FLOAT_TYPES = list(np.typecodes["AllFloat"])
tm.assert_frame_equal(df.select_dtypes(FLOAT_TYPES), df)
@pytest.mark.parametrize(
"arr,expected",
(
(np.array([1, 2], dtype=np.int32), True),
(pd.array([1, 2], dtype="Int32"), True),
(DummyArray([1, 2], dtype=DummyDtype(numeric=True)), True),
(DummyArray([1, 2], dtype=DummyDtype(numeric=False)), False),
),
)
def test_select_dtypes_numeric(self, arr, expected):
# GH 35340
df = DataFrame(arr)
is_selected = df.select_dtypes(np.number).shape == df.shape
assert is_selected == expected
def test_select_dtypes_numeric_nullable_string(self, nullable_string_dtype):
arr = pd.array(["a", "b"], dtype=nullable_string_dtype)
df = DataFrame(arr)
is_selected = df.select_dtypes(np.number).shape == df.shape
assert not is_selected
@pytest.mark.parametrize(
"expected, float_dtypes",
[
[
DataFrame(
{"A": range(3), "B": range(5, 8), "C": range(10, 7, -1)}
).astype(dtype={"A": float, "B": np.float64, "C": np.float32}),
float,
],
[
DataFrame(
{"A": range(3), "B": range(5, 8), "C": range(10, 7, -1)}
).astype(dtype={"A": float, "B": np.float64, "C": np.float32}),
"float",
],
[DataFrame({"C": range(10, 7, -1)}, dtype=np.float32), np.float32],
[
DataFrame({"A": range(3), "B": range(5, 8)}).astype(
dtype={"A": float, "B": np.float64}
),
np.float64,
],
],
)
def test_select_dtypes_float_dtype(self, expected, float_dtypes):
# GH#42452
dtype_dict = {"A": float, "B": np.float64, "C": np.float32}
df = DataFrame(
{"A": range(3), "B": range(5, 8), "C": range(10, 7, -1)},
)
df = df.astype(dtype_dict)
result = df.select_dtypes(include=float_dtypes)
tm.assert_frame_equal(result, expected)
def test_np_bool_ea_boolean_include_number(self):
# GH 46870
df = DataFrame(
{
"a": [1, 2, 3],
"b": pd.Series([True, False, True], dtype="boolean"),
"c": np.array([True, False, True]),
"d": pd.Categorical([True, False, True]),
"e": pd.arrays.SparseArray([True, False, True]),
}
)
result = df.select_dtypes(include="number")
expected = DataFrame({"a": [1, 2, 3]})
tm.assert_frame_equal(result, expected)
def test_select_dtypes_no_view(self):
# https://github.com/pandas-dev/pandas/issues/48090
# result of this method is not a view on the original dataframe
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
df_orig = df.copy()
result = df.select_dtypes(include=["number"])
result.iloc[0, 0] = 0
tm.assert_frame_equal(df, df_orig)

View File

@ -0,0 +1,143 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
class SharedSetAxisTests:
@pytest.fixture
def obj(self):
raise NotImplementedError("Implemented by subclasses")
def test_set_axis(self, obj):
# GH14636; this tests setting index for both Series and DataFrame
new_index = list("abcd")[: len(obj)]
expected = obj.copy()
expected.index = new_index
result = obj.set_axis(new_index, axis=0)
tm.assert_equal(expected, result)
def test_set_axis_copy(self, obj, using_copy_on_write):
# Test copy keyword GH#47932
new_index = list("abcd")[: len(obj)]
orig = obj.iloc[:]
expected = obj.copy()
expected.index = new_index
result = obj.set_axis(new_index, axis=0, copy=True)
tm.assert_equal(expected, result)
assert result is not obj
# check we DID make a copy
if not using_copy_on_write:
if obj.ndim == 1:
assert not tm.shares_memory(result, obj)
else:
assert not any(
tm.shares_memory(result.iloc[:, i], obj.iloc[:, i])
for i in range(obj.shape[1])
)
result = obj.set_axis(new_index, axis=0, copy=False)
tm.assert_equal(expected, result)
assert result is not obj
# check we did NOT make a copy
if obj.ndim == 1:
assert tm.shares_memory(result, obj)
else:
assert all(
tm.shares_memory(result.iloc[:, i], obj.iloc[:, i])
for i in range(obj.shape[1])
)
# copy defaults to True
result = obj.set_axis(new_index, axis=0)
tm.assert_equal(expected, result)
assert result is not obj
if using_copy_on_write:
# check we DID NOT make a copy
if obj.ndim == 1:
assert tm.shares_memory(result, obj)
else:
assert any(
tm.shares_memory(result.iloc[:, i], obj.iloc[:, i])
for i in range(obj.shape[1])
)
# check we DID make a copy
elif obj.ndim == 1:
assert not tm.shares_memory(result, obj)
else:
assert not any(
tm.shares_memory(result.iloc[:, i], obj.iloc[:, i])
for i in range(obj.shape[1])
)
res = obj.set_axis(new_index, copy=False)
tm.assert_equal(expected, res)
# check we did NOT make a copy
if res.ndim == 1:
assert tm.shares_memory(res, orig)
else:
assert all(
tm.shares_memory(res.iloc[:, i], orig.iloc[:, i])
for i in range(res.shape[1])
)
def test_set_axis_unnamed_kwarg_warns(self, obj):
# omitting the "axis" parameter
new_index = list("abcd")[: len(obj)]
expected = obj.copy()
expected.index = new_index
result = obj.set_axis(new_index)
tm.assert_equal(result, expected)
@pytest.mark.parametrize("axis", [3, "foo"])
def test_set_axis_invalid_axis_name(self, axis, obj):
# wrong values for the "axis" parameter
with pytest.raises(ValueError, match="No axis named"):
obj.set_axis(list("abc"), axis=axis)
def test_set_axis_setattr_index_not_collection(self, obj):
# wrong type
msg = (
r"Index\(\.\.\.\) must be called with a collection of some "
r"kind, None was passed"
)
with pytest.raises(TypeError, match=msg):
obj.index = None
def test_set_axis_setattr_index_wrong_length(self, obj):
# wrong length
msg = (
f"Length mismatch: Expected axis has {len(obj)} elements, "
f"new values have {len(obj)-1} elements"
)
with pytest.raises(ValueError, match=msg):
obj.index = np.arange(len(obj) - 1)
if obj.ndim == 2:
with pytest.raises(ValueError, match="Length mismatch"):
obj.columns = obj.columns[::2]
class TestDataFrameSetAxis(SharedSetAxisTests):
@pytest.fixture
def obj(self):
df = DataFrame(
{"A": [1.1, 2.2, 3.3], "B": [5.0, 6.1, 7.2], "C": [4.4, 5.5, 6.6]},
index=[2010, 2011, 2012],
)
return df
class TestSeriesSetAxis(SharedSetAxisTests):
@pytest.fixture
def obj(self):
ser = Series(np.arange(4), index=[1, 3, 5, 7], dtype="int64")
return ser

View File

@ -0,0 +1,734 @@
"""
See also: test_reindex.py:TestReindexSetIndex
"""
from datetime import (
datetime,
timedelta,
)
import numpy as np
import pytest
from pandas import (
Categorical,
CategoricalIndex,
DataFrame,
DatetimeIndex,
Index,
MultiIndex,
Series,
date_range,
period_range,
to_datetime,
)
import pandas._testing as tm
@pytest.fixture
def frame_of_index_cols():
"""
Fixture for DataFrame of columns that can be used for indexing
Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')];
'A' & 'B' contain duplicates (but are jointly unique), the rest are unique.
A B C D E (tuple, as, label)
0 foo one a 0.608477 -0.012500 -1.664297
1 foo two b -0.633460 0.249614 -0.364411
2 foo three c 0.615256 2.154968 -0.834666
3 bar one d 0.234246 1.085675 0.718445
4 bar two e 0.533841 -0.005702 -3.533912
"""
df = DataFrame(
{
"A": ["foo", "foo", "foo", "bar", "bar"],
"B": ["one", "two", "three", "one", "two"],
"C": ["a", "b", "c", "d", "e"],
"D": np.random.default_rng(2).standard_normal(5),
"E": np.random.default_rng(2).standard_normal(5),
("tuple", "as", "label"): np.random.default_rng(2).standard_normal(5),
}
)
return df
class TestSetIndex:
def test_set_index_multiindex(self):
# segfault in GH#3308
d = {"t1": [2, 2.5, 3], "t2": [4, 5, 6]}
df = DataFrame(d)
tuples = [(0, 1), (0, 2), (1, 2)]
df["tuples"] = tuples
index = MultiIndex.from_tuples(df["tuples"])
# it works!
df.set_index(index)
def test_set_index_empty_column(self):
# GH#1971
df = DataFrame(
[
{"a": 1, "p": 0},
{"a": 2, "m": 10},
{"a": 3, "m": 11, "p": 20},
{"a": 4, "m": 12, "p": 21},
],
columns=["a", "m", "p", "x"],
)
result = df.set_index(["a", "x"])
expected = df[["m", "p"]]
expected.index = MultiIndex.from_arrays([df["a"], df["x"]], names=["a", "x"])
tm.assert_frame_equal(result, expected)
def test_set_index_empty_dataframe(self):
# GH#38419
df1 = DataFrame(
{"a": Series(dtype="datetime64[ns]"), "b": Series(dtype="int64"), "c": []}
)
df2 = df1.set_index(["a", "b"])
result = df2.index.to_frame().dtypes
expected = df1[["a", "b"]].dtypes
tm.assert_series_equal(result, expected)
def test_set_index_multiindexcolumns(self):
columns = MultiIndex.from_tuples([("foo", 1), ("foo", 2), ("bar", 1)])
df = DataFrame(
np.random.default_rng(2).standard_normal((3, 3)), columns=columns
)
result = df.set_index(df.columns[0])
expected = df.iloc[:, 1:]
expected.index = df.iloc[:, 0].values
expected.index.names = [df.columns[0]]
tm.assert_frame_equal(result, expected)
def test_set_index_timezone(self):
# GH#12358
# tz-aware Series should retain the tz
idx = DatetimeIndex(["2014-01-01 10:10:10"], tz="UTC").tz_convert("Europe/Rome")
df = DataFrame({"A": idx})
assert df.set_index(idx).index[0].hour == 11
assert DatetimeIndex(Series(df.A))[0].hour == 11
assert df.set_index(df.A).index[0].hour == 11
def test_set_index_cast_datetimeindex(self):
df = DataFrame(
{
"A": [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)],
"B": np.random.default_rng(2).standard_normal(1000),
}
)
idf = df.set_index("A")
assert isinstance(idf.index, DatetimeIndex)
def test_set_index_dst(self):
di = date_range("2006-10-29 00:00:00", periods=3, freq="h", tz="US/Pacific")
df = DataFrame(data={"a": [0, 1, 2], "b": [3, 4, 5]}, index=di).reset_index()
# single level
res = df.set_index("index")
exp = DataFrame(
data={"a": [0, 1, 2], "b": [3, 4, 5]},
index=Index(di, name="index"),
)
exp.index = exp.index._with_freq(None)
tm.assert_frame_equal(res, exp)
# GH#12920
res = df.set_index(["index", "a"])
exp_index = MultiIndex.from_arrays([di, [0, 1, 2]], names=["index", "a"])
exp = DataFrame({"b": [3, 4, 5]}, index=exp_index)
tm.assert_frame_equal(res, exp)
def test_set_index(self, float_string_frame):
df = float_string_frame
idx = Index(np.arange(len(df))[::-1])
df = df.set_index(idx)
tm.assert_index_equal(df.index, idx)
with pytest.raises(ValueError, match="Length mismatch"):
df.set_index(idx[::2])
def test_set_index_names(self):
df = DataFrame(
np.ones((10, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(10)], dtype=object),
)
df.index.name = "name"
assert df.set_index(df.index).index.names == ["name"]
mi = MultiIndex.from_arrays(df[["A", "B"]].T.values, names=["A", "B"])
mi2 = MultiIndex.from_arrays(
df[["A", "B", "A", "B"]].T.values, names=["A", "B", "C", "D"]
)
df = df.set_index(["A", "B"])
assert df.set_index(df.index).index.names == ["A", "B"]
# Check that set_index isn't converting a MultiIndex into an Index
assert isinstance(df.set_index(df.index).index, MultiIndex)
# Check actual equality
tm.assert_index_equal(df.set_index(df.index).index, mi)
idx2 = df.index.rename(["C", "D"])
# Check that [MultiIndex, MultiIndex] yields a MultiIndex rather
# than a pair of tuples
assert isinstance(df.set_index([df.index, idx2]).index, MultiIndex)
# Check equality
tm.assert_index_equal(df.set_index([df.index, idx2]).index, mi2)
# A has duplicate values, C does not
@pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")])
@pytest.mark.parametrize("inplace", [True, False])
@pytest.mark.parametrize("drop", [True, False])
def test_set_index_drop_inplace(self, frame_of_index_cols, drop, inplace, keys):
df = frame_of_index_cols
if isinstance(keys, list):
idx = MultiIndex.from_arrays([df[x] for x in keys], names=keys)
else:
idx = Index(df[keys], name=keys)
expected = df.drop(keys, axis=1) if drop else df
expected.index = idx
if inplace:
result = df.copy()
return_value = result.set_index(keys, drop=drop, inplace=True)
assert return_value is None
else:
result = df.set_index(keys, drop=drop)
tm.assert_frame_equal(result, expected)
# A has duplicate values, C does not
@pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")])
@pytest.mark.parametrize("drop", [True, False])
def test_set_index_append(self, frame_of_index_cols, drop, keys):
df = frame_of_index_cols
keys = keys if isinstance(keys, list) else [keys]
idx = MultiIndex.from_arrays(
[df.index] + [df[x] for x in keys], names=[None] + keys
)
expected = df.drop(keys, axis=1) if drop else df.copy()
expected.index = idx
result = df.set_index(keys, drop=drop, append=True)
tm.assert_frame_equal(result, expected)
# A has duplicate values, C does not
@pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")])
@pytest.mark.parametrize("drop", [True, False])
def test_set_index_append_to_multiindex(self, frame_of_index_cols, drop, keys):
# append to existing multiindex
df = frame_of_index_cols.set_index(["D"], drop=drop, append=True)
keys = keys if isinstance(keys, list) else [keys]
expected = frame_of_index_cols.set_index(["D"] + keys, drop=drop, append=True)
result = df.set_index(keys, drop=drop, append=True)
tm.assert_frame_equal(result, expected)
def test_set_index_after_mutation(self):
# GH#1590
df = DataFrame({"val": [0, 1, 2], "key": ["a", "b", "c"]})
expected = DataFrame({"val": [1, 2]}, Index(["b", "c"], name="key"))
df2 = df.loc[df.index.map(lambda indx: indx >= 1)]
result = df2.set_index("key")
tm.assert_frame_equal(result, expected)
# MultiIndex constructor does not work directly on Series -> lambda
# Add list-of-list constructor because list is ambiguous -> lambda
# also test index name if append=True (name is duplicate here for B)
@pytest.mark.parametrize(
"box",
[
Series,
Index,
np.array,
list,
lambda x: [list(x)],
lambda x: MultiIndex.from_arrays([x]),
],
)
@pytest.mark.parametrize(
"append, index_name", [(True, None), (True, "B"), (True, "test"), (False, None)]
)
@pytest.mark.parametrize("drop", [True, False])
def test_set_index_pass_single_array(
self, frame_of_index_cols, drop, append, index_name, box
):
df = frame_of_index_cols
df.index.name = index_name
key = box(df["B"])
if box == list:
# list of strings gets interpreted as list of keys
msg = "['one', 'two', 'three', 'one', 'two']"
with pytest.raises(KeyError, match=msg):
df.set_index(key, drop=drop, append=append)
else:
# np.array/list-of-list "forget" the name of B
name_mi = getattr(key, "names", None)
name = [getattr(key, "name", None)] if name_mi is None else name_mi
result = df.set_index(key, drop=drop, append=append)
# only valid column keys are dropped
# since B is always passed as array above, nothing is dropped
expected = df.set_index(["B"], drop=False, append=append)
expected.index.names = [index_name] + name if append else name
tm.assert_frame_equal(result, expected)
# MultiIndex constructor does not work directly on Series -> lambda
# also test index name if append=True (name is duplicate here for A & B)
@pytest.mark.parametrize(
"box", [Series, Index, np.array, list, lambda x: MultiIndex.from_arrays([x])]
)
@pytest.mark.parametrize(
"append, index_name",
[(True, None), (True, "A"), (True, "B"), (True, "test"), (False, None)],
)
@pytest.mark.parametrize("drop", [True, False])
def test_set_index_pass_arrays(
self, frame_of_index_cols, drop, append, index_name, box
):
df = frame_of_index_cols
df.index.name = index_name
keys = ["A", box(df["B"])]
# np.array/list "forget" the name of B
names = ["A", None if box in [np.array, list, tuple, iter] else "B"]
result = df.set_index(keys, drop=drop, append=append)
# only valid column keys are dropped
# since B is always passed as array above, only A is dropped, if at all
expected = df.set_index(["A", "B"], drop=False, append=append)
expected = expected.drop("A", axis=1) if drop else expected
expected.index.names = [index_name] + names if append else names
tm.assert_frame_equal(result, expected)
# MultiIndex constructor does not work directly on Series -> lambda
# We also emulate a "constructor" for the label -> lambda
# also test index name if append=True (name is duplicate here for A)
@pytest.mark.parametrize(
"box2",
[
Series,
Index,
np.array,
list,
iter,
lambda x: MultiIndex.from_arrays([x]),
lambda x: x.name,
],
)
@pytest.mark.parametrize(
"box1",
[
Series,
Index,
np.array,
list,
iter,
lambda x: MultiIndex.from_arrays([x]),
lambda x: x.name,
],
)
@pytest.mark.parametrize(
"append, index_name", [(True, None), (True, "A"), (True, "test"), (False, None)]
)
@pytest.mark.parametrize("drop", [True, False])
def test_set_index_pass_arrays_duplicate(
self, frame_of_index_cols, drop, append, index_name, box1, box2
):
df = frame_of_index_cols
df.index.name = index_name
keys = [box1(df["A"]), box2(df["A"])]
result = df.set_index(keys, drop=drop, append=append)
# if either box is iter, it has been consumed; re-read
keys = [box1(df["A"]), box2(df["A"])]
# need to adapt first drop for case that both keys are 'A' --
# cannot drop the same column twice;
# plain == would give ambiguous Boolean error for containers
first_drop = (
False
if (
isinstance(keys[0], str)
and keys[0] == "A"
and isinstance(keys[1], str)
and keys[1] == "A"
)
else drop
)
# to test against already-tested behaviour, we add sequentially,
# hence second append always True; must wrap keys in list, otherwise
# box = list would be interpreted as keys
expected = df.set_index([keys[0]], drop=first_drop, append=append)
expected = expected.set_index([keys[1]], drop=drop, append=True)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("append", [True, False])
@pytest.mark.parametrize("drop", [True, False])
def test_set_index_pass_multiindex(self, frame_of_index_cols, drop, append):
df = frame_of_index_cols
keys = MultiIndex.from_arrays([df["A"], df["B"]], names=["A", "B"])
result = df.set_index(keys, drop=drop, append=append)
# setting with a MultiIndex will never drop columns
expected = df.set_index(["A", "B"], drop=False, append=append)
tm.assert_frame_equal(result, expected)
def test_construction_with_categorical_index(self):
ci = CategoricalIndex(list("ab") * 5, name="B")
# with Categorical
df = DataFrame(
{"A": np.random.default_rng(2).standard_normal(10), "B": ci.values}
)
idf = df.set_index("B")
tm.assert_index_equal(idf.index, ci)
# from a CategoricalIndex
df = DataFrame({"A": np.random.default_rng(2).standard_normal(10), "B": ci})
idf = df.set_index("B")
tm.assert_index_equal(idf.index, ci)
# round-trip
idf = idf.reset_index().set_index("B")
tm.assert_index_equal(idf.index, ci)
def test_set_index_preserve_categorical_dtype(self):
# GH#13743, GH#13854
df = DataFrame(
{
"A": [1, 2, 1, 1, 2],
"B": [10, 16, 22, 28, 34],
"C1": Categorical(list("abaab"), categories=list("bac"), ordered=False),
"C2": Categorical(list("abaab"), categories=list("bac"), ordered=True),
}
)
for cols in ["C1", "C2", ["A", "C1"], ["A", "C2"], ["C1", "C2"]]:
result = df.set_index(cols).reset_index()
result = result.reindex(columns=df.columns)
tm.assert_frame_equal(result, df)
def test_set_index_datetime(self):
# GH#3950
df = DataFrame(
{
"label": ["a", "a", "a", "b", "b", "b"],
"datetime": [
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
],
"value": range(6),
}
)
df.index = to_datetime(df.pop("datetime"), utc=True)
df.index = df.index.tz_convert("US/Pacific")
expected = DatetimeIndex(
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
name="datetime",
)
expected = expected.tz_localize("UTC").tz_convert("US/Pacific")
df = df.set_index("label", append=True)
tm.assert_index_equal(df.index.levels[0], expected)
tm.assert_index_equal(df.index.levels[1], Index(["a", "b"], name="label"))
assert df.index.names == ["datetime", "label"]
df = df.swaplevel(0, 1)
tm.assert_index_equal(df.index.levels[0], Index(["a", "b"], name="label"))
tm.assert_index_equal(df.index.levels[1], expected)
assert df.index.names == ["label", "datetime"]
df = DataFrame(np.random.default_rng(2).random(6))
idx1 = DatetimeIndex(
[
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
],
tz="US/Eastern",
)
idx2 = DatetimeIndex(
[
"2012-04-01 09:00",
"2012-04-01 09:00",
"2012-04-01 09:00",
"2012-04-02 09:00",
"2012-04-02 09:00",
"2012-04-02 09:00",
],
tz="US/Eastern",
)
idx3 = date_range("2011-01-01 09:00", periods=6, tz="Asia/Tokyo")
idx3 = idx3._with_freq(None)
df = df.set_index(idx1)
df = df.set_index(idx2, append=True)
df = df.set_index(idx3, append=True)
expected1 = DatetimeIndex(
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
tz="US/Eastern",
)
expected2 = DatetimeIndex(
["2012-04-01 09:00", "2012-04-02 09:00"], tz="US/Eastern"
)
tm.assert_index_equal(df.index.levels[0], expected1)
tm.assert_index_equal(df.index.levels[1], expected2)
tm.assert_index_equal(df.index.levels[2], idx3)
# GH#7092
tm.assert_index_equal(df.index.get_level_values(0), idx1)
tm.assert_index_equal(df.index.get_level_values(1), idx2)
tm.assert_index_equal(df.index.get_level_values(2), idx3)
def test_set_index_period(self):
# GH#6631
df = DataFrame(np.random.default_rng(2).random(6))
idx1 = period_range("2011-01-01", periods=3, freq="M")
idx1 = idx1.append(idx1)
idx2 = period_range("2013-01-01 09:00", periods=2, freq="h")
idx2 = idx2.append(idx2).append(idx2)
idx3 = period_range("2005", periods=6, freq="Y")
df = df.set_index(idx1)
df = df.set_index(idx2, append=True)
df = df.set_index(idx3, append=True)
expected1 = period_range("2011-01-01", periods=3, freq="M")
expected2 = period_range("2013-01-01 09:00", periods=2, freq="h")
tm.assert_index_equal(df.index.levels[0], expected1)
tm.assert_index_equal(df.index.levels[1], expected2)
tm.assert_index_equal(df.index.levels[2], idx3)
tm.assert_index_equal(df.index.get_level_values(0), idx1)
tm.assert_index_equal(df.index.get_level_values(1), idx2)
tm.assert_index_equal(df.index.get_level_values(2), idx3)
class TestSetIndexInvalid:
def test_set_index_verify_integrity(self, frame_of_index_cols):
df = frame_of_index_cols
with pytest.raises(ValueError, match="Index has duplicate keys"):
df.set_index("A", verify_integrity=True)
# with MultiIndex
with pytest.raises(ValueError, match="Index has duplicate keys"):
df.set_index([df["A"], df["A"]], verify_integrity=True)
@pytest.mark.parametrize("append", [True, False])
@pytest.mark.parametrize("drop", [True, False])
def test_set_index_raise_keys(self, frame_of_index_cols, drop, append):
df = frame_of_index_cols
with pytest.raises(KeyError, match="['foo', 'bar', 'baz']"):
# column names are A-E, as well as one tuple
df.set_index(["foo", "bar", "baz"], drop=drop, append=append)
# non-existent key in list with arrays
with pytest.raises(KeyError, match="X"):
df.set_index([df["A"], df["B"], "X"], drop=drop, append=append)
msg = "[('foo', 'foo', 'foo', 'bar', 'bar')]"
# tuples always raise KeyError
with pytest.raises(KeyError, match=msg):
df.set_index(tuple(df["A"]), drop=drop, append=append)
# also within a list
with pytest.raises(KeyError, match=msg):
df.set_index(["A", df["A"], tuple(df["A"])], drop=drop, append=append)
@pytest.mark.parametrize("append", [True, False])
@pytest.mark.parametrize("drop", [True, False])
@pytest.mark.parametrize("box", [set], ids=["set"])
def test_set_index_raise_on_type(self, frame_of_index_cols, box, drop, append):
df = frame_of_index_cols
msg = 'The parameter "keys" may be a column key, .*'
# forbidden type, e.g. set
with pytest.raises(TypeError, match=msg):
df.set_index(box(df["A"]), drop=drop, append=append)
# forbidden type in list, e.g. set
with pytest.raises(TypeError, match=msg):
df.set_index(["A", df["A"], box(df["A"])], drop=drop, append=append)
# MultiIndex constructor does not work directly on Series -> lambda
@pytest.mark.parametrize(
"box",
[Series, Index, np.array, iter, lambda x: MultiIndex.from_arrays([x])],
ids=["Series", "Index", "np.array", "iter", "MultiIndex"],
)
@pytest.mark.parametrize("length", [4, 6], ids=["too_short", "too_long"])
@pytest.mark.parametrize("append", [True, False])
@pytest.mark.parametrize("drop", [True, False])
def test_set_index_raise_on_len(
self, frame_of_index_cols, box, length, drop, append
):
# GH 24984
df = frame_of_index_cols # has length 5
values = np.random.default_rng(2).integers(0, 10, (length,))
msg = "Length mismatch: Expected 5 rows, received array of length.*"
# wrong length directly
with pytest.raises(ValueError, match=msg):
df.set_index(box(values), drop=drop, append=append)
# wrong length in list
with pytest.raises(ValueError, match=msg):
df.set_index(["A", df.A, box(values)], drop=drop, append=append)
class TestSetIndexCustomLabelType:
def test_set_index_custom_label_type(self):
# GH#24969
class Thing:
def __init__(self, name, color) -> None:
self.name = name
self.color = color
def __str__(self) -> str:
return f"<Thing {repr(self.name)}>"
# necessary for pretty KeyError
__repr__ = __str__
thing1 = Thing("One", "red")
thing2 = Thing("Two", "blue")
df = DataFrame({thing1: [0, 1], thing2: [2, 3]})
expected = DataFrame({thing1: [0, 1]}, index=Index([2, 3], name=thing2))
# use custom label directly
result = df.set_index(thing2)
tm.assert_frame_equal(result, expected)
# custom label wrapped in list
result = df.set_index([thing2])
tm.assert_frame_equal(result, expected)
# missing key
thing3 = Thing("Three", "pink")
msg = "<Thing 'Three'>"
with pytest.raises(KeyError, match=msg):
# missing label directly
df.set_index(thing3)
with pytest.raises(KeyError, match=msg):
# missing label in list
df.set_index([thing3])
def test_set_index_custom_label_hashable_iterable(self):
# GH#24969
# actual example discussed in GH 24984 was e.g. for shapely.geometry
# objects (e.g. a collection of Points) that can be both hashable and
# iterable; using frozenset as a stand-in for testing here
class Thing(frozenset):
# need to stabilize repr for KeyError (due to random order in sets)
def __repr__(self) -> str:
tmp = sorted(self)
joined_reprs = ", ".join(map(repr, tmp))
# double curly brace prints one brace in format string
return f"frozenset({{{joined_reprs}}})"
thing1 = Thing(["One", "red"])
thing2 = Thing(["Two", "blue"])
df = DataFrame({thing1: [0, 1], thing2: [2, 3]})
expected = DataFrame({thing1: [0, 1]}, index=Index([2, 3], name=thing2))
# use custom label directly
result = df.set_index(thing2)
tm.assert_frame_equal(result, expected)
# custom label wrapped in list
result = df.set_index([thing2])
tm.assert_frame_equal(result, expected)
# missing key
thing3 = Thing(["Three", "pink"])
msg = r"frozenset\(\{'Three', 'pink'\}\)"
with pytest.raises(KeyError, match=msg):
# missing label directly
df.set_index(thing3)
with pytest.raises(KeyError, match=msg):
# missing label in list
df.set_index([thing3])
def test_set_index_custom_label_type_raises(self):
# GH#24969
# purposefully inherit from something unhashable
class Thing(set):
def __init__(self, name, color) -> None:
self.name = name
self.color = color
def __str__(self) -> str:
return f"<Thing {repr(self.name)}>"
thing1 = Thing("One", "red")
thing2 = Thing("Two", "blue")
df = DataFrame([[0, 2], [1, 3]], columns=[thing1, thing2])
msg = 'The parameter "keys" may be a column key, .*'
with pytest.raises(TypeError, match=msg):
# use custom label directly
df.set_index(thing2)
with pytest.raises(TypeError, match=msg):
# custom label wrapped in list
df.set_index([thing2])
def test_set_index_periodindex(self):
# GH#6631
df = DataFrame(np.random.default_rng(2).random(6))
idx1 = period_range("2011/01/01", periods=6, freq="M")
idx2 = period_range("2013", periods=6, freq="Y")
df = df.set_index(idx1)
tm.assert_index_equal(df.index, idx1)
df = df.set_index(idx2)
tm.assert_index_equal(df.index, idx2)

View File

@ -0,0 +1,764 @@
import numpy as np
import pytest
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
CategoricalIndex,
DataFrame,
Index,
NaT,
Series,
date_range,
offsets,
)
import pandas._testing as tm
class TestDataFrameShift:
def test_shift_axis1_with_valid_fill_value_one_array(self):
# Case with axis=1 that does not go through the "len(arrays)>1" path
# in DataFrame.shift
data = np.random.default_rng(2).standard_normal((5, 3))
df = DataFrame(data)
res = df.shift(axis=1, periods=1, fill_value=12345)
expected = df.T.shift(periods=1, fill_value=12345).T
tm.assert_frame_equal(res, expected)
# same but with an 1D ExtensionArray backing it
df2 = df[[0]].astype("Float64")
res2 = df2.shift(axis=1, periods=1, fill_value=12345)
expected2 = DataFrame([12345] * 5, dtype="Float64")
tm.assert_frame_equal(res2, expected2)
def test_shift_deprecate_freq_and_fill_value(self, frame_or_series):
# Can't pass both!
obj = frame_or_series(
np.random.default_rng(2).standard_normal(5),
index=date_range("1/1/2000", periods=5, freq="h"),
)
msg = (
"Passing a 'freq' together with a 'fill_value' silently ignores the "
"fill_value"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
obj.shift(1, fill_value=1, freq="h")
if frame_or_series is DataFrame:
obj.columns = date_range("1/1/2000", periods=1, freq="h")
with tm.assert_produces_warning(FutureWarning, match=msg):
obj.shift(1, axis=1, fill_value=1, freq="h")
@pytest.mark.parametrize(
"input_data, output_data",
[(np.empty(shape=(0,)), []), (np.ones(shape=(2,)), [np.nan, 1.0])],
)
def test_shift_non_writable_array(self, input_data, output_data, frame_or_series):
# GH21049 Verify whether non writable numpy array is shiftable
input_data.setflags(write=False)
result = frame_or_series(input_data).shift(1)
if frame_or_series is not Series:
# need to explicitly specify columns in the empty case
expected = frame_or_series(
output_data,
index=range(len(output_data)),
columns=range(1),
dtype="float64",
)
else:
expected = frame_or_series(output_data, dtype="float64")
tm.assert_equal(result, expected)
def test_shift_mismatched_freq(self, frame_or_series):
ts = frame_or_series(
np.random.default_rng(2).standard_normal(5),
index=date_range("1/1/2000", periods=5, freq="h"),
)
result = ts.shift(1, freq="5min")
exp_index = ts.index.shift(1, freq="5min")
tm.assert_index_equal(result.index, exp_index)
# GH#1063, multiple of same base
result = ts.shift(1, freq="4h")
exp_index = ts.index + offsets.Hour(4)
tm.assert_index_equal(result.index, exp_index)
@pytest.mark.parametrize(
"obj",
[
Series([np.arange(5)]),
date_range("1/1/2011", periods=24, freq="h"),
Series(range(5), index=date_range("2017", periods=5)),
],
)
@pytest.mark.parametrize("shift_size", [0, 1, 2])
def test_shift_always_copy(self, obj, shift_size, frame_or_series):
# GH#22397
if frame_or_series is not Series:
obj = obj.to_frame()
assert obj.shift(shift_size) is not obj
def test_shift_object_non_scalar_fill(self):
# shift requires scalar fill_value except for object dtype
ser = Series(range(3))
with pytest.raises(ValueError, match="fill_value must be a scalar"):
ser.shift(1, fill_value=[])
df = ser.to_frame()
with pytest.raises(ValueError, match="fill_value must be a scalar"):
df.shift(1, fill_value=np.arange(3))
obj_ser = ser.astype(object)
result = obj_ser.shift(1, fill_value={})
assert result[0] == {}
obj_df = obj_ser.to_frame()
result = obj_df.shift(1, fill_value={})
assert result.iloc[0, 0] == {}
def test_shift_int(self, datetime_frame, frame_or_series):
ts = tm.get_obj(datetime_frame, frame_or_series).astype(int)
shifted = ts.shift(1)
expected = ts.astype(float).shift(1)
tm.assert_equal(shifted, expected)
@pytest.mark.parametrize("dtype", ["int32", "int64"])
def test_shift_32bit_take(self, frame_or_series, dtype):
# 32-bit taking
# GH#8129
index = date_range("2000-01-01", periods=5)
arr = np.arange(5, dtype=dtype)
s1 = frame_or_series(arr, index=index)
p = arr[1]
result = s1.shift(periods=p)
expected = frame_or_series([np.nan, 0, 1, 2, 3], index=index)
tm.assert_equal(result, expected)
@pytest.mark.parametrize("periods", [1, 2, 3, 4])
def test_shift_preserve_freqstr(self, periods, frame_or_series):
# GH#21275
obj = frame_or_series(
range(periods),
index=date_range("2016-1-1 00:00:00", periods=periods, freq="h"),
)
result = obj.shift(1, "2h")
expected = frame_or_series(
range(periods),
index=date_range("2016-1-1 02:00:00", periods=periods, freq="h"),
)
tm.assert_equal(result, expected)
def test_shift_dst(self, frame_or_series):
# GH#13926
dates = date_range("2016-11-06", freq="h", periods=10, tz="US/Eastern")
obj = frame_or_series(dates)
res = obj.shift(0)
tm.assert_equal(res, obj)
assert tm.get_dtype(res) == "datetime64[ns, US/Eastern]"
res = obj.shift(1)
exp_vals = [NaT] + dates.astype(object).values.tolist()[:9]
exp = frame_or_series(exp_vals)
tm.assert_equal(res, exp)
assert tm.get_dtype(res) == "datetime64[ns, US/Eastern]"
res = obj.shift(-2)
exp_vals = dates.astype(object).values.tolist()[2:] + [NaT, NaT]
exp = frame_or_series(exp_vals)
tm.assert_equal(res, exp)
assert tm.get_dtype(res) == "datetime64[ns, US/Eastern]"
@pytest.mark.parametrize("ex", [10, -10, 20, -20])
def test_shift_dst_beyond(self, frame_or_series, ex):
# GH#13926
dates = date_range("2016-11-06", freq="h", periods=10, tz="US/Eastern")
obj = frame_or_series(dates)
res = obj.shift(ex)
exp = frame_or_series([NaT] * 10, dtype="datetime64[ns, US/Eastern]")
tm.assert_equal(res, exp)
assert tm.get_dtype(res) == "datetime64[ns, US/Eastern]"
def test_shift_by_zero(self, datetime_frame, frame_or_series):
# shift by 0
obj = tm.get_obj(datetime_frame, frame_or_series)
unshifted = obj.shift(0)
tm.assert_equal(unshifted, obj)
def test_shift(self, datetime_frame):
# naive shift
ser = datetime_frame["A"]
shifted = datetime_frame.shift(5)
tm.assert_index_equal(shifted.index, datetime_frame.index)
shifted_ser = ser.shift(5)
tm.assert_series_equal(shifted["A"], shifted_ser)
shifted = datetime_frame.shift(-5)
tm.assert_index_equal(shifted.index, datetime_frame.index)
shifted_ser = ser.shift(-5)
tm.assert_series_equal(shifted["A"], shifted_ser)
unshifted = datetime_frame.shift(5).shift(-5)
tm.assert_numpy_array_equal(
unshifted.dropna().values, datetime_frame.values[:-5]
)
unshifted_ser = ser.shift(5).shift(-5)
tm.assert_numpy_array_equal(unshifted_ser.dropna().values, ser.values[:-5])
def test_shift_by_offset(self, datetime_frame, frame_or_series):
# shift by DateOffset
obj = tm.get_obj(datetime_frame, frame_or_series)
offset = offsets.BDay()
shifted = obj.shift(5, freq=offset)
assert len(shifted) == len(obj)
unshifted = shifted.shift(-5, freq=offset)
tm.assert_equal(unshifted, obj)
shifted2 = obj.shift(5, freq="B")
tm.assert_equal(shifted, shifted2)
unshifted = obj.shift(0, freq=offset)
tm.assert_equal(unshifted, obj)
d = obj.index[0]
shifted_d = d + offset * 5
if frame_or_series is DataFrame:
tm.assert_series_equal(obj.xs(d), shifted.xs(shifted_d), check_names=False)
else:
tm.assert_almost_equal(obj.at[d], shifted.at[shifted_d])
def test_shift_with_periodindex(self, frame_or_series):
# Shifting with PeriodIndex
ps = DataFrame(
np.arange(4, dtype=float), index=pd.period_range("2020-01-01", periods=4)
)
ps = tm.get_obj(ps, frame_or_series)
shifted = ps.shift(1)
unshifted = shifted.shift(-1)
tm.assert_index_equal(shifted.index, ps.index)
tm.assert_index_equal(unshifted.index, ps.index)
if frame_or_series is DataFrame:
tm.assert_numpy_array_equal(
unshifted.iloc[:, 0].dropna().values, ps.iloc[:-1, 0].values
)
else:
tm.assert_numpy_array_equal(unshifted.dropna().values, ps.values[:-1])
shifted2 = ps.shift(1, "D")
shifted3 = ps.shift(1, offsets.Day())
tm.assert_equal(shifted2, shifted3)
tm.assert_equal(ps, shifted2.shift(-1, "D"))
msg = "does not match PeriodIndex freq"
with pytest.raises(ValueError, match=msg):
ps.shift(freq="W")
# legacy support
shifted4 = ps.shift(1, freq="D")
tm.assert_equal(shifted2, shifted4)
shifted5 = ps.shift(1, freq=offsets.Day())
tm.assert_equal(shifted5, shifted4)
def test_shift_other_axis(self):
# shift other axis
# GH#6371
df = DataFrame(np.random.default_rng(2).random((10, 5)))
expected = pd.concat(
[DataFrame(np.nan, index=df.index, columns=[0]), df.iloc[:, 0:-1]],
ignore_index=True,
axis=1,
)
result = df.shift(1, axis=1)
tm.assert_frame_equal(result, expected)
def test_shift_named_axis(self):
# shift named axis
df = DataFrame(np.random.default_rng(2).random((10, 5)))
expected = pd.concat(
[DataFrame(np.nan, index=df.index, columns=[0]), df.iloc[:, 0:-1]],
ignore_index=True,
axis=1,
)
result = df.shift(1, axis="columns")
tm.assert_frame_equal(result, expected)
def test_shift_other_axis_with_freq(self, datetime_frame):
obj = datetime_frame.T
offset = offsets.BDay()
# GH#47039
shifted = obj.shift(5, freq=offset, axis=1)
assert len(shifted) == len(obj)
unshifted = shifted.shift(-5, freq=offset, axis=1)
tm.assert_equal(unshifted, obj)
def test_shift_bool(self):
df = DataFrame({"high": [True, False], "low": [False, False]})
rs = df.shift(1)
xp = DataFrame(
np.array([[np.nan, np.nan], [True, False]], dtype=object),
columns=["high", "low"],
)
tm.assert_frame_equal(rs, xp)
def test_shift_categorical1(self, frame_or_series):
# GH#9416
obj = frame_or_series(["a", "b", "c", "d"], dtype="category")
rt = obj.shift(1).shift(-1)
tm.assert_equal(obj.iloc[:-1], rt.dropna())
def get_cat_values(ndframe):
# For Series we could just do ._values; for DataFrame
# we may be able to do this if we ever have 2D Categoricals
return ndframe._mgr.arrays[0]
cat = get_cat_values(obj)
sp1 = obj.shift(1)
tm.assert_index_equal(obj.index, sp1.index)
assert np.all(get_cat_values(sp1).codes[:1] == -1)
assert np.all(cat.codes[:-1] == get_cat_values(sp1).codes[1:])
sn2 = obj.shift(-2)
tm.assert_index_equal(obj.index, sn2.index)
assert np.all(get_cat_values(sn2).codes[-2:] == -1)
assert np.all(cat.codes[2:] == get_cat_values(sn2).codes[:-2])
tm.assert_index_equal(cat.categories, get_cat_values(sp1).categories)
tm.assert_index_equal(cat.categories, get_cat_values(sn2).categories)
def test_shift_categorical(self):
# GH#9416
s1 = Series(["a", "b", "c"], dtype="category")
s2 = Series(["A", "B", "C"], dtype="category")
df = DataFrame({"one": s1, "two": s2})
rs = df.shift(1)
xp = DataFrame({"one": s1.shift(1), "two": s2.shift(1)})
tm.assert_frame_equal(rs, xp)
def test_shift_categorical_fill_value(self, frame_or_series):
ts = frame_or_series(["a", "b", "c", "d"], dtype="category")
res = ts.shift(1, fill_value="a")
expected = frame_or_series(
pd.Categorical(
["a", "a", "b", "c"], categories=["a", "b", "c", "d"], ordered=False
)
)
tm.assert_equal(res, expected)
# check for incorrect fill_value
msg = r"Cannot setitem on a Categorical with a new category \(f\)"
with pytest.raises(TypeError, match=msg):
ts.shift(1, fill_value="f")
def test_shift_fill_value(self, frame_or_series):
# GH#24128
dti = date_range("1/1/2000", periods=5, freq="h")
ts = frame_or_series([1.0, 2.0, 3.0, 4.0, 5.0], index=dti)
exp = frame_or_series([0.0, 1.0, 2.0, 3.0, 4.0], index=dti)
# check that fill value works
result = ts.shift(1, fill_value=0.0)
tm.assert_equal(result, exp)
exp = frame_or_series([0.0, 0.0, 1.0, 2.0, 3.0], index=dti)
result = ts.shift(2, fill_value=0.0)
tm.assert_equal(result, exp)
ts = frame_or_series([1, 2, 3])
res = ts.shift(2, fill_value=0)
assert tm.get_dtype(res) == tm.get_dtype(ts)
# retain integer dtype
obj = frame_or_series([1, 2, 3, 4, 5], index=dti)
exp = frame_or_series([0, 1, 2, 3, 4], index=dti)
result = obj.shift(1, fill_value=0)
tm.assert_equal(result, exp)
exp = frame_or_series([0, 0, 1, 2, 3], index=dti)
result = obj.shift(2, fill_value=0)
tm.assert_equal(result, exp)
def test_shift_empty(self):
# Regression test for GH#8019
df = DataFrame({"foo": []})
rs = df.shift(-1)
tm.assert_frame_equal(df, rs)
def test_shift_duplicate_columns(self):
# GH#9092; verify that position-based shifting works
# in the presence of duplicate columns
column_lists = [list(range(5)), [1] * 5, [1, 1, 2, 2, 1]]
data = np.random.default_rng(2).standard_normal((20, 5))
shifted = []
for columns in column_lists:
df = DataFrame(data.copy(), columns=columns)
for s in range(5):
df.iloc[:, s] = df.iloc[:, s].shift(s + 1)
df.columns = range(5)
shifted.append(df)
# sanity check the base case
nulls = shifted[0].isna().sum()
tm.assert_series_equal(nulls, Series(range(1, 6), dtype="int64"))
# check all answers are the same
tm.assert_frame_equal(shifted[0], shifted[1])
tm.assert_frame_equal(shifted[0], shifted[2])
def test_shift_axis1_multiple_blocks(self, using_array_manager):
# GH#35488
df1 = DataFrame(np.random.default_rng(2).integers(1000, size=(5, 3)))
df2 = DataFrame(np.random.default_rng(2).integers(1000, size=(5, 2)))
df3 = pd.concat([df1, df2], axis=1)
if not using_array_manager:
assert len(df3._mgr.blocks) == 2
result = df3.shift(2, axis=1)
expected = df3.take([-1, -1, 0, 1, 2], axis=1)
# Explicit cast to float to avoid implicit cast when setting nan.
# Column names aren't unique, so directly calling `expected.astype` won't work.
expected = expected.pipe(
lambda df: df.set_axis(range(df.shape[1]), axis=1)
.astype({0: "float", 1: "float"})
.set_axis(df.columns, axis=1)
)
expected.iloc[:, :2] = np.nan
expected.columns = df3.columns
tm.assert_frame_equal(result, expected)
# Case with periods < 0
# rebuild df3 because `take` call above consolidated
df3 = pd.concat([df1, df2], axis=1)
if not using_array_manager:
assert len(df3._mgr.blocks) == 2
result = df3.shift(-2, axis=1)
expected = df3.take([2, 3, 4, -1, -1], axis=1)
# Explicit cast to float to avoid implicit cast when setting nan.
# Column names aren't unique, so directly calling `expected.astype` won't work.
expected = expected.pipe(
lambda df: df.set_axis(range(df.shape[1]), axis=1)
.astype({3: "float", 4: "float"})
.set_axis(df.columns, axis=1)
)
expected.iloc[:, -2:] = np.nan
expected.columns = df3.columns
tm.assert_frame_equal(result, expected)
@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) axis=1 support
def test_shift_axis1_multiple_blocks_with_int_fill(self):
# GH#42719
rng = np.random.default_rng(2)
df1 = DataFrame(rng.integers(1000, size=(5, 3), dtype=int))
df2 = DataFrame(rng.integers(1000, size=(5, 2), dtype=int))
df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1)
result = df3.shift(2, axis=1, fill_value=np.int_(0))
assert len(df3._mgr.blocks) == 2
expected = df3.take([-1, -1, 0, 1], axis=1)
expected.iloc[:, :2] = np.int_(0)
expected.columns = df3.columns
tm.assert_frame_equal(result, expected)
# Case with periods < 0
df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1)
result = df3.shift(-2, axis=1, fill_value=np.int_(0))
assert len(df3._mgr.blocks) == 2
expected = df3.take([2, 3, -1, -1], axis=1)
expected.iloc[:, -2:] = np.int_(0)
expected.columns = df3.columns
tm.assert_frame_equal(result, expected)
def test_period_index_frame_shift_with_freq(self, frame_or_series):
ps = DataFrame(range(4), index=pd.period_range("2020-01-01", periods=4))
ps = tm.get_obj(ps, frame_or_series)
shifted = ps.shift(1, freq="infer")
unshifted = shifted.shift(-1, freq="infer")
tm.assert_equal(unshifted, ps)
shifted2 = ps.shift(freq="D")
tm.assert_equal(shifted, shifted2)
shifted3 = ps.shift(freq=offsets.Day())
tm.assert_equal(shifted, shifted3)
def test_datetime_frame_shift_with_freq(self, datetime_frame, frame_or_series):
dtobj = tm.get_obj(datetime_frame, frame_or_series)
shifted = dtobj.shift(1, freq="infer")
unshifted = shifted.shift(-1, freq="infer")
tm.assert_equal(dtobj, unshifted)
shifted2 = dtobj.shift(freq=dtobj.index.freq)
tm.assert_equal(shifted, shifted2)
inferred_ts = DataFrame(
datetime_frame.values,
Index(np.asarray(datetime_frame.index)),
columns=datetime_frame.columns,
)
inferred_ts = tm.get_obj(inferred_ts, frame_or_series)
shifted = inferred_ts.shift(1, freq="infer")
expected = dtobj.shift(1, freq="infer")
expected.index = expected.index._with_freq(None)
tm.assert_equal(shifted, expected)
unshifted = shifted.shift(-1, freq="infer")
tm.assert_equal(unshifted, inferred_ts)
def test_period_index_frame_shift_with_freq_error(self, frame_or_series):
ps = DataFrame(range(4), index=pd.period_range("2020-01-01", periods=4))
ps = tm.get_obj(ps, frame_or_series)
msg = "Given freq M does not match PeriodIndex freq D"
with pytest.raises(ValueError, match=msg):
ps.shift(freq="M")
def test_datetime_frame_shift_with_freq_error(
self, datetime_frame, frame_or_series
):
dtobj = tm.get_obj(datetime_frame, frame_or_series)
no_freq = dtobj.iloc[[0, 5, 7]]
msg = "Freq was not set in the index hence cannot be inferred"
with pytest.raises(ValueError, match=msg):
no_freq.shift(freq="infer")
def test_shift_dt64values_int_fill_deprecated(self):
# GH#31971
ser = Series([pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02")])
with pytest.raises(TypeError, match="value should be a"):
ser.shift(1, fill_value=0)
df = ser.to_frame()
with pytest.raises(TypeError, match="value should be a"):
df.shift(1, fill_value=0)
# axis = 1
df2 = DataFrame({"A": ser, "B": ser})
df2._consolidate_inplace()
result = df2.shift(1, axis=1, fill_value=0)
expected = DataFrame({"A": [0, 0], "B": df2["A"]})
tm.assert_frame_equal(result, expected)
# same thing but not consolidated; pre-2.0 we got different behavior
df3 = DataFrame({"A": ser})
df3["B"] = ser
assert len(df3._mgr.arrays) == 2
result = df3.shift(1, axis=1, fill_value=0)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"as_cat",
[
pytest.param(
True,
marks=pytest.mark.xfail(
reason="_can_hold_element incorrectly always returns True"
),
),
False,
],
)
@pytest.mark.parametrize(
"vals",
[
date_range("2020-01-01", periods=2),
date_range("2020-01-01", periods=2, tz="US/Pacific"),
pd.period_range("2020-01-01", periods=2, freq="D"),
pd.timedelta_range("2020 Days", periods=2, freq="D"),
pd.interval_range(0, 3, periods=2),
pytest.param(
pd.array([1, 2], dtype="Int64"),
marks=pytest.mark.xfail(
reason="_can_hold_element incorrectly always returns True"
),
),
pytest.param(
pd.array([1, 2], dtype="Float32"),
marks=pytest.mark.xfail(
reason="_can_hold_element incorrectly always returns True"
),
),
],
ids=lambda x: str(x.dtype),
)
def test_shift_dt64values_axis1_invalid_fill(self, vals, as_cat):
# GH#44564
ser = Series(vals)
if as_cat:
ser = ser.astype("category")
df = DataFrame({"A": ser})
result = df.shift(-1, axis=1, fill_value="foo")
expected = DataFrame({"A": ["foo", "foo"]})
tm.assert_frame_equal(result, expected)
# same thing but multiple blocks
df2 = DataFrame({"A": ser, "B": ser})
df2._consolidate_inplace()
result = df2.shift(-1, axis=1, fill_value="foo")
expected = DataFrame({"A": df2["B"], "B": ["foo", "foo"]})
tm.assert_frame_equal(result, expected)
# same thing but not consolidated
df3 = DataFrame({"A": ser})
df3["B"] = ser
assert len(df3._mgr.arrays) == 2
result = df3.shift(-1, axis=1, fill_value="foo")
tm.assert_frame_equal(result, expected)
def test_shift_axis1_categorical_columns(self):
# GH#38434
ci = CategoricalIndex(["a", "b", "c"])
df = DataFrame(
{"a": [1, 3], "b": [2, 4], "c": [5, 6]}, index=ci[:-1], columns=ci
)
result = df.shift(axis=1)
expected = DataFrame(
{"a": [np.nan, np.nan], "b": [1, 3], "c": [2, 4]}, index=ci[:-1], columns=ci
)
tm.assert_frame_equal(result, expected)
# periods != 1
result = df.shift(2, axis=1)
expected = DataFrame(
{"a": [np.nan, np.nan], "b": [np.nan, np.nan], "c": [1, 3]},
index=ci[:-1],
columns=ci,
)
tm.assert_frame_equal(result, expected)
def test_shift_axis1_many_periods(self):
# GH#44978 periods > len(columns)
df = DataFrame(np.random.default_rng(2).random((5, 3)))
shifted = df.shift(6, axis=1, fill_value=None)
expected = df * np.nan
tm.assert_frame_equal(shifted, expected)
shifted2 = df.shift(-6, axis=1, fill_value=None)
tm.assert_frame_equal(shifted2, expected)
def test_shift_with_offsets_freq(self):
df = DataFrame({"x": [1, 2, 3]}, index=date_range("2000", periods=3))
shifted = df.shift(freq="1MS")
expected = DataFrame(
{"x": [1, 2, 3]},
index=date_range(start="02/01/2000", end="02/01/2000", periods=3),
)
tm.assert_frame_equal(shifted, expected)
def test_shift_with_iterable_basic_functionality(self):
# GH#44424
data = {"a": [1, 2, 3], "b": [4, 5, 6]}
shifts = [0, 1, 2]
df = DataFrame(data)
shifted = df.shift(shifts)
expected = DataFrame(
{
"a_0": [1, 2, 3],
"b_0": [4, 5, 6],
"a_1": [np.nan, 1.0, 2.0],
"b_1": [np.nan, 4.0, 5.0],
"a_2": [np.nan, np.nan, 1.0],
"b_2": [np.nan, np.nan, 4.0],
}
)
tm.assert_frame_equal(expected, shifted)
def test_shift_with_iterable_series(self):
# GH#44424
data = {"a": [1, 2, 3]}
shifts = [0, 1, 2]
df = DataFrame(data)
s = df["a"]
tm.assert_frame_equal(s.shift(shifts), df.shift(shifts))
def test_shift_with_iterable_freq_and_fill_value(self):
# GH#44424
df = DataFrame(
np.random.default_rng(2).standard_normal(5),
index=date_range("1/1/2000", periods=5, freq="h"),
)
tm.assert_frame_equal(
# rename because shift with an iterable leads to str column names
df.shift([1], fill_value=1).rename(columns=lambda x: int(x[0])),
df.shift(1, fill_value=1),
)
tm.assert_frame_equal(
df.shift([1], freq="h").rename(columns=lambda x: int(x[0])),
df.shift(1, freq="h"),
)
msg = (
"Passing a 'freq' together with a 'fill_value' silently ignores the "
"fill_value"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
df.shift([1, 2], fill_value=1, freq="h")
def test_shift_with_iterable_check_other_arguments(self):
# GH#44424
data = {"a": [1, 2], "b": [4, 5]}
shifts = [0, 1]
df = DataFrame(data)
# test suffix
shifted = df[["a"]].shift(shifts, suffix="_suffix")
expected = DataFrame({"a_suffix_0": [1, 2], "a_suffix_1": [np.nan, 1.0]})
tm.assert_frame_equal(shifted, expected)
# check bad inputs when doing multiple shifts
msg = "If `periods` contains multiple shifts, `axis` cannot be 1."
with pytest.raises(ValueError, match=msg):
df.shift(shifts, axis=1)
msg = "Periods must be integer, but s is <class 'str'>."
with pytest.raises(TypeError, match=msg):
df.shift(["s"])
msg = "If `periods` is an iterable, it cannot be empty."
with pytest.raises(ValueError, match=msg):
df.shift([])
msg = "Cannot specify `suffix` if `periods` is an int."
with pytest.raises(ValueError, match=msg):
df.shift(1, suffix="fails")
def test_shift_axis_one_empty(self):
# GH#57301
df = DataFrame()
result = df.shift(1, axis=1)
tm.assert_frame_equal(result, df)

View File

@ -0,0 +1,21 @@
import numpy as np
import pytest
from pandas import DataFrame
@pytest.mark.parametrize(
"data, index, expected",
[
({"col1": [1], "col2": [3]}, None, 2),
({}, None, 0),
({"col1": [1, np.nan], "col2": [3, 4]}, None, 4),
({"col1": [1, 2], "col2": [3, 4]}, [["a", "b"], [1, 2]], 4),
({"col1": [1, 2, 3, 4], "col2": [3, 4, 5, 6]}, ["x", "y", "a", "b"], 8),
],
)
def test_size(data, index, expected):
# GH#52897
df = DataFrame(data, index=index)
assert df.size == expected
assert isinstance(df.size, int)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,940 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
Categorical,
DataFrame,
NaT,
Timestamp,
date_range,
)
import pandas._testing as tm
from pandas.util.version import Version
class TestDataFrameSortValues:
@pytest.mark.parametrize("dtype", [np.uint8, bool])
def test_sort_values_sparse_no_warning(self, dtype):
# GH#45618
ser = pd.Series(Categorical(["a", "b", "a"], categories=["a", "b", "c"]))
df = pd.get_dummies(ser, dtype=dtype, sparse=True)
with tm.assert_produces_warning(None):
# No warnings about constructing Index from SparseArray
df.sort_values(by=df.columns.tolist())
def test_sort_values(self):
frame = DataFrame(
[[1, 1, 2], [3, 1, 0], [4, 5, 6]], index=[1, 2, 3], columns=list("ABC")
)
# by column (axis=0)
sorted_df = frame.sort_values(by="A")
indexer = frame["A"].argsort().values
expected = frame.loc[frame.index[indexer]]
tm.assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by="A", ascending=False)
indexer = indexer[::-1]
expected = frame.loc[frame.index[indexer]]
tm.assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by="A", ascending=False)
tm.assert_frame_equal(sorted_df, expected)
# GH4839
sorted_df = frame.sort_values(by=["A"], ascending=[False])
tm.assert_frame_equal(sorted_df, expected)
# multiple bys
sorted_df = frame.sort_values(by=["B", "C"])
expected = frame.loc[[2, 1, 3]]
tm.assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=["B", "C"], ascending=False)
tm.assert_frame_equal(sorted_df, expected[::-1])
sorted_df = frame.sort_values(by=["B", "A"], ascending=[True, False])
tm.assert_frame_equal(sorted_df, expected)
msg = "No axis named 2 for object type DataFrame"
with pytest.raises(ValueError, match=msg):
frame.sort_values(by=["A", "B"], axis=2, inplace=True)
# by row (axis=1): GH#10806
sorted_df = frame.sort_values(by=3, axis=1)
expected = frame
tm.assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=3, axis=1, ascending=False)
expected = frame.reindex(columns=["C", "B", "A"])
tm.assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=[1, 2], axis="columns")
expected = frame.reindex(columns=["B", "A", "C"])
tm.assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=[True, False])
tm.assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False)
expected = frame.reindex(columns=["C", "B", "A"])
tm.assert_frame_equal(sorted_df, expected)
msg = r"Length of ascending \(5\) != length of by \(2\)"
with pytest.raises(ValueError, match=msg):
frame.sort_values(by=["A", "B"], axis=0, ascending=[True] * 5)
def test_sort_values_by_empty_list(self):
# https://github.com/pandas-dev/pandas/issues/40258
expected = DataFrame({"a": [1, 4, 2, 5, 3, 6]})
result = expected.sort_values(by=[])
tm.assert_frame_equal(result, expected)
assert result is not expected
def test_sort_values_inplace(self):
frame = DataFrame(
np.random.default_rng(2).standard_normal((4, 4)),
index=[1, 2, 3, 4],
columns=["A", "B", "C", "D"],
)
sorted_df = frame.copy()
return_value = sorted_df.sort_values(by="A", inplace=True)
assert return_value is None
expected = frame.sort_values(by="A")
tm.assert_frame_equal(sorted_df, expected)
sorted_df = frame.copy()
return_value = sorted_df.sort_values(by=1, axis=1, inplace=True)
assert return_value is None
expected = frame.sort_values(by=1, axis=1)
tm.assert_frame_equal(sorted_df, expected)
sorted_df = frame.copy()
return_value = sorted_df.sort_values(by="A", ascending=False, inplace=True)
assert return_value is None
expected = frame.sort_values(by="A", ascending=False)
tm.assert_frame_equal(sorted_df, expected)
sorted_df = frame.copy()
return_value = sorted_df.sort_values(
by=["A", "B"], ascending=False, inplace=True
)
assert return_value is None
expected = frame.sort_values(by=["A", "B"], ascending=False)
tm.assert_frame_equal(sorted_df, expected)
def test_sort_values_multicolumn(self):
A = np.arange(5).repeat(20)
B = np.tile(np.arange(5), 20)
np.random.default_rng(2).shuffle(A)
np.random.default_rng(2).shuffle(B)
frame = DataFrame(
{"A": A, "B": B, "C": np.random.default_rng(2).standard_normal(100)}
)
result = frame.sort_values(by=["A", "B"])
indexer = np.lexsort((frame["B"], frame["A"]))
expected = frame.take(indexer)
tm.assert_frame_equal(result, expected)
result = frame.sort_values(by=["A", "B"], ascending=False)
indexer = np.lexsort(
(frame["B"].rank(ascending=False), frame["A"].rank(ascending=False))
)
expected = frame.take(indexer)
tm.assert_frame_equal(result, expected)
result = frame.sort_values(by=["B", "A"])
indexer = np.lexsort((frame["A"], frame["B"]))
expected = frame.take(indexer)
tm.assert_frame_equal(result, expected)
def test_sort_values_multicolumn_uint64(self):
# GH#9918
# uint64 multicolumn sort
df = DataFrame(
{
"a": pd.Series([18446637057563306014, 1162265347240853609]),
"b": pd.Series([1, 2]),
}
)
df["a"] = df["a"].astype(np.uint64)
result = df.sort_values(["a", "b"])
expected = DataFrame(
{
"a": pd.Series([18446637057563306014, 1162265347240853609]),
"b": pd.Series([1, 2]),
},
index=pd.Index([1, 0]),
)
tm.assert_frame_equal(result, expected)
def test_sort_values_nan(self):
# GH#3917
df = DataFrame(
{"A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5]}
)
# sort one column only
expected = DataFrame(
{"A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, np.nan, 5, 5, 4]},
index=[2, 0, 3, 1, 6, 4, 5],
)
sorted_df = df.sort_values(["A"], na_position="first")
tm.assert_frame_equal(sorted_df, expected)
expected = DataFrame(
{"A": [np.nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, np.nan, 9, 2]},
index=[2, 5, 4, 6, 1, 0, 3],
)
sorted_df = df.sort_values(["A"], na_position="first", ascending=False)
tm.assert_frame_equal(sorted_df, expected)
expected = df.reindex(columns=["B", "A"])
sorted_df = df.sort_values(by=1, axis=1, na_position="first")
tm.assert_frame_equal(sorted_df, expected)
# na_position='last', order
expected = DataFrame(
{"A": [1, 1, 2, 4, 6, 8, np.nan], "B": [2, 9, np.nan, 5, 5, 4, 5]},
index=[3, 0, 1, 6, 4, 5, 2],
)
sorted_df = df.sort_values(["A", "B"])
tm.assert_frame_equal(sorted_df, expected)
# na_position='first', order
expected = DataFrame(
{"A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 2, 9, np.nan, 5, 5, 4]},
index=[2, 3, 0, 1, 6, 4, 5],
)
sorted_df = df.sort_values(["A", "B"], na_position="first")
tm.assert_frame_equal(sorted_df, expected)
# na_position='first', not order
expected = DataFrame(
{"A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, np.nan, 5, 5, 4]},
index=[2, 0, 3, 1, 6, 4, 5],
)
sorted_df = df.sort_values(["A", "B"], ascending=[1, 0], na_position="first")
tm.assert_frame_equal(sorted_df, expected)
# na_position='last', not order
expected = DataFrame(
{"A": [8, 6, 4, 2, 1, 1, np.nan], "B": [4, 5, 5, np.nan, 2, 9, 5]},
index=[5, 4, 6, 1, 3, 0, 2],
)
sorted_df = df.sort_values(["A", "B"], ascending=[0, 1], na_position="last")
tm.assert_frame_equal(sorted_df, expected)
def test_sort_values_stable_descending_sort(self):
# GH#6399
df = DataFrame(
[[2, "first"], [2, "second"], [1, "a"], [1, "b"]],
columns=["sort_col", "order"],
)
sorted_df = df.sort_values(by="sort_col", kind="mergesort", ascending=False)
tm.assert_frame_equal(df, sorted_df)
@pytest.mark.parametrize(
"expected_idx_non_na, ascending",
[
[
[3, 4, 5, 0, 1, 8, 6, 9, 7, 10, 13, 14],
[True, True],
],
[
[0, 3, 4, 5, 1, 8, 6, 7, 10, 13, 14, 9],
[True, False],
],
[
[9, 7, 10, 13, 14, 6, 8, 1, 3, 4, 5, 0],
[False, True],
],
[
[7, 10, 13, 14, 9, 6, 8, 1, 0, 3, 4, 5],
[False, False],
],
],
)
@pytest.mark.parametrize("na_position", ["first", "last"])
def test_sort_values_stable_multicolumn_sort(
self, expected_idx_non_na, ascending, na_position
):
# GH#38426 Clarify sort_values with mult. columns / labels is stable
df = DataFrame(
{
"A": [1, 2, np.nan, 1, 1, 1, 6, 8, 4, 8, 8, np.nan, np.nan, 8, 8],
"B": [9, np.nan, 5, 2, 2, 2, 5, 4, 5, 3, 4, np.nan, np.nan, 4, 4],
}
)
# All rows with NaN in col "B" only have unique values in "A", therefore,
# only the rows with NaNs in "A" have to be treated individually:
expected_idx = (
[11, 12, 2] + expected_idx_non_na
if na_position == "first"
else expected_idx_non_na + [2, 11, 12]
)
expected = df.take(expected_idx)
sorted_df = df.sort_values(
["A", "B"], ascending=ascending, na_position=na_position
)
tm.assert_frame_equal(sorted_df, expected)
def test_sort_values_stable_categorial(self):
# GH#16793
df = DataFrame({"x": Categorical(np.repeat([1, 2, 3, 4], 5), ordered=True)})
expected = df.copy()
sorted_df = df.sort_values("x", kind="mergesort")
tm.assert_frame_equal(sorted_df, expected)
def test_sort_values_datetimes(self):
# GH#3461, argsort / lexsort differences for a datetime column
df = DataFrame(
["a", "a", "a", "b", "c", "d", "e", "f", "g"],
columns=["A"],
index=date_range("20130101", periods=9),
)
dts = [
Timestamp(x)
for x in [
"2004-02-11",
"2004-01-21",
"2004-01-26",
"2005-09-20",
"2010-10-04",
"2009-05-12",
"2008-11-12",
"2010-09-28",
"2010-09-28",
]
]
df["B"] = dts[::2] + dts[1::2]
df["C"] = 2.0
df["A1"] = 3.0
df1 = df.sort_values(by="A")
df2 = df.sort_values(by=["A"])
tm.assert_frame_equal(df1, df2)
df1 = df.sort_values(by="B")
df2 = df.sort_values(by=["B"])
tm.assert_frame_equal(df1, df2)
df1 = df.sort_values(by="B")
df2 = df.sort_values(by=["C", "B"])
tm.assert_frame_equal(df1, df2)
def test_sort_values_frame_column_inplace_sort_exception(
self, float_frame, using_copy_on_write
):
s = float_frame["A"]
float_frame_orig = float_frame.copy()
if using_copy_on_write:
# INFO(CoW) Series is a new object, so can be changed inplace
# without modifying original datafame
s.sort_values(inplace=True)
tm.assert_series_equal(s, float_frame_orig["A"].sort_values())
# column in dataframe is not changed
tm.assert_frame_equal(float_frame, float_frame_orig)
else:
with pytest.raises(ValueError, match="This Series is a view"):
s.sort_values(inplace=True)
cp = s.copy()
cp.sort_values() # it works!
def test_sort_values_nat_values_in_int_column(self):
# GH#14922: "sorting with large float and multiple columns incorrect"
# cause was that the int64 value NaT was considered as "na". Which is
# only correct for datetime64 columns.
int_values = (2, int(NaT._value))
float_values = (2.0, -1.797693e308)
df = DataFrame(
{"int": int_values, "float": float_values}, columns=["int", "float"]
)
df_reversed = DataFrame(
{"int": int_values[::-1], "float": float_values[::-1]},
columns=["int", "float"],
index=[1, 0],
)
# NaT is not a "na" for int64 columns, so na_position must not
# influence the result:
df_sorted = df.sort_values(["int", "float"], na_position="last")
tm.assert_frame_equal(df_sorted, df_reversed)
df_sorted = df.sort_values(["int", "float"], na_position="first")
tm.assert_frame_equal(df_sorted, df_reversed)
# reverse sorting order
df_sorted = df.sort_values(["int", "float"], ascending=False)
tm.assert_frame_equal(df_sorted, df)
# and now check if NaT is still considered as "na" for datetime64
# columns:
df = DataFrame(
{"datetime": [Timestamp("2016-01-01"), NaT], "float": float_values},
columns=["datetime", "float"],
)
df_reversed = DataFrame(
{"datetime": [NaT, Timestamp("2016-01-01")], "float": float_values[::-1]},
columns=["datetime", "float"],
index=[1, 0],
)
df_sorted = df.sort_values(["datetime", "float"], na_position="first")
tm.assert_frame_equal(df_sorted, df_reversed)
df_sorted = df.sort_values(["datetime", "float"], na_position="last")
tm.assert_frame_equal(df_sorted, df)
# Ascending should not affect the results.
df_sorted = df.sort_values(["datetime", "float"], ascending=False)
tm.assert_frame_equal(df_sorted, df)
def test_sort_nat(self):
# GH 16836
d1 = [Timestamp(x) for x in ["2016-01-01", "2015-01-01", np.nan, "2016-01-01"]]
d2 = [
Timestamp(x)
for x in ["2017-01-01", "2014-01-01", "2016-01-01", "2015-01-01"]
]
df = DataFrame({"a": d1, "b": d2}, index=[0, 1, 2, 3])
d3 = [Timestamp(x) for x in ["2015-01-01", "2016-01-01", "2016-01-01", np.nan]]
d4 = [
Timestamp(x)
for x in ["2014-01-01", "2015-01-01", "2017-01-01", "2016-01-01"]
]
expected = DataFrame({"a": d3, "b": d4}, index=[1, 3, 0, 2])
sorted_df = df.sort_values(by=["a", "b"])
tm.assert_frame_equal(sorted_df, expected)
def test_sort_values_na_position_with_categories(self):
# GH#22556
# Positioning missing value properly when column is Categorical.
categories = ["A", "B", "C"]
category_indices = [0, 2, 4]
list_of_nans = [np.nan, np.nan]
na_indices = [1, 3]
na_position_first = "first"
na_position_last = "last"
column_name = "c"
reversed_categories = sorted(categories, reverse=True)
reversed_category_indices = sorted(category_indices, reverse=True)
reversed_na_indices = sorted(na_indices)
df = DataFrame(
{
column_name: Categorical(
["A", np.nan, "B", np.nan, "C"], categories=categories, ordered=True
)
}
)
# sort ascending with na first
result = df.sort_values(
by=column_name, ascending=True, na_position=na_position_first
)
expected = DataFrame(
{
column_name: Categorical(
list_of_nans + categories, categories=categories, ordered=True
)
},
index=na_indices + category_indices,
)
tm.assert_frame_equal(result, expected)
# sort ascending with na last
result = df.sort_values(
by=column_name, ascending=True, na_position=na_position_last
)
expected = DataFrame(
{
column_name: Categorical(
categories + list_of_nans, categories=categories, ordered=True
)
},
index=category_indices + na_indices,
)
tm.assert_frame_equal(result, expected)
# sort descending with na first
result = df.sort_values(
by=column_name, ascending=False, na_position=na_position_first
)
expected = DataFrame(
{
column_name: Categorical(
list_of_nans + reversed_categories,
categories=categories,
ordered=True,
)
},
index=reversed_na_indices + reversed_category_indices,
)
tm.assert_frame_equal(result, expected)
# sort descending with na last
result = df.sort_values(
by=column_name, ascending=False, na_position=na_position_last
)
expected = DataFrame(
{
column_name: Categorical(
reversed_categories + list_of_nans,
categories=categories,
ordered=True,
)
},
index=reversed_category_indices + reversed_na_indices,
)
tm.assert_frame_equal(result, expected)
def test_sort_values_nat(self):
# GH#16836
d1 = [Timestamp(x) for x in ["2016-01-01", "2015-01-01", np.nan, "2016-01-01"]]
d2 = [
Timestamp(x)
for x in ["2017-01-01", "2014-01-01", "2016-01-01", "2015-01-01"]
]
df = DataFrame({"a": d1, "b": d2}, index=[0, 1, 2, 3])
d3 = [Timestamp(x) for x in ["2015-01-01", "2016-01-01", "2016-01-01", np.nan]]
d4 = [
Timestamp(x)
for x in ["2014-01-01", "2015-01-01", "2017-01-01", "2016-01-01"]
]
expected = DataFrame({"a": d3, "b": d4}, index=[1, 3, 0, 2])
sorted_df = df.sort_values(by=["a", "b"])
tm.assert_frame_equal(sorted_df, expected)
def test_sort_values_na_position_with_categories_raises(self):
df = DataFrame(
{
"c": Categorical(
["A", np.nan, "B", np.nan, "C"],
categories=["A", "B", "C"],
ordered=True,
)
}
)
with pytest.raises(ValueError, match="invalid na_position: bad_position"):
df.sort_values(by="c", ascending=False, na_position="bad_position")
@pytest.mark.parametrize("inplace", [True, False])
@pytest.mark.parametrize(
"original_dict, sorted_dict, ignore_index, output_index",
[
({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, True, [0, 1, 2]),
({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, False, [2, 1, 0]),
(
{"A": [1, 2, 3], "B": [2, 3, 4]},
{"A": [3, 2, 1], "B": [4, 3, 2]},
True,
[0, 1, 2],
),
(
{"A": [1, 2, 3], "B": [2, 3, 4]},
{"A": [3, 2, 1], "B": [4, 3, 2]},
False,
[2, 1, 0],
),
],
)
def test_sort_values_ignore_index(
self, inplace, original_dict, sorted_dict, ignore_index, output_index
):
# GH 30114
df = DataFrame(original_dict)
expected = DataFrame(sorted_dict, index=output_index)
kwargs = {"ignore_index": ignore_index, "inplace": inplace}
if inplace:
result_df = df.copy()
result_df.sort_values("A", ascending=False, **kwargs)
else:
result_df = df.sort_values("A", ascending=False, **kwargs)
tm.assert_frame_equal(result_df, expected)
tm.assert_frame_equal(df, DataFrame(original_dict))
def test_sort_values_nat_na_position_default(self):
# GH 13230
expected = DataFrame(
{
"A": [1, 2, 3, 4, 4],
"date": pd.DatetimeIndex(
[
"2010-01-01 09:00:00",
"2010-01-01 09:00:01",
"2010-01-01 09:00:02",
"2010-01-01 09:00:03",
"NaT",
]
),
}
)
result = expected.sort_values(["A", "date"])
tm.assert_frame_equal(result, expected)
def test_sort_values_item_cache(self, using_array_manager, using_copy_on_write):
# previous behavior incorrect retained an invalid _item_cache entry
df = DataFrame(
np.random.default_rng(2).standard_normal((4, 3)), columns=["A", "B", "C"]
)
df["D"] = df["A"] * 2
ser = df["A"]
if not using_array_manager:
assert len(df._mgr.blocks) == 2
df.sort_values(by="A")
if using_copy_on_write:
ser.iloc[0] = 99
assert df.iloc[0, 0] == df["A"][0]
assert df.iloc[0, 0] != 99
else:
ser.values[0] = 99
assert df.iloc[0, 0] == df["A"][0]
assert df.iloc[0, 0] == 99
def test_sort_values_reshaping(self):
# GH 39426
values = list(range(21))
expected = DataFrame([values], columns=values)
df = expected.sort_values(expected.index[0], axis=1, ignore_index=True)
tm.assert_frame_equal(df, expected)
def test_sort_values_no_by_inplace(self):
# GH#50643
df = DataFrame({"a": [1, 2, 3]})
expected = df.copy()
result = df.sort_values(by=[], inplace=True)
tm.assert_frame_equal(df, expected)
assert result is None
def test_sort_values_no_op_reset_index(self):
# GH#52553
df = DataFrame({"A": [10, 20], "B": [1, 5]}, index=[2, 3])
result = df.sort_values(by="A", ignore_index=True)
expected = DataFrame({"A": [10, 20], "B": [1, 5]})
tm.assert_frame_equal(result, expected)
class TestDataFrameSortKey: # test key sorting (issue 27237)
def test_sort_values_inplace_key(self, sort_by_key):
frame = DataFrame(
np.random.default_rng(2).standard_normal((4, 4)),
index=[1, 2, 3, 4],
columns=["A", "B", "C", "D"],
)
sorted_df = frame.copy()
return_value = sorted_df.sort_values(by="A", inplace=True, key=sort_by_key)
assert return_value is None
expected = frame.sort_values(by="A", key=sort_by_key)
tm.assert_frame_equal(sorted_df, expected)
sorted_df = frame.copy()
return_value = sorted_df.sort_values(
by=1, axis=1, inplace=True, key=sort_by_key
)
assert return_value is None
expected = frame.sort_values(by=1, axis=1, key=sort_by_key)
tm.assert_frame_equal(sorted_df, expected)
sorted_df = frame.copy()
return_value = sorted_df.sort_values(
by="A", ascending=False, inplace=True, key=sort_by_key
)
assert return_value is None
expected = frame.sort_values(by="A", ascending=False, key=sort_by_key)
tm.assert_frame_equal(sorted_df, expected)
sorted_df = frame.copy()
sorted_df.sort_values(
by=["A", "B"], ascending=False, inplace=True, key=sort_by_key
)
expected = frame.sort_values(by=["A", "B"], ascending=False, key=sort_by_key)
tm.assert_frame_equal(sorted_df, expected)
def test_sort_values_key(self):
df = DataFrame(np.array([0, 5, np.nan, 3, 2, np.nan]))
result = df.sort_values(0)
expected = df.iloc[[0, 4, 3, 1, 2, 5]]
tm.assert_frame_equal(result, expected)
result = df.sort_values(0, key=lambda x: x + 5)
expected = df.iloc[[0, 4, 3, 1, 2, 5]]
tm.assert_frame_equal(result, expected)
result = df.sort_values(0, key=lambda x: -x, ascending=False)
expected = df.iloc[[0, 4, 3, 1, 2, 5]]
tm.assert_frame_equal(result, expected)
def test_sort_values_by_key(self):
df = DataFrame(
{
"a": np.array([0, 3, np.nan, 3, 2, np.nan]),
"b": np.array([0, 2, np.nan, 5, 2, np.nan]),
}
)
result = df.sort_values("a", key=lambda x: -x)
expected = df.iloc[[1, 3, 4, 0, 2, 5]]
tm.assert_frame_equal(result, expected)
result = df.sort_values(by=["a", "b"], key=lambda x: -x)
expected = df.iloc[[3, 1, 4, 0, 2, 5]]
tm.assert_frame_equal(result, expected)
result = df.sort_values(by=["a", "b"], key=lambda x: -x, ascending=False)
expected = df.iloc[[0, 4, 1, 3, 2, 5]]
tm.assert_frame_equal(result, expected)
def test_sort_values_by_key_by_name(self):
df = DataFrame(
{
"a": np.array([0, 3, np.nan, 3, 2, np.nan]),
"b": np.array([0, 2, np.nan, 5, 2, np.nan]),
}
)
def key(col):
if col.name == "a":
return -col
else:
return col
result = df.sort_values(by="a", key=key)
expected = df.iloc[[1, 3, 4, 0, 2, 5]]
tm.assert_frame_equal(result, expected)
result = df.sort_values(by=["a"], key=key)
expected = df.iloc[[1, 3, 4, 0, 2, 5]]
tm.assert_frame_equal(result, expected)
result = df.sort_values(by="b", key=key)
expected = df.iloc[[0, 1, 4, 3, 2, 5]]
tm.assert_frame_equal(result, expected)
result = df.sort_values(by=["a", "b"], key=key)
expected = df.iloc[[1, 3, 4, 0, 2, 5]]
tm.assert_frame_equal(result, expected)
def test_sort_values_key_string(self):
df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]]))
result = df.sort_values(1)
expected = df[::-1]
tm.assert_frame_equal(result, expected)
result = df.sort_values([0, 1], key=lambda col: col.str.lower())
tm.assert_frame_equal(result, df)
result = df.sort_values(
[0, 1], key=lambda col: col.str.lower(), ascending=False
)
expected = df.sort_values(1, key=lambda col: col.str.lower(), ascending=False)
tm.assert_frame_equal(result, expected)
def test_sort_values_key_empty(self, sort_by_key):
df = DataFrame(np.array([]))
df.sort_values(0, key=sort_by_key)
df.sort_index(key=sort_by_key)
def test_changes_length_raises(self):
df = DataFrame({"A": [1, 2, 3]})
with pytest.raises(ValueError, match="change the shape"):
df.sort_values("A", key=lambda x: x[:1])
def test_sort_values_key_axes(self):
df = DataFrame({0: ["Hello", "goodbye"], 1: [0, 1]})
result = df.sort_values(0, key=lambda col: col.str.lower())
expected = df[::-1]
tm.assert_frame_equal(result, expected)
result = df.sort_values(1, key=lambda col: -col)
expected = df[::-1]
tm.assert_frame_equal(result, expected)
def test_sort_values_key_dict_axis(self):
df = DataFrame({0: ["Hello", 0], 1: ["goodbye", 1]})
result = df.sort_values(0, key=lambda col: col.str.lower(), axis=1)
expected = df.loc[:, ::-1]
tm.assert_frame_equal(result, expected)
result = df.sort_values(1, key=lambda col: -col, axis=1)
expected = df.loc[:, ::-1]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("ordered", [True, False])
def test_sort_values_key_casts_to_categorical(self, ordered):
# https://github.com/pandas-dev/pandas/issues/36383
categories = ["c", "b", "a"]
df = DataFrame({"x": [1, 1, 1], "y": ["a", "b", "c"]})
def sorter(key):
if key.name == "y":
return pd.Series(
Categorical(key, categories=categories, ordered=ordered)
)
return key
result = df.sort_values(by=["x", "y"], key=sorter)
expected = DataFrame(
{"x": [1, 1, 1], "y": ["c", "b", "a"]}, index=pd.Index([2, 1, 0])
)
tm.assert_frame_equal(result, expected)
@pytest.fixture
def df_none():
return DataFrame(
{
"outer": ["a", "a", "a", "b", "b", "b"],
"inner": [1, 2, 2, 2, 1, 1],
"A": np.arange(6, 0, -1),
("B", 5): ["one", "one", "two", "two", "one", "one"],
}
)
@pytest.fixture(params=[["outer"], ["outer", "inner"]])
def df_idx(request, df_none):
levels = request.param
return df_none.set_index(levels)
@pytest.fixture(
params=[
"inner", # index level
["outer"], # list of index level
"A", # column
[("B", 5)], # list of column
["inner", "outer"], # two index levels
[("B", 5), "outer"], # index level and column
["A", ("B", 5)], # Two columns
["inner", "outer"], # two index levels and column
]
)
def sort_names(request):
return request.param
@pytest.fixture(params=[True, False])
def ascending(request):
return request.param
class TestSortValuesLevelAsStr:
def test_sort_index_level_and_column_label(
self, df_none, df_idx, sort_names, ascending, request
):
# GH#14353
if (
Version(np.__version__) >= Version("1.25")
and request.node.callspec.id == "df_idx0-inner-True"
):
request.applymarker(
pytest.mark.xfail(
reason=(
"pandas default unstable sorting of duplicates"
"issue with numpy>=1.25 with AVX instructions"
),
strict=False,
)
)
# Get index levels from df_idx
levels = df_idx.index.names
# Compute expected by sorting on columns and the setting index
expected = df_none.sort_values(
by=sort_names, ascending=ascending, axis=0
).set_index(levels)
# Compute result sorting on mix on columns and index levels
result = df_idx.sort_values(by=sort_names, ascending=ascending, axis=0)
tm.assert_frame_equal(result, expected)
def test_sort_column_level_and_index_label(
self, df_none, df_idx, sort_names, ascending, request
):
# GH#14353
# Get levels from df_idx
levels = df_idx.index.names
# Compute expected by sorting on axis=0, setting index levels, and then
# transposing. For some cases this will result in a frame with
# multiple column levels
expected = (
df_none.sort_values(by=sort_names, ascending=ascending, axis=0)
.set_index(levels)
.T
)
# Compute result by transposing and sorting on axis=1.
result = df_idx.T.sort_values(by=sort_names, ascending=ascending, axis=1)
if Version(np.__version__) >= Version("1.25"):
request.applymarker(
pytest.mark.xfail(
reason=(
"pandas default unstable sorting of duplicates"
"issue with numpy>=1.25 with AVX instructions"
),
strict=False,
)
)
tm.assert_frame_equal(result, expected)
def test_sort_values_validate_ascending_for_value_error(self):
# GH41634
df = DataFrame({"D": [23, 7, 21]})
msg = 'For argument "ascending" expected type bool, received type str.'
with pytest.raises(ValueError, match=msg):
df.sort_values(by="D", ascending="False")
@pytest.mark.parametrize("ascending", [False, 0, 1, True])
def test_sort_values_validate_ascending_functional(self, ascending):
df = DataFrame({"D": [23, 7, 21]})
indexer = df["D"].argsort().values
if not ascending:
indexer = indexer[::-1]
expected = df.loc[df.index[indexer]]
result = df.sort_values(by="D", ascending=ascending)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,37 @@
import numpy as np
import pytest
from pandas import DataFrame
import pandas._testing as tm
class TestSwapAxes:
def test_swapaxes(self):
df = DataFrame(np.random.default_rng(2).standard_normal((10, 5)))
msg = "'DataFrame.swapaxes' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
tm.assert_frame_equal(df.T, df.swapaxes(0, 1))
tm.assert_frame_equal(df.T, df.swapaxes(1, 0))
def test_swapaxes_noop(self):
df = DataFrame(np.random.default_rng(2).standard_normal((10, 5)))
msg = "'DataFrame.swapaxes' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
tm.assert_frame_equal(df, df.swapaxes(0, 0))
def test_swapaxes_invalid_axis(self):
df = DataFrame(np.random.default_rng(2).standard_normal((10, 5)))
msg = "'DataFrame.swapaxes' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
msg = "No axis named 2 for object type DataFrame"
with pytest.raises(ValueError, match=msg):
df.swapaxes(2, 5)
def test_round_empty_not_input(self):
# GH#51032
df = DataFrame({"a": [1, 2]})
msg = "'DataFrame.swapaxes' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.swapaxes("index", "index")
tm.assert_frame_equal(df, result)
assert df is not result

View File

@ -0,0 +1,36 @@
import pytest
from pandas import DataFrame
import pandas._testing as tm
class TestSwaplevel:
def test_swaplevel(self, multiindex_dataframe_random_data):
frame = multiindex_dataframe_random_data
swapped = frame["A"].swaplevel()
swapped2 = frame["A"].swaplevel(0)
swapped3 = frame["A"].swaplevel(0, 1)
swapped4 = frame["A"].swaplevel("first", "second")
assert not swapped.index.equals(frame.index)
tm.assert_series_equal(swapped, swapped2)
tm.assert_series_equal(swapped, swapped3)
tm.assert_series_equal(swapped, swapped4)
back = swapped.swaplevel()
back2 = swapped.swaplevel(0)
back3 = swapped.swaplevel(0, 1)
back4 = swapped.swaplevel("second", "first")
assert back.index.equals(frame.index)
tm.assert_series_equal(back, back2)
tm.assert_series_equal(back, back3)
tm.assert_series_equal(back, back4)
ft = frame.T
swapped = ft.swaplevel("first", "second", axis=1)
exp = frame.swaplevel("first", "second").T
tm.assert_frame_equal(swapped, exp)
msg = "Can only swap levels on a hierarchical axis."
with pytest.raises(TypeError, match=msg):
DataFrame(range(3)).swaplevel()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,535 @@
from collections import (
OrderedDict,
defaultdict,
)
from datetime import datetime
import numpy as np
import pytest
import pytz
from pandas import (
NA,
DataFrame,
Index,
Interval,
MultiIndex,
Period,
Series,
Timedelta,
Timestamp,
)
import pandas._testing as tm
class TestDataFrameToDict:
def test_to_dict_timestamp(self):
# GH#11247
# split/records producing np.datetime64 rather than Timestamps
# on datetime64[ns] dtypes only
tsmp = Timestamp("20130101")
test_data = DataFrame({"A": [tsmp, tsmp], "B": [tsmp, tsmp]})
test_data_mixed = DataFrame({"A": [tsmp, tsmp], "B": [1, 2]})
expected_records = [{"A": tsmp, "B": tsmp}, {"A": tsmp, "B": tsmp}]
expected_records_mixed = [{"A": tsmp, "B": 1}, {"A": tsmp, "B": 2}]
assert test_data.to_dict(orient="records") == expected_records
assert test_data_mixed.to_dict(orient="records") == expected_records_mixed
expected_series = {
"A": Series([tsmp, tsmp], name="A"),
"B": Series([tsmp, tsmp], name="B"),
}
expected_series_mixed = {
"A": Series([tsmp, tsmp], name="A"),
"B": Series([1, 2], name="B"),
}
tm.assert_dict_equal(test_data.to_dict(orient="series"), expected_series)
tm.assert_dict_equal(
test_data_mixed.to_dict(orient="series"), expected_series_mixed
)
expected_split = {
"index": [0, 1],
"data": [[tsmp, tsmp], [tsmp, tsmp]],
"columns": ["A", "B"],
}
expected_split_mixed = {
"index": [0, 1],
"data": [[tsmp, 1], [tsmp, 2]],
"columns": ["A", "B"],
}
tm.assert_dict_equal(test_data.to_dict(orient="split"), expected_split)
tm.assert_dict_equal(
test_data_mixed.to_dict(orient="split"), expected_split_mixed
)
def test_to_dict_index_not_unique_with_index_orient(self):
# GH#22801
# Data loss when indexes are not unique. Raise ValueError.
df = DataFrame({"a": [1, 2], "b": [0.5, 0.75]}, index=["A", "A"])
msg = "DataFrame index must be unique for orient='index'"
with pytest.raises(ValueError, match=msg):
df.to_dict(orient="index")
def test_to_dict_invalid_orient(self):
df = DataFrame({"A": [0, 1]})
msg = "orient 'xinvalid' not understood"
with pytest.raises(ValueError, match=msg):
df.to_dict(orient="xinvalid")
@pytest.mark.parametrize("orient", ["d", "l", "r", "sp", "s", "i"])
def test_to_dict_short_orient_raises(self, orient):
# GH#32515
df = DataFrame({"A": [0, 1]})
with pytest.raises(ValueError, match="not understood"):
df.to_dict(orient=orient)
@pytest.mark.parametrize("mapping", [dict, defaultdict(list), OrderedDict])
def test_to_dict(self, mapping):
# orient= should only take the listed options
# see GH#32515
test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}}
# GH#16122
recons_data = DataFrame(test_data).to_dict(into=mapping)
for k, v in test_data.items():
for k2, v2 in v.items():
assert v2 == recons_data[k][k2]
recons_data = DataFrame(test_data).to_dict("list", into=mapping)
for k, v in test_data.items():
for k2, v2 in v.items():
assert v2 == recons_data[k][int(k2) - 1]
recons_data = DataFrame(test_data).to_dict("series", into=mapping)
for k, v in test_data.items():
for k2, v2 in v.items():
assert v2 == recons_data[k][k2]
recons_data = DataFrame(test_data).to_dict("split", into=mapping)
expected_split = {
"columns": ["A", "B"],
"index": ["1", "2", "3"],
"data": [[1.0, "1"], [2.0, "2"], [np.nan, "3"]],
}
tm.assert_dict_equal(recons_data, expected_split)
recons_data = DataFrame(test_data).to_dict("records", into=mapping)
expected_records = [
{"A": 1.0, "B": "1"},
{"A": 2.0, "B": "2"},
{"A": np.nan, "B": "3"},
]
assert isinstance(recons_data, list)
assert len(recons_data) == 3
for left, right in zip(recons_data, expected_records):
tm.assert_dict_equal(left, right)
# GH#10844
recons_data = DataFrame(test_data).to_dict("index")
for k, v in test_data.items():
for k2, v2 in v.items():
assert v2 == recons_data[k2][k]
df = DataFrame(test_data)
df["duped"] = df[df.columns[0]]
recons_data = df.to_dict("index")
comp_data = test_data.copy()
comp_data["duped"] = comp_data[df.columns[0]]
for k, v in comp_data.items():
for k2, v2 in v.items():
assert v2 == recons_data[k2][k]
@pytest.mark.parametrize("mapping", [list, defaultdict, []])
def test_to_dict_errors(self, mapping):
# GH#16122
df = DataFrame(np.random.default_rng(2).standard_normal((3, 3)))
msg = "|".join(
[
"unsupported type: <class 'list'>",
r"to_dict\(\) only accepts initialized defaultdicts",
]
)
with pytest.raises(TypeError, match=msg):
df.to_dict(into=mapping)
def test_to_dict_not_unique_warning(self):
# GH#16927: When converting to a dict, if a column has a non-unique name
# it will be dropped, throwing a warning.
df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"])
with tm.assert_produces_warning(UserWarning):
df.to_dict()
@pytest.mark.filterwarnings("ignore::UserWarning")
@pytest.mark.parametrize(
"orient,expected",
[
("list", {"A": [2, 5], "B": [3, 6]}),
("dict", {"A": {0: 2, 1: 5}, "B": {0: 3, 1: 6}}),
],
)
def test_to_dict_not_unique(self, orient, expected):
# GH#54824: This is to make sure that dataframes with non-unique column
# would have uniform behavior throughout different orients
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "A", "B"])
result = df.to_dict(orient)
assert result == expected
# orient - orient argument to to_dict function
# item_getter - function for extracting value from
# the resulting dict using column name and index
@pytest.mark.parametrize(
"orient,item_getter",
[
("dict", lambda d, col, idx: d[col][idx]),
("records", lambda d, col, idx: d[idx][col]),
("list", lambda d, col, idx: d[col][idx]),
("split", lambda d, col, idx: d["data"][idx][d["columns"].index(col)]),
("index", lambda d, col, idx: d[idx][col]),
],
)
def test_to_dict_box_scalars(self, orient, item_getter):
# GH#14216, GH#23753
# make sure that we are boxing properly
df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]})
result = df.to_dict(orient=orient)
assert isinstance(item_getter(result, "a", 0), int)
assert isinstance(item_getter(result, "b", 0), float)
def test_to_dict_tz(self):
# GH#18372 When converting to dict with orient='records' columns of
# datetime that are tz-aware were not converted to required arrays
data = [
(datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),),
(datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc),),
]
df = DataFrame(list(data), columns=["d"])
result = df.to_dict(orient="records")
expected = [
{"d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc)},
{"d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc)},
]
tm.assert_dict_equal(result[0], expected[0])
tm.assert_dict_equal(result[1], expected[1])
@pytest.mark.parametrize(
"into, expected",
[
(
dict,
{
0: {"int_col": 1, "float_col": 1.0},
1: {"int_col": 2, "float_col": 2.0},
2: {"int_col": 3, "float_col": 3.0},
},
),
(
OrderedDict,
OrderedDict(
[
(0, {"int_col": 1, "float_col": 1.0}),
(1, {"int_col": 2, "float_col": 2.0}),
(2, {"int_col": 3, "float_col": 3.0}),
]
),
),
(
defaultdict(dict),
defaultdict(
dict,
{
0: {"int_col": 1, "float_col": 1.0},
1: {"int_col": 2, "float_col": 2.0},
2: {"int_col": 3, "float_col": 3.0},
},
),
),
],
)
def test_to_dict_index_dtypes(self, into, expected):
# GH#18580
# When using to_dict(orient='index') on a dataframe with int
# and float columns only the int columns were cast to float
df = DataFrame({"int_col": [1, 2, 3], "float_col": [1.0, 2.0, 3.0]})
result = df.to_dict(orient="index", into=into)
cols = ["int_col", "float_col"]
result = DataFrame.from_dict(result, orient="index")[cols]
expected = DataFrame.from_dict(expected, orient="index")[cols]
tm.assert_frame_equal(result, expected)
def test_to_dict_numeric_names(self):
# GH#24940
df = DataFrame({str(i): [i] for i in range(5)})
result = set(df.to_dict("records")[0].keys())
expected = set(df.columns)
assert result == expected
def test_to_dict_wide(self):
# GH#24939
df = DataFrame({(f"A_{i:d}"): [i] for i in range(256)})
result = df.to_dict("records")[0]
expected = {f"A_{i:d}": i for i in range(256)}
assert result == expected
@pytest.mark.parametrize(
"data,dtype",
(
([True, True, False], bool),
[
[
datetime(2018, 1, 1),
datetime(2019, 2, 2),
datetime(2020, 3, 3),
],
Timestamp,
],
[[1.0, 2.0, 3.0], float],
[[1, 2, 3], int],
[["X", "Y", "Z"], str],
),
)
def test_to_dict_orient_dtype(self, data, dtype):
# GH22620 & GH21256
df = DataFrame({"a": data})
d = df.to_dict(orient="records")
assert all(type(record["a"]) is dtype for record in d)
@pytest.mark.parametrize(
"data,expected_dtype",
(
[np.uint64(2), int],
[np.int64(-9), int],
[np.float64(1.1), float],
[np.bool_(True), bool],
[np.datetime64("2005-02-25"), Timestamp],
),
)
def test_to_dict_scalar_constructor_orient_dtype(self, data, expected_dtype):
# GH22620 & GH21256
df = DataFrame({"a": data}, index=[0])
d = df.to_dict(orient="records")
result = type(d[0]["a"])
assert result is expected_dtype
def test_to_dict_mixed_numeric_frame(self):
# GH 12859
df = DataFrame({"a": [1.0], "b": [9.0]})
result = df.reset_index().to_dict("records")
expected = [{"index": 0, "a": 1.0, "b": 9.0}]
assert result == expected
@pytest.mark.parametrize(
"index",
[
None,
Index(["aa", "bb"]),
Index(["aa", "bb"], name="cc"),
MultiIndex.from_tuples([("a", "b"), ("a", "c")]),
MultiIndex.from_tuples([("a", "b"), ("a", "c")], names=["n1", "n2"]),
],
)
@pytest.mark.parametrize(
"columns",
[
["x", "y"],
Index(["x", "y"]),
Index(["x", "y"], name="z"),
MultiIndex.from_tuples([("x", 1), ("y", 2)]),
MultiIndex.from_tuples([("x", 1), ("y", 2)], names=["z1", "z2"]),
],
)
def test_to_dict_orient_tight(self, index, columns):
df = DataFrame.from_records(
[[1, 3], [2, 4]],
columns=columns,
index=index,
)
roundtrip = DataFrame.from_dict(df.to_dict(orient="tight"), orient="tight")
tm.assert_frame_equal(df, roundtrip)
@pytest.mark.parametrize(
"orient",
["dict", "list", "split", "records", "index", "tight"],
)
@pytest.mark.parametrize(
"data,expected_types",
(
(
{
"a": [np.int64(1), 1, np.int64(3)],
"b": [np.float64(1.0), 2.0, np.float64(3.0)],
"c": [np.float64(1.0), 2, np.int64(3)],
"d": [np.float64(1.0), "a", np.int64(3)],
"e": [np.float64(1.0), ["a"], np.int64(3)],
"f": [np.float64(1.0), ("a",), np.int64(3)],
},
{
"a": [int, int, int],
"b": [float, float, float],
"c": [float, float, float],
"d": [float, str, int],
"e": [float, list, int],
"f": [float, tuple, int],
},
),
(
{
"a": [1, 2, 3],
"b": [1.1, 2.2, 3.3],
},
{
"a": [int, int, int],
"b": [float, float, float],
},
),
( # Make sure we have one df which is all object type cols
{
"a": [1, "hello", 3],
"b": [1.1, "world", 3.3],
},
{
"a": [int, str, int],
"b": [float, str, float],
},
),
),
)
def test_to_dict_returns_native_types(self, orient, data, expected_types):
# GH 46751
# Tests we get back native types for all orient types
df = DataFrame(data)
result = df.to_dict(orient)
if orient == "dict":
assertion_iterator = (
(i, key, value)
for key, index_value_map in result.items()
for i, value in index_value_map.items()
)
elif orient == "list":
assertion_iterator = (
(i, key, value)
for key, values in result.items()
for i, value in enumerate(values)
)
elif orient in {"split", "tight"}:
assertion_iterator = (
(i, key, result["data"][i][j])
for i in result["index"]
for j, key in enumerate(result["columns"])
)
elif orient == "records":
assertion_iterator = (
(i, key, value)
for i, record in enumerate(result)
for key, value in record.items()
)
elif orient == "index":
assertion_iterator = (
(i, key, value)
for i, record in result.items()
for key, value in record.items()
)
for i, key, value in assertion_iterator:
assert value == data[key][i]
assert type(value) is expected_types[key][i]
@pytest.mark.parametrize("orient", ["dict", "list", "series", "records", "index"])
def test_to_dict_index_false_error(self, orient):
# GH#46398
df = DataFrame({"col1": [1, 2], "col2": [3, 4]}, index=["row1", "row2"])
msg = "'index=False' is only valid when 'orient' is 'split' or 'tight'"
with pytest.raises(ValueError, match=msg):
df.to_dict(orient=orient, index=False)
@pytest.mark.parametrize(
"orient, expected",
[
("split", {"columns": ["col1", "col2"], "data": [[1, 3], [2, 4]]}),
(
"tight",
{
"columns": ["col1", "col2"],
"data": [[1, 3], [2, 4]],
"column_names": [None],
},
),
],
)
def test_to_dict_index_false(self, orient, expected):
# GH#46398
df = DataFrame({"col1": [1, 2], "col2": [3, 4]}, index=["row1", "row2"])
result = df.to_dict(orient=orient, index=False)
tm.assert_dict_equal(result, expected)
@pytest.mark.parametrize(
"orient, expected",
[
("dict", {"a": {0: 1, 1: None}}),
("list", {"a": [1, None]}),
("split", {"index": [0, 1], "columns": ["a"], "data": [[1], [None]]}),
(
"tight",
{
"index": [0, 1],
"columns": ["a"],
"data": [[1], [None]],
"index_names": [None],
"column_names": [None],
},
),
("records", [{"a": 1}, {"a": None}]),
("index", {0: {"a": 1}, 1: {"a": None}}),
],
)
def test_to_dict_na_to_none(self, orient, expected):
# GH#50795
df = DataFrame({"a": [1, NA]}, dtype="Int64")
result = df.to_dict(orient=orient)
assert result == expected
def test_to_dict_masked_native_python(self):
# GH#34665
df = DataFrame({"a": Series([1, 2], dtype="Int64"), "B": 1})
result = df.to_dict(orient="records")
assert isinstance(result[0]["a"], int)
df = DataFrame({"a": Series([1, NA], dtype="Int64"), "B": 1})
result = df.to_dict(orient="records")
assert isinstance(result[0]["a"], int)
def test_to_dict_pos_args_deprecation(self):
# GH-54229
df = DataFrame({"a": [1, 2, 3]})
msg = (
r"Starting with pandas version 3.0 all arguments of to_dict except for the "
r"argument 'orient' will be keyword-only."
)
with tm.assert_produces_warning(FutureWarning, match=msg):
df.to_dict("records", {})
@pytest.mark.parametrize(
"val", [Timestamp(2020, 1, 1), Timedelta(1), Period("2020"), Interval(1, 2)]
)
def test_to_dict_list_pd_scalars(val):
# GH 54824
df = DataFrame({"a": [val]})
result = df.to_dict(orient="list")
expected = {"a": [val]}
assert result == expected

View File

@ -0,0 +1,76 @@
import numpy as np
import pytest
import pandas.util._test_decorators as td
from pandas import (
DataFrame,
MultiIndex,
)
import pandas._testing as tm
from pandas.core.arrays import NumpyExtensionArray
pytestmark = td.skip_array_manager_invalid_test
class TestToDictOfBlocks:
@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning")
def test_no_copy_blocks(self, float_frame, using_copy_on_write):
# GH#9607
df = DataFrame(float_frame, copy=True)
column = df.columns[0]
_last_df = None
# use the copy=False, change a column
blocks = df._to_dict_of_blocks()
for _df in blocks.values():
_last_df = _df
if column in _df:
_df.loc[:, column] = _df[column] + 1
if not using_copy_on_write:
# make sure we did change the original DataFrame
assert _last_df is not None and _last_df[column].equals(df[column])
else:
assert _last_df is not None and not _last_df[column].equals(df[column])
def test_to_dict_of_blocks_item_cache(using_copy_on_write, warn_copy_on_write):
# Calling to_dict_of_blocks should not poison item_cache
df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
df["c"] = NumpyExtensionArray(np.array([1, 2, None, 3], dtype=object))
mgr = df._mgr
assert len(mgr.blocks) == 3 # i.e. not consolidated
ser = df["b"] # populations item_cache["b"]
df._to_dict_of_blocks()
if using_copy_on_write:
with pytest.raises(ValueError, match="read-only"):
ser.values[0] = "foo"
elif warn_copy_on_write:
ser.values[0] = "foo"
assert df.loc[0, "b"] == "foo"
# with warning mode, the item cache is disabled
assert df["b"] is not ser
else:
# Check that the to_dict_of_blocks didn't break link between ser and df
ser.values[0] = "foo"
assert df.loc[0, "b"] == "foo"
assert df["b"] is ser
def test_set_change_dtype_slice():
# GH#8850
cols = MultiIndex.from_tuples([("1st", "a"), ("2nd", "b"), ("3rd", "c")])
df = DataFrame([[1.0, 2, 3], [4.0, 5, 6]], columns=cols)
df["2nd"] = df["2nd"] * 2.0
blocks = df._to_dict_of_blocks()
assert sorted(blocks.keys()) == ["float64", "int64"]
tm.assert_frame_equal(
blocks["float64"], DataFrame([[1.0, 4.0], [4.0, 10.0]], columns=cols[:2])
)
tm.assert_frame_equal(blocks["int64"], DataFrame([[3], [6]], columns=cols[2:]))

View File

@ -0,0 +1,49 @@
import numpy as np
import pandas.util._test_decorators as td
from pandas import (
DataFrame,
Timestamp,
)
import pandas._testing as tm
class TestToNumpy:
def test_to_numpy(self):
df = DataFrame({"A": [1, 2], "B": [3, 4.5]})
expected = np.array([[1, 3], [2, 4.5]])
result = df.to_numpy()
tm.assert_numpy_array_equal(result, expected)
def test_to_numpy_dtype(self):
df = DataFrame({"A": [1, 2], "B": [3, 4.5]})
expected = np.array([[1, 3], [2, 4]], dtype="int64")
result = df.to_numpy(dtype="int64")
tm.assert_numpy_array_equal(result, expected)
@td.skip_array_manager_invalid_test
def test_to_numpy_copy(self, using_copy_on_write):
arr = np.random.default_rng(2).standard_normal((4, 3))
df = DataFrame(arr)
if using_copy_on_write:
assert df.values.base is not arr
assert df.to_numpy(copy=False).base is df.values.base
else:
assert df.values.base is arr
assert df.to_numpy(copy=False).base is arr
assert df.to_numpy(copy=True).base is not arr
# we still don't want a copy when na_value=np.nan is passed,
# and that can be respected because we are already numpy-float
if using_copy_on_write:
assert df.to_numpy(copy=False).base is df.values.base
else:
assert df.to_numpy(copy=False, na_value=np.nan).base is arr
def test_to_numpy_mixed_dtype_to_str(self):
# https://github.com/pandas-dev/pandas/issues/35455
df = DataFrame([[Timestamp("2020-01-01 00:00:00"), 100.0]])
result = df.to_numpy(dtype=str)
expected = np.array([["2020-01-01 00:00:00", "100.0"]], dtype=str)
tm.assert_numpy_array_equal(result, expected)

View File

@ -0,0 +1,89 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
DatetimeIndex,
PeriodIndex,
Series,
date_range,
period_range,
)
import pandas._testing as tm
class TestToPeriod:
def test_to_period(self, frame_or_series):
K = 5
dr = date_range("1/1/2000", "1/1/2001", freq="D")
obj = DataFrame(
np.random.default_rng(2).standard_normal((len(dr), K)),
index=dr,
columns=["A", "B", "C", "D", "E"],
)
obj["mix"] = "a"
obj = tm.get_obj(obj, frame_or_series)
pts = obj.to_period()
exp = obj.copy()
exp.index = period_range("1/1/2000", "1/1/2001")
tm.assert_equal(pts, exp)
pts = obj.to_period("M")
exp.index = exp.index.asfreq("M")
tm.assert_equal(pts, exp)
def test_to_period_without_freq(self, frame_or_series):
# GH#7606 without freq
idx = DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"])
exp_idx = PeriodIndex(
["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], freq="D"
)
obj = DataFrame(
np.random.default_rng(2).standard_normal((4, 4)), index=idx, columns=idx
)
obj = tm.get_obj(obj, frame_or_series)
expected = obj.copy()
expected.index = exp_idx
tm.assert_equal(obj.to_period(), expected)
if frame_or_series is DataFrame:
expected = obj.copy()
expected.columns = exp_idx
tm.assert_frame_equal(obj.to_period(axis=1), expected)
def test_to_period_columns(self):
dr = date_range("1/1/2000", "1/1/2001")
df = DataFrame(np.random.default_rng(2).standard_normal((len(dr), 5)), index=dr)
df["mix"] = "a"
df = df.T
pts = df.to_period(axis=1)
exp = df.copy()
exp.columns = period_range("1/1/2000", "1/1/2001")
tm.assert_frame_equal(pts, exp)
pts = df.to_period("M", axis=1)
tm.assert_index_equal(pts.columns, exp.columns.asfreq("M"))
def test_to_period_invalid_axis(self):
dr = date_range("1/1/2000", "1/1/2001")
df = DataFrame(np.random.default_rng(2).standard_normal((len(dr), 5)), index=dr)
df["mix"] = "a"
msg = "No axis named 2 for object type DataFrame"
with pytest.raises(ValueError, match=msg):
df.to_period(axis=2)
def test_to_period_raises(self, index, frame_or_series):
# https://github.com/pandas-dev/pandas/issues/33327
obj = Series(index=index, dtype=object)
if frame_or_series is DataFrame:
obj = obj.to_frame()
if not isinstance(index, DatetimeIndex):
msg = f"unsupported Type {type(index).__name__}"
with pytest.raises(TypeError, match=msg):
obj.to_period()

View File

@ -0,0 +1,523 @@
from collections import abc
import email
from email.parser import Parser
import numpy as np
import pytest
from pandas import (
CategoricalDtype,
DataFrame,
MultiIndex,
Series,
Timestamp,
date_range,
)
import pandas._testing as tm
class TestDataFrameToRecords:
def test_to_records_timeseries(self):
index = date_range("1/1/2000", periods=10)
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 3)),
index=index,
columns=["a", "b", "c"],
)
result = df.to_records()
assert result["index"].dtype == "M8[ns]"
result = df.to_records(index=False)
def test_to_records_dt64(self):
df = DataFrame(
[["one", "two", "three"], ["four", "five", "six"]],
index=date_range("2012-01-01", "2012-01-02"),
)
expected = df.index.values[0]
result = df.to_records()["index"][0]
assert expected == result
def test_to_records_dt64tz_column(self):
# GH#32535 dont less tz in to_records
df = DataFrame({"A": date_range("2012-01-01", "2012-01-02", tz="US/Eastern")})
result = df.to_records()
assert result.dtype["A"] == object
val = result[0][1]
assert isinstance(val, Timestamp)
assert val == df.loc[0, "A"]
def test_to_records_with_multindex(self):
# GH#3189
index = [
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
["one", "two", "one", "two", "one", "two", "one", "two"],
]
data = np.zeros((8, 4))
df = DataFrame(data, index=index)
r = df.to_records(index=True)["level_0"]
assert "bar" in r
assert "one" not in r
def test_to_records_with_Mapping_type(self):
abc.Mapping.register(email.message.Message)
headers = Parser().parsestr(
"From: <user@example.com>\n"
"To: <someone_else@example.com>\n"
"Subject: Test message\n"
"\n"
"Body would go here\n"
)
frame = DataFrame.from_records([headers])
all(x in frame for x in ["Type", "Subject", "From"])
def test_to_records_floats(self):
df = DataFrame(np.random.default_rng(2).random((10, 10)))
df.to_records()
def test_to_records_index_name(self):
df = DataFrame(np.random.default_rng(2).standard_normal((3, 3)))
df.index.name = "X"
rs = df.to_records()
assert "X" in rs.dtype.fields
df = DataFrame(np.random.default_rng(2).standard_normal((3, 3)))
rs = df.to_records()
assert "index" in rs.dtype.fields
df.index = MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
df.index.names = ["A", None]
result = df.to_records()
expected = np.rec.fromarrays(
[np.array(["a", "a", "b"]), np.array(["x", "y", "z"])]
+ [np.asarray(df.iloc[:, i]) for i in range(3)],
dtype={
"names": ["A", "level_1", "0", "1", "2"],
"formats": [
"O",
"O",
f"{tm.ENDIAN}f8",
f"{tm.ENDIAN}f8",
f"{tm.ENDIAN}f8",
],
},
)
tm.assert_numpy_array_equal(result, expected)
def test_to_records_with_unicode_index(self):
# GH#13172
# unicode_literals conflict with to_records
result = DataFrame([{"a": "x", "b": "y"}]).set_index("a").to_records()
expected = np.rec.array([("x", "y")], dtype=[("a", "O"), ("b", "O")])
tm.assert_almost_equal(result, expected)
def test_to_records_index_dtype(self):
# GH 47263: consistent data types for Index and MultiIndex
df = DataFrame(
{
1: date_range("2022-01-01", periods=2),
2: date_range("2022-01-01", periods=2),
3: date_range("2022-01-01", periods=2),
}
)
expected = np.rec.array(
[
("2022-01-01", "2022-01-01", "2022-01-01"),
("2022-01-02", "2022-01-02", "2022-01-02"),
],
dtype=[
("1", f"{tm.ENDIAN}M8[ns]"),
("2", f"{tm.ENDIAN}M8[ns]"),
("3", f"{tm.ENDIAN}M8[ns]"),
],
)
result = df.to_records(index=False)
tm.assert_almost_equal(result, expected)
result = df.set_index(1).to_records(index=True)
tm.assert_almost_equal(result, expected)
result = df.set_index([1, 2]).to_records(index=True)
tm.assert_almost_equal(result, expected)
def test_to_records_with_unicode_column_names(self):
# xref issue: https://github.com/numpy/numpy/issues/2407
# Issue GH#11879. to_records used to raise an exception when used
# with column names containing non-ascii characters in Python 2
result = DataFrame(data={"accented_name_é": [1.0]}).to_records()
# Note that numpy allows for unicode field names but dtypes need
# to be specified using dictionary instead of list of tuples.
expected = np.rec.array(
[(0, 1.0)],
dtype={"names": ["index", "accented_name_é"], "formats": ["=i8", "=f8"]},
)
tm.assert_almost_equal(result, expected)
def test_to_records_with_categorical(self):
# GH#8626
# dict creation
df = DataFrame({"A": list("abc")}, dtype="category")
expected = Series(list("abc"), dtype="category", name="A")
tm.assert_series_equal(df["A"], expected)
# list-like creation
df = DataFrame(list("abc"), dtype="category")
expected = Series(list("abc"), dtype="category", name=0)
tm.assert_series_equal(df[0], expected)
# to record array
# this coerces
result = df.to_records()
expected = np.rec.array(
[(0, "a"), (1, "b"), (2, "c")], dtype=[("index", "=i8"), ("0", "O")]
)
tm.assert_almost_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,expected",
[
# No dtypes --> default to array dtypes.
(
{},
np.rec.array(
[(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
dtype=[
("index", f"{tm.ENDIAN}i8"),
("A", f"{tm.ENDIAN}i8"),
("B", f"{tm.ENDIAN}f8"),
("C", "O"),
],
),
),
# Should have no effect in this case.
(
{"index": True},
np.rec.array(
[(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
dtype=[
("index", f"{tm.ENDIAN}i8"),
("A", f"{tm.ENDIAN}i8"),
("B", f"{tm.ENDIAN}f8"),
("C", "O"),
],
),
),
# Column dtype applied across the board. Index unaffected.
(
{"column_dtypes": f"{tm.ENDIAN}U4"},
np.rec.array(
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[
("index", f"{tm.ENDIAN}i8"),
("A", f"{tm.ENDIAN}U4"),
("B", f"{tm.ENDIAN}U4"),
("C", f"{tm.ENDIAN}U4"),
],
),
),
# Index dtype applied across the board. Columns unaffected.
(
{"index_dtypes": f"{tm.ENDIAN}U1"},
np.rec.array(
[("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")],
dtype=[
("index", f"{tm.ENDIAN}U1"),
("A", f"{tm.ENDIAN}i8"),
("B", f"{tm.ENDIAN}f8"),
("C", "O"),
],
),
),
# Pass in a type instance.
(
{"column_dtypes": str},
np.rec.array(
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[
("index", f"{tm.ENDIAN}i8"),
("A", f"{tm.ENDIAN}U"),
("B", f"{tm.ENDIAN}U"),
("C", f"{tm.ENDIAN}U"),
],
),
),
# Pass in a dtype instance.
(
{"column_dtypes": np.dtype(np.str_)},
np.rec.array(
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[
("index", f"{tm.ENDIAN}i8"),
("A", f"{tm.ENDIAN}U"),
("B", f"{tm.ENDIAN}U"),
("C", f"{tm.ENDIAN}U"),
],
),
),
# Pass in a dictionary (name-only).
(
{
"column_dtypes": {
"A": np.int8,
"B": np.float32,
"C": f"{tm.ENDIAN}U2",
}
},
np.rec.array(
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[
("index", f"{tm.ENDIAN}i8"),
("A", "i1"),
("B", f"{tm.ENDIAN}f4"),
("C", f"{tm.ENDIAN}U2"),
],
),
),
# Pass in a dictionary (indices-only).
(
{"index_dtypes": {0: "int16"}},
np.rec.array(
[(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
dtype=[
("index", "i2"),
("A", f"{tm.ENDIAN}i8"),
("B", f"{tm.ENDIAN}f8"),
("C", "O"),
],
),
),
# Ignore index mappings if index is not True.
(
{"index": False, "index_dtypes": f"{tm.ENDIAN}U2"},
np.rec.array(
[(1, 0.2, "a"), (2, 1.5, "bc")],
dtype=[
("A", f"{tm.ENDIAN}i8"),
("B", f"{tm.ENDIAN}f8"),
("C", "O"),
],
),
),
# Non-existent names / indices in mapping should not error.
(
{"index_dtypes": {0: "int16", "not-there": "float32"}},
np.rec.array(
[(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
dtype=[
("index", "i2"),
("A", f"{tm.ENDIAN}i8"),
("B", f"{tm.ENDIAN}f8"),
("C", "O"),
],
),
),
# Names / indices not in mapping default to array dtype.
(
{"column_dtypes": {"A": np.int8, "B": np.float32}},
np.rec.array(
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[
("index", f"{tm.ENDIAN}i8"),
("A", "i1"),
("B", f"{tm.ENDIAN}f4"),
("C", "O"),
],
),
),
# Names / indices not in dtype mapping default to array dtype.
(
{"column_dtypes": {"A": np.dtype("int8"), "B": np.dtype("float32")}},
np.rec.array(
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[
("index", f"{tm.ENDIAN}i8"),
("A", "i1"),
("B", f"{tm.ENDIAN}f4"),
("C", "O"),
],
),
),
# Mixture of everything.
(
{
"column_dtypes": {"A": np.int8, "B": np.float32},
"index_dtypes": f"{tm.ENDIAN}U2",
},
np.rec.array(
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[
("index", f"{tm.ENDIAN}U2"),
("A", "i1"),
("B", f"{tm.ENDIAN}f4"),
("C", "O"),
],
),
),
# Invalid dype values.
(
{"index": False, "column_dtypes": []},
(ValueError, "Invalid dtype \\[\\] specified for column A"),
),
(
{"index": False, "column_dtypes": {"A": "int32", "B": 5}},
(ValueError, "Invalid dtype 5 specified for column B"),
),
# Numpy can't handle EA types, so check error is raised
(
{
"index": False,
"column_dtypes": {"A": "int32", "B": CategoricalDtype(["a", "b"])},
},
(ValueError, "Invalid dtype category specified for column B"),
),
# Check that bad types raise
(
{"index": False, "column_dtypes": {"A": "int32", "B": "foo"}},
(TypeError, "data type [\"']foo[\"'] not understood"),
),
],
)
def test_to_records_dtype(self, kwargs, expected):
# see GH#18146
df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})
if not isinstance(expected, np.rec.recarray):
with pytest.raises(expected[0], match=expected[1]):
df.to_records(**kwargs)
else:
result = df.to_records(**kwargs)
tm.assert_almost_equal(result, expected)
@pytest.mark.parametrize(
"df,kwargs,expected",
[
# MultiIndex in the index.
(
DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=list("abc")
).set_index(["a", "b"]),
{"column_dtypes": "float64", "index_dtypes": {0: "int32", 1: "int8"}},
np.rec.array(
[(1, 2, 3.0), (4, 5, 6.0), (7, 8, 9.0)],
dtype=[
("a", f"{tm.ENDIAN}i4"),
("b", "i1"),
("c", f"{tm.ENDIAN}f8"),
],
),
),
# MultiIndex in the columns.
(
DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=MultiIndex.from_tuples(
[("a", "d"), ("b", "e"), ("c", "f")]
),
),
{
"column_dtypes": {0: f"{tm.ENDIAN}U1", 2: "float32"},
"index_dtypes": "float32",
},
np.rec.array(
[(0.0, "1", 2, 3.0), (1.0, "4", 5, 6.0), (2.0, "7", 8, 9.0)],
dtype=[
("index", f"{tm.ENDIAN}f4"),
("('a', 'd')", f"{tm.ENDIAN}U1"),
("('b', 'e')", f"{tm.ENDIAN}i8"),
("('c', 'f')", f"{tm.ENDIAN}f4"),
],
),
),
# MultiIndex in both the columns and index.
(
DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=MultiIndex.from_tuples(
[("a", "d"), ("b", "e"), ("c", "f")], names=list("ab")
),
index=MultiIndex.from_tuples(
[("d", -4), ("d", -5), ("f", -6)], names=list("cd")
),
),
{
"column_dtypes": "float64",
"index_dtypes": {0: f"{tm.ENDIAN}U2", 1: "int8"},
},
np.rec.array(
[
("d", -4, 1.0, 2.0, 3.0),
("d", -5, 4.0, 5.0, 6.0),
("f", -6, 7, 8, 9.0),
],
dtype=[
("c", f"{tm.ENDIAN}U2"),
("d", "i1"),
("('a', 'd')", f"{tm.ENDIAN}f8"),
("('b', 'e')", f"{tm.ENDIAN}f8"),
("('c', 'f')", f"{tm.ENDIAN}f8"),
],
),
),
],
)
def test_to_records_dtype_mi(self, df, kwargs, expected):
# see GH#18146
result = df.to_records(**kwargs)
tm.assert_almost_equal(result, expected)
def test_to_records_dict_like(self):
# see GH#18146
class DictLike:
def __init__(self, **kwargs) -> None:
self.d = kwargs.copy()
def __getitem__(self, key):
return self.d.__getitem__(key)
def __contains__(self, key) -> bool:
return key in self.d
def keys(self):
return self.d.keys()
df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})
dtype_mappings = {
"column_dtypes": DictLike(A=np.int8, B=np.float32),
"index_dtypes": f"{tm.ENDIAN}U2",
}
result = df.to_records(**dtype_mappings)
expected = np.rec.array(
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[
("index", f"{tm.ENDIAN}U2"),
("A", "i1"),
("B", f"{tm.ENDIAN}f4"),
("C", "O"),
],
)
tm.assert_almost_equal(result, expected)
@pytest.mark.parametrize("tz", ["UTC", "GMT", "US/Eastern"])
def test_to_records_datetimeindex_with_tz(self, tz):
# GH#13937
dr = date_range("2016-01-01", periods=10, freq="s", tz=tz)
df = DataFrame({"datetime": dr}, index=dr)
expected = df.to_records()
result = df.tz_convert("UTC").to_records()
# both converted to UTC, so they are equal
tm.assert_numpy_array_equal(result, expected)

View File

@ -0,0 +1,154 @@
from datetime import timedelta
import numpy as np
import pytest
from pandas import (
DataFrame,
DatetimeIndex,
PeriodIndex,
Series,
Timedelta,
date_range,
period_range,
to_datetime,
)
import pandas._testing as tm
def _get_with_delta(delta, freq="YE-DEC"):
return date_range(
to_datetime("1/1/2001") + delta,
to_datetime("12/31/2009") + delta,
freq=freq,
)
class TestToTimestamp:
def test_to_timestamp(self, frame_or_series):
K = 5
index = period_range(freq="Y", start="1/1/2001", end="12/1/2009")
obj = DataFrame(
np.random.default_rng(2).standard_normal((len(index), K)),
index=index,
columns=["A", "B", "C", "D", "E"],
)
obj["mix"] = "a"
obj = tm.get_obj(obj, frame_or_series)
exp_index = date_range("1/1/2001", end="12/31/2009", freq="YE-DEC")
exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns")
result = obj.to_timestamp("D", "end")
tm.assert_index_equal(result.index, exp_index)
tm.assert_numpy_array_equal(result.values, obj.values)
if frame_or_series is Series:
assert result.name == "A"
exp_index = date_range("1/1/2001", end="1/1/2009", freq="YS-JAN")
result = obj.to_timestamp("D", "start")
tm.assert_index_equal(result.index, exp_index)
result = obj.to_timestamp(how="start")
tm.assert_index_equal(result.index, exp_index)
delta = timedelta(hours=23)
result = obj.to_timestamp("H", "end")
exp_index = _get_with_delta(delta)
exp_index = exp_index + Timedelta(1, "h") - Timedelta(1, "ns")
tm.assert_index_equal(result.index, exp_index)
delta = timedelta(hours=23, minutes=59)
result = obj.to_timestamp("T", "end")
exp_index = _get_with_delta(delta)
exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns")
tm.assert_index_equal(result.index, exp_index)
result = obj.to_timestamp("S", "end")
delta = timedelta(hours=23, minutes=59, seconds=59)
exp_index = _get_with_delta(delta)
exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns")
tm.assert_index_equal(result.index, exp_index)
def test_to_timestamp_columns(self):
K = 5
index = period_range(freq="Y", start="1/1/2001", end="12/1/2009")
df = DataFrame(
np.random.default_rng(2).standard_normal((len(index), K)),
index=index,
columns=["A", "B", "C", "D", "E"],
)
df["mix"] = "a"
# columns
df = df.T
exp_index = date_range("1/1/2001", end="12/31/2009", freq="YE-DEC")
exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns")
result = df.to_timestamp("D", "end", axis=1)
tm.assert_index_equal(result.columns, exp_index)
tm.assert_numpy_array_equal(result.values, df.values)
exp_index = date_range("1/1/2001", end="1/1/2009", freq="YS-JAN")
result = df.to_timestamp("D", "start", axis=1)
tm.assert_index_equal(result.columns, exp_index)
delta = timedelta(hours=23)
result = df.to_timestamp("H", "end", axis=1)
exp_index = _get_with_delta(delta)
exp_index = exp_index + Timedelta(1, "h") - Timedelta(1, "ns")
tm.assert_index_equal(result.columns, exp_index)
delta = timedelta(hours=23, minutes=59)
result = df.to_timestamp("min", "end", axis=1)
exp_index = _get_with_delta(delta)
exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns")
tm.assert_index_equal(result.columns, exp_index)
result = df.to_timestamp("S", "end", axis=1)
delta = timedelta(hours=23, minutes=59, seconds=59)
exp_index = _get_with_delta(delta)
exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns")
tm.assert_index_equal(result.columns, exp_index)
result1 = df.to_timestamp("5min", axis=1)
result2 = df.to_timestamp("min", axis=1)
expected = date_range("2001-01-01", "2009-01-01", freq="YS")
assert isinstance(result1.columns, DatetimeIndex)
assert isinstance(result2.columns, DatetimeIndex)
tm.assert_numpy_array_equal(result1.columns.asi8, expected.asi8)
tm.assert_numpy_array_equal(result2.columns.asi8, expected.asi8)
# PeriodIndex.to_timestamp always use 'infer'
assert result1.columns.freqstr == "YS-JAN"
assert result2.columns.freqstr == "YS-JAN"
def test_to_timestamp_invalid_axis(self):
index = period_range(freq="Y", start="1/1/2001", end="12/1/2009")
obj = DataFrame(
np.random.default_rng(2).standard_normal((len(index), 5)), index=index
)
# invalid axis
with pytest.raises(ValueError, match="axis"):
obj.to_timestamp(axis=2)
def test_to_timestamp_hourly(self, frame_or_series):
index = period_range(freq="h", start="1/1/2001", end="1/2/2001")
obj = Series(1, index=index, name="foo")
if frame_or_series is not Series:
obj = obj.to_frame()
exp_index = date_range("1/1/2001 00:59:59", end="1/2/2001 00:59:59", freq="h")
result = obj.to_timestamp(how="end")
exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns")
tm.assert_index_equal(result.index, exp_index)
if frame_or_series is Series:
assert result.name == "foo"
def test_to_timestamp_raises(self, index, frame_or_series):
# GH#33327
obj = frame_or_series(index=index, dtype=object)
if not isinstance(index, PeriodIndex):
msg = f"unsupported Type {type(index).__name__}"
with pytest.raises(TypeError, match=msg):
obj.to_timestamp()

View File

@ -0,0 +1,209 @@
import numpy as np
import pytest
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
DataFrame,
DatetimeIndex,
Index,
IntervalIndex,
Series,
Timestamp,
bdate_range,
date_range,
timedelta_range,
)
import pandas._testing as tm
class TestTranspose:
def test_transpose_td64_intervals(self):
# GH#44917
tdi = timedelta_range("0 Days", "3 Days")
ii = IntervalIndex.from_breaks(tdi)
ii = ii.insert(-1, np.nan)
df = DataFrame(ii)
result = df.T
expected = DataFrame({i: ii[i : i + 1] for i in range(len(ii))})
tm.assert_frame_equal(result, expected)
def test_transpose_empty_preserves_datetimeindex(self):
# GH#41382
dti = DatetimeIndex([], dtype="M8[ns]")
df = DataFrame(index=dti)
expected = DatetimeIndex([], dtype="datetime64[ns]", freq=None)
result1 = df.T.sum().index
result2 = df.sum(axis=1).index
tm.assert_index_equal(result1, expected)
tm.assert_index_equal(result2, expected)
def test_transpose_tzaware_1col_single_tz(self):
# GH#26825
dti = date_range("2016-04-05 04:30", periods=3, tz="UTC")
df = DataFrame(dti)
assert (df.dtypes == dti.dtype).all()
res = df.T
assert (res.dtypes == dti.dtype).all()
def test_transpose_tzaware_2col_single_tz(self):
# GH#26825
dti = date_range("2016-04-05 04:30", periods=3, tz="UTC")
df3 = DataFrame({"A": dti, "B": dti})
assert (df3.dtypes == dti.dtype).all()
res3 = df3.T
assert (res3.dtypes == dti.dtype).all()
def test_transpose_tzaware_2col_mixed_tz(self):
# GH#26825
dti = date_range("2016-04-05 04:30", periods=3, tz="UTC")
dti2 = dti.tz_convert("US/Pacific")
df4 = DataFrame({"A": dti, "B": dti2})
assert (df4.dtypes == [dti.dtype, dti2.dtype]).all()
assert (df4.T.dtypes == object).all()
tm.assert_frame_equal(df4.T.T, df4.astype(object))
@pytest.mark.parametrize("tz", [None, "America/New_York"])
def test_transpose_preserves_dtindex_equality_with_dst(self, tz):
# GH#19970
idx = date_range("20161101", "20161130", freq="4h", tz=tz)
df = DataFrame({"a": range(len(idx)), "b": range(len(idx))}, index=idx)
result = df.T == df.T
expected = DataFrame(True, index=list("ab"), columns=idx)
tm.assert_frame_equal(result, expected)
def test_transpose_object_to_tzaware_mixed_tz(self):
# GH#26825
dti = date_range("2016-04-05 04:30", periods=3, tz="UTC")
dti2 = dti.tz_convert("US/Pacific")
# mixed all-tzaware dtypes
df2 = DataFrame([dti, dti2])
assert (df2.dtypes == object).all()
res2 = df2.T
assert (res2.dtypes == object).all()
def test_transpose_uint64(self):
df = DataFrame(
{"A": np.arange(3), "B": [2**63, 2**63 + 5, 2**63 + 10]},
dtype=np.uint64,
)
result = df.T
expected = DataFrame(df.values.T)
expected.index = ["A", "B"]
tm.assert_frame_equal(result, expected)
def test_transpose_float(self, float_frame):
frame = float_frame
dft = frame.T
for idx, series in dft.items():
for col, value in series.items():
if np.isnan(value):
assert np.isnan(frame[col][idx])
else:
assert value == frame[col][idx]
def test_transpose_mixed(self):
# mixed type
mixed = DataFrame(
{
"A": [0.0, 1.0, 2.0, 3.0, 4.0],
"B": [0.0, 1.0, 0.0, 1.0, 0.0],
"C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
"D": bdate_range("1/1/2009", periods=5),
},
index=Index(["a", "b", "c", "d", "e"], dtype=object),
)
mixed_T = mixed.T
for col, s in mixed_T.items():
assert s.dtype == np.object_
@td.skip_array_manager_invalid_test
def test_transpose_get_view(self, float_frame, using_copy_on_write):
dft = float_frame.T
dft.iloc[:, 5:10] = 5
if using_copy_on_write:
assert (float_frame.values[5:10] != 5).all()
else:
assert (float_frame.values[5:10] == 5).all()
@td.skip_array_manager_invalid_test
def test_transpose_get_view_dt64tzget_view(self, using_copy_on_write):
dti = date_range("2016-01-01", periods=6, tz="US/Pacific")
arr = dti._data.reshape(3, 2)
df = DataFrame(arr)
assert df._mgr.nblocks == 1
result = df.T
assert result._mgr.nblocks == 1
rtrip = result._mgr.blocks[0].values
if using_copy_on_write:
assert np.shares_memory(df._mgr.blocks[0].values._ndarray, rtrip._ndarray)
else:
assert np.shares_memory(arr._ndarray, rtrip._ndarray)
def test_transpose_not_inferring_dt(self):
# GH#51546
df = DataFrame(
{
"a": [Timestamp("2019-12-31"), Timestamp("2019-12-31")],
},
dtype=object,
)
result = df.T
expected = DataFrame(
[[Timestamp("2019-12-31"), Timestamp("2019-12-31")]],
columns=[0, 1],
index=["a"],
dtype=object,
)
tm.assert_frame_equal(result, expected)
def test_transpose_not_inferring_dt_mixed_blocks(self):
# GH#51546
df = DataFrame(
{
"a": Series(
[Timestamp("2019-12-31"), Timestamp("2019-12-31")], dtype=object
),
"b": [Timestamp("2019-12-31"), Timestamp("2019-12-31")],
}
)
result = df.T
expected = DataFrame(
[
[Timestamp("2019-12-31"), Timestamp("2019-12-31")],
[Timestamp("2019-12-31"), Timestamp("2019-12-31")],
],
columns=[0, 1],
index=["a", "b"],
dtype=object,
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype1", ["Int64", "Float64"])
@pytest.mark.parametrize("dtype2", ["Int64", "Float64"])
def test_transpose(self, dtype1, dtype2):
# GH#57315 - transpose should have F contiguous blocks
df = DataFrame(
{
"a": pd.array([1, 1, 2], dtype=dtype1),
"b": pd.array([3, 4, 5], dtype=dtype2),
}
)
result = df.T
for blk in result._mgr.blocks:
# When dtypes are unequal, we get NumPy object array
data = blk.values._data if dtype1 == dtype2 else blk.values
assert data.flags["F_CONTIGUOUS"]

View File

@ -0,0 +1,154 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
DatetimeIndex,
Index,
Series,
date_range,
)
import pandas._testing as tm
class TestDataFrameTruncate:
def test_truncate(self, datetime_frame, frame_or_series):
ts = datetime_frame[::3]
ts = tm.get_obj(ts, frame_or_series)
start, end = datetime_frame.index[3], datetime_frame.index[6]
start_missing = datetime_frame.index[2]
end_missing = datetime_frame.index[7]
# neither specified
truncated = ts.truncate()
tm.assert_equal(truncated, ts)
# both specified
expected = ts[1:3]
truncated = ts.truncate(start, end)
tm.assert_equal(truncated, expected)
truncated = ts.truncate(start_missing, end_missing)
tm.assert_equal(truncated, expected)
# start specified
expected = ts[1:]
truncated = ts.truncate(before=start)
tm.assert_equal(truncated, expected)
truncated = ts.truncate(before=start_missing)
tm.assert_equal(truncated, expected)
# end specified
expected = ts[:3]
truncated = ts.truncate(after=end)
tm.assert_equal(truncated, expected)
truncated = ts.truncate(after=end_missing)
tm.assert_equal(truncated, expected)
# corner case, empty series/frame returned
truncated = ts.truncate(after=ts.index[0] - ts.index.freq)
assert len(truncated) == 0
truncated = ts.truncate(before=ts.index[-1] + ts.index.freq)
assert len(truncated) == 0
msg = "Truncate: 2000-01-06 00:00:00 must be after 2000-05-16 00:00:00"
with pytest.raises(ValueError, match=msg):
ts.truncate(
before=ts.index[-1] - ts.index.freq, after=ts.index[0] + ts.index.freq
)
def test_truncate_nonsortedindex(self, frame_or_series):
# GH#17935
obj = DataFrame({"A": ["a", "b", "c", "d", "e"]}, index=[5, 3, 2, 9, 0])
obj = tm.get_obj(obj, frame_or_series)
msg = "truncate requires a sorted index"
with pytest.raises(ValueError, match=msg):
obj.truncate(before=3, after=9)
def test_sort_values_nonsortedindex(self):
rng = date_range("2011-01-01", "2012-01-01", freq="W")
ts = DataFrame(
{
"A": np.random.default_rng(2).standard_normal(len(rng)),
"B": np.random.default_rng(2).standard_normal(len(rng)),
},
index=rng,
)
decreasing = ts.sort_values("A", ascending=False)
msg = "truncate requires a sorted index"
with pytest.raises(ValueError, match=msg):
decreasing.truncate(before="2011-11", after="2011-12")
def test_truncate_nonsortedindex_axis1(self):
# GH#17935
df = DataFrame(
{
3: np.random.default_rng(2).standard_normal(5),
20: np.random.default_rng(2).standard_normal(5),
2: np.random.default_rng(2).standard_normal(5),
0: np.random.default_rng(2).standard_normal(5),
},
columns=[3, 20, 2, 0],
)
msg = "truncate requires a sorted index"
with pytest.raises(ValueError, match=msg):
df.truncate(before=2, after=20, axis=1)
@pytest.mark.parametrize(
"before, after, indices",
[(1, 2, [2, 1]), (None, 2, [2, 1, 0]), (1, None, [3, 2, 1])],
)
@pytest.mark.parametrize("dtyp", [*tm.ALL_REAL_NUMPY_DTYPES, "datetime64[ns]"])
def test_truncate_decreasing_index(
self, before, after, indices, dtyp, frame_or_series
):
# https://github.com/pandas-dev/pandas/issues/33756
idx = Index([3, 2, 1, 0], dtype=dtyp)
if isinstance(idx, DatetimeIndex):
before = pd.Timestamp(before) if before is not None else None
after = pd.Timestamp(after) if after is not None else None
indices = [pd.Timestamp(i) for i in indices]
values = frame_or_series(range(len(idx)), index=idx)
result = values.truncate(before=before, after=after)
expected = values.loc[indices]
tm.assert_equal(result, expected)
def test_truncate_multiindex(self, frame_or_series):
# GH 34564
mi = pd.MultiIndex.from_product([[1, 2, 3, 4], ["A", "B"]], names=["L1", "L2"])
s1 = DataFrame(range(mi.shape[0]), index=mi, columns=["col"])
s1 = tm.get_obj(s1, frame_or_series)
result = s1.truncate(before=2, after=3)
df = DataFrame.from_dict(
{"L1": [2, 2, 3, 3], "L2": ["A", "B", "A", "B"], "col": [2, 3, 4, 5]}
)
expected = df.set_index(["L1", "L2"])
expected = tm.get_obj(expected, frame_or_series)
tm.assert_equal(result, expected)
def test_truncate_index_only_one_unique_value(self, frame_or_series):
# GH 42365
obj = Series(0, index=date_range("2021-06-30", "2021-06-30")).repeat(5)
if frame_or_series is DataFrame:
obj = obj.to_frame(name="a")
truncated = obj.truncate("2021-06-28", "2021-07-01")
tm.assert_equal(truncated, obj)

View File

@ -0,0 +1,131 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
MultiIndex,
Series,
date_range,
)
import pandas._testing as tm
class TestTZConvert:
def test_tz_convert(self, frame_or_series):
rng = date_range("1/1/2011", periods=200, freq="D", tz="US/Eastern")
obj = DataFrame({"a": 1}, index=rng)
obj = tm.get_obj(obj, frame_or_series)
result = obj.tz_convert("Europe/Berlin")
expected = DataFrame({"a": 1}, rng.tz_convert("Europe/Berlin"))
expected = tm.get_obj(expected, frame_or_series)
assert result.index.tz.zone == "Europe/Berlin"
tm.assert_equal(result, expected)
def test_tz_convert_axis1(self):
rng = date_range("1/1/2011", periods=200, freq="D", tz="US/Eastern")
obj = DataFrame({"a": 1}, index=rng)
obj = obj.T
result = obj.tz_convert("Europe/Berlin", axis=1)
assert result.columns.tz.zone == "Europe/Berlin"
expected = DataFrame({"a": 1}, rng.tz_convert("Europe/Berlin"))
tm.assert_equal(result, expected.T)
def test_tz_convert_naive(self, frame_or_series):
# can't convert tz-naive
rng = date_range("1/1/2011", periods=200, freq="D")
ts = Series(1, index=rng)
ts = frame_or_series(ts)
with pytest.raises(TypeError, match="Cannot convert tz-naive"):
ts.tz_convert("US/Eastern")
@pytest.mark.parametrize("fn", ["tz_localize", "tz_convert"])
def test_tz_convert_and_localize(self, fn):
l0 = date_range("20140701", periods=5, freq="D")
l1 = date_range("20140701", periods=5, freq="D")
int_idx = Index(range(5))
if fn == "tz_convert":
l0 = l0.tz_localize("UTC")
l1 = l1.tz_localize("UTC")
for idx in [l0, l1]:
l0_expected = getattr(idx, fn)("US/Pacific")
l1_expected = getattr(idx, fn)("US/Pacific")
df1 = DataFrame(np.ones(5), index=l0)
df1 = getattr(df1, fn)("US/Pacific")
tm.assert_index_equal(df1.index, l0_expected)
# MultiIndex
# GH7846
df2 = DataFrame(np.ones(5), MultiIndex.from_arrays([l0, l1]))
# freq is not preserved in MultiIndex construction
l1_expected = l1_expected._with_freq(None)
l0_expected = l0_expected._with_freq(None)
l1 = l1._with_freq(None)
l0 = l0._with_freq(None)
df3 = getattr(df2, fn)("US/Pacific", level=0)
assert not df3.index.levels[0].equals(l0)
tm.assert_index_equal(df3.index.levels[0], l0_expected)
tm.assert_index_equal(df3.index.levels[1], l1)
assert not df3.index.levels[1].equals(l1_expected)
df3 = getattr(df2, fn)("US/Pacific", level=1)
tm.assert_index_equal(df3.index.levels[0], l0)
assert not df3.index.levels[0].equals(l0_expected)
tm.assert_index_equal(df3.index.levels[1], l1_expected)
assert not df3.index.levels[1].equals(l1)
df4 = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0]))
# TODO: untested
getattr(df4, fn)("US/Pacific", level=1)
tm.assert_index_equal(df3.index.levels[0], l0)
assert not df3.index.levels[0].equals(l0_expected)
tm.assert_index_equal(df3.index.levels[1], l1_expected)
assert not df3.index.levels[1].equals(l1)
# Bad Inputs
# Not DatetimeIndex / PeriodIndex
with pytest.raises(TypeError, match="DatetimeIndex"):
df = DataFrame(index=int_idx)
getattr(df, fn)("US/Pacific")
# Not DatetimeIndex / PeriodIndex
with pytest.raises(TypeError, match="DatetimeIndex"):
df = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0]))
getattr(df, fn)("US/Pacific", level=0)
# Invalid level
with pytest.raises(ValueError, match="not valid"):
df = DataFrame(index=l0)
getattr(df, fn)("US/Pacific", level=1)
@pytest.mark.parametrize("copy", [True, False])
def test_tz_convert_copy_inplace_mutate(self, copy, frame_or_series):
# GH#6326
obj = frame_or_series(
np.arange(0, 5),
index=date_range("20131027", periods=5, freq="h", tz="Europe/Berlin"),
)
orig = obj.copy()
result = obj.tz_convert("UTC", copy=copy)
expected = frame_or_series(np.arange(0, 5), index=obj.index.tz_convert("UTC"))
tm.assert_equal(result, expected)
tm.assert_equal(obj, orig)
assert result.index is not obj.index
assert result is not obj

View File

@ -0,0 +1,68 @@
from datetime import timezone
import numpy as np
import pytest
from pandas import (
DataFrame,
Series,
date_range,
)
import pandas._testing as tm
class TestTZLocalize:
# See also:
# test_tz_convert_and_localize in test_tz_convert
def test_tz_localize(self, frame_or_series):
rng = date_range("1/1/2011", periods=100, freq="h")
obj = DataFrame({"a": 1}, index=rng)
obj = tm.get_obj(obj, frame_or_series)
result = obj.tz_localize("utc")
expected = DataFrame({"a": 1}, rng.tz_localize("UTC"))
expected = tm.get_obj(expected, frame_or_series)
assert result.index.tz is timezone.utc
tm.assert_equal(result, expected)
def test_tz_localize_axis1(self):
rng = date_range("1/1/2011", periods=100, freq="h")
df = DataFrame({"a": 1}, index=rng)
df = df.T
result = df.tz_localize("utc", axis=1)
assert result.columns.tz is timezone.utc
expected = DataFrame({"a": 1}, rng.tz_localize("UTC"))
tm.assert_frame_equal(result, expected.T)
def test_tz_localize_naive(self, frame_or_series):
# Can't localize if already tz-aware
rng = date_range("1/1/2011", periods=100, freq="h", tz="utc")
ts = Series(1, index=rng)
ts = frame_or_series(ts)
with pytest.raises(TypeError, match="Already tz-aware"):
ts.tz_localize("US/Eastern")
@pytest.mark.parametrize("copy", [True, False])
def test_tz_localize_copy_inplace_mutate(self, copy, frame_or_series):
# GH#6326
obj = frame_or_series(
np.arange(0, 5), index=date_range("20131027", periods=5, freq="1h", tz=None)
)
orig = obj.copy()
result = obj.tz_localize("UTC", copy=copy)
expected = frame_or_series(
np.arange(0, 5),
index=date_range("20131027", periods=5, freq="1h", tz="UTC"),
)
tm.assert_equal(result, expected)
tm.assert_equal(obj, orig)
assert result.index is not obj.index
assert result is not obj

View File

@ -0,0 +1,204 @@
import numpy as np
import pytest
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
DataFrame,
Series,
date_range,
)
import pandas._testing as tm
class TestDataFrameUpdate:
def test_update_nan(self):
# #15593 #15617
# test 1
df1 = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)})
df2 = DataFrame({"A": [None, 2, 3]})
expected = df1.copy()
df1.update(df2, overwrite=False)
tm.assert_frame_equal(df1, expected)
# test 2
df1 = DataFrame({"A": [1.0, None, 3], "B": date_range("2000", periods=3)})
df2 = DataFrame({"A": [None, 2, 3]})
expected = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)})
df1.update(df2, overwrite=False)
tm.assert_frame_equal(df1, expected)
def test_update(self):
df = DataFrame(
[[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
)
other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3])
df.update(other)
expected = DataFrame(
[[1.5, np.nan, 3], [3.6, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]]
)
tm.assert_frame_equal(df, expected)
def test_update_dtypes(self):
# gh 3016
df = DataFrame(
[[1.0, 2.0, 1, False, True], [4.0, 5.0, 2, True, False]],
columns=["A", "B", "int", "bool1", "bool2"],
)
other = DataFrame(
[[45, 45, 3, True]], index=[0], columns=["A", "B", "int", "bool1"]
)
df.update(other)
expected = DataFrame(
[[45.0, 45.0, 3, True, True], [4.0, 5.0, 2, True, False]],
columns=["A", "B", "int", "bool1", "bool2"],
)
tm.assert_frame_equal(df, expected)
def test_update_nooverwrite(self):
df = DataFrame(
[[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
)
other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3])
df.update(other, overwrite=False)
expected = DataFrame(
[[1.5, np.nan, 3], [1.5, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 3.0]]
)
tm.assert_frame_equal(df, expected)
def test_update_filtered(self):
df = DataFrame(
[[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
)
other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3])
df.update(other, filter_func=lambda x: x > 2)
expected = DataFrame(
[[1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]]
)
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize(
"bad_kwarg, exception, msg",
[
# errors must be 'ignore' or 'raise'
({"errors": "something"}, ValueError, "The parameter errors must.*"),
({"join": "inner"}, NotImplementedError, "Only left join is supported"),
],
)
def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg):
df = DataFrame([[1.5, 1, 3.0]])
with pytest.raises(exception, match=msg):
df.update(df, **bad_kwarg)
def test_update_raise_on_overlap(self):
df = DataFrame(
[[1.5, 1, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
)
other = DataFrame([[2.0, np.nan], [np.nan, 7]], index=[1, 3], columns=[1, 2])
with pytest.raises(ValueError, match="Data overlaps"):
df.update(other, errors="raise")
def test_update_from_non_df(self):
d = {"a": Series([1, 2, 3, 4]), "b": Series([5, 6, 7, 8])}
df = DataFrame(d)
d["a"] = Series([5, 6, 7, 8])
df.update(d)
expected = DataFrame(d)
tm.assert_frame_equal(df, expected)
d = {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}
df = DataFrame(d)
d["a"] = [5, 6, 7, 8]
df.update(d)
expected = DataFrame(d)
tm.assert_frame_equal(df, expected)
def test_update_datetime_tz(self):
# GH 25807
result = DataFrame([pd.Timestamp("2019", tz="UTC")])
with tm.assert_produces_warning(None):
result.update(result)
expected = DataFrame([pd.Timestamp("2019", tz="UTC")])
tm.assert_frame_equal(result, expected)
def test_update_datetime_tz_in_place(self, using_copy_on_write, warn_copy_on_write):
# https://github.com/pandas-dev/pandas/issues/56227
result = DataFrame([pd.Timestamp("2019", tz="UTC")])
orig = result.copy()
view = result[:]
with tm.assert_produces_warning(
FutureWarning if warn_copy_on_write else None, match="Setting a value"
):
result.update(result + pd.Timedelta(days=1))
expected = DataFrame([pd.Timestamp("2019-01-02", tz="UTC")])
tm.assert_frame_equal(result, expected)
if not using_copy_on_write:
tm.assert_frame_equal(view, expected)
else:
tm.assert_frame_equal(view, orig)
def test_update_with_different_dtype(self, using_copy_on_write):
# GH#3217
df = DataFrame({"a": [1, 3], "b": [np.nan, 2]})
df["c"] = np.nan
with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"):
df.update({"c": Series(["foo"], index=[0])})
expected = DataFrame(
{
"a": [1, 3],
"b": [np.nan, 2],
"c": Series(["foo", np.nan], dtype="object"),
}
)
tm.assert_frame_equal(df, expected)
@td.skip_array_manager_invalid_test
def test_update_modify_view(
self, using_copy_on_write, warn_copy_on_write, using_infer_string
):
# GH#47188
df = DataFrame({"A": ["1", np.nan], "B": ["100", np.nan]})
df2 = DataFrame({"A": ["a", "x"], "B": ["100", "200"]})
df2_orig = df2.copy()
result_view = df2[:]
# TODO(CoW-warn) better warning message
with tm.assert_cow_warning(warn_copy_on_write):
df2.update(df)
expected = DataFrame({"A": ["1", "x"], "B": ["100", "200"]})
tm.assert_frame_equal(df2, expected)
if using_copy_on_write or using_infer_string:
tm.assert_frame_equal(result_view, df2_orig)
else:
tm.assert_frame_equal(result_view, expected)
def test_update_dt_column_with_NaT_create_column(self):
# GH#16713
df = DataFrame({"A": [1, None], "B": [pd.NaT, pd.to_datetime("2016-01-01")]})
df2 = DataFrame({"A": [2, 3]})
df.update(df2, overwrite=False)
expected = DataFrame(
{"A": [1.0, 3.0], "B": [pd.NaT, pd.to_datetime("2016-01-01")]}
)
tm.assert_frame_equal(df, expected)

View File

@ -0,0 +1,205 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
def test_data_frame_value_counts_unsorted():
df = pd.DataFrame(
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
index=["falcon", "dog", "cat", "ant"],
)
result = df.value_counts(sort=False)
expected = pd.Series(
data=[1, 2, 1],
index=pd.MultiIndex.from_arrays(
[(2, 4, 6), (2, 0, 0)], names=["num_legs", "num_wings"]
),
name="count",
)
tm.assert_series_equal(result, expected)
def test_data_frame_value_counts_ascending():
df = pd.DataFrame(
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
index=["falcon", "dog", "cat", "ant"],
)
result = df.value_counts(ascending=True)
expected = pd.Series(
data=[1, 1, 2],
index=pd.MultiIndex.from_arrays(
[(2, 6, 4), (2, 0, 0)], names=["num_legs", "num_wings"]
),
name="count",
)
tm.assert_series_equal(result, expected)
def test_data_frame_value_counts_default():
df = pd.DataFrame(
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
index=["falcon", "dog", "cat", "ant"],
)
result = df.value_counts()
expected = pd.Series(
data=[2, 1, 1],
index=pd.MultiIndex.from_arrays(
[(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"]
),
name="count",
)
tm.assert_series_equal(result, expected)
def test_data_frame_value_counts_normalize():
df = pd.DataFrame(
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
index=["falcon", "dog", "cat", "ant"],
)
result = df.value_counts(normalize=True)
expected = pd.Series(
data=[0.5, 0.25, 0.25],
index=pd.MultiIndex.from_arrays(
[(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"]
),
name="proportion",
)
tm.assert_series_equal(result, expected)
def test_data_frame_value_counts_single_col_default():
df = pd.DataFrame({"num_legs": [2, 4, 4, 6]})
result = df.value_counts()
expected = pd.Series(
data=[2, 1, 1],
index=pd.MultiIndex.from_arrays([[4, 2, 6]], names=["num_legs"]),
name="count",
)
tm.assert_series_equal(result, expected)
def test_data_frame_value_counts_empty():
df_no_cols = pd.DataFrame()
result = df_no_cols.value_counts()
expected = pd.Series(
[], dtype=np.int64, name="count", index=np.array([], dtype=np.intp)
)
tm.assert_series_equal(result, expected)
def test_data_frame_value_counts_empty_normalize():
df_no_cols = pd.DataFrame()
result = df_no_cols.value_counts(normalize=True)
expected = pd.Series(
[], dtype=np.float64, name="proportion", index=np.array([], dtype=np.intp)
)
tm.assert_series_equal(result, expected)
def test_data_frame_value_counts_dropna_true(nulls_fixture):
# GH 41334
df = pd.DataFrame(
{
"first_name": ["John", "Anne", "John", "Beth"],
"middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"],
},
)
result = df.value_counts()
expected = pd.Series(
data=[1, 1],
index=pd.MultiIndex.from_arrays(
[("Beth", "John"), ("Louise", "Smith")], names=["first_name", "middle_name"]
),
name="count",
)
tm.assert_series_equal(result, expected)
def test_data_frame_value_counts_dropna_false(nulls_fixture):
# GH 41334
df = pd.DataFrame(
{
"first_name": ["John", "Anne", "John", "Beth"],
"middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"],
},
)
result = df.value_counts(dropna=False)
expected = pd.Series(
data=[1, 1, 1, 1],
index=pd.MultiIndex(
levels=[
pd.Index(["Anne", "Beth", "John"]),
pd.Index(["Louise", "Smith", np.nan]),
],
codes=[[0, 1, 2, 2], [2, 0, 1, 2]],
names=["first_name", "middle_name"],
),
name="count",
)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("columns", (["first_name", "middle_name"], [0, 1]))
def test_data_frame_value_counts_subset(nulls_fixture, columns):
# GH 50829
df = pd.DataFrame(
{
columns[0]: ["John", "Anne", "John", "Beth"],
columns[1]: ["Smith", nulls_fixture, nulls_fixture, "Louise"],
},
)
result = df.value_counts(columns[0])
expected = pd.Series(
data=[2, 1, 1],
index=pd.Index(["John", "Anne", "Beth"], name=columns[0]),
name="count",
)
tm.assert_series_equal(result, expected)
def test_value_counts_categorical_future_warning():
# GH#54775
df = pd.DataFrame({"a": [1, 2, 3]}, dtype="category")
result = df.value_counts()
expected = pd.Series(
1,
index=pd.MultiIndex.from_arrays(
[pd.Index([1, 2, 3], name="a", dtype="category")]
),
name="count",
)
tm.assert_series_equal(result, expected)
def test_value_counts_with_missing_category():
# GH-54836
df = pd.DataFrame({"a": pd.Categorical([1, 2, 4], categories=[1, 2, 3, 4])})
result = df.value_counts()
expected = pd.Series(
[1, 1, 1, 0],
index=pd.MultiIndex.from_arrays(
[pd.CategoricalIndex([1, 2, 4, 3], categories=[1, 2, 3, 4], name="a")]
),
name="count",
)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,280 @@
import numpy as np
import pytest
import pandas.util._test_decorators as td
from pandas import (
DataFrame,
NaT,
Series,
Timestamp,
date_range,
period_range,
)
import pandas._testing as tm
class TestDataFrameValues:
@td.skip_array_manager_invalid_test
def test_values(self, float_frame, using_copy_on_write):
if using_copy_on_write:
with pytest.raises(ValueError, match="read-only"):
float_frame.values[:, 0] = 5.0
assert (float_frame.values[:, 0] != 5).all()
else:
float_frame.values[:, 0] = 5.0
assert (float_frame.values[:, 0] == 5).all()
def test_more_values(self, float_string_frame):
values = float_string_frame.values
assert values.shape[1] == len(float_string_frame.columns)
def test_values_mixed_dtypes(self, float_frame, float_string_frame):
frame = float_frame
arr = frame.values
frame_cols = frame.columns
for i, row in enumerate(arr):
for j, value in enumerate(row):
col = frame_cols[j]
if np.isnan(value):
assert np.isnan(frame[col].iloc[i])
else:
assert value == frame[col].iloc[i]
# mixed type
arr = float_string_frame[["foo", "A"]].values
assert arr[0, 0] == "bar"
df = DataFrame({"complex": [1j, 2j, 3j], "real": [1, 2, 3]})
arr = df.values
assert arr[0, 0] == 1j
def test_values_duplicates(self):
df = DataFrame(
[[1, 2, "a", "b"], [1, 2, "a", "b"]], columns=["one", "one", "two", "two"]
)
result = df.values
expected = np.array([[1, 2, "a", "b"], [1, 2, "a", "b"]], dtype=object)
tm.assert_numpy_array_equal(result, expected)
def test_values_with_duplicate_columns(self):
df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=["x", "x"])
result = df.values
expected = np.array([[1, 2.5], [3, 4.5]])
assert (result == expected).all().all()
@pytest.mark.parametrize("constructor", [date_range, period_range])
def test_values_casts_datetimelike_to_object(self, constructor):
series = Series(constructor("2000-01-01", periods=10, freq="D"))
expected = series.astype("object")
df = DataFrame(
{"a": series, "b": np.random.default_rng(2).standard_normal(len(series))}
)
result = df.values.squeeze()
assert (result[:, 0] == expected.values).all()
df = DataFrame({"a": series, "b": ["foo"] * len(series)})
result = df.values.squeeze()
assert (result[:, 0] == expected.values).all()
def test_frame_values_with_tz(self):
tz = "US/Central"
df = DataFrame({"A": date_range("2000", periods=4, tz=tz)})
result = df.values
expected = np.array(
[
[Timestamp("2000-01-01", tz=tz)],
[Timestamp("2000-01-02", tz=tz)],
[Timestamp("2000-01-03", tz=tz)],
[Timestamp("2000-01-04", tz=tz)],
]
)
tm.assert_numpy_array_equal(result, expected)
# two columns, homogeneous
df["B"] = df["A"]
result = df.values
expected = np.concatenate([expected, expected], axis=1)
tm.assert_numpy_array_equal(result, expected)
# three columns, heterogeneous
est = "US/Eastern"
df["C"] = df["A"].dt.tz_convert(est)
new = np.array(
[
[Timestamp("2000-01-01T01:00:00", tz=est)],
[Timestamp("2000-01-02T01:00:00", tz=est)],
[Timestamp("2000-01-03T01:00:00", tz=est)],
[Timestamp("2000-01-04T01:00:00", tz=est)],
]
)
expected = np.concatenate([expected, new], axis=1)
result = df.values
tm.assert_numpy_array_equal(result, expected)
def test_interleave_with_tzaware(self, timezone_frame):
# interleave with object
result = timezone_frame.assign(D="foo").values
expected = np.array(
[
[
Timestamp("2013-01-01 00:00:00"),
Timestamp("2013-01-02 00:00:00"),
Timestamp("2013-01-03 00:00:00"),
],
[
Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"),
NaT,
Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"),
],
[
Timestamp("2013-01-01 00:00:00+0100", tz="CET"),
NaT,
Timestamp("2013-01-03 00:00:00+0100", tz="CET"),
],
["foo", "foo", "foo"],
],
dtype=object,
).T
tm.assert_numpy_array_equal(result, expected)
# interleave with only datetime64[ns]
result = timezone_frame.values
expected = np.array(
[
[
Timestamp("2013-01-01 00:00:00"),
Timestamp("2013-01-02 00:00:00"),
Timestamp("2013-01-03 00:00:00"),
],
[
Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"),
NaT,
Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"),
],
[
Timestamp("2013-01-01 00:00:00+0100", tz="CET"),
NaT,
Timestamp("2013-01-03 00:00:00+0100", tz="CET"),
],
],
dtype=object,
).T
tm.assert_numpy_array_equal(result, expected)
def test_values_interleave_non_unique_cols(self):
df = DataFrame(
[[Timestamp("20130101"), 3.5], [Timestamp("20130102"), 4.5]],
columns=["x", "x"],
index=[1, 2],
)
df_unique = df.copy()
df_unique.columns = ["x", "y"]
assert df_unique.values.shape == df.values.shape
tm.assert_numpy_array_equal(df_unique.values[0], df.values[0])
tm.assert_numpy_array_equal(df_unique.values[1], df.values[1])
def test_values_numeric_cols(self, float_frame):
float_frame["foo"] = "bar"
values = float_frame[["A", "B", "C", "D"]].values
assert values.dtype == np.float64
def test_values_lcd(self, mixed_float_frame, mixed_int_frame):
# mixed lcd
values = mixed_float_frame[["A", "B", "C", "D"]].values
assert values.dtype == np.float64
values = mixed_float_frame[["A", "B", "C"]].values
assert values.dtype == np.float32
values = mixed_float_frame[["C"]].values
assert values.dtype == np.float16
# GH#10364
# B uint64 forces float because there are other signed int types
values = mixed_int_frame[["A", "B", "C", "D"]].values
assert values.dtype == np.float64
values = mixed_int_frame[["A", "D"]].values
assert values.dtype == np.int64
# B uint64 forces float because there are other signed int types
values = mixed_int_frame[["A", "B", "C"]].values
assert values.dtype == np.float64
# as B and C are both unsigned, no forcing to float is needed
values = mixed_int_frame[["B", "C"]].values
assert values.dtype == np.uint64
values = mixed_int_frame[["A", "C"]].values
assert values.dtype == np.int32
values = mixed_int_frame[["C", "D"]].values
assert values.dtype == np.int64
values = mixed_int_frame[["A"]].values
assert values.dtype == np.int32
values = mixed_int_frame[["C"]].values
assert values.dtype == np.uint8
class TestPrivateValues:
@td.skip_array_manager_invalid_test
def test_private_values_dt64tz(self, using_copy_on_write):
dta = date_range("2000", periods=4, tz="US/Central")._data.reshape(-1, 1)
df = DataFrame(dta, columns=["A"])
tm.assert_equal(df._values, dta)
if using_copy_on_write:
assert not np.shares_memory(df._values._ndarray, dta._ndarray)
else:
# we have a view
assert np.shares_memory(df._values._ndarray, dta._ndarray)
# TimedeltaArray
tda = dta - dta
df2 = df - df
tm.assert_equal(df2._values, tda)
@td.skip_array_manager_invalid_test
def test_private_values_dt64tz_multicol(self, using_copy_on_write):
dta = date_range("2000", periods=8, tz="US/Central")._data.reshape(-1, 2)
df = DataFrame(dta, columns=["A", "B"])
tm.assert_equal(df._values, dta)
if using_copy_on_write:
assert not np.shares_memory(df._values._ndarray, dta._ndarray)
else:
# we have a view
assert np.shares_memory(df._values._ndarray, dta._ndarray)
# TimedeltaArray
tda = dta - dta
df2 = df - df
tm.assert_equal(df2._values, tda)
def test_private_values_dt64_multiblock(self):
dta = date_range("2000", periods=8)._data
df = DataFrame({"A": dta[:4]}, copy=False)
df["B"] = dta[4:]
assert len(df._mgr.arrays) == 2
result = df._values
expected = dta.reshape(2, 4).T
tm.assert_equal(result, expected)