Updated script that can be controled by Nodejs web app
This commit is contained in:
@ -0,0 +1,7 @@
|
||||
"""
|
||||
Test files dedicated to individual (stand-alone) DataFrame methods
|
||||
|
||||
Ideally these files/tests should correspond 1-to-1 with tests.series.methods
|
||||
|
||||
These may also present opportunities for sharing/de-duplicating test code.
|
||||
"""
|
@ -0,0 +1,49 @@
|
||||
import pytest
|
||||
|
||||
from pandas import Index
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_add_prefix_suffix(float_frame):
|
||||
with_prefix = float_frame.add_prefix("foo#")
|
||||
expected = Index([f"foo#{c}" for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_prefix.columns, expected)
|
||||
|
||||
with_suffix = float_frame.add_suffix("#foo")
|
||||
expected = Index([f"{c}#foo" for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_suffix.columns, expected)
|
||||
|
||||
with_pct_prefix = float_frame.add_prefix("%")
|
||||
expected = Index([f"%{c}" for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_pct_prefix.columns, expected)
|
||||
|
||||
with_pct_suffix = float_frame.add_suffix("%")
|
||||
expected = Index([f"{c}%" for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_pct_suffix.columns, expected)
|
||||
|
||||
|
||||
def test_add_prefix_suffix_axis(float_frame):
|
||||
# GH 47819
|
||||
with_prefix = float_frame.add_prefix("foo#", axis=0)
|
||||
expected = Index([f"foo#{c}" for c in float_frame.index])
|
||||
tm.assert_index_equal(with_prefix.index, expected)
|
||||
|
||||
with_prefix = float_frame.add_prefix("foo#", axis=1)
|
||||
expected = Index([f"foo#{c}" for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_prefix.columns, expected)
|
||||
|
||||
with_pct_suffix = float_frame.add_suffix("#foo", axis=0)
|
||||
expected = Index([f"{c}#foo" for c in float_frame.index])
|
||||
tm.assert_index_equal(with_pct_suffix.index, expected)
|
||||
|
||||
with_pct_suffix = float_frame.add_suffix("#foo", axis=1)
|
||||
expected = Index([f"{c}#foo" for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_pct_suffix.columns, expected)
|
||||
|
||||
|
||||
def test_add_prefix_suffix_invalid_axis(float_frame):
|
||||
with pytest.raises(ValueError, match="No axis named 2 for object type DataFrame"):
|
||||
float_frame.add_prefix("foo#", axis=2)
|
||||
|
||||
with pytest.raises(ValueError, match="No axis named 2 for object type DataFrame"):
|
||||
float_frame.add_suffix("foo#", axis=2)
|
@ -0,0 +1,484 @@
|
||||
from datetime import timezone
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameAlign:
|
||||
def test_align_asfreq_method_raises(self):
|
||||
df = DataFrame({"A": [1, np.nan, 2]})
|
||||
msg = "Invalid fill method"
|
||||
msg2 = "The 'method', 'limit', and 'fill_axis' keywords"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg2):
|
||||
df.align(df.iloc[::-1], method="asfreq")
|
||||
|
||||
def test_frame_align_aware(self):
|
||||
idx1 = date_range("2001", periods=5, freq="h", tz="US/Eastern")
|
||||
idx2 = date_range("2001", periods=5, freq="2h", tz="US/Eastern")
|
||||
df1 = DataFrame(np.random.default_rng(2).standard_normal((len(idx1), 3)), idx1)
|
||||
df2 = DataFrame(np.random.default_rng(2).standard_normal((len(idx2), 3)), idx2)
|
||||
new1, new2 = df1.align(df2)
|
||||
assert df1.index.tz == new1.index.tz
|
||||
assert df2.index.tz == new2.index.tz
|
||||
|
||||
# different timezones convert to UTC
|
||||
|
||||
# frame with frame
|
||||
df1_central = df1.tz_convert("US/Central")
|
||||
new1, new2 = df1.align(df1_central)
|
||||
assert new1.index.tz is timezone.utc
|
||||
assert new2.index.tz is timezone.utc
|
||||
|
||||
# frame with Series
|
||||
new1, new2 = df1.align(df1_central[0], axis=0)
|
||||
assert new1.index.tz is timezone.utc
|
||||
assert new2.index.tz is timezone.utc
|
||||
|
||||
df1[0].align(df1_central, axis=0)
|
||||
assert new1.index.tz is timezone.utc
|
||||
assert new2.index.tz is timezone.utc
|
||||
|
||||
def test_align_float(self, float_frame, using_copy_on_write):
|
||||
af, bf = float_frame.align(float_frame)
|
||||
assert af._mgr is not float_frame._mgr
|
||||
|
||||
af, bf = float_frame.align(float_frame, copy=False)
|
||||
if not using_copy_on_write:
|
||||
assert af._mgr is float_frame._mgr
|
||||
else:
|
||||
assert af._mgr is not float_frame._mgr
|
||||
|
||||
# axis = 0
|
||||
other = float_frame.iloc[:-5, :3]
|
||||
af, bf = float_frame.align(other, axis=0, fill_value=-1)
|
||||
|
||||
tm.assert_index_equal(bf.columns, other.columns)
|
||||
|
||||
# test fill value
|
||||
join_idx = float_frame.index.join(other.index)
|
||||
diff_a = float_frame.index.difference(join_idx)
|
||||
diff_a_vals = af.reindex(diff_a).values
|
||||
assert (diff_a_vals == -1).all()
|
||||
|
||||
af, bf = float_frame.align(other, join="right", axis=0)
|
||||
tm.assert_index_equal(bf.columns, other.columns)
|
||||
tm.assert_index_equal(bf.index, other.index)
|
||||
tm.assert_index_equal(af.index, other.index)
|
||||
|
||||
# axis = 1
|
||||
other = float_frame.iloc[:-5, :3].copy()
|
||||
af, bf = float_frame.align(other, axis=1)
|
||||
tm.assert_index_equal(bf.columns, float_frame.columns)
|
||||
tm.assert_index_equal(bf.index, other.index)
|
||||
|
||||
# test fill value
|
||||
join_idx = float_frame.index.join(other.index)
|
||||
diff_a = float_frame.index.difference(join_idx)
|
||||
diff_a_vals = af.reindex(diff_a).values
|
||||
|
||||
assert (diff_a_vals == -1).all()
|
||||
|
||||
af, bf = float_frame.align(other, join="inner", axis=1)
|
||||
tm.assert_index_equal(bf.columns, other.columns)
|
||||
|
||||
msg = (
|
||||
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
|
||||
"are deprecated"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
af, bf = float_frame.align(other, join="inner", axis=1, method="pad")
|
||||
tm.assert_index_equal(bf.columns, other.columns)
|
||||
|
||||
msg = (
|
||||
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
|
||||
"are deprecated"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
af, bf = float_frame.align(
|
||||
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=None
|
||||
)
|
||||
tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype))
|
||||
|
||||
msg = (
|
||||
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
|
||||
"are deprecated"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
af, bf = float_frame.align(
|
||||
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0
|
||||
)
|
||||
tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype))
|
||||
|
||||
# Try to align DataFrame to Series along bad axis
|
||||
msg = "No axis named 2 for object type DataFrame"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
float_frame.align(af.iloc[0, :3], join="inner", axis=2)
|
||||
|
||||
def test_align_frame_with_series(self, float_frame):
|
||||
# align dataframe to series with broadcast or not
|
||||
idx = float_frame.index
|
||||
s = Series(range(len(idx)), index=idx)
|
||||
|
||||
left, right = float_frame.align(s, axis=0)
|
||||
tm.assert_index_equal(left.index, float_frame.index)
|
||||
tm.assert_index_equal(right.index, float_frame.index)
|
||||
assert isinstance(right, Series)
|
||||
|
||||
msg = "The 'broadcast_axis' keyword in DataFrame.align is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
left, right = float_frame.align(s, broadcast_axis=1)
|
||||
tm.assert_index_equal(left.index, float_frame.index)
|
||||
expected = {c: s for c in float_frame.columns}
|
||||
expected = DataFrame(
|
||||
expected, index=float_frame.index, columns=float_frame.columns
|
||||
)
|
||||
tm.assert_frame_equal(right, expected)
|
||||
|
||||
def test_align_series_condition(self):
|
||||
# see gh-9558
|
||||
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
|
||||
result = df[df["a"] == 2]
|
||||
expected = DataFrame([[2, 5]], index=[1], columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.where(df["a"] == 2, 0)
|
||||
expected = DataFrame({"a": [0, 2, 0], "b": [0, 5, 0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_align_int(self, int_frame):
|
||||
# test other non-float types
|
||||
other = DataFrame(index=range(5), columns=["A", "B", "C"])
|
||||
|
||||
msg = (
|
||||
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
|
||||
"are deprecated"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
af, bf = int_frame.align(other, join="inner", axis=1, method="pad")
|
||||
tm.assert_index_equal(bf.columns, other.columns)
|
||||
|
||||
def test_align_mixed_type(self, float_string_frame):
|
||||
msg = (
|
||||
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
|
||||
"are deprecated"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
af, bf = float_string_frame.align(
|
||||
float_string_frame, join="inner", axis=1, method="pad"
|
||||
)
|
||||
tm.assert_index_equal(bf.columns, float_string_frame.columns)
|
||||
|
||||
def test_align_mixed_float(self, mixed_float_frame):
|
||||
# mixed floats/ints
|
||||
other = DataFrame(index=range(5), columns=["A", "B", "C"])
|
||||
|
||||
msg = (
|
||||
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
|
||||
"are deprecated"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
af, bf = mixed_float_frame.align(
|
||||
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0
|
||||
)
|
||||
tm.assert_index_equal(bf.index, Index([]))
|
||||
|
||||
def test_align_mixed_int(self, mixed_int_frame):
|
||||
other = DataFrame(index=range(5), columns=["A", "B", "C"])
|
||||
|
||||
msg = (
|
||||
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
|
||||
"are deprecated"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
af, bf = mixed_int_frame.align(
|
||||
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0
|
||||
)
|
||||
tm.assert_index_equal(bf.index, Index([]))
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"l_ordered,r_ordered,expected",
|
||||
[
|
||||
[True, True, pd.CategoricalIndex],
|
||||
[True, False, Index],
|
||||
[False, True, Index],
|
||||
[False, False, pd.CategoricalIndex],
|
||||
],
|
||||
)
|
||||
def test_align_categorical(self, l_ordered, r_ordered, expected):
|
||||
# GH-28397
|
||||
df_1 = DataFrame(
|
||||
{
|
||||
"A": np.arange(6, dtype="int64"),
|
||||
"B": Series(list("aabbca")).astype(
|
||||
pd.CategoricalDtype(list("cab"), ordered=l_ordered)
|
||||
),
|
||||
}
|
||||
).set_index("B")
|
||||
df_2 = DataFrame(
|
||||
{
|
||||
"A": np.arange(5, dtype="int64"),
|
||||
"B": Series(list("babca")).astype(
|
||||
pd.CategoricalDtype(list("cab"), ordered=r_ordered)
|
||||
),
|
||||
}
|
||||
).set_index("B")
|
||||
|
||||
aligned_1, aligned_2 = df_1.align(df_2)
|
||||
assert isinstance(aligned_1.index, expected)
|
||||
assert isinstance(aligned_2.index, expected)
|
||||
tm.assert_index_equal(aligned_1.index, aligned_2.index)
|
||||
|
||||
def test_align_multiindex(self):
|
||||
# GH#10665
|
||||
# same test cases as test_align_multiindex in test_series.py
|
||||
|
||||
midx = pd.MultiIndex.from_product(
|
||||
[range(2), range(3), range(2)], names=("a", "b", "c")
|
||||
)
|
||||
idx = Index(range(2), name="b")
|
||||
df1 = DataFrame(np.arange(12, dtype="int64"), index=midx)
|
||||
df2 = DataFrame(np.arange(2, dtype="int64"), index=idx)
|
||||
|
||||
# these must be the same results (but flipped)
|
||||
res1l, res1r = df1.align(df2, join="left")
|
||||
res2l, res2r = df2.align(df1, join="right")
|
||||
|
||||
expl = df1
|
||||
tm.assert_frame_equal(expl, res1l)
|
||||
tm.assert_frame_equal(expl, res2r)
|
||||
expr = DataFrame([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx)
|
||||
tm.assert_frame_equal(expr, res1r)
|
||||
tm.assert_frame_equal(expr, res2l)
|
||||
|
||||
res1l, res1r = df1.align(df2, join="right")
|
||||
res2l, res2r = df2.align(df1, join="left")
|
||||
|
||||
exp_idx = pd.MultiIndex.from_product(
|
||||
[range(2), range(2), range(2)], names=("a", "b", "c")
|
||||
)
|
||||
expl = DataFrame([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx)
|
||||
tm.assert_frame_equal(expl, res1l)
|
||||
tm.assert_frame_equal(expl, res2r)
|
||||
expr = DataFrame([0, 0, 1, 1] * 2, index=exp_idx)
|
||||
tm.assert_frame_equal(expr, res1r)
|
||||
tm.assert_frame_equal(expr, res2l)
|
||||
|
||||
def test_align_series_combinations(self):
|
||||
df = DataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE"))
|
||||
s = Series([1, 2, 4], index=list("ABD"), name="x")
|
||||
|
||||
# frame + series
|
||||
res1, res2 = df.align(s, axis=0)
|
||||
exp1 = DataFrame(
|
||||
{"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]},
|
||||
index=list("ABCDE"),
|
||||
)
|
||||
exp2 = Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x")
|
||||
|
||||
tm.assert_frame_equal(res1, exp1)
|
||||
tm.assert_series_equal(res2, exp2)
|
||||
|
||||
# series + frame
|
||||
res1, res2 = s.align(df)
|
||||
tm.assert_series_equal(res1, exp2)
|
||||
tm.assert_frame_equal(res2, exp1)
|
||||
|
||||
def test_multiindex_align_to_series_with_common_index_level(self):
|
||||
# GH-46001
|
||||
foo_index = Index([1, 2, 3], name="foo")
|
||||
bar_index = Index([1, 2], name="bar")
|
||||
|
||||
series = Series([1, 2], index=bar_index, name="foo_series")
|
||||
df = DataFrame(
|
||||
{"col": np.arange(6)},
|
||||
index=pd.MultiIndex.from_product([foo_index, bar_index]),
|
||||
)
|
||||
|
||||
expected_r = Series([1, 2] * 3, index=df.index, name="foo_series")
|
||||
result_l, result_r = df.align(series, axis=0)
|
||||
|
||||
tm.assert_frame_equal(result_l, df)
|
||||
tm.assert_series_equal(result_r, expected_r)
|
||||
|
||||
def test_multiindex_align_to_series_with_common_index_level_missing_in_left(self):
|
||||
# GH-46001
|
||||
foo_index = Index([1, 2, 3], name="foo")
|
||||
bar_index = Index([1, 2], name="bar")
|
||||
|
||||
series = Series(
|
||||
[1, 2, 3, 4], index=Index([1, 2, 3, 4], name="bar"), name="foo_series"
|
||||
)
|
||||
df = DataFrame(
|
||||
{"col": np.arange(6)},
|
||||
index=pd.MultiIndex.from_product([foo_index, bar_index]),
|
||||
)
|
||||
|
||||
expected_r = Series([1, 2] * 3, index=df.index, name="foo_series")
|
||||
result_l, result_r = df.align(series, axis=0)
|
||||
|
||||
tm.assert_frame_equal(result_l, df)
|
||||
tm.assert_series_equal(result_r, expected_r)
|
||||
|
||||
def test_multiindex_align_to_series_with_common_index_level_missing_in_right(self):
|
||||
# GH-46001
|
||||
foo_index = Index([1, 2, 3], name="foo")
|
||||
bar_index = Index([1, 2, 3, 4], name="bar")
|
||||
|
||||
series = Series([1, 2], index=Index([1, 2], name="bar"), name="foo_series")
|
||||
df = DataFrame(
|
||||
{"col": np.arange(12)},
|
||||
index=pd.MultiIndex.from_product([foo_index, bar_index]),
|
||||
)
|
||||
|
||||
expected_r = Series(
|
||||
[1, 2, np.nan, np.nan] * 3, index=df.index, name="foo_series"
|
||||
)
|
||||
result_l, result_r = df.align(series, axis=0)
|
||||
|
||||
tm.assert_frame_equal(result_l, df)
|
||||
tm.assert_series_equal(result_r, expected_r)
|
||||
|
||||
def test_multiindex_align_to_series_with_common_index_level_missing_in_both(self):
|
||||
# GH-46001
|
||||
foo_index = Index([1, 2, 3], name="foo")
|
||||
bar_index = Index([1, 3, 4], name="bar")
|
||||
|
||||
series = Series(
|
||||
[1, 2, 3], index=Index([1, 2, 4], name="bar"), name="foo_series"
|
||||
)
|
||||
df = DataFrame(
|
||||
{"col": np.arange(9)},
|
||||
index=pd.MultiIndex.from_product([foo_index, bar_index]),
|
||||
)
|
||||
|
||||
expected_r = Series([1, np.nan, 3] * 3, index=df.index, name="foo_series")
|
||||
result_l, result_r = df.align(series, axis=0)
|
||||
|
||||
tm.assert_frame_equal(result_l, df)
|
||||
tm.assert_series_equal(result_r, expected_r)
|
||||
|
||||
def test_multiindex_align_to_series_with_common_index_level_non_unique_cols(self):
|
||||
# GH-46001
|
||||
foo_index = Index([1, 2, 3], name="foo")
|
||||
bar_index = Index([1, 2], name="bar")
|
||||
|
||||
series = Series([1, 2], index=bar_index, name="foo_series")
|
||||
df = DataFrame(
|
||||
np.arange(18).reshape(6, 3),
|
||||
index=pd.MultiIndex.from_product([foo_index, bar_index]),
|
||||
)
|
||||
df.columns = ["cfoo", "cbar", "cfoo"]
|
||||
|
||||
expected = Series([1, 2] * 3, index=df.index, name="foo_series")
|
||||
result_left, result_right = df.align(series, axis=0)
|
||||
|
||||
tm.assert_series_equal(result_right, expected)
|
||||
tm.assert_index_equal(result_left.columns, df.columns)
|
||||
|
||||
def test_missing_axis_specification_exception(self):
|
||||
df = DataFrame(np.arange(50).reshape((10, 5)))
|
||||
series = Series(np.arange(5))
|
||||
|
||||
with pytest.raises(ValueError, match=r"axis=0 or 1"):
|
||||
df.align(series)
|
||||
|
||||
@pytest.mark.parametrize("method", ["pad", "bfill"])
|
||||
@pytest.mark.parametrize("axis", [0, 1, None])
|
||||
@pytest.mark.parametrize("fill_axis", [0, 1])
|
||||
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
|
||||
@pytest.mark.parametrize(
|
||||
"left_slice",
|
||||
[
|
||||
[slice(4), slice(10)],
|
||||
[slice(0), slice(0)],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"right_slice",
|
||||
[
|
||||
[slice(2, None), slice(6, None)],
|
||||
[slice(0), slice(0)],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("limit", [1, None])
|
||||
def test_align_fill_method(
|
||||
self, how, method, axis, fill_axis, float_frame, left_slice, right_slice, limit
|
||||
):
|
||||
frame = float_frame
|
||||
left = frame.iloc[left_slice[0], left_slice[1]]
|
||||
right = frame.iloc[right_slice[0], right_slice[1]]
|
||||
|
||||
msg = (
|
||||
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
|
||||
"are deprecated"
|
||||
)
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
aa, ab = left.align(
|
||||
right,
|
||||
axis=axis,
|
||||
join=how,
|
||||
method=method,
|
||||
limit=limit,
|
||||
fill_axis=fill_axis,
|
||||
)
|
||||
|
||||
join_index, join_columns = None, None
|
||||
|
||||
ea, eb = left, right
|
||||
if axis is None or axis == 0:
|
||||
join_index = left.index.join(right.index, how=how)
|
||||
ea = ea.reindex(index=join_index)
|
||||
eb = eb.reindex(index=join_index)
|
||||
|
||||
if axis is None or axis == 1:
|
||||
join_columns = left.columns.join(right.columns, how=how)
|
||||
ea = ea.reindex(columns=join_columns)
|
||||
eb = eb.reindex(columns=join_columns)
|
||||
|
||||
msg = "DataFrame.fillna with 'method' is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
ea = ea.fillna(axis=fill_axis, method=method, limit=limit)
|
||||
eb = eb.fillna(axis=fill_axis, method=method, limit=limit)
|
||||
|
||||
tm.assert_frame_equal(aa, ea)
|
||||
tm.assert_frame_equal(ab, eb)
|
||||
|
||||
def test_align_series_check_copy(self):
|
||||
# GH#
|
||||
df = DataFrame({0: [1, 2]})
|
||||
ser = Series([1], name=0)
|
||||
expected = ser.copy()
|
||||
result, other = df.align(ser, axis=1)
|
||||
ser.iloc[0] = 100
|
||||
tm.assert_series_equal(other, expected)
|
||||
|
||||
def test_align_identical_different_object(self):
|
||||
# GH#51032
|
||||
df = DataFrame({"a": [1, 2]})
|
||||
ser = Series([3, 4])
|
||||
result, result2 = df.align(ser, axis=0)
|
||||
tm.assert_frame_equal(result, df)
|
||||
tm.assert_series_equal(result2, ser)
|
||||
assert df is not result
|
||||
assert ser is not result2
|
||||
|
||||
def test_align_identical_different_object_columns(self):
|
||||
# GH#51032
|
||||
df = DataFrame({"a": [1, 2]})
|
||||
ser = Series([1], index=["a"])
|
||||
result, result2 = df.align(ser, axis=1)
|
||||
tm.assert_frame_equal(result, df)
|
||||
tm.assert_series_equal(result2, ser)
|
||||
assert df is not result
|
||||
assert ser is not result2
|
@ -0,0 +1,263 @@
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.tslibs.offsets import MonthEnd
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Series,
|
||||
date_range,
|
||||
period_range,
|
||||
to_datetime,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.tseries import offsets
|
||||
|
||||
|
||||
class TestAsFreq:
|
||||
@pytest.fixture(params=["s", "ms", "us", "ns"])
|
||||
def unit(self, request):
|
||||
return request.param
|
||||
|
||||
def test_asfreq2(self, frame_or_series):
|
||||
ts = frame_or_series(
|
||||
[0.0, 1.0, 2.0],
|
||||
index=DatetimeIndex(
|
||||
[
|
||||
datetime(2009, 10, 30),
|
||||
datetime(2009, 11, 30),
|
||||
datetime(2009, 12, 31),
|
||||
],
|
||||
dtype="M8[ns]",
|
||||
freq="BME",
|
||||
),
|
||||
)
|
||||
|
||||
daily_ts = ts.asfreq("B")
|
||||
monthly_ts = daily_ts.asfreq("BME")
|
||||
tm.assert_equal(monthly_ts, ts)
|
||||
|
||||
daily_ts = ts.asfreq("B", method="pad")
|
||||
monthly_ts = daily_ts.asfreq("BME")
|
||||
tm.assert_equal(monthly_ts, ts)
|
||||
|
||||
daily_ts = ts.asfreq(offsets.BDay())
|
||||
monthly_ts = daily_ts.asfreq(offsets.BMonthEnd())
|
||||
tm.assert_equal(monthly_ts, ts)
|
||||
|
||||
result = ts[:0].asfreq("ME")
|
||||
assert len(result) == 0
|
||||
assert result is not ts
|
||||
|
||||
if frame_or_series is Series:
|
||||
daily_ts = ts.asfreq("D", fill_value=-1)
|
||||
result = daily_ts.value_counts().sort_index()
|
||||
expected = Series(
|
||||
[60, 1, 1, 1], index=[-1.0, 2.0, 1.0, 0.0], name="count"
|
||||
).sort_index()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_asfreq_datetimeindex_empty(self, frame_or_series):
|
||||
# GH#14320
|
||||
index = DatetimeIndex(["2016-09-29 11:00"])
|
||||
expected = frame_or_series(index=index, dtype=object).asfreq("h")
|
||||
result = frame_or_series([3], index=index.copy()).asfreq("h")
|
||||
tm.assert_index_equal(expected.index, result.index)
|
||||
|
||||
@pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"])
|
||||
def test_tz_aware_asfreq_smoke(self, tz, frame_or_series):
|
||||
dr = date_range("2011-12-01", "2012-07-20", freq="D", tz=tz)
|
||||
|
||||
obj = frame_or_series(
|
||||
np.random.default_rng(2).standard_normal(len(dr)), index=dr
|
||||
)
|
||||
|
||||
# it works!
|
||||
obj.asfreq("min")
|
||||
|
||||
def test_asfreq_normalize(self, frame_or_series):
|
||||
rng = date_range("1/1/2000 09:30", periods=20)
|
||||
norm = date_range("1/1/2000", periods=20)
|
||||
|
||||
vals = np.random.default_rng(2).standard_normal((20, 3))
|
||||
|
||||
obj = DataFrame(vals, index=rng)
|
||||
expected = DataFrame(vals, index=norm)
|
||||
if frame_or_series is Series:
|
||||
obj = obj[0]
|
||||
expected = expected[0]
|
||||
|
||||
result = obj.asfreq("D", normalize=True)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_asfreq_keep_index_name(self, frame_or_series):
|
||||
# GH#9854
|
||||
index_name = "bar"
|
||||
index = date_range("20130101", periods=20, name=index_name)
|
||||
obj = DataFrame(list(range(20)), columns=["foo"], index=index)
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
assert index_name == obj.index.name
|
||||
assert index_name == obj.asfreq("10D").index.name
|
||||
|
||||
def test_asfreq_ts(self, frame_or_series):
|
||||
index = period_range(freq="Y", start="1/1/2001", end="12/31/2010")
|
||||
obj = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(index), 3)), index=index
|
||||
)
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
result = obj.asfreq("D", how="end")
|
||||
exp_index = index.asfreq("D", how="end")
|
||||
assert len(result) == len(obj)
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
result = obj.asfreq("D", how="start")
|
||||
exp_index = index.asfreq("D", how="start")
|
||||
assert len(result) == len(obj)
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
def test_asfreq_resample_set_correct_freq(self, frame_or_series):
|
||||
# GH#5613
|
||||
# we test if .asfreq() and .resample() set the correct value for .freq
|
||||
dti = to_datetime(["2012-01-01", "2012-01-02", "2012-01-03"])
|
||||
obj = DataFrame({"col": [1, 2, 3]}, index=dti)
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
# testing the settings before calling .asfreq() and .resample()
|
||||
assert obj.index.freq is None
|
||||
assert obj.index.inferred_freq == "D"
|
||||
|
||||
# does .asfreq() set .freq correctly?
|
||||
assert obj.asfreq("D").index.freq == "D"
|
||||
|
||||
# does .resample() set .freq correctly?
|
||||
assert obj.resample("D").asfreq().index.freq == "D"
|
||||
|
||||
def test_asfreq_empty(self, datetime_frame):
|
||||
# test does not blow up on length-0 DataFrame
|
||||
zero_length = datetime_frame.reindex([])
|
||||
result = zero_length.asfreq("BME")
|
||||
assert result is not zero_length
|
||||
|
||||
def test_asfreq(self, datetime_frame):
|
||||
offset_monthly = datetime_frame.asfreq(offsets.BMonthEnd())
|
||||
rule_monthly = datetime_frame.asfreq("BME")
|
||||
|
||||
tm.assert_frame_equal(offset_monthly, rule_monthly)
|
||||
|
||||
rule_monthly.asfreq("B", method="pad")
|
||||
# TODO: actually check that this worked.
|
||||
|
||||
# don't forget!
|
||||
rule_monthly.asfreq("B", method="pad")
|
||||
|
||||
def test_asfreq_datetimeindex(self):
|
||||
df = DataFrame(
|
||||
{"A": [1, 2, 3]},
|
||||
index=[datetime(2011, 11, 1), datetime(2011, 11, 2), datetime(2011, 11, 3)],
|
||||
)
|
||||
df = df.asfreq("B")
|
||||
assert isinstance(df.index, DatetimeIndex)
|
||||
|
||||
ts = df["A"].asfreq("B")
|
||||
assert isinstance(ts.index, DatetimeIndex)
|
||||
|
||||
def test_asfreq_fillvalue(self):
|
||||
# test for fill value during upsampling, related to issue 3715
|
||||
|
||||
# setup
|
||||
rng = date_range("1/1/2016", periods=10, freq="2s")
|
||||
# Explicit cast to 'float' to avoid implicit cast when setting None
|
||||
ts = Series(np.arange(len(rng)), index=rng, dtype="float")
|
||||
df = DataFrame({"one": ts})
|
||||
|
||||
# insert pre-existing missing value
|
||||
df.loc["2016-01-01 00:00:08", "one"] = None
|
||||
|
||||
actual_df = df.asfreq(freq="1s", fill_value=9.0)
|
||||
expected_df = df.asfreq(freq="1s").fillna(9.0)
|
||||
expected_df.loc["2016-01-01 00:00:08", "one"] = None
|
||||
tm.assert_frame_equal(expected_df, actual_df)
|
||||
|
||||
expected_series = ts.asfreq(freq="1s").fillna(9.0)
|
||||
actual_series = ts.asfreq(freq="1s", fill_value=9.0)
|
||||
tm.assert_series_equal(expected_series, actual_series)
|
||||
|
||||
def test_asfreq_with_date_object_index(self, frame_or_series):
|
||||
rng = date_range("1/1/2000", periods=20)
|
||||
ts = frame_or_series(np.random.default_rng(2).standard_normal(20), index=rng)
|
||||
|
||||
ts2 = ts.copy()
|
||||
ts2.index = [x.date() for x in ts2.index]
|
||||
|
||||
result = ts2.asfreq("4h", method="ffill")
|
||||
expected = ts.asfreq("4h", method="ffill")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_asfreq_with_unsorted_index(self, frame_or_series):
|
||||
# GH#39805
|
||||
# Test that rows are not dropped when the datetime index is out of order
|
||||
index = to_datetime(["2021-01-04", "2021-01-02", "2021-01-03", "2021-01-01"])
|
||||
result = frame_or_series(range(4), index=index)
|
||||
|
||||
expected = result.reindex(sorted(index))
|
||||
expected.index = expected.index._with_freq("infer")
|
||||
|
||||
result = result.asfreq("D")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_asfreq_after_normalize(self, unit):
|
||||
# https://github.com/pandas-dev/pandas/issues/50727
|
||||
result = DatetimeIndex(
|
||||
date_range("2000", periods=2).as_unit(unit).normalize(), freq="D"
|
||||
)
|
||||
expected = DatetimeIndex(["2000-01-01", "2000-01-02"], freq="D").as_unit(unit)
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"freq, freq_half",
|
||||
[
|
||||
("2ME", "ME"),
|
||||
(MonthEnd(2), MonthEnd(1)),
|
||||
],
|
||||
)
|
||||
def test_asfreq_2ME(self, freq, freq_half):
|
||||
index = date_range("1/1/2000", periods=6, freq=freq_half)
|
||||
df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0, 4.0, 5.0], index=index)})
|
||||
expected = df.asfreq(freq=freq)
|
||||
|
||||
index = date_range("1/1/2000", periods=3, freq=freq)
|
||||
result = DataFrame({"s": Series([0.0, 2.0, 4.0], index=index)})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"freq, freq_depr",
|
||||
[
|
||||
("2ME", "2M"),
|
||||
("2QE", "2Q"),
|
||||
("2QE-SEP", "2Q-SEP"),
|
||||
("1BQE", "1BQ"),
|
||||
("2BQE-SEP", "2BQ-SEP"),
|
||||
("1YE", "1Y"),
|
||||
("2YE-MAR", "2Y-MAR"),
|
||||
("1YE", "1A"),
|
||||
("2YE-MAR", "2A-MAR"),
|
||||
("2BYE-MAR", "2BA-MAR"),
|
||||
],
|
||||
)
|
||||
def test_asfreq_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr):
|
||||
# GH#9586, #55978
|
||||
depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed "
|
||||
f"in a future version, please use '{freq[1:]}' instead."
|
||||
|
||||
index = date_range("1/1/2000", periods=4, freq=f"{freq[1:]}")
|
||||
df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0], index=index)})
|
||||
expected = df.asfreq(freq=freq)
|
||||
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
|
||||
result = df.asfreq(freq=freq_depr)
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,198 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.tslibs import IncompatibleFrequency
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Period,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
period_range,
|
||||
to_datetime,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def date_range_frame():
|
||||
"""
|
||||
Fixture for DataFrame of ints with date_range index
|
||||
|
||||
Columns are ['A', 'B'].
|
||||
"""
|
||||
N = 50
|
||||
rng = date_range("1/1/1990", periods=N, freq="53s")
|
||||
return DataFrame({"A": np.arange(N), "B": np.arange(N)}, index=rng)
|
||||
|
||||
|
||||
class TestFrameAsof:
|
||||
def test_basic(self, date_range_frame):
|
||||
# Explicitly cast to float to avoid implicit cast when setting np.nan
|
||||
df = date_range_frame.astype({"A": "float"})
|
||||
N = 50
|
||||
df.loc[df.index[15:30], "A"] = np.nan
|
||||
dates = date_range("1/1/1990", periods=N * 3, freq="25s")
|
||||
|
||||
result = df.asof(dates)
|
||||
assert result.notna().all(1).all()
|
||||
lb = df.index[14]
|
||||
ub = df.index[30]
|
||||
|
||||
dates = list(dates)
|
||||
|
||||
result = df.asof(dates)
|
||||
assert result.notna().all(1).all()
|
||||
|
||||
mask = (result.index >= lb) & (result.index < ub)
|
||||
rs = result[mask]
|
||||
assert (rs == 14).all(1).all()
|
||||
|
||||
def test_subset(self, date_range_frame):
|
||||
N = 10
|
||||
# explicitly cast to float to avoid implicit upcast when setting to np.nan
|
||||
df = date_range_frame.iloc[:N].copy().astype({"A": "float"})
|
||||
df.loc[df.index[4:8], "A"] = np.nan
|
||||
dates = date_range("1/1/1990", periods=N * 3, freq="25s")
|
||||
|
||||
# with a subset of A should be the same
|
||||
result = df.asof(dates, subset="A")
|
||||
expected = df.asof(dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# same with A/B
|
||||
result = df.asof(dates, subset=["A", "B"])
|
||||
expected = df.asof(dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# B gives df.asof
|
||||
result = df.asof(dates, subset="B")
|
||||
expected = df.resample("25s", closed="right").ffill().reindex(dates)
|
||||
expected.iloc[20:] = 9
|
||||
# no "missing", so "B" can retain int dtype (df["A"].dtype platform-dependent)
|
||||
expected["B"] = expected["B"].astype(df["B"].dtype)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_missing(self, date_range_frame):
|
||||
# GH 15118
|
||||
# no match found - `where` value before earliest date in index
|
||||
N = 10
|
||||
# Cast to 'float64' to avoid upcast when introducing nan in df.asof
|
||||
df = date_range_frame.iloc[:N].copy().astype("float64")
|
||||
|
||||
result = df.asof("1989-12-31")
|
||||
|
||||
expected = Series(
|
||||
index=["A", "B"], name=Timestamp("1989-12-31"), dtype=np.float64
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.asof(to_datetime(["1989-12-31"]))
|
||||
expected = DataFrame(
|
||||
index=to_datetime(["1989-12-31"]), columns=["A", "B"], dtype="float64"
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Check that we handle PeriodIndex correctly, dont end up with
|
||||
# period.ordinal for series name
|
||||
df = df.to_period("D")
|
||||
result = df.asof("1989-12-31")
|
||||
assert isinstance(result.name, Period)
|
||||
|
||||
def test_asof_all_nans(self, frame_or_series):
|
||||
# GH 15713
|
||||
# DataFrame/Series is all nans
|
||||
result = frame_or_series([np.nan]).asof([0])
|
||||
expected = frame_or_series([np.nan])
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_all_nans(self, date_range_frame):
|
||||
# GH 15713
|
||||
# DataFrame is all nans
|
||||
|
||||
# testing non-default indexes, multiple inputs
|
||||
N = 150
|
||||
rng = date_range_frame.index
|
||||
dates = date_range("1/1/1990", periods=N, freq="25s")
|
||||
result = DataFrame(np.nan, index=rng, columns=["A"]).asof(dates)
|
||||
expected = DataFrame(np.nan, index=dates, columns=["A"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# testing multiple columns
|
||||
dates = date_range("1/1/1990", periods=N, freq="25s")
|
||||
result = DataFrame(np.nan, index=rng, columns=["A", "B", "C"]).asof(dates)
|
||||
expected = DataFrame(np.nan, index=dates, columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# testing scalar input
|
||||
result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof([3])
|
||||
expected = DataFrame(np.nan, index=[3], columns=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof(3)
|
||||
expected = Series(np.nan, index=["A", "B"], name=3)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"stamp,expected",
|
||||
[
|
||||
(
|
||||
Timestamp("2018-01-01 23:22:43.325+00:00"),
|
||||
Series(2, name=Timestamp("2018-01-01 23:22:43.325+00:00")),
|
||||
),
|
||||
(
|
||||
Timestamp("2018-01-01 22:33:20.682+01:00"),
|
||||
Series(1, name=Timestamp("2018-01-01 22:33:20.682+01:00")),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_time_zone_aware_index(self, stamp, expected):
|
||||
# GH21194
|
||||
# Testing awareness of DataFrame index considering different
|
||||
# UTC and timezone
|
||||
df = DataFrame(
|
||||
data=[1, 2],
|
||||
index=[
|
||||
Timestamp("2018-01-01 21:00:05.001+00:00"),
|
||||
Timestamp("2018-01-01 22:35:10.550+00:00"),
|
||||
],
|
||||
)
|
||||
|
||||
result = df.asof(stamp)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_is_copy(self, date_range_frame):
|
||||
# GH-27357, GH-30784: ensure the result of asof is an actual copy and
|
||||
# doesn't track the parent dataframe / doesn't give SettingWithCopy warnings
|
||||
df = date_range_frame.astype({"A": "float"})
|
||||
N = 50
|
||||
df.loc[df.index[15:30], "A"] = np.nan
|
||||
dates = date_range("1/1/1990", periods=N * 3, freq="25s")
|
||||
|
||||
result = df.asof(dates)
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
result["C"] = 1
|
||||
|
||||
def test_asof_periodindex_mismatched_freq(self):
|
||||
N = 50
|
||||
rng = period_range("1/1/1990", periods=N, freq="h")
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal(N), index=rng)
|
||||
|
||||
# Mismatched freq
|
||||
msg = "Input has different freq"
|
||||
with pytest.raises(IncompatibleFrequency, match=msg):
|
||||
df.asof(rng.asfreq("D"))
|
||||
|
||||
def test_asof_preserves_bool_dtype(self):
|
||||
# GH#16063 was casting bools to floats
|
||||
dti = date_range("2017-01-01", freq="MS", periods=4)
|
||||
ser = Series([True, False, True], index=dti[:-1])
|
||||
|
||||
ts = dti[-1]
|
||||
res = ser.asof([ts])
|
||||
|
||||
expected = Series([True], index=[ts])
|
||||
tm.assert_series_equal(res, expected)
|
@ -0,0 +1,84 @@
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestAssign:
|
||||
def test_assign(self):
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
original = df.copy()
|
||||
result = df.assign(C=df.B / df.A)
|
||||
expected = df.copy()
|
||||
expected["C"] = [4, 2.5, 2]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# lambda syntax
|
||||
result = df.assign(C=lambda x: x.B / x.A)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# original is unmodified
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
# Non-Series array-like
|
||||
result = df.assign(C=[4, 2.5, 2])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
# original is unmodified
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
result = df.assign(B=df.B / df.A)
|
||||
expected = expected.drop("B", axis=1).rename(columns={"C": "B"})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# overwrite
|
||||
result = df.assign(A=df.A + df.B)
|
||||
expected = df.copy()
|
||||
expected["A"] = [5, 7, 9]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# lambda
|
||||
result = df.assign(A=lambda x: x.A + x.B)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_assign_multiple(self):
|
||||
df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=["A", "B"])
|
||||
result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B)
|
||||
expected = DataFrame(
|
||||
[[1, 4, 7, 1, 4], [2, 5, 8, 2, 5], [3, 6, 9, 3, 6]], columns=list("ABCDE")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_assign_order(self):
|
||||
# GH 9818
|
||||
df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
|
||||
result = df.assign(D=df.A + df.B, C=df.A - df.B)
|
||||
|
||||
expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]], columns=list("ABDC"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df.assign(C=df.A - df.B, D=df.A + df.B)
|
||||
|
||||
expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], columns=list("ABCD"))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_assign_bad(self):
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
|
||||
# non-keyword argument
|
||||
msg = r"assign\(\) takes 1 positional argument but 2 were given"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.assign(lambda x: x.A)
|
||||
msg = "'DataFrame' object has no attribute 'C'"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
df.assign(C=df.A, D=df.A + df.C)
|
||||
|
||||
def test_assign_dependent(self):
|
||||
df = DataFrame({"A": [1, 2], "B": [3, 4]})
|
||||
|
||||
result = df.assign(C=df.A, D=lambda x: x["A"] + x["C"])
|
||||
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.assign(C=lambda df: df.A, D=lambda df: df["A"] + df["C"])
|
||||
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD"))
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,911 @@
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
CategoricalDtype,
|
||||
DataFrame,
|
||||
DatetimeTZDtype,
|
||||
Index,
|
||||
Interval,
|
||||
IntervalDtype,
|
||||
NaT,
|
||||
Series,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
concat,
|
||||
date_range,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def _check_cast(df, v):
|
||||
"""
|
||||
Check if all dtypes of df are equal to v
|
||||
"""
|
||||
assert all(s.dtype.name == v for _, s in df.items())
|
||||
|
||||
|
||||
class TestAstype:
|
||||
def test_astype_float(self, float_frame):
|
||||
casted = float_frame.astype(int)
|
||||
expected = DataFrame(
|
||||
float_frame.values.astype(int),
|
||||
index=float_frame.index,
|
||||
columns=float_frame.columns,
|
||||
)
|
||||
tm.assert_frame_equal(casted, expected)
|
||||
|
||||
casted = float_frame.astype(np.int32)
|
||||
expected = DataFrame(
|
||||
float_frame.values.astype(np.int32),
|
||||
index=float_frame.index,
|
||||
columns=float_frame.columns,
|
||||
)
|
||||
tm.assert_frame_equal(casted, expected)
|
||||
|
||||
float_frame["foo"] = "5"
|
||||
casted = float_frame.astype(int)
|
||||
expected = DataFrame(
|
||||
float_frame.values.astype(int),
|
||||
index=float_frame.index,
|
||||
columns=float_frame.columns,
|
||||
)
|
||||
tm.assert_frame_equal(casted, expected)
|
||||
|
||||
def test_astype_mixed_float(self, mixed_float_frame):
|
||||
# mixed casting
|
||||
casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float32")
|
||||
_check_cast(casted, "float32")
|
||||
|
||||
casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float16")
|
||||
_check_cast(casted, "float16")
|
||||
|
||||
def test_astype_mixed_type(self):
|
||||
# mixed casting
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": 1.0,
|
||||
"b": 2,
|
||||
"c": "foo",
|
||||
"float32": np.array([1.0] * 10, dtype="float32"),
|
||||
"int32": np.array([1] * 10, dtype="int32"),
|
||||
},
|
||||
index=np.arange(10),
|
||||
)
|
||||
mn = df._get_numeric_data().copy()
|
||||
mn["little_float"] = np.array(12345.0, dtype="float16")
|
||||
mn["big_float"] = np.array(123456789101112.0, dtype="float64")
|
||||
|
||||
casted = mn.astype("float64")
|
||||
_check_cast(casted, "float64")
|
||||
|
||||
casted = mn.astype("int64")
|
||||
_check_cast(casted, "int64")
|
||||
|
||||
casted = mn.reindex(columns=["little_float"]).astype("float16")
|
||||
_check_cast(casted, "float16")
|
||||
|
||||
casted = mn.astype("float32")
|
||||
_check_cast(casted, "float32")
|
||||
|
||||
casted = mn.astype("int32")
|
||||
_check_cast(casted, "int32")
|
||||
|
||||
# to object
|
||||
casted = mn.astype("O")
|
||||
_check_cast(casted, "object")
|
||||
|
||||
def test_astype_with_exclude_string(self, float_frame):
|
||||
df = float_frame.copy()
|
||||
expected = float_frame.astype(int)
|
||||
df["string"] = "foo"
|
||||
casted = df.astype(int, errors="ignore")
|
||||
|
||||
expected["string"] = "foo"
|
||||
tm.assert_frame_equal(casted, expected)
|
||||
|
||||
df = float_frame.copy()
|
||||
expected = float_frame.astype(np.int32)
|
||||
df["string"] = "foo"
|
||||
casted = df.astype(np.int32, errors="ignore")
|
||||
|
||||
expected["string"] = "foo"
|
||||
tm.assert_frame_equal(casted, expected)
|
||||
|
||||
def test_astype_with_view_float(self, float_frame):
|
||||
# this is the only real reason to do it this way
|
||||
tf = np.round(float_frame).astype(np.int32)
|
||||
tf.astype(np.float32, copy=False)
|
||||
|
||||
# TODO(wesm): verification?
|
||||
tf = float_frame.astype(np.float64)
|
||||
tf.astype(np.int64, copy=False)
|
||||
|
||||
def test_astype_with_view_mixed_float(self, mixed_float_frame):
|
||||
tf = mixed_float_frame.reindex(columns=["A", "B", "C"])
|
||||
|
||||
tf.astype(np.int64)
|
||||
tf.astype(np.float32)
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.int32, np.int64])
|
||||
@pytest.mark.parametrize("val", [np.nan, np.inf])
|
||||
def test_astype_cast_nan_inf_int(self, val, dtype):
|
||||
# see GH#14265
|
||||
#
|
||||
# Check NaN and inf --> raise error when converting to int.
|
||||
msg = "Cannot convert non-finite values \\(NA or inf\\) to integer"
|
||||
df = DataFrame([val])
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.astype(dtype)
|
||||
|
||||
def test_astype_str(self):
|
||||
# see GH#9757
|
||||
a = Series(date_range("2010-01-04", periods=5))
|
||||
b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern"))
|
||||
c = Series([Timedelta(x, unit="d") for x in range(5)])
|
||||
d = Series(range(5))
|
||||
e = Series([0.0, 0.2, 0.4, 0.6, 0.8])
|
||||
|
||||
df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e})
|
||||
|
||||
# Datetime-like
|
||||
result = df.astype(str)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": list(map(str, (Timestamp(x)._date_repr for x in a._values))),
|
||||
"b": list(map(str, map(Timestamp, b._values))),
|
||||
"c": [Timedelta(x)._repr_base() for x in c._values],
|
||||
"d": list(map(str, d._values)),
|
||||
"e": list(map(str, e._values)),
|
||||
},
|
||||
dtype="object",
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_astype_str_float(self):
|
||||
# see GH#11302
|
||||
result = DataFrame([np.nan]).astype(str)
|
||||
expected = DataFrame(["nan"], dtype="object")
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = DataFrame([1.12345678901234567890]).astype(str)
|
||||
|
||||
val = "1.1234567890123457"
|
||||
expected = DataFrame([val], dtype="object")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("dtype_class", [dict, Series])
|
||||
def test_astype_dict_like(self, dtype_class):
|
||||
# GH7271 & GH16717
|
||||
a = Series(date_range("2010-01-04", periods=5))
|
||||
b = Series(range(5))
|
||||
c = Series([0.0, 0.2, 0.4, 0.6, 0.8])
|
||||
d = Series(["1.0", "2", "3.14", "4", "5.4"])
|
||||
df = DataFrame({"a": a, "b": b, "c": c, "d": d})
|
||||
original = df.copy(deep=True)
|
||||
|
||||
# change type of a subset of columns
|
||||
dt1 = dtype_class({"b": "str", "d": "float32"})
|
||||
result = df.astype(dt1)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": a,
|
||||
"b": Series(["0", "1", "2", "3", "4"], dtype="object"),
|
||||
"c": c,
|
||||
"d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
dt2 = dtype_class({"b": np.float32, "c": "float32", "d": np.float64})
|
||||
result = df.astype(dt2)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": a,
|
||||
"b": Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float32"),
|
||||
"c": Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype="float32"),
|
||||
"d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float64"),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
# change all columns
|
||||
dt3 = dtype_class({"a": str, "b": str, "c": str, "d": str})
|
||||
tm.assert_frame_equal(df.astype(dt3), df.astype(str))
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
# error should be raised when using something other than column labels
|
||||
# in the keys of the dtype dict
|
||||
dt4 = dtype_class({"b": str, 2: str})
|
||||
dt5 = dtype_class({"e": str})
|
||||
msg_frame = (
|
||||
"Only a column name can be used for the key in a dtype mappings argument. "
|
||||
"'{}' not found in columns."
|
||||
)
|
||||
with pytest.raises(KeyError, match=msg_frame.format(2)):
|
||||
df.astype(dt4)
|
||||
with pytest.raises(KeyError, match=msg_frame.format("e")):
|
||||
df.astype(dt5)
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
# if the dtypes provided are the same as the original dtypes, the
|
||||
# resulting DataFrame should be the same as the original DataFrame
|
||||
dt6 = dtype_class({col: df[col].dtype for col in df.columns})
|
||||
equiv = df.astype(dt6)
|
||||
tm.assert_frame_equal(df, equiv)
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
# GH#16717
|
||||
# if dtypes provided is empty, the resulting DataFrame
|
||||
# should be the same as the original DataFrame
|
||||
dt7 = dtype_class({}) if dtype_class is dict else dtype_class({}, dtype=object)
|
||||
equiv = df.astype(dt7)
|
||||
tm.assert_frame_equal(df, equiv)
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
def test_astype_duplicate_col(self):
|
||||
a1 = Series([1, 2, 3, 4, 5], name="a")
|
||||
b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name="b")
|
||||
a2 = Series([0, 1, 2, 3, 4], name="a")
|
||||
df = concat([a1, b, a2], axis=1)
|
||||
|
||||
result = df.astype(str)
|
||||
a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a")
|
||||
b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype=str, name="b")
|
||||
a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a")
|
||||
expected = concat([a1_str, b_str, a2_str], axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.astype({"a": "str"})
|
||||
expected = concat([a1_str, b, a2_str], axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_astype_duplicate_col_series_arg(self):
|
||||
# GH#44417
|
||||
vals = np.random.default_rng(2).standard_normal((3, 4))
|
||||
df = DataFrame(vals, columns=["A", "B", "C", "A"])
|
||||
dtypes = df.dtypes
|
||||
dtypes.iloc[0] = str
|
||||
dtypes.iloc[2] = "Float64"
|
||||
|
||||
result = df.astype(dtypes)
|
||||
expected = DataFrame(
|
||||
{
|
||||
0: Series(vals[:, 0].astype(str), dtype=object),
|
||||
1: vals[:, 1],
|
||||
2: pd.array(vals[:, 2], dtype="Float64"),
|
||||
3: vals[:, 3],
|
||||
}
|
||||
)
|
||||
expected.columns = df.columns
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
"category",
|
||||
CategoricalDtype(),
|
||||
CategoricalDtype(ordered=True),
|
||||
CategoricalDtype(ordered=False),
|
||||
CategoricalDtype(categories=list("abcdef")),
|
||||
CategoricalDtype(categories=list("edba"), ordered=False),
|
||||
CategoricalDtype(categories=list("edcb"), ordered=True),
|
||||
],
|
||||
ids=repr,
|
||||
)
|
||||
def test_astype_categorical(self, dtype):
|
||||
# GH#18099
|
||||
d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")}
|
||||
df = DataFrame(d)
|
||||
result = df.astype(dtype)
|
||||
expected = DataFrame({k: Categorical(v, dtype=dtype) for k, v in d.items()})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("cls", [CategoricalDtype, DatetimeTZDtype, IntervalDtype])
|
||||
def test_astype_categoricaldtype_class_raises(self, cls):
|
||||
df = DataFrame({"A": ["a", "a", "b", "c"]})
|
||||
xpr = f"Expected an instance of {cls.__name__}"
|
||||
with pytest.raises(TypeError, match=xpr):
|
||||
df.astype({"A": cls})
|
||||
|
||||
with pytest.raises(TypeError, match=xpr):
|
||||
df["A"].astype(cls)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"])
|
||||
def test_astype_extension_dtypes(self, dtype):
|
||||
# GH#22578
|
||||
df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"])
|
||||
|
||||
expected1 = DataFrame(
|
||||
{
|
||||
"a": pd.array([1, 3, 5], dtype=dtype),
|
||||
"b": pd.array([2, 4, 6], dtype=dtype),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(df.astype(dtype), expected1)
|
||||
tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
|
||||
tm.assert_frame_equal(df.astype(dtype).astype("float64"), df)
|
||||
|
||||
df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"])
|
||||
df["b"] = df["b"].astype(dtype)
|
||||
expected2 = DataFrame(
|
||||
{"a": [1.0, 3.0, 5.0], "b": pd.array([2, 4, 6], dtype=dtype)}
|
||||
)
|
||||
tm.assert_frame_equal(df, expected2)
|
||||
|
||||
tm.assert_frame_equal(df.astype(dtype), expected1)
|
||||
tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"])
|
||||
def test_astype_extension_dtypes_1d(self, dtype):
|
||||
# GH#22578
|
||||
df = DataFrame({"a": [1.0, 2.0, 3.0]})
|
||||
|
||||
expected1 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)})
|
||||
tm.assert_frame_equal(df.astype(dtype), expected1)
|
||||
tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
|
||||
|
||||
df = DataFrame({"a": [1.0, 2.0, 3.0]})
|
||||
df["a"] = df["a"].astype(dtype)
|
||||
expected2 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)})
|
||||
tm.assert_frame_equal(df, expected2)
|
||||
|
||||
tm.assert_frame_equal(df.astype(dtype), expected1)
|
||||
tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["category", "Int64"])
|
||||
def test_astype_extension_dtypes_duplicate_col(self, dtype):
|
||||
# GH#24704
|
||||
a1 = Series([0, np.nan, 4], name="a")
|
||||
a2 = Series([np.nan, 3, 5], name="a")
|
||||
df = concat([a1, a2], axis=1)
|
||||
|
||||
result = df.astype(dtype)
|
||||
expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", [{100: "float64", 200: "uint64"}, "category", "float64"]
|
||||
)
|
||||
def test_astype_column_metadata(self, dtype):
|
||||
# GH#19920
|
||||
columns = Index([100, 200, 300], dtype=np.uint64, name="foo")
|
||||
df = DataFrame(np.arange(15).reshape(5, 3), columns=columns)
|
||||
df = df.astype(dtype)
|
||||
tm.assert_index_equal(df.columns, columns)
|
||||
|
||||
@pytest.mark.parametrize("unit", ["Y", "M", "W", "D", "h", "m"])
|
||||
def test_astype_from_object_to_datetime_unit(self, unit):
|
||||
vals = [
|
||||
["2015-01-01", "2015-01-02", "2015-01-03"],
|
||||
["2017-01-01", "2017-01-02", "2017-02-03"],
|
||||
]
|
||||
df = DataFrame(vals, dtype=object)
|
||||
msg = (
|
||||
rf"Unexpected value for 'dtype': 'datetime64\[{unit}\]'. "
|
||||
r"Must be 'datetime64\[s\]', 'datetime64\[ms\]', 'datetime64\[us\]', "
|
||||
r"'datetime64\[ns\]' or DatetimeTZDtype"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.astype(f"M8[{unit}]")
|
||||
|
||||
@pytest.mark.parametrize("unit", ["Y", "M", "W", "D", "h", "m"])
|
||||
def test_astype_from_object_to_timedelta_unit(self, unit):
|
||||
vals = [
|
||||
["1 Day", "2 Days", "3 Days"],
|
||||
["4 Days", "5 Days", "6 Days"],
|
||||
]
|
||||
df = DataFrame(vals, dtype=object)
|
||||
msg = (
|
||||
r"Cannot convert from timedelta64\[ns\] to timedelta64\[.*\]. "
|
||||
"Supported resolutions are 's', 'ms', 'us', 'ns'"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
# TODO: this is ValueError while for DatetimeArray it is TypeError;
|
||||
# get these consistent
|
||||
df.astype(f"m8[{unit}]")
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["M8", "m8"])
|
||||
@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
|
||||
def test_astype_from_datetimelike_to_object(self, dtype, unit):
|
||||
# tests astype to object dtype
|
||||
# GH#19223 / GH#12425
|
||||
dtype = f"{dtype}[{unit}]"
|
||||
arr = np.array([[1, 2, 3]], dtype=dtype)
|
||||
df = DataFrame(arr)
|
||||
result = df.astype(object)
|
||||
assert (result.dtypes == object).all()
|
||||
|
||||
if dtype.startswith("M8"):
|
||||
assert result.iloc[0, 0] == Timestamp(1, unit=unit)
|
||||
else:
|
||||
assert result.iloc[0, 0] == Timedelta(1, unit=unit)
|
||||
|
||||
@pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
|
||||
@pytest.mark.parametrize("dtype", ["M8", "m8"])
|
||||
@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
|
||||
def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit):
|
||||
# tests all units from numeric origination
|
||||
# GH#19223 / GH#12425
|
||||
dtype = f"{dtype}[{unit}]"
|
||||
arr = np.array([[1, 2, 3]], dtype=arr_dtype)
|
||||
df = DataFrame(arr)
|
||||
result = df.astype(dtype)
|
||||
expected = DataFrame(arr.astype(dtype))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
|
||||
def test_astype_to_datetime_unit(self, unit):
|
||||
# tests all units from datetime origination
|
||||
# GH#19223
|
||||
dtype = f"M8[{unit}]"
|
||||
arr = np.array([[1, 2, 3]], dtype=dtype)
|
||||
df = DataFrame(arr)
|
||||
ser = df.iloc[:, 0]
|
||||
idx = Index(ser)
|
||||
dta = ser._values
|
||||
|
||||
if unit in ["ns", "us", "ms", "s"]:
|
||||
# GH#48928
|
||||
result = df.astype(dtype)
|
||||
else:
|
||||
# we use the nearest supported dtype (i.e. M8[s])
|
||||
msg = rf"Cannot cast DatetimeArray to dtype datetime64\[{unit}\]"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.astype(dtype)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
ser.astype(dtype)
|
||||
|
||||
with pytest.raises(TypeError, match=msg.replace("Array", "Index")):
|
||||
idx.astype(dtype)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
dta.astype(dtype)
|
||||
|
||||
return
|
||||
|
||||
exp_df = DataFrame(arr.astype(dtype))
|
||||
assert (exp_df.dtypes == dtype).all()
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
res_ser = ser.astype(dtype)
|
||||
exp_ser = exp_df.iloc[:, 0]
|
||||
assert exp_ser.dtype == dtype
|
||||
tm.assert_series_equal(res_ser, exp_ser)
|
||||
|
||||
exp_dta = exp_ser._values
|
||||
|
||||
res_index = idx.astype(dtype)
|
||||
exp_index = Index(exp_ser)
|
||||
assert exp_index.dtype == dtype
|
||||
tm.assert_index_equal(res_index, exp_index)
|
||||
|
||||
res_dta = dta.astype(dtype)
|
||||
assert exp_dta.dtype == dtype
|
||||
tm.assert_extension_array_equal(res_dta, exp_dta)
|
||||
|
||||
@pytest.mark.parametrize("unit", ["ns"])
|
||||
def test_astype_to_timedelta_unit_ns(self, unit):
|
||||
# preserver the timedelta conversion
|
||||
# GH#19223
|
||||
dtype = f"m8[{unit}]"
|
||||
arr = np.array([[1, 2, 3]], dtype=dtype)
|
||||
df = DataFrame(arr)
|
||||
result = df.astype(dtype)
|
||||
expected = DataFrame(arr.astype(dtype))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"])
|
||||
def test_astype_to_timedelta_unit(self, unit):
|
||||
# coerce to float
|
||||
# GH#19223 until 2.0 used to coerce to float
|
||||
dtype = f"m8[{unit}]"
|
||||
arr = np.array([[1, 2, 3]], dtype=dtype)
|
||||
df = DataFrame(arr)
|
||||
ser = df.iloc[:, 0]
|
||||
tdi = Index(ser)
|
||||
tda = tdi._values
|
||||
|
||||
if unit in ["us", "ms", "s"]:
|
||||
assert (df.dtypes == dtype).all()
|
||||
result = df.astype(dtype)
|
||||
else:
|
||||
# We get the nearest supported unit, i.e. "s"
|
||||
assert (df.dtypes == "m8[s]").all()
|
||||
|
||||
msg = (
|
||||
rf"Cannot convert from timedelta64\[s\] to timedelta64\[{unit}\]. "
|
||||
"Supported resolutions are 's', 'ms', 'us', 'ns'"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.astype(dtype)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ser.astype(dtype)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
tdi.astype(dtype)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
tda.astype(dtype)
|
||||
|
||||
return
|
||||
|
||||
result = df.astype(dtype)
|
||||
# The conversion is a no-op, so we just get a copy
|
||||
expected = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
|
||||
def test_astype_to_incorrect_datetimelike(self, unit):
|
||||
# trying to astype a m to a M, or vice-versa
|
||||
# GH#19224
|
||||
dtype = f"M8[{unit}]"
|
||||
other = f"m8[{unit}]"
|
||||
|
||||
df = DataFrame(np.array([[1, 2, 3]], dtype=dtype))
|
||||
msg = "|".join(
|
||||
[
|
||||
# BlockManager path
|
||||
rf"Cannot cast DatetimeArray to dtype timedelta64\[{unit}\]",
|
||||
# ArrayManager path
|
||||
"cannot astype a datetimelike from "
|
||||
rf"\[datetime64\[ns\]\] to \[timedelta64\[{unit}\]\]",
|
||||
]
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.astype(other)
|
||||
|
||||
msg = "|".join(
|
||||
[
|
||||
# BlockManager path
|
||||
rf"Cannot cast TimedeltaArray to dtype datetime64\[{unit}\]",
|
||||
# ArrayManager path
|
||||
"cannot astype a timedelta from "
|
||||
rf"\[timedelta64\[ns\]\] to \[datetime64\[{unit}\]\]",
|
||||
]
|
||||
)
|
||||
df = DataFrame(np.array([[1, 2, 3]], dtype=other))
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.astype(dtype)
|
||||
|
||||
def test_astype_arg_for_errors(self):
|
||||
# GH#14878
|
||||
|
||||
df = DataFrame([1, 2, 3])
|
||||
|
||||
msg = (
|
||||
"Expected value of kwarg 'errors' to be one of "
|
||||
"['raise', 'ignore']. Supplied value is 'True'"
|
||||
)
|
||||
with pytest.raises(ValueError, match=re.escape(msg)):
|
||||
df.astype(np.float64, errors=True)
|
||||
|
||||
df.astype(np.int8, errors="ignore")
|
||||
|
||||
def test_astype_invalid_conversion(self):
|
||||
# GH#47571
|
||||
df = DataFrame({"a": [1, 2, "text"], "b": [1, 2, 3]})
|
||||
|
||||
msg = (
|
||||
"invalid literal for int() with base 10: 'text': "
|
||||
"Error while type casting for column 'a'"
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=re.escape(msg)):
|
||||
df.astype({"a": int})
|
||||
|
||||
def test_astype_arg_for_errors_dictlist(self):
|
||||
# GH#25905
|
||||
df = DataFrame(
|
||||
[
|
||||
{"a": "1", "b": "16.5%", "c": "test"},
|
||||
{"a": "2.2", "b": "15.3", "c": "another_test"},
|
||||
]
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
{"a": 1.0, "b": "16.5%", "c": "test"},
|
||||
{"a": 2.2, "b": "15.3", "c": "another_test"},
|
||||
]
|
||||
)
|
||||
expected["c"] = expected["c"].astype("object")
|
||||
type_dict = {"a": "float64", "b": "float64", "c": "object"}
|
||||
|
||||
result = df.astype(dtype=type_dict, errors="ignore")
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_astype_dt64tz(self, timezone_frame):
|
||||
# astype
|
||||
expected = np.array(
|
||||
[
|
||||
[
|
||||
Timestamp("2013-01-01 00:00:00"),
|
||||
Timestamp("2013-01-02 00:00:00"),
|
||||
Timestamp("2013-01-03 00:00:00"),
|
||||
],
|
||||
[
|
||||
Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"),
|
||||
NaT,
|
||||
Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"),
|
||||
],
|
||||
[
|
||||
Timestamp("2013-01-01 00:00:00+0100", tz="CET"),
|
||||
NaT,
|
||||
Timestamp("2013-01-03 00:00:00+0100", tz="CET"),
|
||||
],
|
||||
],
|
||||
dtype=object,
|
||||
).T
|
||||
expected = DataFrame(
|
||||
expected,
|
||||
index=timezone_frame.index,
|
||||
columns=timezone_frame.columns,
|
||||
dtype=object,
|
||||
)
|
||||
result = timezone_frame.astype(object)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
msg = "Cannot use .astype to convert from timezone-aware dtype to timezone-"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
# dt64tz->dt64 deprecated
|
||||
timezone_frame.astype("datetime64[ns]")
|
||||
|
||||
def test_astype_dt64tz_to_str(self, timezone_frame):
|
||||
# str formatting
|
||||
result = timezone_frame.astype(str)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[
|
||||
"2013-01-01",
|
||||
"2013-01-01 00:00:00-05:00",
|
||||
"2013-01-01 00:00:00+01:00",
|
||||
],
|
||||
["2013-01-02", "NaT", "NaT"],
|
||||
[
|
||||
"2013-01-03",
|
||||
"2013-01-03 00:00:00-05:00",
|
||||
"2013-01-03 00:00:00+01:00",
|
||||
],
|
||||
],
|
||||
columns=timezone_frame.columns,
|
||||
dtype="object",
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with option_context("display.max_columns", 20):
|
||||
result = str(timezone_frame)
|
||||
assert (
|
||||
"0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00"
|
||||
) in result
|
||||
assert (
|
||||
"1 2013-01-02 NaT NaT"
|
||||
) in result
|
||||
assert (
|
||||
"2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00"
|
||||
) in result
|
||||
|
||||
def test_astype_empty_dtype_dict(self):
|
||||
# issue mentioned further down in the following issue's thread
|
||||
# https://github.com/pandas-dev/pandas/issues/33113
|
||||
df = DataFrame()
|
||||
result = df.astype({})
|
||||
tm.assert_frame_equal(result, df)
|
||||
assert result is not df
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, dtype",
|
||||
[
|
||||
(["x", "y", "z"], "string[python]"),
|
||||
pytest.param(
|
||||
["x", "y", "z"],
|
||||
"string[pyarrow]",
|
||||
marks=td.skip_if_no("pyarrow"),
|
||||
),
|
||||
(["x", "y", "z"], "category"),
|
||||
(3 * [Timestamp("2020-01-01", tz="UTC")], None),
|
||||
(3 * [Interval(0, 1)], None),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("errors", ["raise", "ignore"])
|
||||
def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors):
|
||||
# https://github.com/pandas-dev/pandas/issues/35471
|
||||
df = DataFrame(Series(data, dtype=dtype))
|
||||
if errors == "ignore":
|
||||
expected = df
|
||||
result = df.astype(float, errors=errors)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
msg = "(Cannot cast)|(could not convert)"
|
||||
with pytest.raises((ValueError, TypeError), match=msg):
|
||||
df.astype(float, errors=errors)
|
||||
|
||||
def test_astype_tz_conversion(self):
|
||||
# GH 35973
|
||||
val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")}
|
||||
df = DataFrame(val)
|
||||
result = df.astype({"tz": "datetime64[ns, Europe/Berlin]"})
|
||||
|
||||
expected = df
|
||||
expected["tz"] = expected["tz"].dt.tz_convert("Europe/Berlin")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz", ["UTC", "Europe/Berlin"])
|
||||
def test_astype_tz_object_conversion(self, tz):
|
||||
# GH 35973
|
||||
val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")}
|
||||
expected = DataFrame(val)
|
||||
|
||||
# convert expected to object dtype from other tz str (independently tested)
|
||||
result = expected.astype({"tz": f"datetime64[ns, {tz}]"})
|
||||
result = result.astype({"tz": "object"})
|
||||
|
||||
# do real test: object dtype to a specified tz, different from construction tz.
|
||||
result = result.astype({"tz": "datetime64[ns, Europe/London]"})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_astype_dt64_to_string(
|
||||
self, frame_or_series, tz_naive_fixture, using_infer_string
|
||||
):
|
||||
# GH#41409
|
||||
tz = tz_naive_fixture
|
||||
|
||||
dti = date_range("2016-01-01", periods=3, tz=tz)
|
||||
dta = dti._data
|
||||
dta[0] = NaT
|
||||
|
||||
obj = frame_or_series(dta)
|
||||
result = obj.astype("string")
|
||||
|
||||
# Check that Series/DataFrame.astype matches DatetimeArray.astype
|
||||
expected = frame_or_series(dta.astype("string"))
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
item = result.iloc[0]
|
||||
if frame_or_series is DataFrame:
|
||||
item = item.iloc[0]
|
||||
if using_infer_string:
|
||||
assert item is np.nan
|
||||
else:
|
||||
assert item is pd.NA
|
||||
|
||||
# For non-NA values, we should match what we get for non-EA str
|
||||
alt = obj.astype(str)
|
||||
assert np.all(alt.iloc[1:] == result.iloc[1:])
|
||||
|
||||
def test_astype_td64_to_string(self, frame_or_series):
|
||||
# GH#41409
|
||||
tdi = pd.timedelta_range("1 Day", periods=3)
|
||||
obj = frame_or_series(tdi)
|
||||
|
||||
expected = frame_or_series(["1 days", "2 days", "3 days"], dtype="string")
|
||||
result = obj.astype("string")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_astype_bytes(self):
|
||||
# GH#39474
|
||||
result = DataFrame(["foo", "bar", "baz"]).astype(bytes)
|
||||
assert result.dtypes[0] == np.dtype("S3")
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index_slice",
|
||||
[
|
||||
np.s_[:2, :2],
|
||||
np.s_[:1, :2],
|
||||
np.s_[:2, :1],
|
||||
np.s_[::2, ::2],
|
||||
np.s_[::1, ::2],
|
||||
np.s_[::2, ::1],
|
||||
],
|
||||
)
|
||||
def test_astype_noncontiguous(self, index_slice):
|
||||
# GH#42396
|
||||
data = np.arange(16).reshape(4, 4)
|
||||
df = DataFrame(data)
|
||||
|
||||
result = df.iloc[index_slice].astype("int16")
|
||||
expected = df.iloc[index_slice]
|
||||
tm.assert_frame_equal(result, expected, check_dtype=False)
|
||||
|
||||
def test_astype_retain_attrs(self, any_numpy_dtype):
|
||||
# GH#44414
|
||||
df = DataFrame({"a": [0, 1, 2], "b": [3, 4, 5]})
|
||||
df.attrs["Location"] = "Michigan"
|
||||
|
||||
result = df.astype({"a": any_numpy_dtype}).attrs
|
||||
expected = df.attrs
|
||||
|
||||
tm.assert_dict_equal(expected, result)
|
||||
|
||||
|
||||
class TestAstypeCategorical:
|
||||
def test_astype_from_categorical3(self):
|
||||
df = DataFrame({"cats": [1, 2, 3, 4, 5, 6], "vals": [1, 2, 3, 4, 5, 6]})
|
||||
cats = Categorical([1, 2, 3, 4, 5, 6])
|
||||
exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
|
||||
df["cats"] = df["cats"].astype("category")
|
||||
tm.assert_frame_equal(exp_df, df)
|
||||
|
||||
def test_astype_from_categorical4(self):
|
||||
df = DataFrame(
|
||||
{"cats": ["a", "b", "b", "a", "a", "d"], "vals": [1, 2, 3, 4, 5, 6]}
|
||||
)
|
||||
cats = Categorical(["a", "b", "b", "a", "a", "d"])
|
||||
exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
|
||||
df["cats"] = df["cats"].astype("category")
|
||||
tm.assert_frame_equal(exp_df, df)
|
||||
|
||||
def test_categorical_astype_to_int(self, any_int_dtype):
|
||||
# GH#39402
|
||||
|
||||
df = DataFrame(data={"col1": pd.array([2.0, 1.0, 3.0])})
|
||||
df.col1 = df.col1.astype("category")
|
||||
df.col1 = df.col1.astype(any_int_dtype)
|
||||
expected = DataFrame({"col1": pd.array([2, 1, 3], dtype=any_int_dtype)})
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_astype_categorical_to_string_missing(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/41797
|
||||
df = DataFrame(["a", "b", np.nan])
|
||||
expected = df.astype(str)
|
||||
cat = df.astype("category")
|
||||
result = cat.astype(str)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class IntegerArrayNoCopy(pd.core.arrays.IntegerArray):
|
||||
# GH 42501
|
||||
|
||||
def copy(self):
|
||||
assert False
|
||||
|
||||
|
||||
class Int16DtypeNoCopy(pd.Int16Dtype):
|
||||
# GH 42501
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls):
|
||||
return IntegerArrayNoCopy
|
||||
|
||||
|
||||
def test_frame_astype_no_copy():
|
||||
# GH 42501
|
||||
df = DataFrame({"a": [1, 4, None, 5], "b": [6, 7, 8, 9]}, dtype=object)
|
||||
result = df.astype({"a": Int16DtypeNoCopy()}, copy=False)
|
||||
|
||||
assert result.a.dtype == pd.Int16Dtype()
|
||||
assert np.shares_memory(df.b.values, result.b.values)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["int64", "Int64"])
|
||||
def test_astype_copies(dtype):
|
||||
# GH#50984
|
||||
pytest.importorskip("pyarrow")
|
||||
df = DataFrame({"a": [1, 2, 3]}, dtype=dtype)
|
||||
result = df.astype("int64[pyarrow]", copy=True)
|
||||
df.iloc[0, 0] = 100
|
||||
expected = DataFrame({"a": [1, 2, 3]}, dtype="int64[pyarrow]")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT])
|
||||
def test_astype_to_string_not_modifying_input(string_storage, val):
|
||||
# GH#51073
|
||||
df = DataFrame({"a": ["a", "b", val]})
|
||||
expected = df.copy()
|
||||
with option_context("mode.string_storage", string_storage):
|
||||
df.astype("string", copy=False)
|
||||
tm.assert_frame_equal(df, expected)
|
@ -0,0 +1,132 @@
|
||||
from datetime import time
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import pytz
|
||||
|
||||
from pandas._libs.tslibs import timezones
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestAtTime:
|
||||
@pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"])
|
||||
def test_localized_at_time(self, tzstr, frame_or_series):
|
||||
tz = timezones.maybe_get_tz(tzstr)
|
||||
|
||||
rng = date_range("4/16/2012", "5/1/2012", freq="h")
|
||||
ts = frame_or_series(
|
||||
np.random.default_rng(2).standard_normal(len(rng)), index=rng
|
||||
)
|
||||
|
||||
ts_local = ts.tz_localize(tzstr)
|
||||
|
||||
result = ts_local.at_time(time(10, 0))
|
||||
expected = ts.at_time(time(10, 0)).tz_localize(tzstr)
|
||||
tm.assert_equal(result, expected)
|
||||
assert timezones.tz_compare(result.index.tz, tz)
|
||||
|
||||
def test_at_time(self, frame_or_series):
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
ts = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
|
||||
)
|
||||
ts = tm.get_obj(ts, frame_or_series)
|
||||
rs = ts.at_time(rng[1])
|
||||
assert (rs.index.hour == rng[1].hour).all()
|
||||
assert (rs.index.minute == rng[1].minute).all()
|
||||
assert (rs.index.second == rng[1].second).all()
|
||||
|
||||
result = ts.at_time("9:30")
|
||||
expected = ts.at_time(time(9, 30))
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_at_time_midnight(self, frame_or_series):
|
||||
# midnight, everything
|
||||
rng = date_range("1/1/2000", "1/31/2000")
|
||||
ts = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(rng), 3)), index=rng
|
||||
)
|
||||
ts = tm.get_obj(ts, frame_or_series)
|
||||
|
||||
result = ts.at_time(time(0, 0))
|
||||
tm.assert_equal(result, ts)
|
||||
|
||||
def test_at_time_nonexistent(self, frame_or_series):
|
||||
# time doesn't exist
|
||||
rng = date_range("1/1/2012", freq="23Min", periods=384)
|
||||
ts = DataFrame(np.random.default_rng(2).standard_normal(len(rng)), rng)
|
||||
ts = tm.get_obj(ts, frame_or_series)
|
||||
rs = ts.at_time("16:00")
|
||||
assert len(rs) == 0
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"hour", ["1:00", "1:00AM", time(1), time(1, tzinfo=pytz.UTC)]
|
||||
)
|
||||
def test_at_time_errors(self, hour):
|
||||
# GH#24043
|
||||
dti = date_range("2018", periods=3, freq="h")
|
||||
df = DataFrame(list(range(len(dti))), index=dti)
|
||||
if getattr(hour, "tzinfo", None) is None:
|
||||
result = df.at_time(hour)
|
||||
expected = df.iloc[1:2]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(ValueError, match="Index must be timezone"):
|
||||
df.at_time(hour)
|
||||
|
||||
def test_at_time_tz(self):
|
||||
# GH#24043
|
||||
dti = date_range("2018", periods=3, freq="h", tz="US/Pacific")
|
||||
df = DataFrame(list(range(len(dti))), index=dti)
|
||||
result = df.at_time(time(4, tzinfo=pytz.timezone("US/Eastern")))
|
||||
expected = df.iloc[1:2]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_at_time_raises(self, frame_or_series):
|
||||
# GH#20725
|
||||
obj = DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
msg = "Index must be DatetimeIndex"
|
||||
with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex
|
||||
obj.at_time("00:00")
|
||||
|
||||
@pytest.mark.parametrize("axis", ["index", "columns", 0, 1])
|
||||
def test_at_time_axis(self, axis):
|
||||
# issue 8839
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
ts = DataFrame(np.random.default_rng(2).standard_normal((len(rng), len(rng))))
|
||||
ts.index, ts.columns = rng, rng
|
||||
|
||||
indices = rng[(rng.hour == 9) & (rng.minute == 30) & (rng.second == 0)]
|
||||
|
||||
if axis in ["index", 0]:
|
||||
expected = ts.loc[indices, :]
|
||||
elif axis in ["columns", 1]:
|
||||
expected = ts.loc[:, indices]
|
||||
|
||||
result = ts.at_time("9:30", axis=axis)
|
||||
|
||||
# Without clearing freq, result has freq 1440T and expected 5T
|
||||
result.index = result.index._with_freq(None)
|
||||
expected.index = expected.index._with_freq(None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_at_time_datetimeindex(self):
|
||||
index = date_range("2012-01-01", "2012-01-05", freq="30min")
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(index), 5)), index=index
|
||||
)
|
||||
akey = time(12, 0, 0)
|
||||
ainds = [24, 72, 120, 168]
|
||||
|
||||
result = df.at_time(akey)
|
||||
expected = df.loc[akey]
|
||||
expected2 = df.iloc[ainds]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(result, expected2)
|
||||
assert len(result) == 4
|
@ -0,0 +1,227 @@
|
||||
from datetime import (
|
||||
datetime,
|
||||
time,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.tslibs import timezones
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestBetweenTime:
|
||||
@td.skip_if_not_us_locale
|
||||
def test_between_time_formats(self, frame_or_series):
|
||||
# GH#11818
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
ts = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
|
||||
)
|
||||
ts = tm.get_obj(ts, frame_or_series)
|
||||
|
||||
strings = [
|
||||
("2:00", "2:30"),
|
||||
("0200", "0230"),
|
||||
("2:00am", "2:30am"),
|
||||
("0200am", "0230am"),
|
||||
("2:00:00", "2:30:00"),
|
||||
("020000", "023000"),
|
||||
("2:00:00am", "2:30:00am"),
|
||||
("020000am", "023000am"),
|
||||
]
|
||||
expected_length = 28
|
||||
|
||||
for time_string in strings:
|
||||
assert len(ts.between_time(*time_string)) == expected_length
|
||||
|
||||
@pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"])
|
||||
def test_localized_between_time(self, tzstr, frame_or_series):
|
||||
tz = timezones.maybe_get_tz(tzstr)
|
||||
|
||||
rng = date_range("4/16/2012", "5/1/2012", freq="h")
|
||||
ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
|
||||
if frame_or_series is DataFrame:
|
||||
ts = ts.to_frame()
|
||||
|
||||
ts_local = ts.tz_localize(tzstr)
|
||||
|
||||
t1, t2 = time(10, 0), time(11, 0)
|
||||
result = ts_local.between_time(t1, t2)
|
||||
expected = ts.between_time(t1, t2).tz_localize(tzstr)
|
||||
tm.assert_equal(result, expected)
|
||||
assert timezones.tz_compare(result.index.tz, tz)
|
||||
|
||||
def test_between_time_types(self, frame_or_series):
|
||||
# GH11818
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
obj = DataFrame({"A": 0}, index=rng)
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
msg = r"Cannot convert arg \[datetime\.datetime\(2010, 1, 2, 1, 0\)\] to a time"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5))
|
||||
|
||||
def test_between_time(self, inclusive_endpoints_fixture, frame_or_series):
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
ts = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
|
||||
)
|
||||
ts = tm.get_obj(ts, frame_or_series)
|
||||
|
||||
stime = time(0, 0)
|
||||
etime = time(1, 0)
|
||||
inclusive = inclusive_endpoints_fixture
|
||||
|
||||
filtered = ts.between_time(stime, etime, inclusive=inclusive)
|
||||
exp_len = 13 * 4 + 1
|
||||
|
||||
if inclusive in ["right", "neither"]:
|
||||
exp_len -= 5
|
||||
if inclusive in ["left", "neither"]:
|
||||
exp_len -= 4
|
||||
|
||||
assert len(filtered) == exp_len
|
||||
for rs in filtered.index:
|
||||
t = rs.time()
|
||||
if inclusive in ["left", "both"]:
|
||||
assert t >= stime
|
||||
else:
|
||||
assert t > stime
|
||||
|
||||
if inclusive in ["right", "both"]:
|
||||
assert t <= etime
|
||||
else:
|
||||
assert t < etime
|
||||
|
||||
result = ts.between_time("00:00", "01:00")
|
||||
expected = ts.between_time(stime, etime)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# across midnight
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
ts = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
|
||||
)
|
||||
ts = tm.get_obj(ts, frame_or_series)
|
||||
stime = time(22, 0)
|
||||
etime = time(9, 0)
|
||||
|
||||
filtered = ts.between_time(stime, etime, inclusive=inclusive)
|
||||
exp_len = (12 * 11 + 1) * 4 + 1
|
||||
if inclusive in ["right", "neither"]:
|
||||
exp_len -= 4
|
||||
if inclusive in ["left", "neither"]:
|
||||
exp_len -= 4
|
||||
|
||||
assert len(filtered) == exp_len
|
||||
for rs in filtered.index:
|
||||
t = rs.time()
|
||||
if inclusive in ["left", "both"]:
|
||||
assert (t >= stime) or (t <= etime)
|
||||
else:
|
||||
assert (t > stime) or (t <= etime)
|
||||
|
||||
if inclusive in ["right", "both"]:
|
||||
assert (t <= etime) or (t >= stime)
|
||||
else:
|
||||
assert (t < etime) or (t >= stime)
|
||||
|
||||
def test_between_time_raises(self, frame_or_series):
|
||||
# GH#20725
|
||||
obj = DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
msg = "Index must be DatetimeIndex"
|
||||
with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex
|
||||
obj.between_time(start_time="00:00", end_time="12:00")
|
||||
|
||||
def test_between_time_axis(self, frame_or_series):
|
||||
# GH#8839
|
||||
rng = date_range("1/1/2000", periods=100, freq="10min")
|
||||
ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
|
||||
if frame_or_series is DataFrame:
|
||||
ts = ts.to_frame()
|
||||
|
||||
stime, etime = ("08:00:00", "09:00:00")
|
||||
expected_length = 7
|
||||
|
||||
assert len(ts.between_time(stime, etime)) == expected_length
|
||||
assert len(ts.between_time(stime, etime, axis=0)) == expected_length
|
||||
msg = f"No axis named {ts.ndim} for object type {type(ts).__name__}"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ts.between_time(stime, etime, axis=ts.ndim)
|
||||
|
||||
def test_between_time_axis_aliases(self, axis):
|
||||
# GH#8839
|
||||
rng = date_range("1/1/2000", periods=100, freq="10min")
|
||||
ts = DataFrame(np.random.default_rng(2).standard_normal((len(rng), len(rng))))
|
||||
stime, etime = ("08:00:00", "09:00:00")
|
||||
exp_len = 7
|
||||
|
||||
if axis in ["index", 0]:
|
||||
ts.index = rng
|
||||
assert len(ts.between_time(stime, etime)) == exp_len
|
||||
assert len(ts.between_time(stime, etime, axis=0)) == exp_len
|
||||
|
||||
if axis in ["columns", 1]:
|
||||
ts.columns = rng
|
||||
selected = ts.between_time(stime, etime, axis=1).columns
|
||||
assert len(selected) == exp_len
|
||||
|
||||
def test_between_time_axis_raises(self, axis):
|
||||
# issue 8839
|
||||
rng = date_range("1/1/2000", periods=100, freq="10min")
|
||||
mask = np.arange(0, len(rng))
|
||||
rand_data = np.random.default_rng(2).standard_normal((len(rng), len(rng)))
|
||||
ts = DataFrame(rand_data, index=rng, columns=rng)
|
||||
stime, etime = ("08:00:00", "09:00:00")
|
||||
|
||||
msg = "Index must be DatetimeIndex"
|
||||
if axis in ["columns", 1]:
|
||||
ts.index = mask
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
ts.between_time(stime, etime)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
ts.between_time(stime, etime, axis=0)
|
||||
|
||||
if axis in ["index", 0]:
|
||||
ts.columns = mask
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
ts.between_time(stime, etime, axis=1)
|
||||
|
||||
def test_between_time_datetimeindex(self):
|
||||
index = date_range("2012-01-01", "2012-01-05", freq="30min")
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(index), 5)), index=index
|
||||
)
|
||||
bkey = slice(time(13, 0, 0), time(14, 0, 0))
|
||||
binds = [26, 27, 28, 74, 75, 76, 122, 123, 124, 170, 171, 172]
|
||||
|
||||
result = df.between_time(bkey.start, bkey.stop)
|
||||
expected = df.loc[bkey]
|
||||
expected2 = df.iloc[binds]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(result, expected2)
|
||||
assert len(result) == 12
|
||||
|
||||
def test_between_time_incorrect_arg_inclusive(self):
|
||||
# GH40245
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
ts = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
|
||||
)
|
||||
|
||||
stime = time(0, 0)
|
||||
etime = time(1, 0)
|
||||
inclusive = "bad_string"
|
||||
msg = "Inclusive has to be either 'both', 'neither', 'left' or 'right'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ts.between_time(stime, etime, inclusive=inclusive)
|
@ -0,0 +1,199 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameClip:
|
||||
def test_clip(self, float_frame):
|
||||
median = float_frame.median().median()
|
||||
original = float_frame.copy()
|
||||
|
||||
double = float_frame.clip(upper=median, lower=median)
|
||||
assert not (double.values != median).any()
|
||||
|
||||
# Verify that float_frame was not changed inplace
|
||||
assert (float_frame.values == original.values).all()
|
||||
|
||||
def test_inplace_clip(self, float_frame):
|
||||
# GH#15388
|
||||
median = float_frame.median().median()
|
||||
frame_copy = float_frame.copy()
|
||||
|
||||
return_value = frame_copy.clip(upper=median, lower=median, inplace=True)
|
||||
assert return_value is None
|
||||
assert not (frame_copy.values != median).any()
|
||||
|
||||
def test_dataframe_clip(self):
|
||||
# GH#2747
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((1000, 2)))
|
||||
|
||||
for lb, ub in [(-1, 1), (1, -1)]:
|
||||
clipped_df = df.clip(lb, ub)
|
||||
|
||||
lb, ub = min(lb, ub), max(ub, lb)
|
||||
lb_mask = df.values <= lb
|
||||
ub_mask = df.values >= ub
|
||||
mask = ~lb_mask & ~ub_mask
|
||||
assert (clipped_df.values[lb_mask] == lb).all()
|
||||
assert (clipped_df.values[ub_mask] == ub).all()
|
||||
assert (clipped_df.values[mask] == df.values[mask]).all()
|
||||
|
||||
def test_clip_mixed_numeric(self):
|
||||
# clip on mixed integer or floats
|
||||
# GH#24162, clipping now preserves numeric types per column
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [1.0, np.nan, 3.0]})
|
||||
result = df.clip(1, 2)
|
||||
expected = DataFrame({"A": [1, 2, 2], "B": [1.0, np.nan, 2.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame([[1, 2, 3.4], [3, 4, 5.6]], columns=["foo", "bar", "baz"])
|
||||
expected = df.dtypes
|
||||
result = df.clip(upper=3).dtypes
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("inplace", [True, False])
|
||||
def test_clip_against_series(self, inplace):
|
||||
# GH#6966
|
||||
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((1000, 2)))
|
||||
lb = Series(np.random.default_rng(2).standard_normal(1000))
|
||||
ub = lb + 1
|
||||
|
||||
original = df.copy()
|
||||
clipped_df = df.clip(lb, ub, axis=0, inplace=inplace)
|
||||
|
||||
if inplace:
|
||||
clipped_df = df
|
||||
|
||||
for i in range(2):
|
||||
lb_mask = original.iloc[:, i] <= lb
|
||||
ub_mask = original.iloc[:, i] >= ub
|
||||
mask = ~lb_mask & ~ub_mask
|
||||
|
||||
result = clipped_df.loc[lb_mask, i]
|
||||
tm.assert_series_equal(result, lb[lb_mask], check_names=False)
|
||||
assert result.name == i
|
||||
|
||||
result = clipped_df.loc[ub_mask, i]
|
||||
tm.assert_series_equal(result, ub[ub_mask], check_names=False)
|
||||
assert result.name == i
|
||||
|
||||
tm.assert_series_equal(clipped_df.loc[mask, i], df.loc[mask, i])
|
||||
|
||||
@pytest.mark.parametrize("inplace", [True, False])
|
||||
@pytest.mark.parametrize("lower", [[2, 3, 4], np.asarray([2, 3, 4])])
|
||||
@pytest.mark.parametrize(
|
||||
"axis,res",
|
||||
[
|
||||
(0, [[2.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 7.0, 7.0]]),
|
||||
(1, [[2.0, 3.0, 4.0], [4.0, 5.0, 6.0], [5.0, 6.0, 7.0]]),
|
||||
],
|
||||
)
|
||||
def test_clip_against_list_like(self, inplace, lower, axis, res):
|
||||
# GH#15390
|
||||
arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])
|
||||
|
||||
original = DataFrame(
|
||||
arr, columns=["one", "two", "three"], index=["a", "b", "c"]
|
||||
)
|
||||
|
||||
result = original.clip(lower=lower, upper=[5, 6, 7], axis=axis, inplace=inplace)
|
||||
|
||||
expected = DataFrame(res, columns=original.columns, index=original.index)
|
||||
if inplace:
|
||||
result = original
|
||||
tm.assert_frame_equal(result, expected, check_exact=True)
|
||||
|
||||
@pytest.mark.parametrize("axis", [0, 1, None])
|
||||
def test_clip_against_frame(self, axis):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((1000, 2)))
|
||||
lb = DataFrame(np.random.default_rng(2).standard_normal((1000, 2)))
|
||||
ub = lb + 1
|
||||
|
||||
clipped_df = df.clip(lb, ub, axis=axis)
|
||||
|
||||
lb_mask = df <= lb
|
||||
ub_mask = df >= ub
|
||||
mask = ~lb_mask & ~ub_mask
|
||||
|
||||
tm.assert_frame_equal(clipped_df[lb_mask], lb[lb_mask])
|
||||
tm.assert_frame_equal(clipped_df[ub_mask], ub[ub_mask])
|
||||
tm.assert_frame_equal(clipped_df[mask], df[mask])
|
||||
|
||||
def test_clip_against_unordered_columns(self):
|
||||
# GH#20911
|
||||
df1 = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((1000, 4)),
|
||||
columns=["A", "B", "C", "D"],
|
||||
)
|
||||
df2 = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((1000, 4)),
|
||||
columns=["D", "A", "B", "C"],
|
||||
)
|
||||
df3 = DataFrame(df2.values - 1, columns=["B", "D", "C", "A"])
|
||||
result_upper = df1.clip(lower=0, upper=df2)
|
||||
expected_upper = df1.clip(lower=0, upper=df2[df1.columns])
|
||||
result_lower = df1.clip(lower=df3, upper=3)
|
||||
expected_lower = df1.clip(lower=df3[df1.columns], upper=3)
|
||||
result_lower_upper = df1.clip(lower=df3, upper=df2)
|
||||
expected_lower_upper = df1.clip(lower=df3[df1.columns], upper=df2[df1.columns])
|
||||
tm.assert_frame_equal(result_upper, expected_upper)
|
||||
tm.assert_frame_equal(result_lower, expected_lower)
|
||||
tm.assert_frame_equal(result_lower_upper, expected_lower_upper)
|
||||
|
||||
def test_clip_with_na_args(self, float_frame):
|
||||
"""Should process np.nan argument as None"""
|
||||
# GH#17276
|
||||
tm.assert_frame_equal(float_frame.clip(np.nan), float_frame)
|
||||
tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), float_frame)
|
||||
|
||||
# GH#19992 and adjusted in GH#40420
|
||||
df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]})
|
||||
|
||||
msg = "Downcasting behavior in Series and DataFrame methods 'where'"
|
||||
# TODO: avoid this warning here? seems like we should never be upcasting
|
||||
# in the first place?
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.clip(lower=[4, 5, np.nan], axis=0)
|
||||
expected = DataFrame(
|
||||
{"col_0": [4, 5, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.clip(lower=[4, 5, np.nan], axis=1)
|
||||
expected = DataFrame(
|
||||
{"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [7, 8, 9]}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH#40420
|
||||
data = {"col_0": [9, -3, 0, -1, 5], "col_1": [-2, -7, 6, 8, -5]}
|
||||
df = DataFrame(data)
|
||||
t = Series([2, -4, np.nan, 6, 3])
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.clip(lower=t, axis=0)
|
||||
expected = DataFrame({"col_0": [9, -3, 0, 6, 5], "col_1": [2, -4, 6, 8, 3]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_clip_int_data_with_float_bound(self):
|
||||
# GH51472
|
||||
df = DataFrame({"a": [1, 2, 3]})
|
||||
result = df.clip(lower=1.5)
|
||||
expected = DataFrame({"a": [1.5, 2.0, 3.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_clip_with_list_bound(self):
|
||||
# GH#54817
|
||||
df = DataFrame([1, 5])
|
||||
expected = DataFrame([3, 5])
|
||||
result = df.clip([3])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame([1, 3])
|
||||
result = df.clip(upper=[3])
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,47 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestCombine:
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
pd.date_range("2000", periods=4),
|
||||
pd.date_range("2000", periods=4, tz="US/Central"),
|
||||
pd.period_range("2000", periods=4),
|
||||
pd.timedelta_range(0, periods=4),
|
||||
],
|
||||
)
|
||||
def test_combine_datetlike_udf(self, data):
|
||||
# GH#23079
|
||||
df = pd.DataFrame({"A": data})
|
||||
other = df.copy()
|
||||
df.iloc[1, 0] = None
|
||||
|
||||
def combiner(a, b):
|
||||
return b
|
||||
|
||||
result = df.combine(other, combiner)
|
||||
tm.assert_frame_equal(result, other)
|
||||
|
||||
def test_combine_generic(self, float_frame):
|
||||
df1 = float_frame
|
||||
df2 = float_frame.loc[float_frame.index[:-5], ["A", "B", "C"]]
|
||||
|
||||
combined = df1.combine(df2, np.add)
|
||||
combined2 = df2.combine(df1, np.add)
|
||||
assert combined["D"].isna().all()
|
||||
assert combined2["D"].isna().all()
|
||||
|
||||
chunk = combined.loc[combined.index[:-5], ["A", "B", "C"]]
|
||||
chunk2 = combined2.loc[combined2.index[:-5], ["A", "B", "C"]]
|
||||
|
||||
exp = (
|
||||
float_frame.loc[float_frame.index[:-5], ["A", "B", "C"]].reindex_like(chunk)
|
||||
* 2
|
||||
)
|
||||
tm.assert_frame_equal(chunk, exp)
|
||||
tm.assert_frame_equal(chunk2, exp)
|
@ -0,0 +1,556 @@
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.cast import find_common_type
|
||||
from pandas.core.dtypes.common import is_dtype_equal
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameCombineFirst:
|
||||
def test_combine_first_mixed(self):
|
||||
a = Series(["a", "b"], index=range(2))
|
||||
b = Series(range(2), index=range(2))
|
||||
f = DataFrame({"A": a, "B": b})
|
||||
|
||||
a = Series(["a", "b"], index=range(5, 7))
|
||||
b = Series(range(2), index=range(5, 7))
|
||||
g = DataFrame({"A": a, "B": b})
|
||||
|
||||
exp = DataFrame({"A": list("abab"), "B": [0, 1, 0, 1]}, index=[0, 1, 5, 6])
|
||||
combined = f.combine_first(g)
|
||||
tm.assert_frame_equal(combined, exp)
|
||||
|
||||
def test_combine_first(self, float_frame, using_infer_string):
|
||||
# disjoint
|
||||
head, tail = float_frame[:5], float_frame[5:]
|
||||
|
||||
combined = head.combine_first(tail)
|
||||
reordered_frame = float_frame.reindex(combined.index)
|
||||
tm.assert_frame_equal(combined, reordered_frame)
|
||||
tm.assert_index_equal(combined.columns, float_frame.columns)
|
||||
tm.assert_series_equal(combined["A"], reordered_frame["A"])
|
||||
|
||||
# same index
|
||||
fcopy = float_frame.copy()
|
||||
fcopy["A"] = 1
|
||||
del fcopy["C"]
|
||||
|
||||
fcopy2 = float_frame.copy()
|
||||
fcopy2["B"] = 0
|
||||
del fcopy2["D"]
|
||||
|
||||
combined = fcopy.combine_first(fcopy2)
|
||||
|
||||
assert (combined["A"] == 1).all()
|
||||
tm.assert_series_equal(combined["B"], fcopy["B"])
|
||||
tm.assert_series_equal(combined["C"], fcopy2["C"])
|
||||
tm.assert_series_equal(combined["D"], fcopy["D"])
|
||||
|
||||
# overlap
|
||||
head, tail = reordered_frame[:10].copy(), reordered_frame
|
||||
head["A"] = 1
|
||||
|
||||
combined = head.combine_first(tail)
|
||||
assert (combined["A"][:10] == 1).all()
|
||||
|
||||
# reverse overlap
|
||||
tail.iloc[:10, tail.columns.get_loc("A")] = 0
|
||||
combined = tail.combine_first(head)
|
||||
assert (combined["A"][:10] == 0).all()
|
||||
|
||||
# no overlap
|
||||
f = float_frame[:10]
|
||||
g = float_frame[10:]
|
||||
combined = f.combine_first(g)
|
||||
tm.assert_series_equal(combined["A"].reindex(f.index), f["A"])
|
||||
tm.assert_series_equal(combined["A"].reindex(g.index), g["A"])
|
||||
|
||||
# corner cases
|
||||
warning = FutureWarning if using_infer_string else None
|
||||
with tm.assert_produces_warning(warning, match="empty entries"):
|
||||
comb = float_frame.combine_first(DataFrame())
|
||||
tm.assert_frame_equal(comb, float_frame)
|
||||
|
||||
comb = DataFrame().combine_first(float_frame)
|
||||
tm.assert_frame_equal(comb, float_frame.sort_index())
|
||||
|
||||
comb = float_frame.combine_first(DataFrame(index=["faz", "boo"]))
|
||||
assert "faz" in comb.index
|
||||
|
||||
# #2525
|
||||
df = DataFrame({"a": [1]}, index=[datetime(2012, 1, 1)])
|
||||
df2 = DataFrame(columns=["b"])
|
||||
result = df.combine_first(df2)
|
||||
assert "b" in result
|
||||
|
||||
def test_combine_first_mixed_bug(self):
|
||||
idx = Index(["a", "b", "c", "e"])
|
||||
ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx)
|
||||
ser2 = Series(["a", "b", "c", "e"], index=idx)
|
||||
ser3 = Series([12, 4, 5, 97], index=idx)
|
||||
|
||||
frame1 = DataFrame({"col0": ser1, "col2": ser2, "col3": ser3})
|
||||
|
||||
idx = Index(["a", "b", "c", "f"])
|
||||
ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx)
|
||||
ser2 = Series(["a", "b", "c", "f"], index=idx)
|
||||
ser3 = Series([12, 4, 5, 97], index=idx)
|
||||
|
||||
frame2 = DataFrame({"col1": ser1, "col2": ser2, "col5": ser3})
|
||||
|
||||
combined = frame1.combine_first(frame2)
|
||||
assert len(combined.columns) == 5
|
||||
|
||||
def test_combine_first_same_as_in_update(self):
|
||||
# gh 3016 (same as in update)
|
||||
df = DataFrame(
|
||||
[[1.0, 2.0, False, True], [4.0, 5.0, True, False]],
|
||||
columns=["A", "B", "bool1", "bool2"],
|
||||
)
|
||||
|
||||
other = DataFrame([[45, 45]], index=[0], columns=["A", "B"])
|
||||
result = df.combine_first(other)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
df.loc[0, "A"] = np.nan
|
||||
result = df.combine_first(other)
|
||||
df.loc[0, "A"] = 45
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
def test_combine_first_doc_example(self):
|
||||
# doc example
|
||||
df1 = DataFrame(
|
||||
{"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]}
|
||||
)
|
||||
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0],
|
||||
"B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0],
|
||||
}
|
||||
)
|
||||
|
||||
result = df1.combine_first(df2)
|
||||
expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_combine_first_return_obj_type_with_bools(self):
|
||||
# GH3552
|
||||
|
||||
df1 = DataFrame(
|
||||
[[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]]
|
||||
)
|
||||
df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2])
|
||||
|
||||
expected = Series([True, True, False], name=2, dtype=bool)
|
||||
|
||||
result_12 = df1.combine_first(df2)[2]
|
||||
tm.assert_series_equal(result_12, expected)
|
||||
|
||||
result_21 = df2.combine_first(df1)[2]
|
||||
tm.assert_series_equal(result_21, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data1, data2, data_expected",
|
||||
(
|
||||
(
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
[pd.NaT, pd.NaT, pd.NaT],
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
),
|
||||
(
|
||||
[pd.NaT, pd.NaT, pd.NaT],
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
),
|
||||
(
|
||||
[datetime(2000, 1, 2), pd.NaT, pd.NaT],
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
[datetime(2000, 1, 2), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
),
|
||||
(
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
[datetime(2000, 1, 2), pd.NaT, pd.NaT],
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
),
|
||||
),
|
||||
)
|
||||
def test_combine_first_convert_datatime_correctly(
|
||||
self, data1, data2, data_expected
|
||||
):
|
||||
# GH 3593
|
||||
|
||||
df1, df2 = DataFrame({"a": data1}), DataFrame({"a": data2})
|
||||
result = df1.combine_first(df2)
|
||||
expected = DataFrame({"a": data_expected})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_combine_first_align_nan(self):
|
||||
# GH 7509 (not fixed)
|
||||
dfa = DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"])
|
||||
dfb = DataFrame([[4], [5]], columns=["b"])
|
||||
assert dfa["a"].dtype == "datetime64[ns]"
|
||||
assert dfa["b"].dtype == "int64"
|
||||
|
||||
res = dfa.combine_first(dfb)
|
||||
exp = DataFrame(
|
||||
{"a": [pd.Timestamp("2011-01-01"), pd.NaT], "b": [2, 5]},
|
||||
columns=["a", "b"],
|
||||
)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res["a"].dtype == "datetime64[ns]"
|
||||
# TODO: this must be int64
|
||||
assert res["b"].dtype == "int64"
|
||||
|
||||
res = dfa.iloc[:0].combine_first(dfb)
|
||||
exp = DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
# TODO: this must be datetime64
|
||||
assert res["a"].dtype == "float64"
|
||||
# TODO: this must be int64
|
||||
assert res["b"].dtype == "int64"
|
||||
|
||||
def test_combine_first_timezone(self, unit):
|
||||
# see gh-7630
|
||||
data1 = pd.to_datetime("20100101 01:01").tz_localize("UTC").as_unit(unit)
|
||||
df1 = DataFrame(
|
||||
columns=["UTCdatetime", "abc"],
|
||||
data=data1,
|
||||
index=pd.date_range("20140627", periods=1),
|
||||
)
|
||||
data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC").as_unit(unit)
|
||||
df2 = DataFrame(
|
||||
columns=["UTCdatetime", "xyz"],
|
||||
data=data2,
|
||||
index=pd.date_range("20140628", periods=1),
|
||||
)
|
||||
res = df2[["UTCdatetime"]].combine_first(df1)
|
||||
exp = DataFrame(
|
||||
{
|
||||
"UTCdatetime": [
|
||||
pd.Timestamp("2010-01-01 01:01", tz="UTC"),
|
||||
pd.Timestamp("2012-12-12 12:12", tz="UTC"),
|
||||
],
|
||||
"abc": [pd.Timestamp("2010-01-01 01:01:00", tz="UTC"), pd.NaT],
|
||||
},
|
||||
columns=["UTCdatetime", "abc"],
|
||||
index=pd.date_range("20140627", periods=2, freq="D"),
|
||||
dtype=f"datetime64[{unit}, UTC]",
|
||||
)
|
||||
assert res["UTCdatetime"].dtype == f"datetime64[{unit}, UTC]"
|
||||
assert res["abc"].dtype == f"datetime64[{unit}, UTC]"
|
||||
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_combine_first_timezone2(self, unit):
|
||||
# see gh-10567
|
||||
dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC", unit=unit)
|
||||
df1 = DataFrame({"DATE": dts1})
|
||||
dts2 = pd.date_range("2015-01-03", "2015-01-05", tz="UTC", unit=unit)
|
||||
df2 = DataFrame({"DATE": dts2})
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
tm.assert_frame_equal(res, df1)
|
||||
assert res["DATE"].dtype == f"datetime64[{unit}, UTC]"
|
||||
|
||||
def test_combine_first_timezone3(self, unit):
|
||||
dts1 = pd.DatetimeIndex(
|
||||
["2011-01-01", "NaT", "2011-01-03", "2011-01-04"], tz="US/Eastern"
|
||||
).as_unit(unit)
|
||||
df1 = DataFrame({"DATE": dts1}, index=[1, 3, 5, 7])
|
||||
dts2 = pd.DatetimeIndex(
|
||||
["2012-01-01", "2012-01-02", "2012-01-03"], tz="US/Eastern"
|
||||
).as_unit(unit)
|
||||
df2 = DataFrame({"DATE": dts2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = pd.DatetimeIndex(
|
||||
[
|
||||
"2011-01-01",
|
||||
"2012-01-01",
|
||||
"NaT",
|
||||
"2012-01-02",
|
||||
"2011-01-03",
|
||||
"2011-01-04",
|
||||
],
|
||||
tz="US/Eastern",
|
||||
).as_unit(unit)
|
||||
exp = DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# FIXME: parametrizing over unit breaks on non-nano
|
||||
def test_combine_first_timezone4(self):
|
||||
# different tz
|
||||
dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern")
|
||||
df1 = DataFrame({"DATE": dts1})
|
||||
dts2 = pd.date_range("2015-01-03", "2015-01-05")
|
||||
df2 = DataFrame({"DATE": dts2})
|
||||
|
||||
# if df1 doesn't have NaN, keep its dtype
|
||||
res = df1.combine_first(df2)
|
||||
tm.assert_frame_equal(res, df1)
|
||||
assert res["DATE"].dtype == "datetime64[ns, US/Eastern]"
|
||||
|
||||
def test_combine_first_timezone5(self, unit):
|
||||
dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern", unit=unit)
|
||||
df1 = DataFrame({"DATE": dts1})
|
||||
dts2 = pd.date_range("2015-01-01", "2015-01-03", unit=unit)
|
||||
df2 = DataFrame({"DATE": dts2})
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = [
|
||||
pd.Timestamp("2015-01-01", tz="US/Eastern"),
|
||||
pd.Timestamp("2015-01-02", tz="US/Eastern"),
|
||||
pd.Timestamp("2015-01-03"),
|
||||
]
|
||||
exp = DataFrame({"DATE": exp_dts})
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res["DATE"].dtype == "object"
|
||||
|
||||
def test_combine_first_timedelta(self):
|
||||
data1 = pd.TimedeltaIndex(["1 day", "NaT", "3 day", "4day"])
|
||||
df1 = DataFrame({"TD": data1}, index=[1, 3, 5, 7])
|
||||
data2 = pd.TimedeltaIndex(["10 day", "11 day", "12 day"])
|
||||
df2 = DataFrame({"TD": data2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = pd.TimedeltaIndex(
|
||||
["1 day", "10 day", "NaT", "11 day", "3 day", "4 day"]
|
||||
)
|
||||
exp = DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res["TD"].dtype == "timedelta64[ns]"
|
||||
|
||||
def test_combine_first_period(self):
|
||||
data1 = pd.PeriodIndex(["2011-01", "NaT", "2011-03", "2011-04"], freq="M")
|
||||
df1 = DataFrame({"P": data1}, index=[1, 3, 5, 7])
|
||||
data2 = pd.PeriodIndex(["2012-01-01", "2012-02", "2012-03"], freq="M")
|
||||
df2 = DataFrame({"P": data2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = pd.PeriodIndex(
|
||||
["2011-01", "2012-01", "NaT", "2012-02", "2011-03", "2011-04"], freq="M"
|
||||
)
|
||||
exp = DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res["P"].dtype == data1.dtype
|
||||
|
||||
# different freq
|
||||
dts2 = pd.PeriodIndex(["2012-01-01", "2012-01-02", "2012-01-03"], freq="D")
|
||||
df2 = DataFrame({"P": dts2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = [
|
||||
pd.Period("2011-01", freq="M"),
|
||||
pd.Period("2012-01-01", freq="D"),
|
||||
pd.NaT,
|
||||
pd.Period("2012-01-02", freq="D"),
|
||||
pd.Period("2011-03", freq="M"),
|
||||
pd.Period("2011-04", freq="M"),
|
||||
]
|
||||
exp = DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res["P"].dtype == "object"
|
||||
|
||||
def test_combine_first_int(self):
|
||||
# GH14687 - integer series that do no align exactly
|
||||
|
||||
df1 = DataFrame({"a": [0, 1, 3, 5]}, dtype="int64")
|
||||
df2 = DataFrame({"a": [1, 4]}, dtype="int64")
|
||||
|
||||
result_12 = df1.combine_first(df2)
|
||||
expected_12 = DataFrame({"a": [0, 1, 3, 5]})
|
||||
tm.assert_frame_equal(result_12, expected_12)
|
||||
|
||||
result_21 = df2.combine_first(df1)
|
||||
expected_21 = DataFrame({"a": [1, 4, 3, 5]})
|
||||
tm.assert_frame_equal(result_21, expected_21)
|
||||
|
||||
@pytest.mark.parametrize("val", [1, 1.0])
|
||||
def test_combine_first_with_asymmetric_other(self, val):
|
||||
# see gh-20699
|
||||
df1 = DataFrame({"isNum": [val]})
|
||||
df2 = DataFrame({"isBool": [True]})
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp = DataFrame({"isBool": [True], "isNum": [val]})
|
||||
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_combine_first_string_dtype_only_na(self, nullable_string_dtype):
|
||||
# GH: 37519
|
||||
df = DataFrame(
|
||||
{"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype
|
||||
)
|
||||
df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype=nullable_string_dtype)
|
||||
df.set_index(["a", "b"], inplace=True)
|
||||
df2.set_index(["a", "b"], inplace=True)
|
||||
result = df.combine_first(df2)
|
||||
expected = DataFrame(
|
||||
{"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype
|
||||
).set_index(["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"scalar1, scalar2",
|
||||
[
|
||||
(datetime(2020, 1, 1), datetime(2020, 1, 2)),
|
||||
(pd.Period("2020-01-01", "D"), pd.Period("2020-01-02", "D")),
|
||||
(pd.Timedelta("89 days"), pd.Timedelta("60 min")),
|
||||
(pd.Interval(left=0, right=1), pd.Interval(left=2, right=3, closed="left")),
|
||||
],
|
||||
)
|
||||
def test_combine_first_timestamp_bug(scalar1, scalar2, nulls_fixture):
|
||||
# GH28481
|
||||
na_value = nulls_fixture
|
||||
|
||||
frame = DataFrame([[na_value, na_value]], columns=["a", "b"])
|
||||
other = DataFrame([[scalar1, scalar2]], columns=["b", "c"])
|
||||
|
||||
common_dtype = find_common_type([frame.dtypes["b"], other.dtypes["b"]])
|
||||
|
||||
if is_dtype_equal(common_dtype, "object") or frame.dtypes["b"] == other.dtypes["b"]:
|
||||
val = scalar1
|
||||
else:
|
||||
val = na_value
|
||||
|
||||
result = frame.combine_first(other)
|
||||
|
||||
expected = DataFrame([[na_value, val, scalar2]], columns=["a", "b", "c"])
|
||||
|
||||
expected["b"] = expected["b"].astype(common_dtype)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_combine_first_timestamp_bug_NaT():
|
||||
# GH28481
|
||||
frame = DataFrame([[pd.NaT, pd.NaT]], columns=["a", "b"])
|
||||
other = DataFrame(
|
||||
[[datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["b", "c"]
|
||||
)
|
||||
|
||||
result = frame.combine_first(other)
|
||||
expected = DataFrame(
|
||||
[[pd.NaT, datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["a", "b", "c"]
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_combine_first_with_nan_multiindex():
|
||||
# gh-36562
|
||||
|
||||
mi1 = MultiIndex.from_arrays(
|
||||
[["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"]
|
||||
)
|
||||
df = DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1)
|
||||
mi2 = MultiIndex.from_arrays(
|
||||
[["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"]
|
||||
)
|
||||
s = Series([1, 2, 3, 4, 5, 6], index=mi2)
|
||||
res = df.combine_first(DataFrame({"d": s}))
|
||||
mi_expected = MultiIndex.from_arrays(
|
||||
[
|
||||
["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan],
|
||||
[1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6],
|
||||
],
|
||||
names=["a", "b"],
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"c": [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan, 1, np.nan, 1],
|
||||
"d": [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan],
|
||||
},
|
||||
index=mi_expected,
|
||||
)
|
||||
tm.assert_frame_equal(res, expected)
|
||||
|
||||
|
||||
def test_combine_preserve_dtypes():
|
||||
# GH7509
|
||||
a_column = Series(["a", "b"], index=range(2))
|
||||
b_column = Series(range(2), index=range(2))
|
||||
df1 = DataFrame({"A": a_column, "B": b_column})
|
||||
|
||||
c_column = Series(["a", "b"], index=range(5, 7))
|
||||
b_column = Series(range(-1, 1), index=range(5, 7))
|
||||
df2 = DataFrame({"B": b_column, "C": c_column})
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": ["a", "b", np.nan, np.nan],
|
||||
"B": [0, 1, -1, 0],
|
||||
"C": [np.nan, np.nan, "a", "b"],
|
||||
},
|
||||
index=[0, 1, 5, 6],
|
||||
)
|
||||
combined = df1.combine_first(df2)
|
||||
tm.assert_frame_equal(combined, expected)
|
||||
|
||||
|
||||
def test_combine_first_duplicates_rows_for_nan_index_values():
|
||||
# GH39881
|
||||
df1 = DataFrame(
|
||||
{"x": [9, 10, 11]},
|
||||
index=MultiIndex.from_arrays([[1, 2, 3], [np.nan, 5, 6]], names=["a", "b"]),
|
||||
)
|
||||
|
||||
df2 = DataFrame(
|
||||
{"y": [12, 13, 14]},
|
||||
index=MultiIndex.from_arrays([[1, 2, 4], [np.nan, 5, 7]], names=["a", "b"]),
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"x": [9.0, 10.0, 11.0, np.nan],
|
||||
"y": [12.0, 13.0, np.nan, 14.0],
|
||||
},
|
||||
index=MultiIndex.from_arrays(
|
||||
[[1, 2, 3, 4], [np.nan, 5, 6, 7]], names=["a", "b"]
|
||||
),
|
||||
)
|
||||
combined = df1.combine_first(df2)
|
||||
tm.assert_frame_equal(combined, expected)
|
||||
|
||||
|
||||
def test_combine_first_int64_not_cast_to_float64():
|
||||
# GH 28613
|
||||
df_1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
df_2 = DataFrame({"A": [1, 20, 30], "B": [40, 50, 60], "C": [12, 34, 65]})
|
||||
result = df_1.combine_first(df_2)
|
||||
expected = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [12, 34, 65]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_midx_losing_dtype():
|
||||
# GH#49830
|
||||
midx = MultiIndex.from_arrays([[0, 0], [np.nan, np.nan]])
|
||||
midx2 = MultiIndex.from_arrays([[1, 1], [np.nan, np.nan]])
|
||||
df1 = DataFrame({"a": [None, 4]}, index=midx)
|
||||
df2 = DataFrame({"a": [3, 3]}, index=midx2)
|
||||
result = df1.combine_first(df2)
|
||||
expected_midx = MultiIndex.from_arrays(
|
||||
[[0, 0, 1, 1], [np.nan, np.nan, np.nan, np.nan]]
|
||||
)
|
||||
expected = DataFrame({"a": [np.nan, 4, 3, 3]}, index=expected_midx)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_combine_first_empty_columns():
|
||||
left = DataFrame(columns=["a", "b"])
|
||||
right = DataFrame(columns=["a", "c"])
|
||||
result = left.combine_first(right)
|
||||
expected = DataFrame(columns=["a", "b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,305 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat.numpy import np_version_gte1p25
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("align_axis", [0, 1, "index", "columns"])
|
||||
def test_compare_axis(align_axis):
|
||||
# GH#30429
|
||||
df = pd.DataFrame(
|
||||
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
|
||||
columns=["col1", "col2", "col3"],
|
||||
)
|
||||
df2 = df.copy()
|
||||
df2.loc[0, "col1"] = "c"
|
||||
df2.loc[2, "col3"] = 4.0
|
||||
|
||||
result = df.compare(df2, align_axis=align_axis)
|
||||
|
||||
if align_axis in (1, "columns"):
|
||||
indices = pd.Index([0, 2])
|
||||
columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]])
|
||||
expected = pd.DataFrame(
|
||||
[["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, 4.0]],
|
||||
index=indices,
|
||||
columns=columns,
|
||||
)
|
||||
else:
|
||||
indices = pd.MultiIndex.from_product([[0, 2], ["self", "other"]])
|
||||
columns = pd.Index(["col1", "col3"])
|
||||
expected = pd.DataFrame(
|
||||
[["a", np.nan], ["c", np.nan], [np.nan, 3.0], [np.nan, 4.0]],
|
||||
index=indices,
|
||||
columns=columns,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"keep_shape, keep_equal",
|
||||
[
|
||||
(True, False),
|
||||
(False, True),
|
||||
(True, True),
|
||||
# False, False case is already covered in test_compare_axis
|
||||
],
|
||||
)
|
||||
def test_compare_various_formats(keep_shape, keep_equal):
|
||||
df = pd.DataFrame(
|
||||
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
|
||||
columns=["col1", "col2", "col3"],
|
||||
)
|
||||
df2 = df.copy()
|
||||
df2.loc[0, "col1"] = "c"
|
||||
df2.loc[2, "col3"] = 4.0
|
||||
|
||||
result = df.compare(df2, keep_shape=keep_shape, keep_equal=keep_equal)
|
||||
|
||||
if keep_shape:
|
||||
indices = pd.Index([0, 1, 2])
|
||||
columns = pd.MultiIndex.from_product(
|
||||
[["col1", "col2", "col3"], ["self", "other"]]
|
||||
)
|
||||
if keep_equal:
|
||||
expected = pd.DataFrame(
|
||||
[
|
||||
["a", "c", 1.0, 1.0, 1.0, 1.0],
|
||||
["b", "b", 2.0, 2.0, 2.0, 2.0],
|
||||
["c", "c", np.nan, np.nan, 3.0, 4.0],
|
||||
],
|
||||
index=indices,
|
||||
columns=columns,
|
||||
)
|
||||
else:
|
||||
expected = pd.DataFrame(
|
||||
[
|
||||
["a", "c", np.nan, np.nan, np.nan, np.nan],
|
||||
[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
|
||||
[np.nan, np.nan, np.nan, np.nan, 3.0, 4.0],
|
||||
],
|
||||
index=indices,
|
||||
columns=columns,
|
||||
)
|
||||
else:
|
||||
indices = pd.Index([0, 2])
|
||||
columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]])
|
||||
expected = pd.DataFrame(
|
||||
[["a", "c", 1.0, 1.0], ["c", "c", 3.0, 4.0]], index=indices, columns=columns
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_compare_with_equal_nulls():
|
||||
# We want to make sure two NaNs are considered the same
|
||||
# and dropped where applicable
|
||||
df = pd.DataFrame(
|
||||
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
|
||||
columns=["col1", "col2", "col3"],
|
||||
)
|
||||
df2 = df.copy()
|
||||
df2.loc[0, "col1"] = "c"
|
||||
|
||||
result = df.compare(df2)
|
||||
indices = pd.Index([0])
|
||||
columns = pd.MultiIndex.from_product([["col1"], ["self", "other"]])
|
||||
expected = pd.DataFrame([["a", "c"]], index=indices, columns=columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_compare_with_non_equal_nulls():
|
||||
# We want to make sure the relevant NaNs do not get dropped
|
||||
# even if the entire row or column are NaNs
|
||||
df = pd.DataFrame(
|
||||
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
|
||||
columns=["col1", "col2", "col3"],
|
||||
)
|
||||
df2 = df.copy()
|
||||
df2.loc[0, "col1"] = "c"
|
||||
df2.loc[2, "col3"] = np.nan
|
||||
|
||||
result = df.compare(df2)
|
||||
|
||||
indices = pd.Index([0, 2])
|
||||
columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]])
|
||||
expected = pd.DataFrame(
|
||||
[["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, np.nan]],
|
||||
index=indices,
|
||||
columns=columns,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("align_axis", [0, 1])
|
||||
def test_compare_multi_index(align_axis):
|
||||
df = pd.DataFrame(
|
||||
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}
|
||||
)
|
||||
df.columns = pd.MultiIndex.from_arrays([["a", "a", "b"], ["col1", "col2", "col3"]])
|
||||
df.index = pd.MultiIndex.from_arrays([["x", "x", "y"], [0, 1, 2]])
|
||||
|
||||
df2 = df.copy()
|
||||
df2.iloc[0, 0] = "c"
|
||||
df2.iloc[2, 2] = 4.0
|
||||
|
||||
result = df.compare(df2, align_axis=align_axis)
|
||||
|
||||
if align_axis == 0:
|
||||
indices = pd.MultiIndex.from_arrays(
|
||||
[["x", "x", "y", "y"], [0, 0, 2, 2], ["self", "other", "self", "other"]]
|
||||
)
|
||||
columns = pd.MultiIndex.from_arrays([["a", "b"], ["col1", "col3"]])
|
||||
data = [["a", np.nan], ["c", np.nan], [np.nan, 3.0], [np.nan, 4.0]]
|
||||
else:
|
||||
indices = pd.MultiIndex.from_arrays([["x", "y"], [0, 2]])
|
||||
columns = pd.MultiIndex.from_arrays(
|
||||
[
|
||||
["a", "a", "b", "b"],
|
||||
["col1", "col1", "col3", "col3"],
|
||||
["self", "other", "self", "other"],
|
||||
]
|
||||
)
|
||||
data = [["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, 4.0]]
|
||||
|
||||
expected = pd.DataFrame(data=data, index=indices, columns=columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_compare_unaligned_objects():
|
||||
# test DataFrames with different indices
|
||||
msg = (
|
||||
r"Can only compare identically-labeled \(both index and columns\) DataFrame "
|
||||
"objects"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"])
|
||||
df2 = pd.DataFrame([1, 2, 3], index=["a", "b", "d"])
|
||||
df1.compare(df2)
|
||||
|
||||
# test DataFrames with different shapes
|
||||
msg = (
|
||||
r"Can only compare identically-labeled \(both index and columns\) DataFrame "
|
||||
"objects"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df1 = pd.DataFrame(np.ones((3, 3)))
|
||||
df2 = pd.DataFrame(np.zeros((2, 1)))
|
||||
df1.compare(df2)
|
||||
|
||||
|
||||
def test_compare_result_names():
|
||||
# GH 44354
|
||||
df1 = pd.DataFrame(
|
||||
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
|
||||
)
|
||||
df2 = pd.DataFrame(
|
||||
{
|
||||
"col1": ["c", "b", "c"],
|
||||
"col2": [1.0, 2.0, np.nan],
|
||||
"col3": [1.0, 2.0, np.nan],
|
||||
},
|
||||
)
|
||||
result = df1.compare(df2, result_names=("left", "right"))
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
("col1", "left"): {0: "a", 2: np.nan},
|
||||
("col1", "right"): {0: "c", 2: np.nan},
|
||||
("col3", "left"): {0: np.nan, 2: 3.0},
|
||||
("col3", "right"): {0: np.nan, 2: np.nan},
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"result_names",
|
||||
[
|
||||
[1, 2],
|
||||
"HK",
|
||||
{"2": 2, "3": 3},
|
||||
3,
|
||||
3.0,
|
||||
],
|
||||
)
|
||||
def test_invalid_input_result_names(result_names):
|
||||
# GH 44354
|
||||
df1 = pd.DataFrame(
|
||||
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
|
||||
)
|
||||
df2 = pd.DataFrame(
|
||||
{
|
||||
"col1": ["c", "b", "c"],
|
||||
"col2": [1.0, 2.0, np.nan],
|
||||
"col3": [1.0, 2.0, np.nan],
|
||||
},
|
||||
)
|
||||
with pytest.raises(
|
||||
TypeError,
|
||||
match=(
|
||||
f"Passing 'result_names' as a {type(result_names)} is not "
|
||||
"supported. Provide 'result_names' as a tuple instead."
|
||||
),
|
||||
):
|
||||
df1.compare(df2, result_names=result_names)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"val1,val2",
|
||||
[(4, pd.NA), (pd.NA, pd.NA), (pd.NA, 4)],
|
||||
)
|
||||
def test_compare_ea_and_np_dtype(val1, val2):
|
||||
# GH 48966
|
||||
arr = [4.0, val1]
|
||||
ser = pd.Series([1, val2], dtype="Int64")
|
||||
|
||||
df1 = pd.DataFrame({"a": arr, "b": [1.0, 2]})
|
||||
df2 = pd.DataFrame({"a": ser, "b": [1.0, 2]})
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
("a", "self"): arr,
|
||||
("a", "other"): ser,
|
||||
("b", "self"): np.nan,
|
||||
("b", "other"): np.nan,
|
||||
}
|
||||
)
|
||||
if val1 is pd.NA and val2 is pd.NA:
|
||||
# GH#18463 TODO: is this really the desired behavior?
|
||||
expected.loc[1, ("a", "self")] = np.nan
|
||||
|
||||
if val1 is pd.NA and np_version_gte1p25:
|
||||
# can't compare with numpy array if it contains pd.NA
|
||||
with pytest.raises(TypeError, match="boolean value of NA is ambiguous"):
|
||||
result = df1.compare(df2, keep_shape=True)
|
||||
else:
|
||||
result = df1.compare(df2, keep_shape=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"df1_val,df2_val,diff_self,diff_other",
|
||||
[
|
||||
(4, 3, 4, 3),
|
||||
(4, 4, pd.NA, pd.NA),
|
||||
(4, pd.NA, 4, pd.NA),
|
||||
(pd.NA, pd.NA, pd.NA, pd.NA),
|
||||
],
|
||||
)
|
||||
def test_compare_nullable_int64_dtype(df1_val, df2_val, diff_self, diff_other):
|
||||
# GH 48966
|
||||
df1 = pd.DataFrame({"a": pd.Series([df1_val, pd.NA], dtype="Int64"), "b": [1.0, 2]})
|
||||
df2 = df1.copy()
|
||||
df2.loc[0, "a"] = df2_val
|
||||
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
("a", "self"): pd.Series([diff_self, pd.NA], dtype="Int64"),
|
||||
("a", "other"): pd.Series([diff_other, pd.NA], dtype="Int64"),
|
||||
("b", "self"): np.nan,
|
||||
("b", "other"): np.nan,
|
||||
}
|
||||
)
|
||||
result = df1.compare(df2, keep_shape=True)
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,202 @@
|
||||
import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestConvertDtypes:
|
||||
@pytest.mark.parametrize(
|
||||
"convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")]
|
||||
)
|
||||
def test_convert_dtypes(
|
||||
self, convert_integer, expected, string_storage, using_infer_string
|
||||
):
|
||||
# Specific types are tested in tests/series/test_dtypes.py
|
||||
# Just check that it works for DataFrame here
|
||||
if using_infer_string:
|
||||
string_storage = "pyarrow_numpy"
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
|
||||
"b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
|
||||
}
|
||||
)
|
||||
with pd.option_context("string_storage", string_storage):
|
||||
result = df.convert_dtypes(True, True, convert_integer, False)
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"a": pd.Series([1, 2, 3], dtype=expected),
|
||||
"b": pd.Series(["x", "y", "z"], dtype=f"string[{string_storage}]"),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_convert_empty(self):
|
||||
# Empty DataFrame can pass convert_dtypes, see GH#40393
|
||||
empty_df = pd.DataFrame()
|
||||
tm.assert_frame_equal(empty_df, empty_df.convert_dtypes())
|
||||
|
||||
def test_convert_dtypes_retain_column_names(self):
|
||||
# GH#41435
|
||||
df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
|
||||
df.columns.name = "cols"
|
||||
|
||||
result = df.convert_dtypes()
|
||||
tm.assert_index_equal(result.columns, df.columns)
|
||||
assert result.columns.name == "cols"
|
||||
|
||||
def test_pyarrow_dtype_backend(self):
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
|
||||
"b": pd.Series(["x", "y", None], dtype=np.dtype("O")),
|
||||
"c": pd.Series([True, False, None], dtype=np.dtype("O")),
|
||||
"d": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
|
||||
"e": pd.Series(pd.date_range("2022", periods=3)),
|
||||
"f": pd.Series(pd.date_range("2022", periods=3, tz="UTC").as_unit("s")),
|
||||
"g": pd.Series(pd.timedelta_range("1D", periods=3)),
|
||||
}
|
||||
)
|
||||
result = df.convert_dtypes(dtype_backend="pyarrow")
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"a": pd.arrays.ArrowExtensionArray(
|
||||
pa.array([1, 2, 3], type=pa.int32())
|
||||
),
|
||||
"b": pd.arrays.ArrowExtensionArray(pa.array(["x", "y", None])),
|
||||
"c": pd.arrays.ArrowExtensionArray(pa.array([True, False, None])),
|
||||
"d": pd.arrays.ArrowExtensionArray(pa.array([None, 100.5, 200.0])),
|
||||
"e": pd.arrays.ArrowExtensionArray(
|
||||
pa.array(
|
||||
[
|
||||
datetime.datetime(2022, 1, 1),
|
||||
datetime.datetime(2022, 1, 2),
|
||||
datetime.datetime(2022, 1, 3),
|
||||
],
|
||||
type=pa.timestamp(unit="ns"),
|
||||
)
|
||||
),
|
||||
"f": pd.arrays.ArrowExtensionArray(
|
||||
pa.array(
|
||||
[
|
||||
datetime.datetime(2022, 1, 1),
|
||||
datetime.datetime(2022, 1, 2),
|
||||
datetime.datetime(2022, 1, 3),
|
||||
],
|
||||
type=pa.timestamp(unit="s", tz="UTC"),
|
||||
)
|
||||
),
|
||||
"g": pd.arrays.ArrowExtensionArray(
|
||||
pa.array(
|
||||
[
|
||||
datetime.timedelta(1),
|
||||
datetime.timedelta(2),
|
||||
datetime.timedelta(3),
|
||||
],
|
||||
type=pa.duration("ns"),
|
||||
)
|
||||
),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_pyarrow_dtype_backend_already_pyarrow(self):
|
||||
pytest.importorskip("pyarrow")
|
||||
expected = pd.DataFrame([1, 2, 3], dtype="int64[pyarrow]")
|
||||
result = expected.convert_dtypes(dtype_backend="pyarrow")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_pyarrow_dtype_backend_from_pandas_nullable(self):
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"a": pd.Series([1, 2, None], dtype="Int32"),
|
||||
"b": pd.Series(["x", "y", None], dtype="string[python]"),
|
||||
"c": pd.Series([True, False, None], dtype="boolean"),
|
||||
"d": pd.Series([None, 100.5, 200], dtype="Float64"),
|
||||
}
|
||||
)
|
||||
result = df.convert_dtypes(dtype_backend="pyarrow")
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"a": pd.arrays.ArrowExtensionArray(
|
||||
pa.array([1, 2, None], type=pa.int32())
|
||||
),
|
||||
"b": pd.arrays.ArrowExtensionArray(pa.array(["x", "y", None])),
|
||||
"c": pd.arrays.ArrowExtensionArray(pa.array([True, False, None])),
|
||||
"d": pd.arrays.ArrowExtensionArray(pa.array([None, 100.5, 200.0])),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_pyarrow_dtype_empty_object(self):
|
||||
# GH 50970
|
||||
pytest.importorskip("pyarrow")
|
||||
expected = pd.DataFrame(columns=[0])
|
||||
result = expected.convert_dtypes(dtype_backend="pyarrow")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_pyarrow_engine_lines_false(self):
|
||||
# GH 48893
|
||||
df = pd.DataFrame({"a": [1, 2, 3]})
|
||||
msg = (
|
||||
"dtype_backend numpy is invalid, only 'numpy_nullable' and "
|
||||
"'pyarrow' are allowed."
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.convert_dtypes(dtype_backend="numpy")
|
||||
|
||||
def test_pyarrow_backend_no_conversion(self):
|
||||
# GH#52872
|
||||
pytest.importorskip("pyarrow")
|
||||
df = pd.DataFrame({"a": [1, 2], "b": 1.5, "c": True, "d": "x"})
|
||||
expected = df.copy()
|
||||
result = df.convert_dtypes(
|
||||
convert_floating=False,
|
||||
convert_integer=False,
|
||||
convert_boolean=False,
|
||||
convert_string=False,
|
||||
dtype_backend="pyarrow",
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_convert_dtypes_pyarrow_to_np_nullable(self):
|
||||
# GH 53648
|
||||
pytest.importorskip("pyarrow")
|
||||
ser = pd.DataFrame(range(2), dtype="int32[pyarrow]")
|
||||
result = ser.convert_dtypes(dtype_backend="numpy_nullable")
|
||||
expected = pd.DataFrame(range(2), dtype="Int32")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_convert_dtypes_pyarrow_timestamp(self):
|
||||
# GH 54191
|
||||
pytest.importorskip("pyarrow")
|
||||
ser = pd.Series(pd.date_range("2020-01-01", "2020-01-02", freq="1min"))
|
||||
expected = ser.astype("timestamp[ms][pyarrow]")
|
||||
result = expected.convert_dtypes(dtype_backend="pyarrow")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_convert_dtypes_avoid_block_splitting(self):
|
||||
# GH#55341
|
||||
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": "a"})
|
||||
result = df.convert_dtypes(convert_integer=False)
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"a": [1, 2, 3],
|
||||
"b": [4, 5, 6],
|
||||
"c": pd.Series(["a"] * 3, dtype="string[python]"),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert result._mgr.nblocks == 2
|
||||
|
||||
def test_convert_dtypes_from_arrow(self):
|
||||
# GH#56581
|
||||
df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"])
|
||||
result = df.convert_dtypes()
|
||||
expected = df.astype({"a": "string[python]"})
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,64 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestCopy:
|
||||
@pytest.mark.parametrize("attr", ["index", "columns"])
|
||||
def test_copy_index_name_checking(self, float_frame, attr):
|
||||
# don't want to be able to modify the index stored elsewhere after
|
||||
# making a copy
|
||||
ind = getattr(float_frame, attr)
|
||||
ind.name = None
|
||||
cp = float_frame.copy()
|
||||
getattr(cp, attr).name = "foo"
|
||||
assert getattr(float_frame, attr).name is None
|
||||
|
||||
@td.skip_copy_on_write_invalid_test
|
||||
def test_copy_cache(self):
|
||||
# GH#31784 _item_cache not cleared on copy causes incorrect reads after updates
|
||||
df = DataFrame({"a": [1]})
|
||||
|
||||
df["x"] = [0]
|
||||
df["a"]
|
||||
|
||||
df.copy()
|
||||
|
||||
df["a"].values[0] = -1
|
||||
|
||||
tm.assert_frame_equal(df, DataFrame({"a": [-1], "x": [0]}))
|
||||
|
||||
df["y"] = [0]
|
||||
|
||||
assert df["a"].values[0] == -1
|
||||
tm.assert_frame_equal(df, DataFrame({"a": [-1], "x": [0], "y": [0]}))
|
||||
|
||||
def test_copy(self, float_frame, float_string_frame):
|
||||
cop = float_frame.copy()
|
||||
cop["E"] = cop["A"]
|
||||
assert "E" not in float_frame
|
||||
|
||||
# copy objects
|
||||
copy = float_string_frame.copy()
|
||||
assert copy._mgr is not float_string_frame._mgr
|
||||
|
||||
@td.skip_array_manager_invalid_test
|
||||
def test_copy_consolidates(self):
|
||||
# GH#42477
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": np.random.default_rng(2).integers(0, 100, size=55),
|
||||
"b": np.random.default_rng(2).integers(0, 100, size=55),
|
||||
}
|
||||
)
|
||||
|
||||
for i in range(10):
|
||||
df.loc[:, f"n_{i}"] = np.random.default_rng(2).integers(0, 100, size=55)
|
||||
|
||||
assert len(df._mgr.blocks) == 11
|
||||
result = df.copy()
|
||||
assert len(result._mgr.blocks) == 1
|
@ -0,0 +1,39 @@
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameCount:
|
||||
def test_count(self):
|
||||
# corner case
|
||||
frame = DataFrame()
|
||||
ct1 = frame.count(1)
|
||||
assert isinstance(ct1, Series)
|
||||
|
||||
ct2 = frame.count(0)
|
||||
assert isinstance(ct2, Series)
|
||||
|
||||
# GH#423
|
||||
df = DataFrame(index=range(10))
|
||||
result = df.count(1)
|
||||
expected = Series(0, index=df.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame(columns=range(10))
|
||||
result = df.count(0)
|
||||
expected = Series(0, index=df.columns)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame()
|
||||
result = df.count()
|
||||
expected = Series(dtype="int64")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_count_objects(self, float_string_frame):
|
||||
dm = DataFrame(float_string_frame._series)
|
||||
df = DataFrame(float_string_frame._series)
|
||||
|
||||
tm.assert_series_equal(dm.count(), df.count())
|
||||
tm.assert_series_equal(dm.count(1), df.count(1))
|
@ -0,0 +1,471 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
date_range,
|
||||
isna,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameCov:
|
||||
def test_cov(self, float_frame, float_string_frame):
|
||||
# min_periods no NAs (corner case)
|
||||
expected = float_frame.cov()
|
||||
result = float_frame.cov(min_periods=len(float_frame))
|
||||
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
result = float_frame.cov(min_periods=len(float_frame) + 1)
|
||||
assert isna(result.values).all()
|
||||
|
||||
# with NAs
|
||||
frame = float_frame.copy()
|
||||
frame.iloc[:5, frame.columns.get_loc("A")] = np.nan
|
||||
frame.iloc[5:10, frame.columns.get_loc("B")] = np.nan
|
||||
result = frame.cov(min_periods=len(frame) - 8)
|
||||
expected = frame.cov()
|
||||
expected.loc["A", "B"] = np.nan
|
||||
expected.loc["B", "A"] = np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# regular
|
||||
result = frame.cov()
|
||||
expected = frame["A"].cov(frame["C"])
|
||||
tm.assert_almost_equal(result["A"]["C"], expected)
|
||||
|
||||
# fails on non-numeric types
|
||||
with pytest.raises(ValueError, match="could not convert string to float"):
|
||||
float_string_frame.cov()
|
||||
result = float_string_frame.cov(numeric_only=True)
|
||||
expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].cov()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Single column frame
|
||||
df = DataFrame(np.linspace(0.0, 1.0, 10))
|
||||
result = df.cov()
|
||||
expected = DataFrame(
|
||||
np.cov(df.values.T).reshape((1, 1)), index=df.columns, columns=df.columns
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
df.loc[0] = np.nan
|
||||
result = df.cov()
|
||||
expected = DataFrame(
|
||||
np.cov(df.values[1:].T).reshape((1, 1)),
|
||||
index=df.columns,
|
||||
columns=df.columns,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("test_ddof", [None, 0, 1, 2, 3])
|
||||
def test_cov_ddof(self, test_ddof):
|
||||
# GH#34611
|
||||
np_array1 = np.random.default_rng(2).random(10)
|
||||
np_array2 = np.random.default_rng(2).random(10)
|
||||
df = DataFrame({0: np_array1, 1: np_array2})
|
||||
result = df.cov(ddof=test_ddof)
|
||||
expected_np = np.cov(np_array1, np_array2, ddof=test_ddof)
|
||||
expected = DataFrame(expected_np)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"other_column", [pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0])]
|
||||
)
|
||||
def test_cov_nullable_integer(self, other_column):
|
||||
# https://github.com/pandas-dev/pandas/issues/33803
|
||||
data = DataFrame({"a": pd.array([1, 2, None]), "b": other_column})
|
||||
result = data.cov()
|
||||
arr = np.array([[0.5, 0.5], [0.5, 1.0]])
|
||||
expected = DataFrame(arr, columns=["a", "b"], index=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("numeric_only", [True, False])
|
||||
def test_cov_numeric_only(self, numeric_only):
|
||||
# when dtypes of pandas series are different
|
||||
# then ndarray will have dtype=object,
|
||||
# so it need to be properly handled
|
||||
df = DataFrame({"a": [1, 0], "c": ["x", "y"]})
|
||||
expected = DataFrame(0.5, index=["a"], columns=["a"])
|
||||
if numeric_only:
|
||||
result = df.cov(numeric_only=numeric_only)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(ValueError, match="could not convert string to float"):
|
||||
df.cov(numeric_only=numeric_only)
|
||||
|
||||
|
||||
class TestDataFrameCorr:
|
||||
# DataFrame.corr(), as opposed to DataFrame.corrwith
|
||||
|
||||
@pytest.mark.parametrize("method", ["pearson", "kendall", "spearman"])
|
||||
def test_corr_scipy_method(self, float_frame, method):
|
||||
pytest.importorskip("scipy")
|
||||
float_frame.loc[float_frame.index[:5], "A"] = np.nan
|
||||
float_frame.loc[float_frame.index[5:10], "B"] = np.nan
|
||||
float_frame.loc[float_frame.index[:10], "A"] = float_frame["A"][10:20].copy()
|
||||
|
||||
correls = float_frame.corr(method=method)
|
||||
expected = float_frame["A"].corr(float_frame["C"], method=method)
|
||||
tm.assert_almost_equal(correls["A"]["C"], expected)
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
|
||||
def test_corr_non_numeric(self, float_string_frame):
|
||||
with pytest.raises(ValueError, match="could not convert string to float"):
|
||||
float_string_frame.corr()
|
||||
result = float_string_frame.corr(numeric_only=True)
|
||||
expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"])
|
||||
def test_corr_nooverlap(self, meth):
|
||||
# nothing in common
|
||||
pytest.importorskip("scipy")
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 1.5, 1, np.nan, np.nan, np.nan],
|
||||
"B": [np.nan, np.nan, np.nan, 1, 1.5, 1],
|
||||
"C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
|
||||
}
|
||||
)
|
||||
rs = df.corr(meth)
|
||||
assert isna(rs.loc["A", "B"])
|
||||
assert isna(rs.loc["B", "A"])
|
||||
assert rs.loc["A", "A"] == 1
|
||||
assert rs.loc["B", "B"] == 1
|
||||
assert isna(rs.loc["C", "C"])
|
||||
|
||||
@pytest.mark.parametrize("meth", ["pearson", "spearman"])
|
||||
def test_corr_constant(self, meth):
|
||||
# constant --> all NA
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 1, 1, np.nan, np.nan, np.nan],
|
||||
"B": [np.nan, np.nan, np.nan, 1, 1, 1],
|
||||
}
|
||||
)
|
||||
rs = df.corr(meth)
|
||||
assert isna(rs.values).all()
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::RuntimeWarning")
|
||||
@pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"])
|
||||
def test_corr_int_and_boolean(self, meth):
|
||||
# when dtypes of pandas series are different
|
||||
# then ndarray will have dtype=object,
|
||||
# so it need to be properly handled
|
||||
pytest.importorskip("scipy")
|
||||
df = DataFrame({"a": [True, False], "b": [1, 0]})
|
||||
|
||||
expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"])
|
||||
result = df.corr(meth)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("method", ["cov", "corr"])
|
||||
def test_corr_cov_independent_index_column(self, method):
|
||||
# GH#14617
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal(4 * 10).reshape(10, 4),
|
||||
columns=list("abcd"),
|
||||
)
|
||||
result = getattr(df, method)()
|
||||
assert result.index is not result.columns
|
||||
assert result.index.equals(result.columns)
|
||||
|
||||
def test_corr_invalid_method(self):
|
||||
# GH#22298
|
||||
df = DataFrame(np.random.default_rng(2).normal(size=(10, 2)))
|
||||
msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, "
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.corr(method="____")
|
||||
|
||||
def test_corr_int(self):
|
||||
# dtypes other than float64 GH#1761
|
||||
df = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]})
|
||||
|
||||
df.cov()
|
||||
df.corr()
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"nullable_column", [pd.array([1, 2, 3]), pd.array([1, 2, None])]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"other_column",
|
||||
[pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, np.nan])],
|
||||
)
|
||||
@pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"])
|
||||
def test_corr_nullable_integer(self, nullable_column, other_column, method):
|
||||
# https://github.com/pandas-dev/pandas/issues/33803
|
||||
pytest.importorskip("scipy")
|
||||
data = DataFrame({"a": nullable_column, "b": other_column})
|
||||
result = data.corr(method=method)
|
||||
expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_corr_item_cache(self, using_copy_on_write, warn_copy_on_write):
|
||||
# Check that corr does not lead to incorrect entries in item_cache
|
||||
|
||||
df = DataFrame({"A": range(10)})
|
||||
df["B"] = range(10)[::-1]
|
||||
|
||||
ser = df["A"] # populate item_cache
|
||||
assert len(df._mgr.arrays) == 2 # i.e. 2 blocks
|
||||
|
||||
_ = df.corr(numeric_only=True)
|
||||
|
||||
if using_copy_on_write:
|
||||
ser.iloc[0] = 99
|
||||
assert df.loc[0, "A"] == 0
|
||||
else:
|
||||
# Check that the corr didn't break link between ser and df
|
||||
ser.values[0] = 99
|
||||
assert df.loc[0, "A"] == 99
|
||||
if not warn_copy_on_write:
|
||||
assert df["A"] is ser
|
||||
assert df.values[0, 0] == 99
|
||||
|
||||
@pytest.mark.parametrize("length", [2, 20, 200, 2000])
|
||||
def test_corr_for_constant_columns(self, length):
|
||||
# GH: 37448
|
||||
df = DataFrame(length * [[0.4, 0.1]], columns=["A", "B"])
|
||||
result = df.corr()
|
||||
expected = DataFrame(
|
||||
{"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_calc_corr_small_numbers(self):
|
||||
# GH: 37452
|
||||
df = DataFrame(
|
||||
{"A": [1.0e-20, 2.0e-20, 3.0e-20], "B": [1.0e-20, 2.0e-20, 3.0e-20]}
|
||||
)
|
||||
result = df.corr()
|
||||
expected = DataFrame({"A": [1.0, 1.0], "B": [1.0, 1.0]}, index=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"])
|
||||
def test_corr_min_periods_greater_than_length(self, method):
|
||||
pytest.importorskip("scipy")
|
||||
df = DataFrame({"A": [1, 2], "B": [1, 2]})
|
||||
result = df.corr(method=method, min_periods=3)
|
||||
expected = DataFrame(
|
||||
{"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"])
|
||||
@pytest.mark.parametrize("numeric_only", [True, False])
|
||||
def test_corr_numeric_only(self, meth, numeric_only):
|
||||
# when dtypes of pandas series are different
|
||||
# then ndarray will have dtype=object,
|
||||
# so it need to be properly handled
|
||||
pytest.importorskip("scipy")
|
||||
df = DataFrame({"a": [1, 0], "b": [1, 0], "c": ["x", "y"]})
|
||||
expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"])
|
||||
if numeric_only:
|
||||
result = df.corr(meth, numeric_only=numeric_only)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(ValueError, match="could not convert string to float"):
|
||||
df.corr(meth, numeric_only=numeric_only)
|
||||
|
||||
|
||||
class TestDataFrameCorrWith:
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
"float64",
|
||||
"Float64",
|
||||
pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")),
|
||||
],
|
||||
)
|
||||
def test_corrwith(self, datetime_frame, dtype):
|
||||
datetime_frame = datetime_frame.astype(dtype)
|
||||
|
||||
a = datetime_frame
|
||||
noise = Series(np.random.default_rng(2).standard_normal(len(a)), index=a.index)
|
||||
|
||||
b = datetime_frame.add(noise, axis=0)
|
||||
|
||||
# make sure order does not matter
|
||||
b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:])
|
||||
del b["B"]
|
||||
|
||||
colcorr = a.corrwith(b, axis=0)
|
||||
tm.assert_almost_equal(colcorr["A"], a["A"].corr(b["A"]))
|
||||
|
||||
rowcorr = a.corrwith(b, axis=1)
|
||||
tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0))
|
||||
|
||||
dropped = a.corrwith(b, axis=0, drop=True)
|
||||
tm.assert_almost_equal(dropped["A"], a["A"].corr(b["A"]))
|
||||
assert "B" not in dropped
|
||||
|
||||
dropped = a.corrwith(b, axis=1, drop=True)
|
||||
assert a.index[-1] not in dropped.index
|
||||
|
||||
# non time-series data
|
||||
index = ["a", "b", "c", "d", "e"]
|
||||
columns = ["one", "two", "three", "four"]
|
||||
df1 = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((5, 4)),
|
||||
index=index,
|
||||
columns=columns,
|
||||
)
|
||||
df2 = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((4, 4)),
|
||||
index=index[:4],
|
||||
columns=columns,
|
||||
)
|
||||
correls = df1.corrwith(df2, axis=1)
|
||||
for row in index[:4]:
|
||||
tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row]))
|
||||
|
||||
def test_corrwith_with_objects(self, using_infer_string):
|
||||
df1 = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=date_range("2000-01-01", periods=10, freq="B"),
|
||||
)
|
||||
df2 = df1.copy()
|
||||
cols = ["A", "B", "C", "D"]
|
||||
|
||||
df1["obj"] = "foo"
|
||||
df2["obj"] = "bar"
|
||||
|
||||
if using_infer_string:
|
||||
import pyarrow as pa
|
||||
|
||||
with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"):
|
||||
df1.corrwith(df2)
|
||||
else:
|
||||
with pytest.raises(TypeError, match="Could not convert"):
|
||||
df1.corrwith(df2)
|
||||
result = df1.corrwith(df2, numeric_only=True)
|
||||
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
with pytest.raises(TypeError, match="unsupported operand type"):
|
||||
df1.corrwith(df2, axis=1)
|
||||
result = df1.corrwith(df2, axis=1, numeric_only=True)
|
||||
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_corrwith_series(self, datetime_frame):
|
||||
result = datetime_frame.corrwith(datetime_frame["A"])
|
||||
expected = datetime_frame.apply(datetime_frame["A"].corr)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_corrwith_matches_corrcoef(self):
|
||||
df1 = DataFrame(np.arange(10000), columns=["a"])
|
||||
df2 = DataFrame(np.arange(10000) ** 2, columns=["a"])
|
||||
c1 = df1.corrwith(df2)["a"]
|
||||
c2 = np.corrcoef(df1["a"], df2["a"])[0][1]
|
||||
|
||||
tm.assert_almost_equal(c1, c2)
|
||||
assert c1 < 1
|
||||
|
||||
@pytest.mark.parametrize("numeric_only", [True, False])
|
||||
def test_corrwith_mixed_dtypes(self, numeric_only):
|
||||
# GH#18570
|
||||
df = DataFrame(
|
||||
{"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]}
|
||||
)
|
||||
s = Series([0, 6, 7, 3])
|
||||
if numeric_only:
|
||||
result = df.corrwith(s, numeric_only=numeric_only)
|
||||
corrs = [df["a"].corr(s), df["b"].corr(s)]
|
||||
expected = Series(data=corrs, index=["a", "b"])
|
||||
tm.assert_series_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="could not convert string to float",
|
||||
):
|
||||
df.corrwith(s, numeric_only=numeric_only)
|
||||
|
||||
def test_corrwith_index_intersection(self):
|
||||
df1 = DataFrame(
|
||||
np.random.default_rng(2).random(size=(10, 2)), columns=["a", "b"]
|
||||
)
|
||||
df2 = DataFrame(
|
||||
np.random.default_rng(2).random(size=(10, 3)), columns=["a", "b", "c"]
|
||||
)
|
||||
|
||||
result = df1.corrwith(df2, drop=True).index.sort_values()
|
||||
expected = df1.columns.intersection(df2.columns).sort_values()
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
def test_corrwith_index_union(self):
|
||||
df1 = DataFrame(
|
||||
np.random.default_rng(2).random(size=(10, 2)), columns=["a", "b"]
|
||||
)
|
||||
df2 = DataFrame(
|
||||
np.random.default_rng(2).random(size=(10, 3)), columns=["a", "b", "c"]
|
||||
)
|
||||
|
||||
result = df1.corrwith(df2, drop=False).index.sort_values()
|
||||
expected = df1.columns.union(df2.columns).sort_values()
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
def test_corrwith_dup_cols(self):
|
||||
# GH#21925
|
||||
df1 = DataFrame(np.vstack([np.arange(10)] * 3).T)
|
||||
df2 = df1.copy()
|
||||
df2 = pd.concat((df2, df2[0]), axis=1)
|
||||
|
||||
result = df1.corrwith(df2)
|
||||
expected = Series(np.ones(4), index=[0, 0, 1, 2])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_corr_numerical_instabilities(self):
|
||||
# GH#45640
|
||||
df = DataFrame([[0.2, 0.4], [0.4, 0.2]])
|
||||
result = df.corr()
|
||||
expected = DataFrame({0: [1.0, -1.0], 1: [-1.0, 1.0]})
|
||||
tm.assert_frame_equal(result - 1, expected - 1, atol=1e-17)
|
||||
|
||||
def test_corrwith_spearman(self):
|
||||
# GH#21925
|
||||
pytest.importorskip("scipy")
|
||||
df = DataFrame(np.random.default_rng(2).random(size=(100, 3)))
|
||||
result = df.corrwith(df**2, method="spearman")
|
||||
expected = Series(np.ones(len(result)))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_corrwith_kendall(self):
|
||||
# GH#21925
|
||||
pytest.importorskip("scipy")
|
||||
df = DataFrame(np.random.default_rng(2).random(size=(100, 3)))
|
||||
result = df.corrwith(df**2, method="kendall")
|
||||
expected = Series(np.ones(len(result)))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_corrwith_spearman_with_tied_data(self):
|
||||
# GH#48826
|
||||
pytest.importorskip("scipy")
|
||||
df1 = DataFrame(
|
||||
{
|
||||
"A": [1, np.nan, 7, 8],
|
||||
"B": [False, True, True, False],
|
||||
"C": [10, 4, 9, 3],
|
||||
}
|
||||
)
|
||||
df2 = df1[["B", "C"]]
|
||||
result = (df1 + 1).corrwith(df2.B, method="spearman")
|
||||
expected = Series([0.0, 1.0, 0.0], index=["A", "B", "C"])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
df_bool = DataFrame(
|
||||
{"A": [True, True, False, False], "B": [True, False, False, True]}
|
||||
)
|
||||
ser_bool = Series([True, True, False, True])
|
||||
result = df_bool.corrwith(ser_bool)
|
||||
expected = Series([0.57735, 0.57735], index=["A", "B"])
|
||||
tm.assert_series_equal(result, expected)
|
@ -0,0 +1,417 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameDescribe:
|
||||
def test_describe_bool_in_mixed_frame(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"string_data": ["a", "b", "c", "d", "e"],
|
||||
"bool_data": [True, True, False, False, False],
|
||||
"int_data": [10, 20, 30, 40, 50],
|
||||
}
|
||||
)
|
||||
|
||||
# Integer data are included in .describe() output,
|
||||
# Boolean and string data are not.
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{"int_data": [5, 30, df.int_data.std(), 10, 20, 30, 40, 50]},
|
||||
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Top value is a boolean value that is False
|
||||
result = df.describe(include=["bool"])
|
||||
|
||||
expected = DataFrame(
|
||||
{"bool_data": [5, 2, False, 3]}, index=["count", "unique", "top", "freq"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_describe_empty_object(self):
|
||||
# GH#27183
|
||||
df = DataFrame({"A": [None, None]}, dtype=object)
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{"A": [0, 0, np.nan, np.nan]},
|
||||
dtype=object,
|
||||
index=["count", "unique", "top", "freq"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.iloc[:0].describe()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_describe_bool_frame(self):
|
||||
# GH#13891
|
||||
df = DataFrame(
|
||||
{
|
||||
"bool_data_1": [False, False, True, True],
|
||||
"bool_data_2": [False, True, True, True],
|
||||
}
|
||||
)
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{"bool_data_1": [4, 2, False, 2], "bool_data_2": [4, 2, True, 3]},
|
||||
index=["count", "unique", "top", "freq"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"bool_data": [False, False, True, True, False],
|
||||
"int_data": [0, 1, 2, 3, 4],
|
||||
}
|
||||
)
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{"int_data": [5, 2, df.int_data.std(), 0, 1, 2, 3, 4]},
|
||||
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame(
|
||||
{"bool_data": [False, False, True, True], "str_data": ["a", "b", "c", "a"]}
|
||||
)
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{"bool_data": [4, 2, False, 2], "str_data": [4, 3, "a", 2]},
|
||||
index=["count", "unique", "top", "freq"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_describe_categorical(self):
|
||||
df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 100)})
|
||||
labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)]
|
||||
cat_labels = Categorical(labels, labels)
|
||||
|
||||
df = df.sort_values(by=["value"], ascending=True)
|
||||
df["value_group"] = pd.cut(
|
||||
df.value, range(0, 10500, 500), right=False, labels=cat_labels
|
||||
)
|
||||
cat = df
|
||||
|
||||
# Categoricals should not show up together with numerical columns
|
||||
result = cat.describe()
|
||||
assert len(result.columns) == 1
|
||||
|
||||
# In a frame, describe() for the cat should be the same as for string
|
||||
# arrays (count, unique, top, freq)
|
||||
|
||||
cat = Categorical(
|
||||
["a", "b", "b", "b"], categories=["a", "b", "c"], ordered=True
|
||||
)
|
||||
s = Series(cat)
|
||||
result = s.describe()
|
||||
expected = Series([4, 2, "b", 3], index=["count", "unique", "top", "freq"])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
cat = Series(Categorical(["a", "b", "c", "c"]))
|
||||
df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]})
|
||||
result = df3.describe()
|
||||
tm.assert_numpy_array_equal(result["cat"].values, result["s"].values)
|
||||
|
||||
def test_describe_empty_categorical_column(self):
|
||||
# GH#26397
|
||||
# Ensure the index of an empty categorical DataFrame column
|
||||
# also contains (count, unique, top, freq)
|
||||
df = DataFrame({"empty_col": Categorical([])})
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{"empty_col": [0, 0, np.nan, np.nan]},
|
||||
index=["count", "unique", "top", "freq"],
|
||||
dtype="object",
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
# ensure NaN, not None
|
||||
assert np.isnan(result.iloc[2, 0])
|
||||
assert np.isnan(result.iloc[3, 0])
|
||||
|
||||
def test_describe_categorical_columns(self):
|
||||
# GH#11558
|
||||
columns = pd.CategoricalIndex(["int1", "int2", "obj"], ordered=True, name="XXX")
|
||||
df = DataFrame(
|
||||
{
|
||||
"int1": [10, 20, 30, 40, 50],
|
||||
"int2": [10, 20, 30, 40, 50],
|
||||
"obj": ["A", 0, None, "X", 1],
|
||||
},
|
||||
columns=columns,
|
||||
)
|
||||
result = df.describe()
|
||||
|
||||
exp_columns = pd.CategoricalIndex(
|
||||
["int1", "int2"],
|
||||
categories=["int1", "int2", "obj"],
|
||||
ordered=True,
|
||||
name="XXX",
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"int1": [5, 30, df.int1.std(), 10, 20, 30, 40, 50],
|
||||
"int2": [5, 30, df.int2.std(), 10, 20, 30, 40, 50],
|
||||
},
|
||||
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
columns=exp_columns,
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_categorical_equal(result.columns.values, expected.columns.values)
|
||||
|
||||
def test_describe_datetime_columns(self):
|
||||
columns = pd.DatetimeIndex(
|
||||
["2011-01-01", "2011-02-01", "2011-03-01"],
|
||||
freq="MS",
|
||||
tz="US/Eastern",
|
||||
name="XXX",
|
||||
)
|
||||
df = DataFrame(
|
||||
{
|
||||
0: [10, 20, 30, 40, 50],
|
||||
1: [10, 20, 30, 40, 50],
|
||||
2: ["A", 0, None, "X", 1],
|
||||
}
|
||||
)
|
||||
df.columns = columns
|
||||
result = df.describe()
|
||||
|
||||
exp_columns = pd.DatetimeIndex(
|
||||
["2011-01-01", "2011-02-01"], freq="MS", tz="US/Eastern", name="XXX"
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
0: [5, 30, df.iloc[:, 0].std(), 10, 20, 30, 40, 50],
|
||||
1: [5, 30, df.iloc[:, 1].std(), 10, 20, 30, 40, 50],
|
||||
},
|
||||
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
)
|
||||
expected.columns = exp_columns
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert result.columns.freq == "MS"
|
||||
assert result.columns.tz == expected.columns.tz
|
||||
|
||||
def test_describe_timedelta_values(self):
|
||||
# GH#6145
|
||||
t1 = pd.timedelta_range("1 days", freq="D", periods=5)
|
||||
t2 = pd.timedelta_range("1 hours", freq="h", periods=5)
|
||||
df = DataFrame({"t1": t1, "t2": t2})
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"t1": [
|
||||
5,
|
||||
pd.Timedelta("3 days"),
|
||||
df.iloc[:, 0].std(),
|
||||
pd.Timedelta("1 days"),
|
||||
pd.Timedelta("2 days"),
|
||||
pd.Timedelta("3 days"),
|
||||
pd.Timedelta("4 days"),
|
||||
pd.Timedelta("5 days"),
|
||||
],
|
||||
"t2": [
|
||||
5,
|
||||
pd.Timedelta("3 hours"),
|
||||
df.iloc[:, 1].std(),
|
||||
pd.Timedelta("1 hours"),
|
||||
pd.Timedelta("2 hours"),
|
||||
pd.Timedelta("3 hours"),
|
||||
pd.Timedelta("4 hours"),
|
||||
pd.Timedelta("5 hours"),
|
||||
],
|
||||
},
|
||||
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
)
|
||||
|
||||
result = df.describe()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
exp_repr = (
|
||||
" t1 t2\n"
|
||||
"count 5 5\n"
|
||||
"mean 3 days 00:00:00 0 days 03:00:00\n"
|
||||
"std 1 days 13:56:50.394919273 0 days 01:34:52.099788303\n"
|
||||
"min 1 days 00:00:00 0 days 01:00:00\n"
|
||||
"25% 2 days 00:00:00 0 days 02:00:00\n"
|
||||
"50% 3 days 00:00:00 0 days 03:00:00\n"
|
||||
"75% 4 days 00:00:00 0 days 04:00:00\n"
|
||||
"max 5 days 00:00:00 0 days 05:00:00"
|
||||
)
|
||||
assert repr(result) == exp_repr
|
||||
|
||||
def test_describe_tz_values(self, tz_naive_fixture):
|
||||
# GH#21332
|
||||
tz = tz_naive_fixture
|
||||
s1 = Series(range(5))
|
||||
start = Timestamp(2018, 1, 1)
|
||||
end = Timestamp(2018, 1, 5)
|
||||
s2 = Series(date_range(start, end, tz=tz))
|
||||
df = DataFrame({"s1": s1, "s2": s2})
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"s1": [5, 2, 0, 1, 2, 3, 4, 1.581139],
|
||||
"s2": [
|
||||
5,
|
||||
Timestamp(2018, 1, 3).tz_localize(tz),
|
||||
start.tz_localize(tz),
|
||||
s2[1],
|
||||
s2[2],
|
||||
s2[3],
|
||||
end.tz_localize(tz),
|
||||
np.nan,
|
||||
],
|
||||
},
|
||||
index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
|
||||
)
|
||||
result = df.describe(include="all")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_datetime_is_numeric_includes_datetime(self):
|
||||
df = DataFrame({"a": date_range("2012", periods=3), "b": [1, 2, 3]})
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": [
|
||||
3,
|
||||
Timestamp("2012-01-02"),
|
||||
Timestamp("2012-01-01"),
|
||||
Timestamp("2012-01-01T12:00:00"),
|
||||
Timestamp("2012-01-02"),
|
||||
Timestamp("2012-01-02T12:00:00"),
|
||||
Timestamp("2012-01-03"),
|
||||
np.nan,
|
||||
],
|
||||
"b": [3, 2, 1, 1.5, 2, 2.5, 3, 1],
|
||||
},
|
||||
index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_describe_tz_values2(self):
|
||||
tz = "CET"
|
||||
s1 = Series(range(5))
|
||||
start = Timestamp(2018, 1, 1)
|
||||
end = Timestamp(2018, 1, 5)
|
||||
s2 = Series(date_range(start, end, tz=tz))
|
||||
df = DataFrame({"s1": s1, "s2": s2})
|
||||
|
||||
s1_ = s1.describe()
|
||||
s2_ = s2.describe()
|
||||
idx = [
|
||||
"count",
|
||||
"mean",
|
||||
"min",
|
||||
"25%",
|
||||
"50%",
|
||||
"75%",
|
||||
"max",
|
||||
"std",
|
||||
]
|
||||
expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).reindex(
|
||||
idx, copy=False
|
||||
)
|
||||
|
||||
result = df.describe(include="all")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_describe_percentiles_integer_idx(self):
|
||||
# GH#26660
|
||||
df = DataFrame({"x": [1]})
|
||||
pct = np.linspace(0, 1, 10 + 1)
|
||||
result = df.describe(percentiles=pct)
|
||||
|
||||
expected = DataFrame(
|
||||
{"x": [1.0, 1.0, np.nan, 1.0, *(1.0 for _ in pct), 1.0]},
|
||||
index=[
|
||||
"count",
|
||||
"mean",
|
||||
"std",
|
||||
"min",
|
||||
"0%",
|
||||
"10%",
|
||||
"20%",
|
||||
"30%",
|
||||
"40%",
|
||||
"50%",
|
||||
"60%",
|
||||
"70%",
|
||||
"80%",
|
||||
"90%",
|
||||
"100%",
|
||||
"max",
|
||||
],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_describe_does_not_raise_error_for_dictlike_elements(self):
|
||||
# GH#32409
|
||||
df = DataFrame([{"test": {"a": "1"}}, {"test": {"a": "2"}}])
|
||||
expected = DataFrame(
|
||||
{"test": [2, 2, {"a": "1"}, 1]}, index=["count", "unique", "top", "freq"]
|
||||
)
|
||||
result = df.describe()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("exclude", ["x", "y", ["x", "y"], ["x", "z"]])
|
||||
def test_describe_when_include_all_exclude_not_allowed(self, exclude):
|
||||
"""
|
||||
When include is 'all', then setting exclude != None is not allowed.
|
||||
"""
|
||||
df = DataFrame({"x": [1], "y": [2], "z": [3]})
|
||||
msg = "exclude must be None when include is 'all'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.describe(include="all", exclude=exclude)
|
||||
|
||||
def test_describe_with_duplicate_columns(self):
|
||||
df = DataFrame(
|
||||
[[1, 1, 1], [2, 2, 2], [3, 3, 3]],
|
||||
columns=["bar", "a", "a"],
|
||||
dtype="float64",
|
||||
)
|
||||
result = df.describe()
|
||||
ser = df.iloc[:, 0].describe()
|
||||
expected = pd.concat([ser, ser, ser], keys=df.columns, axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_ea_with_na(self, any_numeric_ea_dtype):
|
||||
# GH#48778
|
||||
|
||||
df = DataFrame({"a": [1, pd.NA, pd.NA], "b": pd.NA}, dtype=any_numeric_ea_dtype)
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{"a": [1.0, 1.0, pd.NA] + [1.0] * 5, "b": [0.0] + [pd.NA] * 7},
|
||||
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
dtype="Float64",
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_describe_exclude_pa_dtype(self):
|
||||
# GH#52570
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int8())),
|
||||
"b": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int16())),
|
||||
"c": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int32())),
|
||||
}
|
||||
)
|
||||
result = df.describe(
|
||||
include=pd.ArrowDtype(pa.int8()), exclude=pd.ArrowDtype(pa.int32())
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"a": [3, 2, 1, 1, 1.5, 2, 2.5, 3]},
|
||||
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
dtype=pd.ArrowDtype(pa.float64()),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,308 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameDiff:
|
||||
def test_diff_requires_integer(self):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((2, 2)))
|
||||
with pytest.raises(ValueError, match="periods must be an integer"):
|
||||
df.diff(1.5)
|
||||
|
||||
# GH#44572 np.int64 is accepted
|
||||
@pytest.mark.parametrize("num", [1, np.int64(1)])
|
||||
def test_diff(self, datetime_frame, num):
|
||||
df = datetime_frame
|
||||
the_diff = df.diff(num)
|
||||
|
||||
expected = df["A"] - df["A"].shift(num)
|
||||
tm.assert_series_equal(the_diff["A"], expected)
|
||||
|
||||
def test_diff_int_dtype(self):
|
||||
# int dtype
|
||||
a = 10_000_000_000_000_000
|
||||
b = a + 1
|
||||
ser = Series([a, b])
|
||||
|
||||
rs = DataFrame({"s": ser}).diff()
|
||||
assert rs.s[1] == 1
|
||||
|
||||
def test_diff_mixed_numeric(self, datetime_frame):
|
||||
# mixed numeric
|
||||
tf = datetime_frame.astype("float32")
|
||||
the_diff = tf.diff(1)
|
||||
tm.assert_series_equal(the_diff["A"], tf["A"] - tf["A"].shift(1))
|
||||
|
||||
def test_diff_axis1_nonconsolidated(self):
|
||||
# GH#10907
|
||||
df = DataFrame({"y": Series([2]), "z": Series([3])})
|
||||
df.insert(0, "x", 1)
|
||||
result = df.diff(axis=1)
|
||||
expected = DataFrame({"x": np.nan, "y": Series(1), "z": Series(1)})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_diff_timedelta64_with_nat(self):
|
||||
# GH#32441
|
||||
arr = np.arange(6).reshape(3, 2).astype("timedelta64[ns]")
|
||||
arr[:, 0] = np.timedelta64("NaT", "ns")
|
||||
|
||||
df = DataFrame(arr)
|
||||
result = df.diff(1, axis=0)
|
||||
|
||||
expected = DataFrame({0: df[0], 1: [pd.NaT, pd.Timedelta(2), pd.Timedelta(2)]})
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
result = df.diff(0)
|
||||
expected = df - df
|
||||
assert expected[0].isna().all()
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
result = df.diff(-1, axis=1)
|
||||
expected = df * np.nan
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz", [None, "UTC"])
|
||||
def test_diff_datetime_axis0_with_nat(self, tz, unit):
|
||||
# GH#32441
|
||||
dti = pd.DatetimeIndex(["NaT", "2019-01-01", "2019-01-02"], tz=tz).as_unit(unit)
|
||||
ser = Series(dti)
|
||||
|
||||
df = ser.to_frame()
|
||||
|
||||
result = df.diff()
|
||||
ex_index = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta(days=1)]).as_unit(
|
||||
unit
|
||||
)
|
||||
expected = Series(ex_index).to_frame()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz", [None, "UTC"])
|
||||
def test_diff_datetime_with_nat_zero_periods(self, tz):
|
||||
# diff on NaT values should give NaT, not timedelta64(0)
|
||||
dti = date_range("2016-01-01", periods=4, tz=tz)
|
||||
ser = Series(dti)
|
||||
df = ser.to_frame().copy()
|
||||
|
||||
df[1] = ser.copy()
|
||||
|
||||
df.iloc[:, 0] = pd.NaT
|
||||
|
||||
expected = df - df
|
||||
assert expected[0].isna().all()
|
||||
|
||||
result = df.diff(0, axis=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.diff(0, axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz", [None, "UTC"])
|
||||
def test_diff_datetime_axis0(self, tz):
|
||||
# GH#18578
|
||||
df = DataFrame(
|
||||
{
|
||||
0: date_range("2010", freq="D", periods=2, tz=tz),
|
||||
1: date_range("2010", freq="D", periods=2, tz=tz),
|
||||
}
|
||||
)
|
||||
|
||||
result = df.diff(axis=0)
|
||||
expected = DataFrame(
|
||||
{
|
||||
0: pd.TimedeltaIndex(["NaT", "1 days"]),
|
||||
1: pd.TimedeltaIndex(["NaT", "1 days"]),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz", [None, "UTC"])
|
||||
def test_diff_datetime_axis1(self, tz):
|
||||
# GH#18578
|
||||
df = DataFrame(
|
||||
{
|
||||
0: date_range("2010", freq="D", periods=2, tz=tz),
|
||||
1: date_range("2010", freq="D", periods=2, tz=tz),
|
||||
}
|
||||
)
|
||||
|
||||
result = df.diff(axis=1)
|
||||
expected = DataFrame(
|
||||
{
|
||||
0: pd.TimedeltaIndex(["NaT", "NaT"]),
|
||||
1: pd.TimedeltaIndex(["0 days", "0 days"]),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_diff_timedelta(self, unit):
|
||||
# GH#4533
|
||||
df = DataFrame(
|
||||
{
|
||||
"time": [Timestamp("20130101 9:01"), Timestamp("20130101 9:02")],
|
||||
"value": [1.0, 2.0],
|
||||
}
|
||||
)
|
||||
df["time"] = df["time"].dt.as_unit(unit)
|
||||
|
||||
res = df.diff()
|
||||
exp = DataFrame(
|
||||
[[pd.NaT, np.nan], [pd.Timedelta("00:01:00"), 1]], columns=["time", "value"]
|
||||
)
|
||||
exp["time"] = exp["time"].dt.as_unit(unit)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_diff_mixed_dtype(self):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
|
||||
df["A"] = np.array([1, 2, 3, 4, 5], dtype=object)
|
||||
|
||||
result = df.diff()
|
||||
assert result[0].dtype == np.float64
|
||||
|
||||
def test_diff_neg_n(self, datetime_frame):
|
||||
rs = datetime_frame.diff(-1)
|
||||
xp = datetime_frame - datetime_frame.shift(-1)
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
def test_diff_float_n(self, datetime_frame):
|
||||
rs = datetime_frame.diff(1.0)
|
||||
xp = datetime_frame.diff(1)
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
def test_diff_axis(self):
|
||||
# GH#9727
|
||||
df = DataFrame([[1.0, 2.0], [3.0, 4.0]])
|
||||
tm.assert_frame_equal(
|
||||
df.diff(axis=1), DataFrame([[np.nan, 1.0], [np.nan, 1.0]])
|
||||
)
|
||||
tm.assert_frame_equal(
|
||||
df.diff(axis=0), DataFrame([[np.nan, np.nan], [2.0, 2.0]])
|
||||
)
|
||||
|
||||
def test_diff_period(self):
|
||||
# GH#32995 Don't pass an incorrect axis
|
||||
pi = date_range("2016-01-01", periods=3).to_period("D")
|
||||
df = DataFrame({"A": pi})
|
||||
|
||||
result = df.diff(1, axis=1)
|
||||
|
||||
expected = (df - pd.NaT).astype(object)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_diff_axis1_mixed_dtypes(self):
|
||||
# GH#32995 operate column-wise when we have mixed dtypes and axis=1
|
||||
df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)})
|
||||
|
||||
expected = DataFrame({"A": [np.nan, np.nan, np.nan], "B": df["B"] / 2})
|
||||
|
||||
result = df.diff(axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH#21437 mixed-float-dtypes
|
||||
df = DataFrame(
|
||||
{"a": np.arange(3, dtype="float32"), "b": np.arange(3, dtype="float64")}
|
||||
)
|
||||
result = df.diff(axis=1)
|
||||
expected = DataFrame({"a": df["a"] * np.nan, "b": df["b"] * 0})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_diff_axis1_mixed_dtypes_large_periods(self):
|
||||
# GH#32995 operate column-wise when we have mixed dtypes and axis=1
|
||||
df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)})
|
||||
|
||||
expected = df * np.nan
|
||||
|
||||
result = df.diff(axis=1, periods=3)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_diff_axis1_mixed_dtypes_negative_periods(self):
|
||||
# GH#32995 operate column-wise when we have mixed dtypes and axis=1
|
||||
df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)})
|
||||
|
||||
expected = DataFrame({"A": -1.0 * df["A"], "B": df["B"] * np.nan})
|
||||
|
||||
result = df.diff(axis=1, periods=-1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_diff_sparse(self):
|
||||
# GH#28813 .diff() should work for sparse dataframes as well
|
||||
sparse_df = DataFrame([[0, 1], [1, 0]], dtype="Sparse[int]")
|
||||
|
||||
result = sparse_df.diff()
|
||||
expected = DataFrame(
|
||||
[[np.nan, np.nan], [1.0, -1.0]], dtype=pd.SparseDtype("float", 0.0)
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"axis,expected",
|
||||
[
|
||||
(
|
||||
0,
|
||||
DataFrame(
|
||||
{
|
||||
"a": [np.nan, 0, 1, 0, np.nan, np.nan, np.nan, 0],
|
||||
"b": [np.nan, 1, np.nan, np.nan, -2, 1, np.nan, np.nan],
|
||||
"c": np.repeat(np.nan, 8),
|
||||
"d": [np.nan, 3, 5, 7, 9, 11, 13, 15],
|
||||
},
|
||||
dtype="Int64",
|
||||
),
|
||||
),
|
||||
(
|
||||
1,
|
||||
DataFrame(
|
||||
{
|
||||
"a": np.repeat(np.nan, 8),
|
||||
"b": [0, 1, np.nan, 1, np.nan, np.nan, np.nan, 0],
|
||||
"c": np.repeat(np.nan, 8),
|
||||
"d": np.repeat(np.nan, 8),
|
||||
},
|
||||
dtype="Int64",
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_diff_integer_na(self, axis, expected):
|
||||
# GH#24171 IntegerNA Support for DataFrame.diff()
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": np.repeat([0, 1, np.nan, 2], 2),
|
||||
"b": np.tile([0, 1, np.nan, 2], 2),
|
||||
"c": np.repeat(np.nan, 8),
|
||||
"d": np.arange(1, 9) ** 2,
|
||||
},
|
||||
dtype="Int64",
|
||||
)
|
||||
|
||||
# Test case for default behaviour of diff
|
||||
result = df.diff(axis=axis)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_diff_readonly(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/35559
|
||||
arr = np.random.default_rng(2).standard_normal((5, 2))
|
||||
arr.flags.writeable = False
|
||||
df = DataFrame(arr)
|
||||
result = df.diff()
|
||||
expected = DataFrame(np.array(df)).diff()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_diff_all_int_dtype(self, any_int_numpy_dtype):
|
||||
# GH 14773
|
||||
df = DataFrame(range(5))
|
||||
df = df.astype(any_int_numpy_dtype)
|
||||
result = df.diff()
|
||||
expected_dtype = (
|
||||
"float32" if any_int_numpy_dtype in ("int8", "int16") else "float64"
|
||||
)
|
||||
expected = DataFrame([np.nan, 1.0, 1.0, 1.0, 1.0], dtype=expected_dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,155 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class DotSharedTests:
|
||||
@pytest.fixture
|
||||
def obj(self):
|
||||
raise NotImplementedError
|
||||
|
||||
@pytest.fixture
|
||||
def other(self) -> DataFrame:
|
||||
"""
|
||||
other is a DataFrame that is indexed so that obj.dot(other) is valid
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@pytest.fixture
|
||||
def expected(self, obj, other) -> DataFrame:
|
||||
"""
|
||||
The expected result of obj.dot(other)
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def reduced_dim_assert(cls, result, expected):
|
||||
"""
|
||||
Assertion about results with 1 fewer dimension that self.obj
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def test_dot_equiv_values_dot(self, obj, other, expected):
|
||||
# `expected` is constructed from obj.values.dot(other.values)
|
||||
result = obj.dot(other)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_dot_2d_ndarray(self, obj, other, expected):
|
||||
# Check ndarray argument; in this case we get matching values,
|
||||
# but index/columns may not match
|
||||
result = obj.dot(other.values)
|
||||
assert np.all(result == expected.values)
|
||||
|
||||
def test_dot_1d_ndarray(self, obj, expected):
|
||||
# can pass correct-length array
|
||||
row = obj.iloc[0] if obj.ndim == 2 else obj
|
||||
|
||||
result = obj.dot(row.values)
|
||||
expected = obj.dot(row)
|
||||
self.reduced_dim_assert(result, expected)
|
||||
|
||||
def test_dot_series(self, obj, other, expected):
|
||||
# Check series argument
|
||||
result = obj.dot(other["1"])
|
||||
self.reduced_dim_assert(result, expected["1"])
|
||||
|
||||
def test_dot_series_alignment(self, obj, other, expected):
|
||||
result = obj.dot(other.iloc[::-1]["1"])
|
||||
self.reduced_dim_assert(result, expected["1"])
|
||||
|
||||
def test_dot_aligns(self, obj, other, expected):
|
||||
# Check index alignment
|
||||
other2 = other.iloc[::-1]
|
||||
result = obj.dot(other2)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_dot_shape_mismatch(self, obj):
|
||||
msg = "Dot product shape mismatch"
|
||||
# exception raised is of type Exception
|
||||
with pytest.raises(Exception, match=msg):
|
||||
obj.dot(obj.values[:3])
|
||||
|
||||
def test_dot_misaligned(self, obj, other):
|
||||
msg = "matrices are not aligned"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.dot(other.T)
|
||||
|
||||
|
||||
class TestSeriesDot(DotSharedTests):
|
||||
@pytest.fixture
|
||||
def obj(self):
|
||||
return Series(
|
||||
np.random.default_rng(2).standard_normal(4), index=["p", "q", "r", "s"]
|
||||
)
|
||||
|
||||
@pytest.fixture
|
||||
def other(self):
|
||||
return DataFrame(
|
||||
np.random.default_rng(2).standard_normal((3, 4)),
|
||||
index=["1", "2", "3"],
|
||||
columns=["p", "q", "r", "s"],
|
||||
).T
|
||||
|
||||
@pytest.fixture
|
||||
def expected(self, obj, other):
|
||||
return Series(np.dot(obj.values, other.values), index=other.columns)
|
||||
|
||||
@classmethod
|
||||
def reduced_dim_assert(cls, result, expected):
|
||||
"""
|
||||
Assertion about results with 1 fewer dimension that self.obj
|
||||
"""
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
|
||||
class TestDataFrameDot(DotSharedTests):
|
||||
@pytest.fixture
|
||||
def obj(self):
|
||||
return DataFrame(
|
||||
np.random.default_rng(2).standard_normal((3, 4)),
|
||||
index=["a", "b", "c"],
|
||||
columns=["p", "q", "r", "s"],
|
||||
)
|
||||
|
||||
@pytest.fixture
|
||||
def other(self):
|
||||
return DataFrame(
|
||||
np.random.default_rng(2).standard_normal((4, 2)),
|
||||
index=["p", "q", "r", "s"],
|
||||
columns=["1", "2"],
|
||||
)
|
||||
|
||||
@pytest.fixture
|
||||
def expected(self, obj, other):
|
||||
return DataFrame(
|
||||
np.dot(obj.values, other.values), index=obj.index, columns=other.columns
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def reduced_dim_assert(cls, result, expected):
|
||||
"""
|
||||
Assertion about results with 1 fewer dimension that self.obj
|
||||
"""
|
||||
tm.assert_series_equal(result, expected, check_names=False)
|
||||
assert result.name is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype,exp_dtype",
|
||||
[("Float32", "Float64"), ("Int16", "Int32"), ("float[pyarrow]", "double[pyarrow]")],
|
||||
)
|
||||
def test_arrow_dtype(dtype, exp_dtype):
|
||||
pytest.importorskip("pyarrow")
|
||||
|
||||
cols = ["a", "b"]
|
||||
df_a = DataFrame([[1, 2], [3, 4], [5, 6]], columns=cols, dtype="int32")
|
||||
df_b = DataFrame([[1, 0], [0, 1]], index=cols, dtype=dtype)
|
||||
result = df_a.dot(df_b)
|
||||
expected = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=exp_dtype)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,546 @@
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import PerformanceWarning
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"msg,labels,level",
|
||||
[
|
||||
(r"labels \[4\] not found in level", 4, "a"),
|
||||
(r"labels \[7\] not found in level", 7, "b"),
|
||||
],
|
||||
)
|
||||
def test_drop_raise_exception_if_labels_not_in_level(msg, labels, level):
|
||||
# GH 8594
|
||||
mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"])
|
||||
s = Series([10, 20, 30], index=mi)
|
||||
df = DataFrame([10, 20, 30], index=mi)
|
||||
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
s.drop(labels, level=level)
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
df.drop(labels, level=level)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("labels,level", [(4, "a"), (7, "b")])
|
||||
def test_drop_errors_ignore(labels, level):
|
||||
# GH 8594
|
||||
mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"])
|
||||
s = Series([10, 20, 30], index=mi)
|
||||
df = DataFrame([10, 20, 30], index=mi)
|
||||
|
||||
expected_s = s.drop(labels, level=level, errors="ignore")
|
||||
tm.assert_series_equal(s, expected_s)
|
||||
|
||||
expected_df = df.drop(labels, level=level, errors="ignore")
|
||||
tm.assert_frame_equal(df, expected_df)
|
||||
|
||||
|
||||
def test_drop_with_non_unique_datetime_index_and_invalid_keys():
|
||||
# GH 30399
|
||||
|
||||
# define dataframe with unique datetime index
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((5, 3)),
|
||||
columns=["a", "b", "c"],
|
||||
index=pd.date_range("2012", freq="h", periods=5),
|
||||
)
|
||||
# create dataframe with non-unique datetime index
|
||||
df = df.iloc[[0, 2, 2, 3]].copy()
|
||||
|
||||
with pytest.raises(KeyError, match="not found in axis"):
|
||||
df.drop(["a", "b"]) # Dropping with labels not exist in the index
|
||||
|
||||
|
||||
class TestDataFrameDrop:
|
||||
def test_drop_names(self):
|
||||
df = DataFrame(
|
||||
[[1, 2, 3], [3, 4, 5], [5, 6, 7]],
|
||||
index=["a", "b", "c"],
|
||||
columns=["d", "e", "f"],
|
||||
)
|
||||
df.index.name, df.columns.name = "first", "second"
|
||||
df_dropped_b = df.drop("b")
|
||||
df_dropped_e = df.drop("e", axis=1)
|
||||
df_inplace_b, df_inplace_e = df.copy(), df.copy()
|
||||
return_value = df_inplace_b.drop("b", inplace=True)
|
||||
assert return_value is None
|
||||
return_value = df_inplace_e.drop("e", axis=1, inplace=True)
|
||||
assert return_value is None
|
||||
for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e):
|
||||
assert obj.index.name == "first"
|
||||
assert obj.columns.name == "second"
|
||||
assert list(df.columns) == ["d", "e", "f"]
|
||||
|
||||
msg = r"\['g'\] not found in axis"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
df.drop(["g"])
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
df.drop(["g"], axis=1)
|
||||
|
||||
# errors = 'ignore'
|
||||
dropped = df.drop(["g"], errors="ignore")
|
||||
expected = Index(["a", "b", "c"], name="first")
|
||||
tm.assert_index_equal(dropped.index, expected)
|
||||
|
||||
dropped = df.drop(["b", "g"], errors="ignore")
|
||||
expected = Index(["a", "c"], name="first")
|
||||
tm.assert_index_equal(dropped.index, expected)
|
||||
|
||||
dropped = df.drop(["g"], axis=1, errors="ignore")
|
||||
expected = Index(["d", "e", "f"], name="second")
|
||||
tm.assert_index_equal(dropped.columns, expected)
|
||||
|
||||
dropped = df.drop(["d", "g"], axis=1, errors="ignore")
|
||||
expected = Index(["e", "f"], name="second")
|
||||
tm.assert_index_equal(dropped.columns, expected)
|
||||
|
||||
# GH 16398
|
||||
dropped = df.drop([], errors="ignore")
|
||||
expected = Index(["a", "b", "c"], name="first")
|
||||
tm.assert_index_equal(dropped.index, expected)
|
||||
|
||||
def test_drop(self):
|
||||
simple = DataFrame({"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]})
|
||||
tm.assert_frame_equal(simple.drop("A", axis=1), simple[["B"]])
|
||||
tm.assert_frame_equal(simple.drop(["A", "B"], axis="columns"), simple[[]])
|
||||
tm.assert_frame_equal(simple.drop([0, 1, 3], axis=0), simple.loc[[2], :])
|
||||
tm.assert_frame_equal(simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :])
|
||||
|
||||
with pytest.raises(KeyError, match=r"\[5\] not found in axis"):
|
||||
simple.drop(5)
|
||||
with pytest.raises(KeyError, match=r"\['C'\] not found in axis"):
|
||||
simple.drop("C", axis=1)
|
||||
with pytest.raises(KeyError, match=r"\[5\] not found in axis"):
|
||||
simple.drop([1, 5])
|
||||
with pytest.raises(KeyError, match=r"\['C'\] not found in axis"):
|
||||
simple.drop(["A", "C"], axis=1)
|
||||
|
||||
# GH 42881
|
||||
with pytest.raises(KeyError, match=r"\['C', 'D', 'F'\] not found in axis"):
|
||||
simple.drop(["C", "D", "F"], axis=1)
|
||||
|
||||
# errors = 'ignore'
|
||||
tm.assert_frame_equal(simple.drop(5, errors="ignore"), simple)
|
||||
tm.assert_frame_equal(
|
||||
simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :]
|
||||
)
|
||||
tm.assert_frame_equal(simple.drop("C", axis=1, errors="ignore"), simple)
|
||||
tm.assert_frame_equal(
|
||||
simple.drop(["A", "C"], axis=1, errors="ignore"), simple[["B"]]
|
||||
)
|
||||
|
||||
# non-unique - wheee!
|
||||
nu_df = DataFrame(
|
||||
list(zip(range(3), range(-3, 1), list("abc"))), columns=["a", "a", "b"]
|
||||
)
|
||||
tm.assert_frame_equal(nu_df.drop("a", axis=1), nu_df[["b"]])
|
||||
tm.assert_frame_equal(nu_df.drop("b", axis="columns"), nu_df["a"])
|
||||
tm.assert_frame_equal(nu_df.drop([]), nu_df) # GH 16398
|
||||
|
||||
nu_df = nu_df.set_index(Index(["X", "Y", "X"]))
|
||||
nu_df.columns = list("abc")
|
||||
tm.assert_frame_equal(nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :])
|
||||
tm.assert_frame_equal(nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :])
|
||||
|
||||
# inplace cache issue
|
||||
# GH#5628
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc")
|
||||
)
|
||||
expected = df[~(df.b > 0)]
|
||||
return_value = df.drop(labels=df[df.b > 0].index, inplace=True)
|
||||
assert return_value is None
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_drop_multiindex_not_lexsorted(self):
|
||||
# GH#11640
|
||||
|
||||
# define the lexsorted version
|
||||
lexsorted_mi = MultiIndex.from_tuples(
|
||||
[("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"]
|
||||
)
|
||||
lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
|
||||
assert lexsorted_df.columns._is_lexsorted()
|
||||
|
||||
# define the non-lexsorted version
|
||||
not_lexsorted_df = DataFrame(
|
||||
columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]]
|
||||
)
|
||||
not_lexsorted_df = not_lexsorted_df.pivot_table(
|
||||
index="a", columns=["b", "c"], values="d"
|
||||
)
|
||||
not_lexsorted_df = not_lexsorted_df.reset_index()
|
||||
assert not not_lexsorted_df.columns._is_lexsorted()
|
||||
|
||||
expected = lexsorted_df.drop("a", axis=1).astype(float)
|
||||
with tm.assert_produces_warning(PerformanceWarning):
|
||||
result = not_lexsorted_df.drop("a", axis=1)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_drop_api_equivalence(self):
|
||||
# equivalence of the labels/axis and index/columns API's (GH#12392)
|
||||
df = DataFrame(
|
||||
[[1, 2, 3], [3, 4, 5], [5, 6, 7]],
|
||||
index=["a", "b", "c"],
|
||||
columns=["d", "e", "f"],
|
||||
)
|
||||
|
||||
res1 = df.drop("a")
|
||||
res2 = df.drop(index="a")
|
||||
tm.assert_frame_equal(res1, res2)
|
||||
|
||||
res1 = df.drop("d", axis=1)
|
||||
res2 = df.drop(columns="d")
|
||||
tm.assert_frame_equal(res1, res2)
|
||||
|
||||
res1 = df.drop(labels="e", axis=1)
|
||||
res2 = df.drop(columns="e")
|
||||
tm.assert_frame_equal(res1, res2)
|
||||
|
||||
res1 = df.drop(["a"], axis=0)
|
||||
res2 = df.drop(index=["a"])
|
||||
tm.assert_frame_equal(res1, res2)
|
||||
|
||||
res1 = df.drop(["a"], axis=0).drop(["d"], axis=1)
|
||||
res2 = df.drop(index=["a"], columns=["d"])
|
||||
tm.assert_frame_equal(res1, res2)
|
||||
|
||||
msg = "Cannot specify both 'labels' and 'index'/'columns'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.drop(labels="a", index="b")
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.drop(labels="a", columns="b")
|
||||
|
||||
msg = "Need to specify at least one of 'labels', 'index' or 'columns'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.drop(axis=1)
|
||||
|
||||
data = [[1, 2, 3], [1, 2, 3]]
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"actual",
|
||||
[
|
||||
DataFrame(data=data, index=["a", "a"]),
|
||||
DataFrame(data=data, index=["a", "b"]),
|
||||
DataFrame(data=data, index=["a", "b"]).set_index([0, 1]),
|
||||
DataFrame(data=data, index=["a", "a"]).set_index([0, 1]),
|
||||
],
|
||||
)
|
||||
def test_raise_on_drop_duplicate_index(self, actual):
|
||||
# GH#19186
|
||||
level = 0 if isinstance(actual.index, MultiIndex) else None
|
||||
msg = re.escape("\"['c'] not found in axis\"")
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
actual.drop("c", level=level, axis=0)
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
actual.T.drop("c", level=level, axis=1)
|
||||
expected_no_err = actual.drop("c", axis=0, level=level, errors="ignore")
|
||||
tm.assert_frame_equal(expected_no_err, actual)
|
||||
expected_no_err = actual.T.drop("c", axis=1, level=level, errors="ignore")
|
||||
tm.assert_frame_equal(expected_no_err.T, actual)
|
||||
|
||||
@pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 2]])
|
||||
@pytest.mark.parametrize("drop_labels", [[], [1], [2]])
|
||||
def test_drop_empty_list(self, index, drop_labels):
|
||||
# GH#21494
|
||||
expected_index = [i for i in index if i not in drop_labels]
|
||||
frame = DataFrame(index=index).drop(drop_labels)
|
||||
tm.assert_frame_equal(frame, DataFrame(index=expected_index))
|
||||
|
||||
@pytest.mark.parametrize("index", [[1, 2, 3], [1, 2, 2]])
|
||||
@pytest.mark.parametrize("drop_labels", [[1, 4], [4, 5]])
|
||||
def test_drop_non_empty_list(self, index, drop_labels):
|
||||
# GH# 21494
|
||||
with pytest.raises(KeyError, match="not found in axis"):
|
||||
DataFrame(index=index).drop(drop_labels)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"empty_listlike",
|
||||
[
|
||||
[],
|
||||
{},
|
||||
np.array([]),
|
||||
Series([], dtype="datetime64[ns]"),
|
||||
Index([]),
|
||||
DatetimeIndex([]),
|
||||
],
|
||||
)
|
||||
def test_drop_empty_listlike_non_unique_datetime_index(self, empty_listlike):
|
||||
# GH#27994
|
||||
data = {"column_a": [5, 10], "column_b": ["one", "two"]}
|
||||
index = [Timestamp("2021-01-01"), Timestamp("2021-01-01")]
|
||||
df = DataFrame(data, index=index)
|
||||
|
||||
# Passing empty list-like should return the same DataFrame.
|
||||
expected = df.copy()
|
||||
result = df.drop(empty_listlike)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_mixed_depth_drop(self):
|
||||
arrays = [
|
||||
["a", "top", "top", "routine1", "routine1", "routine2"],
|
||||
["", "OD", "OD", "result1", "result2", "result1"],
|
||||
["", "wx", "wy", "", "", ""],
|
||||
]
|
||||
|
||||
tuples = sorted(zip(*arrays))
|
||||
index = MultiIndex.from_tuples(tuples)
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((4, 6)), columns=index)
|
||||
|
||||
result = df.drop("a", axis=1)
|
||||
expected = df.drop([("a", "", "")], axis=1)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
result = df.drop(["top"], axis=1)
|
||||
expected = df.drop([("top", "OD", "wx")], axis=1)
|
||||
expected = expected.drop([("top", "OD", "wy")], axis=1)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
result = df.drop(("top", "OD", "wx"), axis=1)
|
||||
expected = df.drop([("top", "OD", "wx")], axis=1)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
expected = df.drop([("top", "OD", "wy")], axis=1)
|
||||
expected = df.drop("top", axis=1)
|
||||
|
||||
result = df.drop("result1", level=1, axis=1)
|
||||
expected = df.drop(
|
||||
[("routine1", "result1", ""), ("routine2", "result1", "")], axis=1
|
||||
)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
def test_drop_multiindex_other_level_nan(self):
|
||||
# GH#12754
|
||||
df = (
|
||||
DataFrame(
|
||||
{
|
||||
"A": ["one", "one", "two", "two"],
|
||||
"B": [np.nan, 0.0, 1.0, 2.0],
|
||||
"C": ["a", "b", "c", "c"],
|
||||
"D": [1, 2, 3, 4],
|
||||
}
|
||||
)
|
||||
.set_index(["A", "B", "C"])
|
||||
.sort_index()
|
||||
)
|
||||
result = df.drop("c", level="C")
|
||||
expected = DataFrame(
|
||||
[2, 1],
|
||||
columns=["D"],
|
||||
index=MultiIndex.from_tuples(
|
||||
[("one", 0.0, "b"), ("one", np.nan, "a")], names=["A", "B", "C"]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_drop_nonunique(self):
|
||||
df = DataFrame(
|
||||
[
|
||||
["x-a", "x", "a", 1.5],
|
||||
["x-a", "x", "a", 1.2],
|
||||
["z-c", "z", "c", 3.1],
|
||||
["x-a", "x", "a", 4.1],
|
||||
["x-b", "x", "b", 5.1],
|
||||
["x-b", "x", "b", 4.1],
|
||||
["x-b", "x", "b", 2.2],
|
||||
["y-a", "y", "a", 1.2],
|
||||
["z-b", "z", "b", 2.1],
|
||||
],
|
||||
columns=["var1", "var2", "var3", "var4"],
|
||||
)
|
||||
|
||||
grp_size = df.groupby("var1").size()
|
||||
drop_idx = grp_size.loc[grp_size == 1]
|
||||
|
||||
idf = df.set_index(["var1", "var2", "var3"])
|
||||
|
||||
# it works! GH#2101
|
||||
result = idf.drop(drop_idx.index, level=0).reset_index()
|
||||
expected = df[-df.var1.isin(drop_idx.index)]
|
||||
|
||||
result.index = expected.index
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_drop_level(self, multiindex_dataframe_random_data):
|
||||
frame = multiindex_dataframe_random_data
|
||||
|
||||
result = frame.drop(["bar", "qux"], level="first")
|
||||
expected = frame.iloc[[0, 1, 2, 5, 6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = frame.drop(["two"], level="second")
|
||||
expected = frame.iloc[[0, 2, 3, 6, 7, 9]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = frame.T.drop(["bar", "qux"], axis=1, level="first")
|
||||
expected = frame.iloc[[0, 1, 2, 5, 6]].T
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = frame.T.drop(["two"], axis=1, level="second")
|
||||
expected = frame.iloc[[0, 2, 3, 6, 7, 9]].T
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_drop_level_nonunique_datetime(self):
|
||||
# GH#12701
|
||||
idx = Index([2, 3, 4, 4, 5], name="id")
|
||||
idxdt = pd.to_datetime(
|
||||
[
|
||||
"2016-03-23 14:00",
|
||||
"2016-03-23 15:00",
|
||||
"2016-03-23 16:00",
|
||||
"2016-03-23 16:00",
|
||||
"2016-03-23 17:00",
|
||||
]
|
||||
)
|
||||
df = DataFrame(np.arange(10).reshape(5, 2), columns=list("ab"), index=idx)
|
||||
df["tstamp"] = idxdt
|
||||
df = df.set_index("tstamp", append=True)
|
||||
ts = Timestamp("201603231600")
|
||||
assert df.index.is_unique is False
|
||||
|
||||
result = df.drop(ts, level="tstamp")
|
||||
expected = df.loc[idx != 4]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_drop_tz_aware_timestamp_across_dst(self, frame_or_series):
|
||||
# GH#21761
|
||||
start = Timestamp("2017-10-29", tz="Europe/Berlin")
|
||||
end = Timestamp("2017-10-29 04:00:00", tz="Europe/Berlin")
|
||||
index = pd.date_range(start, end, freq="15min")
|
||||
data = frame_or_series(data=[1] * len(index), index=index)
|
||||
result = data.drop(start)
|
||||
expected_start = Timestamp("2017-10-29 00:15:00", tz="Europe/Berlin")
|
||||
expected_idx = pd.date_range(expected_start, end, freq="15min")
|
||||
expected = frame_or_series(data=[1] * len(expected_idx), index=expected_idx)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_drop_preserve_names(self):
|
||||
index = MultiIndex.from_arrays(
|
||||
[[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]], names=["one", "two"]
|
||||
)
|
||||
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((6, 3)), index=index)
|
||||
|
||||
result = df.drop([(0, 2)])
|
||||
assert result.index.names == ("one", "two")
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"operation", ["__iadd__", "__isub__", "__imul__", "__ipow__"]
|
||||
)
|
||||
@pytest.mark.parametrize("inplace", [False, True])
|
||||
def test_inplace_drop_and_operation(self, operation, inplace):
|
||||
# GH#30484
|
||||
df = DataFrame({"x": range(5)})
|
||||
expected = df.copy()
|
||||
df["y"] = range(5)
|
||||
y = df["y"]
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
if inplace:
|
||||
df.drop("y", axis=1, inplace=inplace)
|
||||
else:
|
||||
df = df.drop("y", axis=1, inplace=inplace)
|
||||
|
||||
# Perform operation and check result
|
||||
getattr(y, operation)(1)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_drop_with_non_unique_multiindex(self):
|
||||
# GH#36293
|
||||
mi = MultiIndex.from_arrays([["x", "y", "x"], ["i", "j", "i"]])
|
||||
df = DataFrame([1, 2, 3], index=mi)
|
||||
result = df.drop(index="x")
|
||||
expected = DataFrame([2], index=MultiIndex.from_arrays([["y"], ["j"]]))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("indexer", [("a", "a"), [("a", "a")]])
|
||||
def test_drop_tuple_with_non_unique_multiindex(self, indexer):
|
||||
# GH#42771
|
||||
idx = MultiIndex.from_product([["a", "b"], ["a", "a"]])
|
||||
df = DataFrame({"x": range(len(idx))}, index=idx)
|
||||
result = df.drop(index=[("a", "a")])
|
||||
expected = DataFrame(
|
||||
{"x": [2, 3]}, index=MultiIndex.from_tuples([("b", "a"), ("b", "a")])
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_drop_with_duplicate_columns(self):
|
||||
df = DataFrame(
|
||||
[[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"]
|
||||
)
|
||||
result = df.drop(["a"], axis=1)
|
||||
expected = DataFrame([[1], [1], [1]], columns=["bar"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df.drop("a", axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_drop_with_duplicate_columns2(self):
|
||||
# drop buggy GH#6240
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": np.random.default_rng(2).standard_normal(5),
|
||||
"B": np.random.default_rng(2).standard_normal(5),
|
||||
"C": np.random.default_rng(2).standard_normal(5),
|
||||
"D": ["a", "b", "c", "d", "e"],
|
||||
}
|
||||
)
|
||||
|
||||
expected = df.take([0, 1, 1], axis=1)
|
||||
df2 = df.take([2, 0, 1, 2, 1], axis=1)
|
||||
result = df2.drop("C", axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_drop_inplace_no_leftover_column_reference(self):
|
||||
# GH 13934
|
||||
df = DataFrame({"a": [1, 2, 3]}, columns=Index(["a"], dtype="object"))
|
||||
a = df.a
|
||||
df.drop(["a"], axis=1, inplace=True)
|
||||
tm.assert_index_equal(df.columns, Index([], dtype="object"))
|
||||
a -= a.mean()
|
||||
tm.assert_index_equal(df.columns, Index([], dtype="object"))
|
||||
|
||||
def test_drop_level_missing_label_multiindex(self):
|
||||
# GH 18561
|
||||
df = DataFrame(index=MultiIndex.from_product([range(3), range(3)]))
|
||||
with pytest.raises(KeyError, match="labels \\[5\\] not found in level"):
|
||||
df.drop(5, level=0)
|
||||
|
||||
@pytest.mark.parametrize("idx, level", [(["a", "b"], 0), (["a"], None)])
|
||||
def test_drop_index_ea_dtype(self, any_numeric_ea_dtype, idx, level):
|
||||
# GH#45860
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 2, pd.NA], "b": 100}, dtype=any_numeric_ea_dtype
|
||||
).set_index(idx)
|
||||
result = df.drop(Index([2, pd.NA]), level=level)
|
||||
expected = DataFrame(
|
||||
{"a": [1], "b": 100}, dtype=any_numeric_ea_dtype
|
||||
).set_index(idx)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_drop_parse_strings_datetime_index(self):
|
||||
# GH #5355
|
||||
df = DataFrame(
|
||||
{"a": [1, 2], "b": [1, 2]},
|
||||
index=[Timestamp("2000-01-03"), Timestamp("2000-01-04")],
|
||||
)
|
||||
result = df.drop("2000-01-03", axis=0)
|
||||
expected = DataFrame({"a": [2], "b": [2]}, index=[Timestamp("2000-01-04")])
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,473 @@
|
||||
from datetime import datetime
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
NaT,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]])
|
||||
def test_drop_duplicates_with_misspelled_column_name(subset):
|
||||
# GH 19730
|
||||
df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
|
||||
msg = re.escape("Index(['a'], dtype=")
|
||||
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
df.drop_duplicates(subset)
|
||||
|
||||
|
||||
def test_drop_duplicates():
|
||||
df = DataFrame(
|
||||
{
|
||||
"AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": [1, 1, 2, 2, 2, 2, 1, 2],
|
||||
"D": range(8),
|
||||
}
|
||||
)
|
||||
# single column
|
||||
result = df.drop_duplicates("AAA")
|
||||
expected = df[:2]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("AAA", keep="last")
|
||||
expected = df.loc[[6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("AAA", keep=False)
|
||||
expected = df.loc[[]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert len(result) == 0
|
||||
|
||||
# multi column
|
||||
expected = df.loc[[0, 1, 2, 3]]
|
||||
result = df.drop_duplicates(np.array(["AAA", "B"]))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df.drop_duplicates(["AAA", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(("AAA", "B"), keep="last")
|
||||
expected = df.loc[[0, 5, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(("AAA", "B"), keep=False)
|
||||
expected = df.loc[[0]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# consider everything
|
||||
df2 = df.loc[:, ["AAA", "B", "C"]]
|
||||
|
||||
result = df2.drop_duplicates()
|
||||
# in this case only
|
||||
expected = df2.drop_duplicates(["AAA", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df2.drop_duplicates(keep="last")
|
||||
expected = df2.drop_duplicates(["AAA", "B"], keep="last")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df2.drop_duplicates(keep=False)
|
||||
expected = df2.drop_duplicates(["AAA", "B"], keep=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# integers
|
||||
result = df.drop_duplicates("C")
|
||||
expected = df.iloc[[0, 2]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df.drop_duplicates("C", keep="last")
|
||||
expected = df.iloc[[-2, -1]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df["E"] = df["C"].astype("int8")
|
||||
result = df.drop_duplicates("E")
|
||||
expected = df.iloc[[0, 2]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df.drop_duplicates("E", keep="last")
|
||||
expected = df.iloc[[-2, -1]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH 11376
|
||||
df = DataFrame({"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]})
|
||||
expected = df.loc[df.index != 3]
|
||||
tm.assert_frame_equal(df.drop_duplicates(), expected)
|
||||
|
||||
df = DataFrame([[1, 0], [0, 2]])
|
||||
tm.assert_frame_equal(df.drop_duplicates(), df)
|
||||
|
||||
df = DataFrame([[-2, 0], [0, -4]])
|
||||
tm.assert_frame_equal(df.drop_duplicates(), df)
|
||||
|
||||
x = np.iinfo(np.int64).max / 3 * 2
|
||||
df = DataFrame([[-x, x], [0, x + 4]])
|
||||
tm.assert_frame_equal(df.drop_duplicates(), df)
|
||||
|
||||
df = DataFrame([[-x, x], [x, x + 4]])
|
||||
tm.assert_frame_equal(df.drop_duplicates(), df)
|
||||
|
||||
# GH 11864
|
||||
df = DataFrame([i] * 9 for i in range(16))
|
||||
df = concat([df, DataFrame([[1] + [0] * 8])], ignore_index=True)
|
||||
|
||||
for keep in ["first", "last", False]:
|
||||
assert df.duplicated(keep=keep).sum() == 0
|
||||
|
||||
|
||||
def test_drop_duplicates_with_duplicate_column_names():
|
||||
# GH17836
|
||||
df = DataFrame([[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"])
|
||||
|
||||
result0 = df.drop_duplicates()
|
||||
tm.assert_frame_equal(result0, df)
|
||||
|
||||
result1 = df.drop_duplicates("a")
|
||||
expected1 = df[:2]
|
||||
tm.assert_frame_equal(result1, expected1)
|
||||
|
||||
|
||||
def test_drop_duplicates_for_take_all():
|
||||
df = DataFrame(
|
||||
{
|
||||
"AAA": ["foo", "bar", "baz", "bar", "foo", "bar", "qux", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": [1, 1, 2, 2, 2, 2, 1, 2],
|
||||
"D": range(8),
|
||||
}
|
||||
)
|
||||
# single column
|
||||
result = df.drop_duplicates("AAA")
|
||||
expected = df.iloc[[0, 1, 2, 6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("AAA", keep="last")
|
||||
expected = df.iloc[[2, 5, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("AAA", keep=False)
|
||||
expected = df.iloc[[2, 6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# multiple columns
|
||||
result = df.drop_duplicates(["AAA", "B"])
|
||||
expected = df.iloc[[0, 1, 2, 3, 4, 6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(["AAA", "B"], keep="last")
|
||||
expected = df.iloc[[0, 1, 2, 5, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(["AAA", "B"], keep=False)
|
||||
expected = df.iloc[[0, 1, 2, 6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_drop_duplicates_tuple():
|
||||
df = DataFrame(
|
||||
{
|
||||
("AA", "AB"): ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": [1, 1, 2, 2, 2, 2, 1, 2],
|
||||
"D": range(8),
|
||||
}
|
||||
)
|
||||
# single column
|
||||
result = df.drop_duplicates(("AA", "AB"))
|
||||
expected = df[:2]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(("AA", "AB"), keep="last")
|
||||
expected = df.loc[[6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(("AA", "AB"), keep=False)
|
||||
expected = df.loc[[]] # empty df
|
||||
assert len(result) == 0
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# multi column
|
||||
expected = df.loc[[0, 1, 2, 3]]
|
||||
result = df.drop_duplicates((("AA", "AB"), "B"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"df",
|
||||
[
|
||||
DataFrame(),
|
||||
DataFrame(columns=[]),
|
||||
DataFrame(columns=["A", "B", "C"]),
|
||||
DataFrame(index=[]),
|
||||
DataFrame(index=["A", "B", "C"]),
|
||||
],
|
||||
)
|
||||
def test_drop_duplicates_empty(df):
|
||||
# GH 20516
|
||||
result = df.drop_duplicates()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
result = df.copy()
|
||||
result.drop_duplicates(inplace=True)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_drop_duplicates_NA():
|
||||
# none
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [None, None, "foo", "bar", "foo", "bar", "bar", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0],
|
||||
"D": range(8),
|
||||
}
|
||||
)
|
||||
# single column
|
||||
result = df.drop_duplicates("A")
|
||||
expected = df.loc[[0, 2, 3]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("A", keep="last")
|
||||
expected = df.loc[[1, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("A", keep=False)
|
||||
expected = df.loc[[]] # empty df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert len(result) == 0
|
||||
|
||||
# multi column
|
||||
result = df.drop_duplicates(["A", "B"])
|
||||
expected = df.loc[[0, 2, 3, 6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(["A", "B"], keep="last")
|
||||
expected = df.loc[[1, 5, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(["A", "B"], keep=False)
|
||||
expected = df.loc[[6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# nan
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0],
|
||||
"D": range(8),
|
||||
}
|
||||
)
|
||||
# single column
|
||||
result = df.drop_duplicates("C")
|
||||
expected = df[:2]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("C", keep="last")
|
||||
expected = df.loc[[3, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("C", keep=False)
|
||||
expected = df.loc[[]] # empty df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert len(result) == 0
|
||||
|
||||
# multi column
|
||||
result = df.drop_duplicates(["C", "B"])
|
||||
expected = df.loc[[0, 1, 2, 4]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(["C", "B"], keep="last")
|
||||
expected = df.loc[[1, 3, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(["C", "B"], keep=False)
|
||||
expected = df.loc[[1]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_drop_duplicates_NA_for_take_all():
|
||||
# none
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [None, None, "foo", "bar", "foo", "baz", "bar", "qux"],
|
||||
"C": [1.0, np.nan, np.nan, np.nan, 1.0, 2.0, 3, 1.0],
|
||||
}
|
||||
)
|
||||
|
||||
# single column
|
||||
result = df.drop_duplicates("A")
|
||||
expected = df.iloc[[0, 2, 3, 5, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("A", keep="last")
|
||||
expected = df.iloc[[1, 4, 5, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("A", keep=False)
|
||||
expected = df.iloc[[5, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# nan
|
||||
|
||||
# single column
|
||||
result = df.drop_duplicates("C")
|
||||
expected = df.iloc[[0, 1, 5, 6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("C", keep="last")
|
||||
expected = df.iloc[[3, 5, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("C", keep=False)
|
||||
expected = df.iloc[[5, 6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_drop_duplicates_inplace():
|
||||
orig = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": [1, 1, 2, 2, 2, 2, 1, 2],
|
||||
"D": range(8),
|
||||
}
|
||||
)
|
||||
# single column
|
||||
df = orig.copy()
|
||||
return_value = df.drop_duplicates("A", inplace=True)
|
||||
expected = orig[:2]
|
||||
result = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert return_value is None
|
||||
|
||||
df = orig.copy()
|
||||
return_value = df.drop_duplicates("A", keep="last", inplace=True)
|
||||
expected = orig.loc[[6, 7]]
|
||||
result = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert return_value is None
|
||||
|
||||
df = orig.copy()
|
||||
return_value = df.drop_duplicates("A", keep=False, inplace=True)
|
||||
expected = orig.loc[[]]
|
||||
result = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert len(df) == 0
|
||||
assert return_value is None
|
||||
|
||||
# multi column
|
||||
df = orig.copy()
|
||||
return_value = df.drop_duplicates(["A", "B"], inplace=True)
|
||||
expected = orig.loc[[0, 1, 2, 3]]
|
||||
result = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert return_value is None
|
||||
|
||||
df = orig.copy()
|
||||
return_value = df.drop_duplicates(["A", "B"], keep="last", inplace=True)
|
||||
expected = orig.loc[[0, 5, 6, 7]]
|
||||
result = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert return_value is None
|
||||
|
||||
df = orig.copy()
|
||||
return_value = df.drop_duplicates(["A", "B"], keep=False, inplace=True)
|
||||
expected = orig.loc[[0]]
|
||||
result = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert return_value is None
|
||||
|
||||
# consider everything
|
||||
orig2 = orig.loc[:, ["A", "B", "C"]].copy()
|
||||
|
||||
df2 = orig2.copy()
|
||||
return_value = df2.drop_duplicates(inplace=True)
|
||||
# in this case only
|
||||
expected = orig2.drop_duplicates(["A", "B"])
|
||||
result = df2
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert return_value is None
|
||||
|
||||
df2 = orig2.copy()
|
||||
return_value = df2.drop_duplicates(keep="last", inplace=True)
|
||||
expected = orig2.drop_duplicates(["A", "B"], keep="last")
|
||||
result = df2
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert return_value is None
|
||||
|
||||
df2 = orig2.copy()
|
||||
return_value = df2.drop_duplicates(keep=False, inplace=True)
|
||||
expected = orig2.drop_duplicates(["A", "B"], keep=False)
|
||||
result = df2
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert return_value is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("inplace", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"origin_dict, output_dict, ignore_index, output_index",
|
||||
[
|
||||
({"A": [2, 2, 3]}, {"A": [2, 3]}, True, [0, 1]),
|
||||
({"A": [2, 2, 3]}, {"A": [2, 3]}, False, [0, 2]),
|
||||
({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, [0, 1]),
|
||||
({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, False, [0, 2]),
|
||||
],
|
||||
)
|
||||
def test_drop_duplicates_ignore_index(
|
||||
inplace, origin_dict, output_dict, ignore_index, output_index
|
||||
):
|
||||
# GH 30114
|
||||
df = DataFrame(origin_dict)
|
||||
expected = DataFrame(output_dict, index=output_index)
|
||||
|
||||
if inplace:
|
||||
result_df = df.copy()
|
||||
result_df.drop_duplicates(ignore_index=ignore_index, inplace=inplace)
|
||||
else:
|
||||
result_df = df.drop_duplicates(ignore_index=ignore_index, inplace=inplace)
|
||||
|
||||
tm.assert_frame_equal(result_df, expected)
|
||||
tm.assert_frame_equal(df, DataFrame(origin_dict))
|
||||
|
||||
|
||||
def test_drop_duplicates_null_in_object_column(nulls_fixture):
|
||||
# https://github.com/pandas-dev/pandas/issues/32992
|
||||
df = DataFrame([[1, nulls_fixture], [2, "a"]], dtype=object)
|
||||
result = df.drop_duplicates()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_drop_duplicates_series_vs_dataframe(keep):
|
||||
# GH#14192
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 1, "one", "one"],
|
||||
"b": [2, 2, np.nan, np.nan, np.nan],
|
||||
"c": [3, 3, np.nan, np.nan, "three"],
|
||||
"d": [1, 2, 3, 4, 4],
|
||||
"e": [
|
||||
datetime(2015, 1, 1),
|
||||
datetime(2015, 1, 1),
|
||||
datetime(2015, 2, 1),
|
||||
NaT,
|
||||
NaT,
|
||||
],
|
||||
}
|
||||
)
|
||||
for column in df.columns:
|
||||
dropped_frame = df[[column]].drop_duplicates(keep=keep)
|
||||
dropped_series = df[column].drop_duplicates(keep=keep)
|
||||
tm.assert_frame_equal(dropped_frame, dropped_series.to_frame())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("arg", [[1], 1, "True", [], 0])
|
||||
def test_drop_duplicates_non_boolean_ignore_index(arg):
|
||||
# GH#38274
|
||||
df = DataFrame({"a": [1, 2, 1, 3]})
|
||||
msg = '^For argument "ignore_index" expected type bool, received type .*.$'
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.drop_duplicates(ignore_index=arg)
|
@ -0,0 +1,36 @@
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDropLevel:
|
||||
def test_droplevel(self, frame_or_series):
|
||||
# GH#20342
|
||||
cols = MultiIndex.from_tuples(
|
||||
[("c", "e"), ("d", "f")], names=["level_1", "level_2"]
|
||||
)
|
||||
mi = MultiIndex.from_tuples([(1, 2), (5, 6), (9, 10)], names=["a", "b"])
|
||||
df = DataFrame([[3, 4], [7, 8], [11, 12]], index=mi, columns=cols)
|
||||
if frame_or_series is not DataFrame:
|
||||
df = df.iloc[:, 0]
|
||||
|
||||
# test that dropping of a level in index works
|
||||
expected = df.reset_index("a", drop=True)
|
||||
result = df.droplevel("a", axis="index")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
if frame_or_series is DataFrame:
|
||||
# test that dropping of a level in columns works
|
||||
expected = df.copy()
|
||||
expected.columns = Index(["c", "d"], name="level_1")
|
||||
result = df.droplevel("level_2", axis="columns")
|
||||
tm.assert_equal(result, expected)
|
||||
else:
|
||||
# test that droplevel raises ValueError on axis != 0
|
||||
with pytest.raises(ValueError, match="No axis named columns"):
|
||||
df.droplevel(1, axis="columns")
|
@ -0,0 +1,285 @@
|
||||
import datetime
|
||||
|
||||
import dateutil
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameMissingData:
|
||||
def test_dropEmptyRows(self, float_frame):
|
||||
N = len(float_frame.index)
|
||||
mat = np.random.default_rng(2).standard_normal(N)
|
||||
mat[:5] = np.nan
|
||||
|
||||
frame = DataFrame({"foo": mat}, index=float_frame.index)
|
||||
original = Series(mat, index=float_frame.index, name="foo")
|
||||
expected = original.dropna()
|
||||
inplace_frame1, inplace_frame2 = frame.copy(), frame.copy()
|
||||
|
||||
smaller_frame = frame.dropna(how="all")
|
||||
# check that original was preserved
|
||||
tm.assert_series_equal(frame["foo"], original)
|
||||
return_value = inplace_frame1.dropna(how="all", inplace=True)
|
||||
tm.assert_series_equal(smaller_frame["foo"], expected)
|
||||
tm.assert_series_equal(inplace_frame1["foo"], expected)
|
||||
assert return_value is None
|
||||
|
||||
smaller_frame = frame.dropna(how="all", subset=["foo"])
|
||||
return_value = inplace_frame2.dropna(how="all", subset=["foo"], inplace=True)
|
||||
tm.assert_series_equal(smaller_frame["foo"], expected)
|
||||
tm.assert_series_equal(inplace_frame2["foo"], expected)
|
||||
assert return_value is None
|
||||
|
||||
def test_dropIncompleteRows(self, float_frame):
|
||||
N = len(float_frame.index)
|
||||
mat = np.random.default_rng(2).standard_normal(N)
|
||||
mat[:5] = np.nan
|
||||
|
||||
frame = DataFrame({"foo": mat}, index=float_frame.index)
|
||||
frame["bar"] = 5
|
||||
original = Series(mat, index=float_frame.index, name="foo")
|
||||
inp_frame1, inp_frame2 = frame.copy(), frame.copy()
|
||||
|
||||
smaller_frame = frame.dropna()
|
||||
tm.assert_series_equal(frame["foo"], original)
|
||||
return_value = inp_frame1.dropna(inplace=True)
|
||||
|
||||
exp = Series(mat[5:], index=float_frame.index[5:], name="foo")
|
||||
tm.assert_series_equal(smaller_frame["foo"], exp)
|
||||
tm.assert_series_equal(inp_frame1["foo"], exp)
|
||||
assert return_value is None
|
||||
|
||||
samesize_frame = frame.dropna(subset=["bar"])
|
||||
tm.assert_series_equal(frame["foo"], original)
|
||||
assert (frame["bar"] == 5).all()
|
||||
return_value = inp_frame2.dropna(subset=["bar"], inplace=True)
|
||||
tm.assert_index_equal(samesize_frame.index, float_frame.index)
|
||||
tm.assert_index_equal(inp_frame2.index, float_frame.index)
|
||||
assert return_value is None
|
||||
|
||||
def test_dropna(self):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((6, 4)))
|
||||
df.iloc[:2, 2] = np.nan
|
||||
|
||||
dropped = df.dropna(axis=1)
|
||||
expected = df.loc[:, [0, 1, 3]]
|
||||
inp = df.copy()
|
||||
return_value = inp.dropna(axis=1, inplace=True)
|
||||
tm.assert_frame_equal(dropped, expected)
|
||||
tm.assert_frame_equal(inp, expected)
|
||||
assert return_value is None
|
||||
|
||||
dropped = df.dropna(axis=0)
|
||||
expected = df.loc[list(range(2, 6))]
|
||||
inp = df.copy()
|
||||
return_value = inp.dropna(axis=0, inplace=True)
|
||||
tm.assert_frame_equal(dropped, expected)
|
||||
tm.assert_frame_equal(inp, expected)
|
||||
assert return_value is None
|
||||
|
||||
# threshold
|
||||
dropped = df.dropna(axis=1, thresh=5)
|
||||
expected = df.loc[:, [0, 1, 3]]
|
||||
inp = df.copy()
|
||||
return_value = inp.dropna(axis=1, thresh=5, inplace=True)
|
||||
tm.assert_frame_equal(dropped, expected)
|
||||
tm.assert_frame_equal(inp, expected)
|
||||
assert return_value is None
|
||||
|
||||
dropped = df.dropna(axis=0, thresh=4)
|
||||
expected = df.loc[range(2, 6)]
|
||||
inp = df.copy()
|
||||
return_value = inp.dropna(axis=0, thresh=4, inplace=True)
|
||||
tm.assert_frame_equal(dropped, expected)
|
||||
tm.assert_frame_equal(inp, expected)
|
||||
assert return_value is None
|
||||
|
||||
dropped = df.dropna(axis=1, thresh=4)
|
||||
tm.assert_frame_equal(dropped, df)
|
||||
|
||||
dropped = df.dropna(axis=1, thresh=3)
|
||||
tm.assert_frame_equal(dropped, df)
|
||||
|
||||
# subset
|
||||
dropped = df.dropna(axis=0, subset=[0, 1, 3])
|
||||
inp = df.copy()
|
||||
return_value = inp.dropna(axis=0, subset=[0, 1, 3], inplace=True)
|
||||
tm.assert_frame_equal(dropped, df)
|
||||
tm.assert_frame_equal(inp, df)
|
||||
assert return_value is None
|
||||
|
||||
# all
|
||||
dropped = df.dropna(axis=1, how="all")
|
||||
tm.assert_frame_equal(dropped, df)
|
||||
|
||||
df[2] = np.nan
|
||||
dropped = df.dropna(axis=1, how="all")
|
||||
expected = df.loc[:, [0, 1, 3]]
|
||||
tm.assert_frame_equal(dropped, expected)
|
||||
|
||||
# bad input
|
||||
msg = "No axis named 3 for object type DataFrame"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.dropna(axis=3)
|
||||
|
||||
def test_drop_and_dropna_caching(self):
|
||||
# tst that cacher updates
|
||||
original = Series([1, 2, np.nan], name="A")
|
||||
expected = Series([1, 2], dtype=original.dtype, name="A")
|
||||
df = DataFrame({"A": original.values.copy()})
|
||||
df2 = df.copy()
|
||||
df["A"].dropna()
|
||||
tm.assert_series_equal(df["A"], original)
|
||||
|
||||
ser = df["A"]
|
||||
return_value = ser.dropna(inplace=True)
|
||||
tm.assert_series_equal(ser, expected)
|
||||
tm.assert_series_equal(df["A"], original)
|
||||
assert return_value is None
|
||||
|
||||
df2["A"].drop([1])
|
||||
tm.assert_series_equal(df2["A"], original)
|
||||
|
||||
ser = df2["A"]
|
||||
return_value = ser.drop([1], inplace=True)
|
||||
tm.assert_series_equal(ser, original.drop([1]))
|
||||
tm.assert_series_equal(df2["A"], original)
|
||||
assert return_value is None
|
||||
|
||||
def test_dropna_corner(self, float_frame):
|
||||
# bad input
|
||||
msg = "invalid how option: foo"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
float_frame.dropna(how="foo")
|
||||
# non-existent column - 8303
|
||||
with pytest.raises(KeyError, match=r"^\['X'\]$"):
|
||||
float_frame.dropna(subset=["A", "X"])
|
||||
|
||||
def test_dropna_multiple_axes(self):
|
||||
df = DataFrame(
|
||||
[
|
||||
[1, np.nan, 2, 3],
|
||||
[4, np.nan, 5, 6],
|
||||
[np.nan, np.nan, np.nan, np.nan],
|
||||
[7, np.nan, 8, 9],
|
||||
]
|
||||
)
|
||||
|
||||
# GH20987
|
||||
with pytest.raises(TypeError, match="supplying multiple axes"):
|
||||
df.dropna(how="all", axis=[0, 1])
|
||||
with pytest.raises(TypeError, match="supplying multiple axes"):
|
||||
df.dropna(how="all", axis=(0, 1))
|
||||
|
||||
inp = df.copy()
|
||||
with pytest.raises(TypeError, match="supplying multiple axes"):
|
||||
inp.dropna(how="all", axis=(0, 1), inplace=True)
|
||||
|
||||
def test_dropna_tz_aware_datetime(self):
|
||||
# GH13407
|
||||
df = DataFrame()
|
||||
dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc())
|
||||
dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc())
|
||||
df["Time"] = [dt1]
|
||||
result = df.dropna(axis=0)
|
||||
expected = DataFrame({"Time": [dt1]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Ex2
|
||||
df = DataFrame({"Time": [dt1, None, np.nan, dt2]})
|
||||
result = df.dropna(axis=0)
|
||||
expected = DataFrame([dt1, dt2], columns=["Time"], index=[0, 3])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dropna_categorical_interval_index(self):
|
||||
# GH 25087
|
||||
ii = pd.IntervalIndex.from_breaks([0, 2.78, 3.14, 6.28])
|
||||
ci = pd.CategoricalIndex(ii)
|
||||
df = DataFrame({"A": list("abc")}, index=ci)
|
||||
|
||||
expected = df
|
||||
result = df.dropna()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dropna_with_duplicate_columns(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": np.random.default_rng(2).standard_normal(5),
|
||||
"B": np.random.default_rng(2).standard_normal(5),
|
||||
"C": np.random.default_rng(2).standard_normal(5),
|
||||
"D": ["a", "b", "c", "d", "e"],
|
||||
}
|
||||
)
|
||||
df.iloc[2, [0, 1, 2]] = np.nan
|
||||
df.iloc[0, 0] = np.nan
|
||||
df.iloc[1, 1] = np.nan
|
||||
df.iloc[:, 3] = np.nan
|
||||
expected = df.dropna(subset=["A", "B", "C"], how="all")
|
||||
expected.columns = ["A", "A", "B", "C"]
|
||||
|
||||
df.columns = ["A", "A", "B", "C"]
|
||||
|
||||
result = df.dropna(subset=["A", "C"], how="all")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_set_single_column_subset(self):
|
||||
# GH 41021
|
||||
df = DataFrame({"A": [1, 2, 3], "B": list("abc"), "C": [4, np.nan, 5]})
|
||||
expected = DataFrame(
|
||||
{"A": [1, 3], "B": list("ac"), "C": [4.0, 5.0]}, index=[0, 2]
|
||||
)
|
||||
result = df.dropna(subset="C")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_single_column_not_present_in_axis(self):
|
||||
# GH 41021
|
||||
df = DataFrame({"A": [1, 2, 3]})
|
||||
|
||||
# Column not present
|
||||
with pytest.raises(KeyError, match="['D']"):
|
||||
df.dropna(subset="D", axis=0)
|
||||
|
||||
def test_subset_is_nparray(self):
|
||||
# GH 41021
|
||||
df = DataFrame({"A": [1, 2, np.nan], "B": list("abc"), "C": [4, np.nan, 5]})
|
||||
expected = DataFrame({"A": [1.0], "B": ["a"], "C": [4.0]})
|
||||
result = df.dropna(subset=np.array(["A", "C"]))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_no_nans_in_frame(self, axis):
|
||||
# GH#41965
|
||||
df = DataFrame([[1, 2], [3, 4]], columns=pd.RangeIndex(0, 2))
|
||||
expected = df.copy()
|
||||
result = df.dropna(axis=axis)
|
||||
tm.assert_frame_equal(result, expected, check_index_type=True)
|
||||
|
||||
def test_how_thresh_param_incompatible(self):
|
||||
# GH46575
|
||||
df = DataFrame([1, 2, pd.NA])
|
||||
msg = "You cannot set both the how and thresh arguments at the same time"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.dropna(how="all", thresh=2)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.dropna(how="any", thresh=2)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.dropna(how=None, thresh=None)
|
||||
|
||||
@pytest.mark.parametrize("val", [1, 1.5])
|
||||
def test_dropna_ignore_index(self, val):
|
||||
# GH#31725
|
||||
df = DataFrame({"a": [1, 2, val]}, index=[3, 2, 1])
|
||||
result = df.dropna(ignore_index=True)
|
||||
expected = DataFrame({"a": [1, 2, val]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df.dropna(ignore_index=True, inplace=True)
|
||||
tm.assert_frame_equal(df, expected)
|
@ -0,0 +1,153 @@
|
||||
from datetime import timedelta
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.dtypes import DatetimeTZDtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
date_range,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameDataTypes:
|
||||
def test_empty_frame_dtypes(self):
|
||||
empty_df = DataFrame()
|
||||
tm.assert_series_equal(empty_df.dtypes, Series(dtype=object))
|
||||
|
||||
nocols_df = DataFrame(index=[1, 2, 3])
|
||||
tm.assert_series_equal(nocols_df.dtypes, Series(dtype=object))
|
||||
|
||||
norows_df = DataFrame(columns=list("abc"))
|
||||
tm.assert_series_equal(norows_df.dtypes, Series(object, index=list("abc")))
|
||||
|
||||
norows_int_df = DataFrame(columns=list("abc")).astype(np.int32)
|
||||
tm.assert_series_equal(
|
||||
norows_int_df.dtypes, Series(np.dtype("int32"), index=list("abc"))
|
||||
)
|
||||
|
||||
df = DataFrame({"a": 1, "b": True, "c": 1.0}, index=[1, 2, 3])
|
||||
ex_dtypes = Series({"a": np.int64, "b": np.bool_, "c": np.float64})
|
||||
tm.assert_series_equal(df.dtypes, ex_dtypes)
|
||||
|
||||
# same but for empty slice of df
|
||||
tm.assert_series_equal(df[:0].dtypes, ex_dtypes)
|
||||
|
||||
def test_datetime_with_tz_dtypes(self):
|
||||
tzframe = DataFrame(
|
||||
{
|
||||
"A": date_range("20130101", periods=3),
|
||||
"B": date_range("20130101", periods=3, tz="US/Eastern"),
|
||||
"C": date_range("20130101", periods=3, tz="CET"),
|
||||
}
|
||||
)
|
||||
tzframe.iloc[1, 1] = pd.NaT
|
||||
tzframe.iloc[1, 2] = pd.NaT
|
||||
result = tzframe.dtypes.sort_index()
|
||||
expected = Series(
|
||||
[
|
||||
np.dtype("datetime64[ns]"),
|
||||
DatetimeTZDtype("ns", "US/Eastern"),
|
||||
DatetimeTZDtype("ns", "CET"),
|
||||
],
|
||||
["A", "B", "C"],
|
||||
)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_dtypes_are_correct_after_column_slice(self):
|
||||
# GH6525
|
||||
df = DataFrame(index=range(5), columns=list("abc"), dtype=np.float64)
|
||||
tm.assert_series_equal(
|
||||
df.dtypes,
|
||||
Series({"a": np.float64, "b": np.float64, "c": np.float64}),
|
||||
)
|
||||
tm.assert_series_equal(df.iloc[:, 2:].dtypes, Series({"c": np.float64}))
|
||||
tm.assert_series_equal(
|
||||
df.dtypes,
|
||||
Series({"a": np.float64, "b": np.float64, "c": np.float64}),
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[pd.NA, True],
|
||||
)
|
||||
def test_dtypes_are_correct_after_groupby_last(self, data):
|
||||
# GH46409
|
||||
df = DataFrame(
|
||||
{"id": [1, 2, 3, 4], "test": [True, pd.NA, data, False]}
|
||||
).convert_dtypes()
|
||||
result = df.groupby("id").last().test
|
||||
expected = df.set_index("id").test
|
||||
assert result.dtype == pd.BooleanDtype()
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
def test_dtypes_gh8722(self, float_string_frame):
|
||||
float_string_frame["bool"] = float_string_frame["A"] > 0
|
||||
result = float_string_frame.dtypes
|
||||
expected = Series(
|
||||
{k: v.dtype for k, v in float_string_frame.items()}, index=result.index
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# compat, GH 8722
|
||||
msg = "use_inf_as_na option is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
with option_context("use_inf_as_na", True):
|
||||
df = DataFrame([[1]])
|
||||
result = df.dtypes
|
||||
tm.assert_series_equal(result, Series({0: np.dtype("int64")}))
|
||||
|
||||
def test_dtypes_timedeltas(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": Series(date_range("2012-1-1", periods=3, freq="D")),
|
||||
"B": Series([timedelta(days=i) for i in range(3)]),
|
||||
}
|
||||
)
|
||||
result = df.dtypes
|
||||
expected = Series(
|
||||
[np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]")], index=list("AB")
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
df["C"] = df["A"] + df["B"]
|
||||
result = df.dtypes
|
||||
expected = Series(
|
||||
[
|
||||
np.dtype("datetime64[ns]"),
|
||||
np.dtype("timedelta64[ns]"),
|
||||
np.dtype("datetime64[ns]"),
|
||||
],
|
||||
index=list("ABC"),
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# mixed int types
|
||||
df["D"] = 1
|
||||
result = df.dtypes
|
||||
expected = Series(
|
||||
[
|
||||
np.dtype("datetime64[ns]"),
|
||||
np.dtype("timedelta64[ns]"),
|
||||
np.dtype("datetime64[ns]"),
|
||||
np.dtype("int64"),
|
||||
],
|
||||
index=list("ABCD"),
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_frame_apply_np_array_return_type(self, using_infer_string):
|
||||
# GH 35517
|
||||
df = DataFrame([["foo"]])
|
||||
result = df.apply(lambda col: np.array("bar"))
|
||||
if using_infer_string:
|
||||
expected = Series([np.array(["bar"])])
|
||||
else:
|
||||
expected = Series(["bar"])
|
||||
tm.assert_series_equal(result, expected)
|
@ -0,0 +1,117 @@
|
||||
import re
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]])
|
||||
def test_duplicated_with_misspelled_column_name(subset):
|
||||
# GH 19730
|
||||
df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
|
||||
msg = re.escape("Index(['a'], dtype=")
|
||||
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
df.duplicated(subset)
|
||||
|
||||
|
||||
def test_duplicated_implemented_no_recursion():
|
||||
# gh-21524
|
||||
# Ensure duplicated isn't implemented using recursion that
|
||||
# can fail on wide frames
|
||||
df = DataFrame(np.random.default_rng(2).integers(0, 1000, (10, 1000)))
|
||||
rec_limit = sys.getrecursionlimit()
|
||||
try:
|
||||
sys.setrecursionlimit(100)
|
||||
result = df.duplicated()
|
||||
finally:
|
||||
sys.setrecursionlimit(rec_limit)
|
||||
|
||||
# Then duplicates produce the bool Series as a result and don't fail during
|
||||
# calculation. Actual values doesn't matter here, though usually it's all
|
||||
# False in this case
|
||||
assert isinstance(result, Series)
|
||||
assert result.dtype == np.bool_
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"keep, expected",
|
||||
[
|
||||
("first", Series([False, False, True, False, True])),
|
||||
("last", Series([True, True, False, False, False])),
|
||||
(False, Series([True, True, True, False, True])),
|
||||
],
|
||||
)
|
||||
def test_duplicated_keep(keep, expected):
|
||||
df = DataFrame({"A": [0, 1, 1, 2, 0], "B": ["a", "b", "b", "c", "a"]})
|
||||
|
||||
result = df.duplicated(keep=keep)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal")
|
||||
@pytest.mark.parametrize(
|
||||
"keep, expected",
|
||||
[
|
||||
("first", Series([False, False, True, False, True])),
|
||||
("last", Series([True, True, False, False, False])),
|
||||
(False, Series([True, True, True, False, True])),
|
||||
],
|
||||
)
|
||||
def test_duplicated_nan_none(keep, expected):
|
||||
df = DataFrame({"C": [np.nan, 3, 3, None, np.nan], "x": 1}, dtype=object)
|
||||
|
||||
result = df.duplicated(keep=keep)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("subset", [None, ["A", "B"], "A"])
|
||||
def test_duplicated_subset(subset, keep):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [0, 1, 1, 2, 0],
|
||||
"B": ["a", "b", "b", "c", "a"],
|
||||
"C": [np.nan, 3, 3, None, np.nan],
|
||||
}
|
||||
)
|
||||
|
||||
if subset is None:
|
||||
subset = list(df.columns)
|
||||
elif isinstance(subset, str):
|
||||
# need to have a DataFrame, not a Series
|
||||
# -> select columns with singleton list, not string
|
||||
subset = [subset]
|
||||
|
||||
expected = df[subset].duplicated(keep=keep)
|
||||
result = df.duplicated(keep=keep, subset=subset)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_duplicated_on_empty_frame():
|
||||
# GH 25184
|
||||
|
||||
df = DataFrame(columns=["a", "b"])
|
||||
dupes = df.duplicated("a")
|
||||
|
||||
result = df[dupes]
|
||||
expected = df.copy()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_frame_datetime64_duplicated():
|
||||
dates = date_range("2010-07-01", end="2010-08-05")
|
||||
|
||||
tst = DataFrame({"symbol": "AAA", "date": dates})
|
||||
result = tst.duplicated(["date", "symbol"])
|
||||
assert (-result).all()
|
||||
|
||||
tst = DataFrame({"date": dates})
|
||||
result = tst.date.duplicated()
|
||||
assert (-result).all()
|
@ -0,0 +1,85 @@
|
||||
import numpy as np
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestEquals:
|
||||
def test_dataframe_not_equal(self):
|
||||
# see GH#28839
|
||||
df1 = DataFrame({"a": [1, 2], "b": ["s", "d"]})
|
||||
df2 = DataFrame({"a": ["s", "d"], "b": [1, 2]})
|
||||
assert df1.equals(df2) is False
|
||||
|
||||
def test_equals_different_blocks(self, using_array_manager, using_infer_string):
|
||||
# GH#9330
|
||||
df0 = DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]})
|
||||
df1 = df0.reset_index()[["A", "B", "C"]]
|
||||
if not using_array_manager and not using_infer_string:
|
||||
# this assert verifies that the above operations have
|
||||
# induced a block rearrangement
|
||||
assert df0._mgr.blocks[0].dtype != df1._mgr.blocks[0].dtype
|
||||
|
||||
# do the real tests
|
||||
tm.assert_frame_equal(df0, df1)
|
||||
assert df0.equals(df1)
|
||||
assert df1.equals(df0)
|
||||
|
||||
def test_equals(self):
|
||||
# Add object dtype column with nans
|
||||
index = np.random.default_rng(2).random(10)
|
||||
df1 = DataFrame(
|
||||
np.random.default_rng(2).random(10), index=index, columns=["floats"]
|
||||
)
|
||||
df1["text"] = "the sky is so blue. we could use more chocolate.".split()
|
||||
df1["start"] = date_range("2000-1-1", periods=10, freq="min")
|
||||
df1["end"] = date_range("2000-1-1", periods=10, freq="D")
|
||||
df1["diff"] = df1["end"] - df1["start"]
|
||||
# Explicitly cast to object, to avoid implicit cast when setting np.nan
|
||||
df1["bool"] = (np.arange(10) % 3 == 0).astype(object)
|
||||
df1.loc[::2] = np.nan
|
||||
df2 = df1.copy()
|
||||
assert df1["text"].equals(df2["text"])
|
||||
assert df1["start"].equals(df2["start"])
|
||||
assert df1["end"].equals(df2["end"])
|
||||
assert df1["diff"].equals(df2["diff"])
|
||||
assert df1["bool"].equals(df2["bool"])
|
||||
assert df1.equals(df2)
|
||||
assert not df1.equals(object)
|
||||
|
||||
# different dtype
|
||||
different = df1.copy()
|
||||
different["floats"] = different["floats"].astype("float32")
|
||||
assert not df1.equals(different)
|
||||
|
||||
# different index
|
||||
different_index = -index
|
||||
different = df2.set_index(different_index)
|
||||
assert not df1.equals(different)
|
||||
|
||||
# different columns
|
||||
different = df2.copy()
|
||||
different.columns = df2.columns[::-1]
|
||||
assert not df1.equals(different)
|
||||
|
||||
# DatetimeIndex
|
||||
index = date_range("2000-1-1", periods=10, freq="min")
|
||||
df1 = df1.set_index(index)
|
||||
df2 = df1.copy()
|
||||
assert df1.equals(df2)
|
||||
|
||||
# MultiIndex
|
||||
df3 = df1.set_index(["text"], append=True)
|
||||
df2 = df1.set_index(["text"], append=True)
|
||||
assert df3.equals(df2)
|
||||
|
||||
df2 = df1.set_index(["floats"], append=True)
|
||||
assert not df3.equals(df2)
|
||||
|
||||
# NaN in index
|
||||
df3 = df1.set_index(["floats"], append=True)
|
||||
df2 = df1.set_index(["floats"], append=True)
|
||||
assert df3.equals(df2)
|
@ -0,0 +1,303 @@
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_error():
|
||||
df = pd.DataFrame(
|
||||
{"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
|
||||
)
|
||||
with pytest.raises(
|
||||
ValueError, match="column must be a scalar, tuple, or list thereof"
|
||||
):
|
||||
df.explode([list("AA")])
|
||||
|
||||
with pytest.raises(ValueError, match="column must be unique"):
|
||||
df.explode(list("AA"))
|
||||
|
||||
df.columns = list("AA")
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=re.escape("DataFrame columns must be unique. Duplicate columns: ['A']"),
|
||||
):
|
||||
df.explode("A")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_subset, error_message",
|
||||
[
|
||||
(
|
||||
list("AC"),
|
||||
"columns must have matching element counts",
|
||||
),
|
||||
(
|
||||
[],
|
||||
"column must be nonempty",
|
||||
),
|
||||
(
|
||||
list("AC"),
|
||||
"columns must have matching element counts",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_error_multi_columns(input_subset, error_message):
|
||||
# GH 39240
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A": [[0, 1, 2], np.nan, [], (3, 4)],
|
||||
"B": 1,
|
||||
"C": [["a", "b", "c"], "foo", [], ["d", "e", "f"]],
|
||||
},
|
||||
index=list("abcd"),
|
||||
)
|
||||
with pytest.raises(ValueError, match=error_message):
|
||||
df.explode(input_subset)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"scalar",
|
||||
["a", 0, 1.5, pd.Timedelta("1 days"), pd.Timestamp("2019-12-31")],
|
||||
)
|
||||
def test_basic(scalar):
|
||||
df = pd.DataFrame(
|
||||
{scalar: pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
|
||||
)
|
||||
result = df.explode(scalar)
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
scalar: pd.Series(
|
||||
[0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object
|
||||
),
|
||||
"B": 1,
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multi_index_rows():
|
||||
df = pd.DataFrame(
|
||||
{"A": np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), "B": 1},
|
||||
index=pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]),
|
||||
)
|
||||
|
||||
result = df.explode("A")
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A": pd.Series(
|
||||
[0, 1, 2, np.nan, np.nan, 3, 4],
|
||||
index=pd.MultiIndex.from_tuples(
|
||||
[
|
||||
("a", 1),
|
||||
("a", 1),
|
||||
("a", 1),
|
||||
("a", 2),
|
||||
("b", 1),
|
||||
("b", 2),
|
||||
("b", 2),
|
||||
]
|
||||
),
|
||||
dtype=object,
|
||||
),
|
||||
"B": 1,
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multi_index_columns():
|
||||
df = pd.DataFrame(
|
||||
{("A", 1): np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), ("A", 2): 1}
|
||||
)
|
||||
|
||||
result = df.explode(("A", 1))
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
("A", 1): pd.Series(
|
||||
[0, 1, 2, np.nan, np.nan, 3, 4],
|
||||
index=pd.Index([0, 0, 0, 1, 2, 3, 3]),
|
||||
dtype=object,
|
||||
),
|
||||
("A", 2): 1,
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecase():
|
||||
# explode a single column
|
||||
# gh-10511
|
||||
df = pd.DataFrame(
|
||||
[[11, range(5), 10], [22, range(3), 20]], columns=list("ABC")
|
||||
).set_index("C")
|
||||
result = df.explode("B")
|
||||
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A": [11, 11, 11, 11, 11, 22, 22, 22],
|
||||
"B": np.array([0, 1, 2, 3, 4, 0, 1, 2], dtype=object),
|
||||
"C": [10, 10, 10, 10, 10, 20, 20, 20],
|
||||
},
|
||||
columns=list("ABC"),
|
||||
).set_index("C")
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# gh-8517
|
||||
df = pd.DataFrame(
|
||||
[["2014-01-01", "Alice", "A B"], ["2014-01-02", "Bob", "C D"]],
|
||||
columns=["dt", "name", "text"],
|
||||
)
|
||||
result = df.assign(text=df.text.str.split(" ")).explode("text")
|
||||
expected = pd.DataFrame(
|
||||
[
|
||||
["2014-01-01", "Alice", "A"],
|
||||
["2014-01-01", "Alice", "B"],
|
||||
["2014-01-02", "Bob", "C"],
|
||||
["2014-01-02", "Bob", "D"],
|
||||
],
|
||||
columns=["dt", "name", "text"],
|
||||
index=[0, 0, 1, 1],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_dict, input_index, expected_dict, expected_index",
|
||||
[
|
||||
(
|
||||
{"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
|
||||
[0, 0],
|
||||
{"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
|
||||
[0, 0, 0, 0],
|
||||
),
|
||||
(
|
||||
{"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
|
||||
pd.Index([0, 0], name="my_index"),
|
||||
{"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
|
||||
pd.Index([0, 0, 0, 0], name="my_index"),
|
||||
),
|
||||
(
|
||||
{"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
|
||||
pd.MultiIndex.from_arrays(
|
||||
[[0, 0], [1, 1]], names=["my_first_index", "my_second_index"]
|
||||
),
|
||||
{"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
|
||||
pd.MultiIndex.from_arrays(
|
||||
[[0, 0, 0, 0], [1, 1, 1, 1]],
|
||||
names=["my_first_index", "my_second_index"],
|
||||
),
|
||||
),
|
||||
(
|
||||
{"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
|
||||
pd.MultiIndex.from_arrays([[0, 0], [1, 1]], names=["my_index", None]),
|
||||
{"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
|
||||
pd.MultiIndex.from_arrays(
|
||||
[[0, 0, 0, 0], [1, 1, 1, 1]], names=["my_index", None]
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_duplicate_index(input_dict, input_index, expected_dict, expected_index):
|
||||
# GH 28005
|
||||
df = pd.DataFrame(input_dict, index=input_index, dtype=object)
|
||||
result = df.explode("col1")
|
||||
expected = pd.DataFrame(expected_dict, index=expected_index, dtype=object)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_ignore_index():
|
||||
# GH 34932
|
||||
df = pd.DataFrame({"id": range(0, 20, 10), "values": [list("ab"), list("cd")]})
|
||||
result = df.explode("values", ignore_index=True)
|
||||
expected = pd.DataFrame(
|
||||
{"id": [0, 0, 10, 10], "values": list("abcd")}, index=[0, 1, 2, 3]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_explode_sets():
|
||||
# https://github.com/pandas-dev/pandas/issues/35614
|
||||
df = pd.DataFrame({"a": [{"x", "y"}], "b": [1]}, index=[1])
|
||||
result = df.explode(column="a").sort_values(by="a")
|
||||
expected = pd.DataFrame({"a": ["x", "y"], "b": [1, 1]}, index=[1, 1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_subset, expected_dict, expected_index",
|
||||
[
|
||||
(
|
||||
list("AC"),
|
||||
{
|
||||
"A": pd.Series(
|
||||
[0, 1, 2, np.nan, np.nan, 3, 4, np.nan],
|
||||
index=list("aaabcdde"),
|
||||
dtype=object,
|
||||
),
|
||||
"B": 1,
|
||||
"C": ["a", "b", "c", "foo", np.nan, "d", "e", np.nan],
|
||||
},
|
||||
list("aaabcdde"),
|
||||
),
|
||||
(
|
||||
list("A"),
|
||||
{
|
||||
"A": pd.Series(
|
||||
[0, 1, 2, np.nan, np.nan, 3, 4, np.nan],
|
||||
index=list("aaabcdde"),
|
||||
dtype=object,
|
||||
),
|
||||
"B": 1,
|
||||
"C": [
|
||||
["a", "b", "c"],
|
||||
["a", "b", "c"],
|
||||
["a", "b", "c"],
|
||||
"foo",
|
||||
[],
|
||||
["d", "e"],
|
||||
["d", "e"],
|
||||
np.nan,
|
||||
],
|
||||
},
|
||||
list("aaabcdde"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_multi_columns(input_subset, expected_dict, expected_index):
|
||||
# GH 39240
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A": [[0, 1, 2], np.nan, [], (3, 4), np.nan],
|
||||
"B": 1,
|
||||
"C": [["a", "b", "c"], "foo", [], ["d", "e"], np.nan],
|
||||
},
|
||||
index=list("abcde"),
|
||||
)
|
||||
result = df.explode(input_subset)
|
||||
expected = pd.DataFrame(expected_dict, expected_index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multi_columns_nan_empty():
|
||||
# GH 46084
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A": [[0, 1], [5], [], [2, 3]],
|
||||
"B": [9, 8, 7, 6],
|
||||
"C": [[1, 2], np.nan, [], [3, 4]],
|
||||
}
|
||||
)
|
||||
result = df.explode(["A", "C"])
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A": np.array([0, 1, 5, np.nan, 2, 3], dtype=object),
|
||||
"B": [9, 9, 8, 7, 6, 6],
|
||||
"C": np.array([1, 2, np.nan, np.nan, 3, 4], dtype=object),
|
||||
},
|
||||
index=[0, 0, 1, 2, 3, 3],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,932 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._config import using_pyarrow_string_dtype
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
NaT,
|
||||
PeriodIndex,
|
||||
Series,
|
||||
TimedeltaIndex,
|
||||
Timestamp,
|
||||
date_range,
|
||||
to_datetime,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.frame.common import _check_mixed_float
|
||||
|
||||
|
||||
class TestFillNA:
|
||||
def test_fillna_dict_inplace_nonunique_columns(
|
||||
self, using_copy_on_write, warn_copy_on_write
|
||||
):
|
||||
df = DataFrame(
|
||||
{"A": [np.nan] * 3, "B": [NaT, Timestamp(1), NaT], "C": [np.nan, "foo", 2]}
|
||||
)
|
||||
df.columns = ["A", "A", "A"]
|
||||
orig = df[:]
|
||||
|
||||
# TODO(CoW-warn) better warning message
|
||||
with tm.assert_cow_warning(warn_copy_on_write):
|
||||
df.fillna({"A": 2}, inplace=True)
|
||||
# The first and third columns can be set inplace, while the second cannot.
|
||||
|
||||
expected = DataFrame(
|
||||
{"A": [2.0] * 3, "B": [2, Timestamp(1), 2], "C": [2, "foo", 2]}
|
||||
)
|
||||
expected.columns = ["A", "A", "A"]
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
# TODO: what's the expected/desired behavior with CoW?
|
||||
if not using_copy_on_write:
|
||||
assert tm.shares_memory(df.iloc[:, 0], orig.iloc[:, 0])
|
||||
assert not tm.shares_memory(df.iloc[:, 1], orig.iloc[:, 1])
|
||||
if not using_copy_on_write:
|
||||
assert tm.shares_memory(df.iloc[:, 2], orig.iloc[:, 2])
|
||||
|
||||
@td.skip_array_manager_not_yet_implemented
|
||||
def test_fillna_on_column_view(self, using_copy_on_write):
|
||||
# GH#46149 avoid unnecessary copies
|
||||
arr = np.full((40, 50), np.nan)
|
||||
df = DataFrame(arr, copy=False)
|
||||
|
||||
if using_copy_on_write:
|
||||
with tm.raises_chained_assignment_error():
|
||||
df[0].fillna(-1, inplace=True)
|
||||
assert np.isnan(arr[:, 0]).all()
|
||||
else:
|
||||
with tm.assert_produces_warning(FutureWarning, match="inplace method"):
|
||||
df[0].fillna(-1, inplace=True)
|
||||
assert (arr[:, 0] == -1).all()
|
||||
|
||||
# i.e. we didn't create a new 49-column block
|
||||
assert len(df._mgr.arrays) == 1
|
||||
assert np.shares_memory(df.values, arr)
|
||||
|
||||
def test_fillna_datetime(self, datetime_frame):
|
||||
tf = datetime_frame
|
||||
tf.loc[tf.index[:5], "A"] = np.nan
|
||||
tf.loc[tf.index[-5:], "A"] = np.nan
|
||||
|
||||
zero_filled = datetime_frame.fillna(0)
|
||||
assert (zero_filled.loc[zero_filled.index[:5], "A"] == 0).all()
|
||||
|
||||
msg = "DataFrame.fillna with 'method' is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
padded = datetime_frame.fillna(method="pad")
|
||||
assert np.isnan(padded.loc[padded.index[:5], "A"]).all()
|
||||
assert (
|
||||
padded.loc[padded.index[-5:], "A"] == padded.loc[padded.index[-5], "A"]
|
||||
).all()
|
||||
|
||||
msg = "Must specify a fill 'value' or 'method'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
datetime_frame.fillna()
|
||||
msg = "Cannot specify both 'value' and 'method'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
datetime_frame.fillna(5, method="ffill")
|
||||
|
||||
@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string")
|
||||
def test_fillna_mixed_type(self, float_string_frame):
|
||||
mf = float_string_frame
|
||||
mf.loc[mf.index[5:20], "foo"] = np.nan
|
||||
mf.loc[mf.index[-10:], "A"] = np.nan
|
||||
# TODO: make stronger assertion here, GH 25640
|
||||
mf.fillna(value=0)
|
||||
msg = "DataFrame.fillna with 'method' is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
mf.fillna(method="pad")
|
||||
|
||||
def test_fillna_mixed_float(self, mixed_float_frame):
|
||||
# mixed numeric (but no float16)
|
||||
mf = mixed_float_frame.reindex(columns=["A", "B", "D"])
|
||||
mf.loc[mf.index[-10:], "A"] = np.nan
|
||||
result = mf.fillna(value=0)
|
||||
_check_mixed_float(result, dtype={"C": None})
|
||||
|
||||
msg = "DataFrame.fillna with 'method' is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = mf.fillna(method="pad")
|
||||
_check_mixed_float(result, dtype={"C": None})
|
||||
|
||||
def test_fillna_empty(self, using_copy_on_write):
|
||||
if using_copy_on_write:
|
||||
pytest.skip("condition is unnecessary complex and is deprecated anyway")
|
||||
# empty frame (GH#2778)
|
||||
df = DataFrame(columns=["x"])
|
||||
for m in ["pad", "backfill"]:
|
||||
msg = "Series.fillna with 'method' is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.x.fillna(method=m, inplace=True)
|
||||
df.x.fillna(method=m)
|
||||
|
||||
def test_fillna_different_dtype(self, using_infer_string):
|
||||
# with different dtype (GH#3386)
|
||||
df = DataFrame(
|
||||
[["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]]
|
||||
)
|
||||
|
||||
if using_infer_string:
|
||||
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
|
||||
result = df.fillna({2: "foo"})
|
||||
else:
|
||||
result = df.fillna({2: "foo"})
|
||||
expected = DataFrame(
|
||||
[["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
if using_infer_string:
|
||||
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
|
||||
return_value = df.fillna({2: "foo"}, inplace=True)
|
||||
else:
|
||||
return_value = df.fillna({2: "foo"}, inplace=True)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
assert return_value is None
|
||||
|
||||
def test_fillna_limit_and_value(self):
|
||||
# limit and value
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((10, 3)))
|
||||
df.iloc[2:7, 0] = np.nan
|
||||
df.iloc[3:5, 2] = np.nan
|
||||
|
||||
expected = df.copy()
|
||||
expected.iloc[2, 0] = 999
|
||||
expected.iloc[3, 2] = 999
|
||||
result = df.fillna(999, limit=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_datelike(self):
|
||||
# with datelike
|
||||
# GH#6344
|
||||
df = DataFrame(
|
||||
{
|
||||
"Date": [NaT, Timestamp("2014-1-1")],
|
||||
"Date2": [Timestamp("2013-1-1"), NaT],
|
||||
}
|
||||
)
|
||||
|
||||
expected = df.copy()
|
||||
expected["Date"] = expected["Date"].fillna(df.loc[df.index[0], "Date2"])
|
||||
result = df.fillna(value={"Date": df["Date2"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_tzaware(self):
|
||||
# with timezone
|
||||
# GH#15855
|
||||
df = DataFrame({"A": [Timestamp("2012-11-11 00:00:00+01:00"), NaT]})
|
||||
exp = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
Timestamp("2012-11-11 00:00:00+01:00"),
|
||||
Timestamp("2012-11-11 00:00:00+01:00"),
|
||||
]
|
||||
}
|
||||
)
|
||||
msg = "DataFrame.fillna with 'method' is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
res = df.fillna(method="pad")
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
df = DataFrame({"A": [NaT, Timestamp("2012-11-11 00:00:00+01:00")]})
|
||||
exp = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
Timestamp("2012-11-11 00:00:00+01:00"),
|
||||
Timestamp("2012-11-11 00:00:00+01:00"),
|
||||
]
|
||||
}
|
||||
)
|
||||
msg = "DataFrame.fillna with 'method' is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
res = df.fillna(method="bfill")
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_fillna_tzaware_different_column(self):
|
||||
# with timezone in another column
|
||||
# GH#15522
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": date_range("20130101", periods=4, tz="US/Eastern"),
|
||||
"B": [1, 2, np.nan, np.nan],
|
||||
}
|
||||
)
|
||||
msg = "DataFrame.fillna with 'method' is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.fillna(method="pad")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": date_range("20130101", periods=4, tz="US/Eastern"),
|
||||
"B": [1.0, 2.0, 2.0, 2.0],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_na_actions_categorical(self):
|
||||
cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
|
||||
vals = ["a", "b", np.nan, "d"]
|
||||
df = DataFrame({"cats": cat, "vals": vals})
|
||||
cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
|
||||
vals2 = ["a", "b", "b", "d"]
|
||||
df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
|
||||
cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
|
||||
vals3 = ["a", "b", np.nan]
|
||||
df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
|
||||
cat4 = Categorical([1, 2], categories=[1, 2, 3])
|
||||
vals4 = ["a", "b"]
|
||||
df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})
|
||||
|
||||
# fillna
|
||||
res = df.fillna(value={"cats": 3, "vals": "b"})
|
||||
tm.assert_frame_equal(res, df_exp_fill)
|
||||
|
||||
msg = "Cannot setitem on a Categorical with a new category"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.fillna(value={"cats": 4, "vals": "c"})
|
||||
|
||||
msg = "DataFrame.fillna with 'method' is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
res = df.fillna(method="pad")
|
||||
tm.assert_frame_equal(res, df_exp_fill)
|
||||
|
||||
# dropna
|
||||
res = df.dropna(subset=["cats"])
|
||||
tm.assert_frame_equal(res, df_exp_drop_cats)
|
||||
|
||||
res = df.dropna()
|
||||
tm.assert_frame_equal(res, df_exp_drop_all)
|
||||
|
||||
# make sure that fillna takes missing values into account
|
||||
c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
|
||||
df = DataFrame({"cats": c, "vals": [1, 2, 3]})
|
||||
|
||||
cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
|
||||
df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})
|
||||
|
||||
res = df.fillna("a")
|
||||
tm.assert_frame_equal(res, df_exp)
|
||||
|
||||
def test_fillna_categorical_nan(self):
|
||||
# GH#14021
|
||||
# np.nan should always be a valid filler
|
||||
cat = Categorical([np.nan, 2, np.nan])
|
||||
val = Categorical([np.nan, np.nan, np.nan])
|
||||
df = DataFrame({"cats": cat, "vals": val})
|
||||
|
||||
# GH#32950 df.median() is poorly behaved because there is no
|
||||
# Categorical.median
|
||||
median = Series({"cats": 2.0, "vals": np.nan})
|
||||
|
||||
res = df.fillna(median)
|
||||
v_exp = [np.nan, np.nan, np.nan]
|
||||
df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype="category")
|
||||
tm.assert_frame_equal(res, df_exp)
|
||||
|
||||
result = df.cats.fillna(np.nan)
|
||||
tm.assert_series_equal(result, df.cats)
|
||||
|
||||
result = df.vals.fillna(np.nan)
|
||||
tm.assert_series_equal(result, df.vals)
|
||||
|
||||
idx = DatetimeIndex(
|
||||
["2011-01-01 09:00", "2016-01-01 23:45", "2011-01-01 09:00", NaT, NaT]
|
||||
)
|
||||
df = DataFrame({"a": Categorical(idx)})
|
||||
tm.assert_frame_equal(df.fillna(value=NaT), df)
|
||||
|
||||
idx = PeriodIndex(["2011-01", "2011-01", "2011-01", NaT, NaT], freq="M")
|
||||
df = DataFrame({"a": Categorical(idx)})
|
||||
tm.assert_frame_equal(df.fillna(value=NaT), df)
|
||||
|
||||
idx = TimedeltaIndex(["1 days", "2 days", "1 days", NaT, NaT])
|
||||
df = DataFrame({"a": Categorical(idx)})
|
||||
tm.assert_frame_equal(df.fillna(value=NaT), df)
|
||||
|
||||
def test_fillna_downcast(self):
|
||||
# GH#15277
|
||||
# infer int64 from float64
|
||||
df = DataFrame({"a": [1.0, np.nan]})
|
||||
msg = "The 'downcast' keyword in fillna is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.fillna(0, downcast="infer")
|
||||
expected = DataFrame({"a": [1, 0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# infer int64 from float64 when fillna value is a dict
|
||||
df = DataFrame({"a": [1.0, np.nan]})
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.fillna({"a": 0}, downcast="infer")
|
||||
expected = DataFrame({"a": [1, 0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_downcast_false(self, frame_or_series):
|
||||
# GH#45603 preserve object dtype with downcast=False
|
||||
obj = frame_or_series([1, 2, 3], dtype="object")
|
||||
msg = "The 'downcast' keyword in fillna"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = obj.fillna("", downcast=False)
|
||||
tm.assert_equal(result, obj)
|
||||
|
||||
def test_fillna_downcast_noop(self, frame_or_series):
|
||||
# GH#45423
|
||||
# Two relevant paths:
|
||||
# 1) not _can_hold_na (e.g. integer)
|
||||
# 2) _can_hold_na + noop + not can_hold_element
|
||||
|
||||
obj = frame_or_series([1, 2, 3], dtype=np.int64)
|
||||
|
||||
msg = "The 'downcast' keyword in fillna"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
# GH#40988
|
||||
res = obj.fillna("foo", downcast=np.dtype(np.int32))
|
||||
expected = obj.astype(np.int32)
|
||||
tm.assert_equal(res, expected)
|
||||
|
||||
obj2 = obj.astype(np.float64)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
res2 = obj2.fillna("foo", downcast="infer")
|
||||
expected2 = obj # get back int64
|
||||
tm.assert_equal(res2, expected2)
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
# GH#40988
|
||||
res3 = obj2.fillna("foo", downcast=np.dtype(np.int32))
|
||||
tm.assert_equal(res3, expected)
|
||||
|
||||
@pytest.mark.parametrize("columns", [["A", "A", "B"], ["A", "A"]])
|
||||
def test_fillna_dictlike_value_duplicate_colnames(self, columns):
|
||||
# GH#43476
|
||||
df = DataFrame(np.nan, index=[0, 1], columns=columns)
|
||||
with tm.assert_produces_warning(None):
|
||||
result = df.fillna({"A": 0})
|
||||
|
||||
expected = df.copy()
|
||||
expected["A"] = 0.0
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_dtype_conversion(self, using_infer_string):
|
||||
# make sure that fillna on an empty frame works
|
||||
df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
|
||||
result = df.dtypes
|
||||
expected = Series([np.dtype("object")] * 5, index=[1, 2, 3, 4, 5])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
msg = "Downcasting object dtype arrays"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.fillna(1)
|
||||
expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# empty block
|
||||
df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64")
|
||||
if using_infer_string:
|
||||
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
|
||||
result = df.fillna("nan")
|
||||
else:
|
||||
result = df.fillna("nan")
|
||||
expected = DataFrame("nan", index=range(3), columns=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("val", ["", 1, np.nan, 1.0])
|
||||
def test_fillna_dtype_conversion_equiv_replace(self, val):
|
||||
df = DataFrame({"A": [1, np.nan], "B": [1.0, 2.0]})
|
||||
expected = df.replace(np.nan, val)
|
||||
result = df.fillna(val)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_datetime_columns(self):
|
||||
# GH#7095
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [-1, -2, np.nan],
|
||||
"B": date_range("20130101", periods=3),
|
||||
"C": ["foo", "bar", None],
|
||||
"D": ["foo2", "bar2", None],
|
||||
},
|
||||
index=date_range("20130110", periods=3),
|
||||
)
|
||||
result = df.fillna("?")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": [-1, -2, "?"],
|
||||
"B": date_range("20130101", periods=3),
|
||||
"C": ["foo", "bar", "?"],
|
||||
"D": ["foo2", "bar2", "?"],
|
||||
},
|
||||
index=date_range("20130110", periods=3),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [-1, -2, np.nan],
|
||||
"B": [Timestamp("2013-01-01"), Timestamp("2013-01-02"), NaT],
|
||||
"C": ["foo", "bar", None],
|
||||
"D": ["foo2", "bar2", None],
|
||||
},
|
||||
index=date_range("20130110", periods=3),
|
||||
)
|
||||
result = df.fillna("?")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": [-1, -2, "?"],
|
||||
"B": [Timestamp("2013-01-01"), Timestamp("2013-01-02"), "?"],
|
||||
"C": ["foo", "bar", "?"],
|
||||
"D": ["foo2", "bar2", "?"],
|
||||
},
|
||||
index=date_range("20130110", periods=3),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_ffill(self, datetime_frame):
|
||||
datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan
|
||||
datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan
|
||||
|
||||
msg = "DataFrame.fillna with 'method' is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
alt = datetime_frame.fillna(method="ffill")
|
||||
tm.assert_frame_equal(datetime_frame.ffill(), alt)
|
||||
|
||||
def test_bfill(self, datetime_frame):
|
||||
datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan
|
||||
datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan
|
||||
|
||||
msg = "DataFrame.fillna with 'method' is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
alt = datetime_frame.fillna(method="bfill")
|
||||
|
||||
tm.assert_frame_equal(datetime_frame.bfill(), alt)
|
||||
|
||||
def test_frame_pad_backfill_limit(self):
|
||||
index = np.arange(10)
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)), index=index)
|
||||
|
||||
result = df[:2].reindex(index, method="pad", limit=5)
|
||||
|
||||
msg = "DataFrame.fillna with 'method' is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
expected = df[:2].reindex(index).fillna(method="pad")
|
||||
expected.iloc[-3:] = np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df[-2:].reindex(index, method="backfill", limit=5)
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
expected = df[-2:].reindex(index).fillna(method="backfill")
|
||||
expected.iloc[:3] = np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_frame_fillna_limit(self):
|
||||
index = np.arange(10)
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)), index=index)
|
||||
|
||||
result = df[:2].reindex(index)
|
||||
msg = "DataFrame.fillna with 'method' is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = result.fillna(method="pad", limit=5)
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
expected = df[:2].reindex(index).fillna(method="pad")
|
||||
expected.iloc[-3:] = np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df[-2:].reindex(index)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = result.fillna(method="backfill", limit=5)
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
expected = df[-2:].reindex(index).fillna(method="backfill")
|
||||
expected.iloc[:3] = np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_skip_certain_blocks(self):
|
||||
# don't try to fill boolean, int blocks
|
||||
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)).astype(int))
|
||||
|
||||
# it works!
|
||||
df.fillna(np.nan)
|
||||
|
||||
@pytest.mark.parametrize("type", [int, float])
|
||||
def test_fillna_positive_limit(self, type):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))).astype(type)
|
||||
|
||||
msg = "Limit must be greater than 0"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.fillna(0, limit=-5)
|
||||
|
||||
@pytest.mark.parametrize("type", [int, float])
|
||||
def test_fillna_integer_limit(self, type):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))).astype(type)
|
||||
|
||||
msg = "Limit must be an integer"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.fillna(0, limit=0.5)
|
||||
|
||||
def test_fillna_inplace(self):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)))
|
||||
df.loc[:4, 1] = np.nan
|
||||
df.loc[-4:, 3] = np.nan
|
||||
|
||||
expected = df.fillna(value=0)
|
||||
assert expected is not df
|
||||
|
||||
df.fillna(value=0, inplace=True)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
expected = df.fillna(value={0: 0}, inplace=True)
|
||||
assert expected is None
|
||||
|
||||
df.loc[:4, 1] = np.nan
|
||||
df.loc[-4:, 3] = np.nan
|
||||
msg = "DataFrame.fillna with 'method' is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
expected = df.fillna(method="ffill")
|
||||
assert expected is not df
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.fillna(method="ffill", inplace=True)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_fillna_dict_series(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [np.nan, 1, 2, np.nan, np.nan],
|
||||
"b": [1, 2, 3, np.nan, np.nan],
|
||||
"c": [np.nan, 1, 2, 3, 4],
|
||||
}
|
||||
)
|
||||
|
||||
result = df.fillna({"a": 0, "b": 5})
|
||||
|
||||
expected = df.copy()
|
||||
expected["a"] = expected["a"].fillna(0)
|
||||
expected["b"] = expected["b"].fillna(5)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# it works
|
||||
result = df.fillna({"a": 0, "b": 5, "d": 7})
|
||||
|
||||
# Series treated same as dict
|
||||
result = df.fillna(df.max())
|
||||
expected = df.fillna(df.max().to_dict())
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# disable this for now
|
||||
with pytest.raises(NotImplementedError, match="column by column"):
|
||||
df.fillna(df.max(1), axis=1)
|
||||
|
||||
def test_fillna_dataframe(self):
|
||||
# GH#8377
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [np.nan, 1, 2, np.nan, np.nan],
|
||||
"b": [1, 2, 3, np.nan, np.nan],
|
||||
"c": [np.nan, 1, 2, 3, 4],
|
||||
},
|
||||
index=list("VWXYZ"),
|
||||
)
|
||||
|
||||
# df2 may have different index and columns
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"a": [np.nan, 10, 20, 30, 40],
|
||||
"b": [50, 60, 70, 80, 90],
|
||||
"foo": ["bar"] * 5,
|
||||
},
|
||||
index=list("VWXuZ"),
|
||||
)
|
||||
|
||||
result = df.fillna(df2)
|
||||
|
||||
# only those columns and indices which are shared get filled
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": [np.nan, 1, 2, np.nan, 40],
|
||||
"b": [1, 2, 3, np.nan, 90],
|
||||
"c": [np.nan, 1, 2, 3, 4],
|
||||
},
|
||||
index=list("VWXYZ"),
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_columns(self):
|
||||
arr = np.random.default_rng(2).standard_normal((10, 10))
|
||||
arr[:, ::2] = np.nan
|
||||
df = DataFrame(arr)
|
||||
|
||||
msg = "DataFrame.fillna with 'method' is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.fillna(method="ffill", axis=1)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
expected = df.T.fillna(method="pad").T
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df.insert(6, "foo", 5)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.fillna(method="ffill", axis=1)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
expected = df.astype(float).fillna(method="ffill", axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_invalid_method(self, float_frame):
|
||||
with pytest.raises(ValueError, match="ffil"):
|
||||
float_frame.fillna(method="ffil")
|
||||
|
||||
def test_fillna_invalid_value(self, float_frame):
|
||||
# list
|
||||
msg = '"value" parameter must be a scalar or dict, but you passed a "{}"'
|
||||
with pytest.raises(TypeError, match=msg.format("list")):
|
||||
float_frame.fillna([1, 2])
|
||||
# tuple
|
||||
with pytest.raises(TypeError, match=msg.format("tuple")):
|
||||
float_frame.fillna((1, 2))
|
||||
# frame with series
|
||||
msg = (
|
||||
'"value" parameter must be a scalar, dict or Series, but you '
|
||||
'passed a "DataFrame"'
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
float_frame.iloc[:, 0].fillna(float_frame)
|
||||
|
||||
def test_fillna_col_reordering(self):
|
||||
cols = ["COL." + str(i) for i in range(5, 0, -1)]
|
||||
data = np.random.default_rng(2).random((20, 5))
|
||||
df = DataFrame(index=range(20), columns=cols, data=data)
|
||||
msg = "DataFrame.fillna with 'method' is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
filled = df.fillna(method="ffill")
|
||||
assert df.columns.tolist() == filled.columns.tolist()
|
||||
|
||||
@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string")
|
||||
def test_fill_corner(self, float_frame, float_string_frame):
|
||||
mf = float_string_frame
|
||||
mf.loc[mf.index[5:20], "foo"] = np.nan
|
||||
mf.loc[mf.index[-10:], "A"] = np.nan
|
||||
|
||||
filled = float_string_frame.fillna(value=0)
|
||||
assert (filled.loc[filled.index[5:20], "foo"] == 0).all()
|
||||
del float_string_frame["foo"]
|
||||
|
||||
float_frame.reindex(columns=[]).fillna(value=0)
|
||||
|
||||
def test_fillna_downcast_dict(self):
|
||||
# GH#40809
|
||||
df = DataFrame({"col1": [1, np.nan]})
|
||||
|
||||
msg = "The 'downcast' keyword in fillna"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.fillna({"col1": 2}, downcast={"col1": "int64"})
|
||||
expected = DataFrame({"col1": [1, 2]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_with_columns_and_limit(self):
|
||||
# GH40989
|
||||
df = DataFrame(
|
||||
[
|
||||
[np.nan, 2, np.nan, 0],
|
||||
[3, 4, np.nan, 1],
|
||||
[np.nan, np.nan, np.nan, 5],
|
||||
[np.nan, 3, np.nan, 4],
|
||||
],
|
||||
columns=list("ABCD"),
|
||||
)
|
||||
result = df.fillna(axis=1, value=100, limit=1)
|
||||
result2 = df.fillna(axis=1, value=100, limit=2)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": Series([100, 3, 100, 100], dtype="float64"),
|
||||
"B": [2, 4, np.nan, 3],
|
||||
"C": [np.nan, 100, np.nan, np.nan],
|
||||
"D": Series([0, 1, 5, 4], dtype="float64"),
|
||||
},
|
||||
index=[0, 1, 2, 3],
|
||||
)
|
||||
expected2 = DataFrame(
|
||||
{
|
||||
"A": Series([100, 3, 100, 100], dtype="float64"),
|
||||
"B": Series([2, 4, 100, 3], dtype="float64"),
|
||||
"C": [100, 100, np.nan, 100],
|
||||
"D": Series([0, 1, 5, 4], dtype="float64"),
|
||||
},
|
||||
index=[0, 1, 2, 3],
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(result2, expected2)
|
||||
|
||||
def test_fillna_datetime_inplace(self):
|
||||
# GH#48863
|
||||
df = DataFrame(
|
||||
{
|
||||
"date1": to_datetime(["2018-05-30", None]),
|
||||
"date2": to_datetime(["2018-09-30", None]),
|
||||
}
|
||||
)
|
||||
expected = df.copy()
|
||||
df.fillna(np.nan, inplace=True)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_fillna_inplace_with_columns_limit_and_value(self):
|
||||
# GH40989
|
||||
df = DataFrame(
|
||||
[
|
||||
[np.nan, 2, np.nan, 0],
|
||||
[3, 4, np.nan, 1],
|
||||
[np.nan, np.nan, np.nan, 5],
|
||||
[np.nan, 3, np.nan, 4],
|
||||
],
|
||||
columns=list("ABCD"),
|
||||
)
|
||||
|
||||
expected = df.fillna(axis=1, value=100, limit=1)
|
||||
assert expected is not df
|
||||
|
||||
df.fillna(axis=1, value=100, limit=1, inplace=True)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
@td.skip_array_manager_invalid_test
|
||||
@pytest.mark.parametrize("val", [-1, {"x": -1, "y": -1}])
|
||||
def test_inplace_dict_update_view(
|
||||
self, val, using_copy_on_write, warn_copy_on_write
|
||||
):
|
||||
# GH#47188
|
||||
df = DataFrame({"x": [np.nan, 2], "y": [np.nan, 2]})
|
||||
df_orig = df.copy()
|
||||
result_view = df[:]
|
||||
with tm.assert_cow_warning(warn_copy_on_write):
|
||||
df.fillna(val, inplace=True)
|
||||
expected = DataFrame({"x": [-1, 2.0], "y": [-1.0, 2]})
|
||||
tm.assert_frame_equal(df, expected)
|
||||
if using_copy_on_write:
|
||||
tm.assert_frame_equal(result_view, df_orig)
|
||||
else:
|
||||
tm.assert_frame_equal(result_view, expected)
|
||||
|
||||
def test_single_block_df_with_horizontal_axis(self):
|
||||
# GH 47713
|
||||
df = DataFrame(
|
||||
{
|
||||
"col1": [5, 0, np.nan, 10, np.nan],
|
||||
"col2": [7, np.nan, np.nan, 5, 3],
|
||||
"col3": [12, np.nan, 1, 2, 0],
|
||||
"col4": [np.nan, 1, 1, np.nan, 18],
|
||||
}
|
||||
)
|
||||
result = df.fillna(50, limit=1, axis=1)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[5.0, 7.0, 12.0, 50.0],
|
||||
[0.0, 50.0, np.nan, 1.0],
|
||||
[50.0, np.nan, 1.0, 1.0],
|
||||
[10.0, 5.0, 2.0, 50.0],
|
||||
[50.0, 3.0, 0.0, 18.0],
|
||||
],
|
||||
columns=["col1", "col2", "col3", "col4"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_with_multi_index_frame(self):
|
||||
# GH 47649
|
||||
pdf = DataFrame(
|
||||
{
|
||||
("x", "a"): [np.nan, 2.0, 3.0],
|
||||
("x", "b"): [1.0, 2.0, np.nan],
|
||||
("y", "c"): [1.0, 2.0, np.nan],
|
||||
}
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
("x", "a"): [-1.0, 2.0, 3.0],
|
||||
("x", "b"): [1.0, 2.0, -1.0],
|
||||
("y", "c"): [1.0, 2.0, np.nan],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(pdf.fillna({"x": -1}), expected)
|
||||
tm.assert_frame_equal(pdf.fillna({"x": -1, ("x", "b"): -2}), expected)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
("x", "a"): [-1.0, 2.0, 3.0],
|
||||
("x", "b"): [1.0, 2.0, -2.0],
|
||||
("y", "c"): [1.0, 2.0, np.nan],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(pdf.fillna({("x", "b"): -2, "x": -1}), expected)
|
||||
|
||||
|
||||
def test_fillna_nonconsolidated_frame():
|
||||
# https://github.com/pandas-dev/pandas/issues/36495
|
||||
df = DataFrame(
|
||||
[
|
||||
[1, 1, 1, 1.0],
|
||||
[2, 2, 2, 2.0],
|
||||
[3, 3, 3, 3.0],
|
||||
],
|
||||
columns=["i1", "i2", "i3", "f1"],
|
||||
)
|
||||
df_nonconsol = df.pivot(index="i1", columns="i2")
|
||||
result = df_nonconsol.fillna(0)
|
||||
assert result.isna().sum().sum() == 0
|
||||
|
||||
|
||||
def test_fillna_nones_inplace():
|
||||
# GH 48480
|
||||
df = DataFrame(
|
||||
[[None, None], [None, None]],
|
||||
columns=["A", "B"],
|
||||
)
|
||||
msg = "Downcasting object dtype arrays"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.fillna(value={"A": 1, "B": 2}, inplace=True)
|
||||
|
||||
expected = DataFrame([[1, 2], [1, 2]], columns=["A", "B"])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", ["pad", "backfill"])
|
||||
def test_pad_backfill_deprecated(func):
|
||||
# GH#33396
|
||||
df = DataFrame({"a": [1, 2, 3]})
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
getattr(df, func)()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, expected_data, method, kwargs",
|
||||
(
|
||||
(
|
||||
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
|
||||
[np.nan, np.nan, 3.0, 3.0, 3.0, 3.0, 7.0, np.nan, np.nan],
|
||||
"ffill",
|
||||
{"limit_area": "inside"},
|
||||
),
|
||||
(
|
||||
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
|
||||
[np.nan, np.nan, 3.0, 3.0, np.nan, np.nan, 7.0, np.nan, np.nan],
|
||||
"ffill",
|
||||
{"limit_area": "inside", "limit": 1},
|
||||
),
|
||||
(
|
||||
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
|
||||
[np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, 7.0],
|
||||
"ffill",
|
||||
{"limit_area": "outside"},
|
||||
),
|
||||
(
|
||||
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
|
||||
[np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan],
|
||||
"ffill",
|
||||
{"limit_area": "outside", "limit": 1},
|
||||
),
|
||||
(
|
||||
[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
|
||||
[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
|
||||
"ffill",
|
||||
{"limit_area": "outside", "limit": 1},
|
||||
),
|
||||
(
|
||||
range(5),
|
||||
range(5),
|
||||
"ffill",
|
||||
{"limit_area": "outside", "limit": 1},
|
||||
),
|
||||
(
|
||||
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
|
||||
[np.nan, np.nan, 3.0, 7.0, 7.0, 7.0, 7.0, np.nan, np.nan],
|
||||
"bfill",
|
||||
{"limit_area": "inside"},
|
||||
),
|
||||
(
|
||||
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
|
||||
[np.nan, np.nan, 3.0, np.nan, np.nan, 7.0, 7.0, np.nan, np.nan],
|
||||
"bfill",
|
||||
{"limit_area": "inside", "limit": 1},
|
||||
),
|
||||
(
|
||||
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
|
||||
[3.0, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan],
|
||||
"bfill",
|
||||
{"limit_area": "outside"},
|
||||
),
|
||||
(
|
||||
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
|
||||
[np.nan, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan],
|
||||
"bfill",
|
||||
{"limit_area": "outside", "limit": 1},
|
||||
),
|
||||
),
|
||||
)
|
||||
def test_ffill_bfill_limit_area(data, expected_data, method, kwargs):
|
||||
# GH#56492
|
||||
df = DataFrame(data)
|
||||
expected = DataFrame(expected_data)
|
||||
result = getattr(df, method)(**kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,153 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameFilter:
|
||||
def test_filter(self, float_frame, float_string_frame):
|
||||
# Items
|
||||
filtered = float_frame.filter(["A", "B", "E"])
|
||||
assert len(filtered.columns) == 2
|
||||
assert "E" not in filtered
|
||||
|
||||
filtered = float_frame.filter(["A", "B", "E"], axis="columns")
|
||||
assert len(filtered.columns) == 2
|
||||
assert "E" not in filtered
|
||||
|
||||
# Other axis
|
||||
idx = float_frame.index[0:4]
|
||||
filtered = float_frame.filter(idx, axis="index")
|
||||
expected = float_frame.reindex(index=idx)
|
||||
tm.assert_frame_equal(filtered, expected)
|
||||
|
||||
# like
|
||||
fcopy = float_frame.copy()
|
||||
fcopy["AA"] = 1
|
||||
|
||||
filtered = fcopy.filter(like="A")
|
||||
assert len(filtered.columns) == 2
|
||||
assert "AA" in filtered
|
||||
|
||||
# like with ints in column names
|
||||
df = DataFrame(0.0, index=[0, 1, 2], columns=[0, 1, "_A", "_B"])
|
||||
filtered = df.filter(like="_")
|
||||
assert len(filtered.columns) == 2
|
||||
|
||||
# regex with ints in column names
|
||||
# from PR #10384
|
||||
df = DataFrame(0.0, index=[0, 1, 2], columns=["A1", 1, "B", 2, "C"])
|
||||
expected = DataFrame(
|
||||
0.0, index=[0, 1, 2], columns=pd.Index([1, 2], dtype=object)
|
||||
)
|
||||
filtered = df.filter(regex="^[0-9]+$")
|
||||
tm.assert_frame_equal(filtered, expected)
|
||||
|
||||
expected = DataFrame(0.0, index=[0, 1, 2], columns=[0, "0", 1, "1"])
|
||||
# shouldn't remove anything
|
||||
filtered = expected.filter(regex="^[0-9]+$")
|
||||
tm.assert_frame_equal(filtered, expected)
|
||||
|
||||
# pass in None
|
||||
with pytest.raises(TypeError, match="Must pass"):
|
||||
float_frame.filter()
|
||||
with pytest.raises(TypeError, match="Must pass"):
|
||||
float_frame.filter(items=None)
|
||||
with pytest.raises(TypeError, match="Must pass"):
|
||||
float_frame.filter(axis=1)
|
||||
|
||||
# test mutually exclusive arguments
|
||||
with pytest.raises(TypeError, match="mutually exclusive"):
|
||||
float_frame.filter(items=["one", "three"], regex="e$", like="bbi")
|
||||
with pytest.raises(TypeError, match="mutually exclusive"):
|
||||
float_frame.filter(items=["one", "three"], regex="e$", axis=1)
|
||||
with pytest.raises(TypeError, match="mutually exclusive"):
|
||||
float_frame.filter(items=["one", "three"], regex="e$")
|
||||
with pytest.raises(TypeError, match="mutually exclusive"):
|
||||
float_frame.filter(items=["one", "three"], like="bbi", axis=0)
|
||||
with pytest.raises(TypeError, match="mutually exclusive"):
|
||||
float_frame.filter(items=["one", "three"], like="bbi")
|
||||
|
||||
# objects
|
||||
filtered = float_string_frame.filter(like="foo")
|
||||
assert "foo" in filtered
|
||||
|
||||
# unicode columns, won't ascii-encode
|
||||
df = float_frame.rename(columns={"B": "\u2202"})
|
||||
filtered = df.filter(like="C")
|
||||
assert "C" in filtered
|
||||
|
||||
def test_filter_regex_search(self, float_frame):
|
||||
fcopy = float_frame.copy()
|
||||
fcopy["AA"] = 1
|
||||
|
||||
# regex
|
||||
filtered = fcopy.filter(regex="[A]+")
|
||||
assert len(filtered.columns) == 2
|
||||
assert "AA" in filtered
|
||||
|
||||
# doesn't have to be at beginning
|
||||
df = DataFrame(
|
||||
{"aBBa": [1, 2], "BBaBB": [1, 2], "aCCa": [1, 2], "aCCaBB": [1, 2]}
|
||||
)
|
||||
|
||||
result = df.filter(regex="BB")
|
||||
exp = df[[x for x in df.columns if "BB" in x]]
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"name,expected",
|
||||
[
|
||||
("a", DataFrame({"a": [1, 2]})),
|
||||
("a", DataFrame({"a": [1, 2]})),
|
||||
("あ", DataFrame({"あ": [3, 4]})),
|
||||
],
|
||||
)
|
||||
def test_filter_unicode(self, name, expected):
|
||||
# GH13101
|
||||
df = DataFrame({"a": [1, 2], "あ": [3, 4]})
|
||||
|
||||
tm.assert_frame_equal(df.filter(like=name), expected)
|
||||
tm.assert_frame_equal(df.filter(regex=name), expected)
|
||||
|
||||
@pytest.mark.parametrize("name", ["a", "a"])
|
||||
def test_filter_bytestring(self, name):
|
||||
# GH13101
|
||||
df = DataFrame({b"a": [1, 2], b"b": [3, 4]})
|
||||
expected = DataFrame({b"a": [1, 2]})
|
||||
|
||||
tm.assert_frame_equal(df.filter(like=name), expected)
|
||||
tm.assert_frame_equal(df.filter(regex=name), expected)
|
||||
|
||||
def test_filter_corner(self):
|
||||
empty = DataFrame()
|
||||
|
||||
result = empty.filter([])
|
||||
tm.assert_frame_equal(result, empty)
|
||||
|
||||
result = empty.filter(like="foo")
|
||||
tm.assert_frame_equal(result, empty)
|
||||
|
||||
def test_filter_regex_non_string(self):
|
||||
# GH#5798 trying to filter on non-string columns should drop,
|
||||
# not raise
|
||||
df = DataFrame(np.random.default_rng(2).random((3, 2)), columns=["STRING", 123])
|
||||
result = df.filter(regex="STRING")
|
||||
expected = df[["STRING"]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_filter_keep_order(self):
|
||||
# GH#54980
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
result = df.filter(items=["B", "A"])
|
||||
expected = df[["B", "A"]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_filter_different_dtype(self):
|
||||
# GH#54980
|
||||
df = DataFrame({1: [1, 2, 3], 2: [4, 5, 6]})
|
||||
result = df.filter(items=["B", "A"])
|
||||
expected = df[[]]
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,143 @@
|
||||
"""
|
||||
Note: includes tests for `last`
|
||||
"""
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
bdate_range,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
deprecated_msg = "first is deprecated"
|
||||
last_deprecated_msg = "last is deprecated"
|
||||
|
||||
|
||||
class TestFirst:
|
||||
def test_first_subset(self, frame_or_series):
|
||||
ts = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((100, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=date_range("2000-01-01", periods=100, freq="12h"),
|
||||
)
|
||||
ts = tm.get_obj(ts, frame_or_series)
|
||||
with tm.assert_produces_warning(FutureWarning, match=deprecated_msg):
|
||||
result = ts.first("10d")
|
||||
assert len(result) == 20
|
||||
|
||||
ts = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((100, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=date_range("2000-01-01", periods=100, freq="D"),
|
||||
)
|
||||
ts = tm.get_obj(ts, frame_or_series)
|
||||
with tm.assert_produces_warning(FutureWarning, match=deprecated_msg):
|
||||
result = ts.first("10d")
|
||||
assert len(result) == 10
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, match=deprecated_msg):
|
||||
result = ts.first("3ME")
|
||||
expected = ts[:"3/31/2000"]
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, match=deprecated_msg):
|
||||
result = ts.first("21D")
|
||||
expected = ts[:21]
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, match=deprecated_msg):
|
||||
result = ts[:0].first("3ME")
|
||||
tm.assert_equal(result, ts[:0])
|
||||
|
||||
def test_first_last_raises(self, frame_or_series):
|
||||
# GH#20725
|
||||
obj = DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
msg = "'first' only supports a DatetimeIndex index"
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=deprecated_msg
|
||||
), pytest.raises(
|
||||
TypeError, match=msg
|
||||
): # index is not a DatetimeIndex
|
||||
obj.first("1D")
|
||||
|
||||
msg = "'last' only supports a DatetimeIndex index"
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=last_deprecated_msg
|
||||
), pytest.raises(
|
||||
TypeError, match=msg
|
||||
): # index is not a DatetimeIndex
|
||||
obj.last("1D")
|
||||
|
||||
def test_last_subset(self, frame_or_series):
|
||||
ts = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((100, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=date_range("2000-01-01", periods=100, freq="12h"),
|
||||
)
|
||||
ts = tm.get_obj(ts, frame_or_series)
|
||||
with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg):
|
||||
result = ts.last("10d")
|
||||
assert len(result) == 20
|
||||
|
||||
ts = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((30, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=date_range("2000-01-01", periods=30, freq="D"),
|
||||
)
|
||||
ts = tm.get_obj(ts, frame_or_series)
|
||||
with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg):
|
||||
result = ts.last("10d")
|
||||
assert len(result) == 10
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg):
|
||||
result = ts.last("21D")
|
||||
expected = ts["2000-01-10":]
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg):
|
||||
result = ts.last("21D")
|
||||
expected = ts[-21:]
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg):
|
||||
result = ts[:0].last("3ME")
|
||||
tm.assert_equal(result, ts[:0])
|
||||
|
||||
@pytest.mark.parametrize("start, periods", [("2010-03-31", 1), ("2010-03-30", 2)])
|
||||
def test_first_with_first_day_last_of_month(self, frame_or_series, start, periods):
|
||||
# GH#29623
|
||||
x = frame_or_series([1] * 100, index=bdate_range(start, periods=100))
|
||||
with tm.assert_produces_warning(FutureWarning, match=deprecated_msg):
|
||||
result = x.first("1ME")
|
||||
expected = frame_or_series(
|
||||
[1] * periods, index=bdate_range(start, periods=periods)
|
||||
)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_first_with_first_day_end_of_frq_n_greater_one(self, frame_or_series):
|
||||
# GH#29623
|
||||
x = frame_or_series([1] * 100, index=bdate_range("2010-03-31", periods=100))
|
||||
with tm.assert_produces_warning(FutureWarning, match=deprecated_msg):
|
||||
result = x.first("2ME")
|
||||
expected = frame_or_series(
|
||||
[1] * 23, index=bdate_range("2010-03-31", "2010-04-30")
|
||||
)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_empty_not_input(self):
|
||||
# GH#51032
|
||||
df = DataFrame(index=pd.DatetimeIndex([]))
|
||||
with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg):
|
||||
result = df.last(offset=1)
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, match=deprecated_msg):
|
||||
result = df.first(offset=1)
|
||||
|
||||
tm.assert_frame_equal(df, result)
|
||||
assert df is not result
|
@ -0,0 +1,78 @@
|
||||
"""
|
||||
Includes test for last_valid_index.
|
||||
"""
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
date_range,
|
||||
)
|
||||
|
||||
|
||||
class TestFirstValidIndex:
|
||||
def test_first_valid_index_single_nan(self, frame_or_series):
|
||||
# GH#9752 Series/DataFrame should both return None, not raise
|
||||
obj = frame_or_series([np.nan])
|
||||
|
||||
assert obj.first_valid_index() is None
|
||||
assert obj.iloc[:0].first_valid_index() is None
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"empty", [DataFrame(), Series(dtype=object), Series([], index=[], dtype=object)]
|
||||
)
|
||||
def test_first_valid_index_empty(self, empty):
|
||||
# GH#12800
|
||||
assert empty.last_valid_index() is None
|
||||
assert empty.first_valid_index() is None
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,idx,expected_first,expected_last",
|
||||
[
|
||||
({"A": [1, 2, 3]}, [1, 1, 2], 1, 2),
|
||||
({"A": [1, 2, 3]}, [1, 2, 2], 1, 2),
|
||||
({"A": [1, 2, 3, 4]}, ["d", "d", "d", "d"], "d", "d"),
|
||||
({"A": [1, np.nan, 3]}, [1, 1, 2], 1, 2),
|
||||
({"A": [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2),
|
||||
({"A": [1, np.nan, 3]}, [1, 2, 2], 1, 2),
|
||||
],
|
||||
)
|
||||
def test_first_last_valid_frame(self, data, idx, expected_first, expected_last):
|
||||
# GH#21441
|
||||
df = DataFrame(data, index=idx)
|
||||
assert expected_first == df.first_valid_index()
|
||||
assert expected_last == df.last_valid_index()
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index",
|
||||
[Index([str(i) for i in range(20)]), date_range("2020-01-01", periods=20)],
|
||||
)
|
||||
def test_first_last_valid(self, index):
|
||||
mat = np.random.default_rng(2).standard_normal(len(index))
|
||||
mat[:5] = np.nan
|
||||
mat[-5:] = np.nan
|
||||
|
||||
frame = DataFrame({"foo": mat}, index=index)
|
||||
assert frame.first_valid_index() == frame.index[5]
|
||||
assert frame.last_valid_index() == frame.index[-6]
|
||||
|
||||
ser = frame["foo"]
|
||||
assert ser.first_valid_index() == frame.index[5]
|
||||
assert ser.last_valid_index() == frame.index[-6]
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index",
|
||||
[Index([str(i) for i in range(10)]), date_range("2020-01-01", periods=10)],
|
||||
)
|
||||
def test_first_last_valid_all_nan(self, index):
|
||||
# GH#17400: no valid entries
|
||||
frame = DataFrame(np.nan, columns=["foo"], index=index)
|
||||
|
||||
assert frame.last_valid_index() is None
|
||||
assert frame.first_valid_index() is None
|
||||
|
||||
ser = frame["foo"]
|
||||
assert ser.first_valid_index() is None
|
||||
assert ser.last_valid_index() is None
|
@ -0,0 +1,102 @@
|
||||
import numpy as np
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
Timestamp,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays import IntervalArray
|
||||
|
||||
|
||||
class TestGetNumericData:
|
||||
def test_get_numeric_data_preserve_dtype(self):
|
||||
# get the numeric data
|
||||
obj = DataFrame({"A": [1, "2", 3.0]}, columns=Index(["A"], dtype="object"))
|
||||
result = obj._get_numeric_data()
|
||||
expected = DataFrame(dtype=object, index=pd.RangeIndex(3), columns=[])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_get_numeric_data(self, using_infer_string):
|
||||
datetime64name = np.dtype("M8[s]").name
|
||||
objectname = np.dtype(np.object_).name
|
||||
|
||||
df = DataFrame(
|
||||
{"a": 1.0, "b": 2, "c": "foo", "f": Timestamp("20010102")},
|
||||
index=np.arange(10),
|
||||
)
|
||||
result = df.dtypes
|
||||
expected = Series(
|
||||
[
|
||||
np.dtype("float64"),
|
||||
np.dtype("int64"),
|
||||
np.dtype(objectname) if not using_infer_string else "string",
|
||||
np.dtype(datetime64name),
|
||||
],
|
||||
index=["a", "b", "c", "f"],
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": 1.0,
|
||||
"b": 2,
|
||||
"c": "foo",
|
||||
"d": np.array([1.0] * 10, dtype="float32"),
|
||||
"e": np.array([1] * 10, dtype="int32"),
|
||||
"f": np.array([1] * 10, dtype="int16"),
|
||||
"g": Timestamp("20010102"),
|
||||
},
|
||||
index=np.arange(10),
|
||||
)
|
||||
|
||||
result = df._get_numeric_data()
|
||||
expected = df.loc[:, ["a", "b", "d", "e", "f"]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
only_obj = df.loc[:, ["c", "g"]]
|
||||
result = only_obj._get_numeric_data()
|
||||
expected = df.loc[:, []]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame.from_dict({"a": [1, 2], "b": ["foo", "bar"], "c": [np.pi, np.e]})
|
||||
result = df._get_numeric_data()
|
||||
expected = DataFrame.from_dict({"a": [1, 2], "c": [np.pi, np.e]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = result.copy()
|
||||
result = df._get_numeric_data()
|
||||
expected = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_get_numeric_data_mixed_dtype(self):
|
||||
# numeric and object columns
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 2, 3],
|
||||
"b": [True, False, True],
|
||||
"c": ["foo", "bar", "baz"],
|
||||
"d": [None, None, None],
|
||||
"e": [3.14, 0.577, 2.773],
|
||||
}
|
||||
)
|
||||
result = df._get_numeric_data()
|
||||
tm.assert_index_equal(result.columns, Index(["a", "b", "e"]))
|
||||
|
||||
def test_get_numeric_data_extension_dtype(self):
|
||||
# GH#22290
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": pd.array([-10, np.nan, 0, 10, 20, 30], dtype="Int64"),
|
||||
"B": Categorical(list("abcabc")),
|
||||
"C": pd.array([0, 1, 2, 3, np.nan, 5], dtype="UInt8"),
|
||||
"D": IntervalArray.from_breaks(range(7)),
|
||||
}
|
||||
)
|
||||
result = df._get_numeric_data()
|
||||
expected = df.loc[:, ["A", "C"]]
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,57 @@
|
||||
import numpy as np
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_head_tail_generic(index, frame_or_series):
|
||||
# GH#5370
|
||||
|
||||
ndim = 2 if frame_or_series is DataFrame else 1
|
||||
shape = (len(index),) * ndim
|
||||
vals = np.random.default_rng(2).standard_normal(shape)
|
||||
obj = frame_or_series(vals, index=index)
|
||||
|
||||
tm.assert_equal(obj.head(), obj.iloc[:5])
|
||||
tm.assert_equal(obj.tail(), obj.iloc[-5:])
|
||||
|
||||
# 0-len
|
||||
tm.assert_equal(obj.head(0), obj.iloc[0:0])
|
||||
tm.assert_equal(obj.tail(0), obj.iloc[0:0])
|
||||
|
||||
# bounded
|
||||
tm.assert_equal(obj.head(len(obj) + 1), obj)
|
||||
tm.assert_equal(obj.tail(len(obj) + 1), obj)
|
||||
|
||||
# neg index
|
||||
tm.assert_equal(obj.head(-3), obj.head(len(index) - 3))
|
||||
tm.assert_equal(obj.tail(-3), obj.tail(len(index) - 3))
|
||||
|
||||
|
||||
def test_head_tail(float_frame):
|
||||
tm.assert_frame_equal(float_frame.head(), float_frame[:5])
|
||||
tm.assert_frame_equal(float_frame.tail(), float_frame[-5:])
|
||||
|
||||
tm.assert_frame_equal(float_frame.head(0), float_frame[0:0])
|
||||
tm.assert_frame_equal(float_frame.tail(0), float_frame[0:0])
|
||||
|
||||
tm.assert_frame_equal(float_frame.head(-1), float_frame[:-1])
|
||||
tm.assert_frame_equal(float_frame.tail(-1), float_frame[1:])
|
||||
tm.assert_frame_equal(float_frame.head(1), float_frame[:1])
|
||||
tm.assert_frame_equal(float_frame.tail(1), float_frame[-1:])
|
||||
# with a float index
|
||||
df = float_frame.copy()
|
||||
df.index = np.arange(len(float_frame)) + 0.1
|
||||
tm.assert_frame_equal(df.head(), df.iloc[:5])
|
||||
tm.assert_frame_equal(df.tail(), df.iloc[-5:])
|
||||
tm.assert_frame_equal(df.head(0), df[0:0])
|
||||
tm.assert_frame_equal(df.tail(0), df[0:0])
|
||||
tm.assert_frame_equal(df.head(-1), df.iloc[:-1])
|
||||
tm.assert_frame_equal(df.tail(-1), df.iloc[1:])
|
||||
|
||||
|
||||
def test_head_tail_empty():
|
||||
# test empty dataframe
|
||||
empty_df = DataFrame()
|
||||
tm.assert_frame_equal(empty_df.tail(), empty_df)
|
||||
tm.assert_frame_equal(empty_df.head(), empty_df)
|
@ -0,0 +1,42 @@
|
||||
from datetime import datetime
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestInferObjects:
|
||||
def test_infer_objects(self):
|
||||
# GH#11221
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": ["a", 1, 2, 3],
|
||||
"b": ["b", 2.0, 3.0, 4.1],
|
||||
"c": [
|
||||
"c",
|
||||
datetime(2016, 1, 1),
|
||||
datetime(2016, 1, 2),
|
||||
datetime(2016, 1, 3),
|
||||
],
|
||||
"d": [1, 2, 3, "d"],
|
||||
},
|
||||
columns=["a", "b", "c", "d"],
|
||||
)
|
||||
df = df.iloc[1:].infer_objects()
|
||||
|
||||
assert df["a"].dtype == "int64"
|
||||
assert df["b"].dtype == "float64"
|
||||
assert df["c"].dtype == "M8[ns]"
|
||||
assert df["d"].dtype == "object"
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": [1, 2, 3],
|
||||
"b": [2.0, 3.0, 4.1],
|
||||
"c": [datetime(2016, 1, 1), datetime(2016, 1, 2), datetime(2016, 1, 3)],
|
||||
"d": [2, 3, "d"],
|
||||
},
|
||||
columns=["a", "b", "c", "d"],
|
||||
)
|
||||
# reconstruct frame to verify inference is same
|
||||
result = df.reset_index(drop=True)
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,565 @@
|
||||
from io import StringIO
|
||||
import re
|
||||
from string import ascii_uppercase
|
||||
import sys
|
||||
import textwrap
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import (
|
||||
IS64,
|
||||
PYPY,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
CategoricalIndex,
|
||||
DataFrame,
|
||||
MultiIndex,
|
||||
Series,
|
||||
date_range,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def duplicate_columns_frame():
|
||||
"""Dataframe with duplicate column names."""
|
||||
return DataFrame(
|
||||
np.random.default_rng(2).standard_normal((1500, 4)),
|
||||
columns=["a", "a", "b", "b"],
|
||||
)
|
||||
|
||||
|
||||
def test_info_empty():
|
||||
# GH #45494
|
||||
df = DataFrame()
|
||||
buf = StringIO()
|
||||
df.info(buf=buf)
|
||||
result = buf.getvalue()
|
||||
expected = textwrap.dedent(
|
||||
"""\
|
||||
<class 'pandas.core.frame.DataFrame'>
|
||||
RangeIndex: 0 entries
|
||||
Empty DataFrame\n"""
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_info_categorical_column_smoke_test():
|
||||
n = 2500
|
||||
df = DataFrame({"int64": np.random.default_rng(2).integers(100, size=n, dtype=int)})
|
||||
df["category"] = Series(
|
||||
np.array(list("abcdefghij")).take(
|
||||
np.random.default_rng(2).integers(0, 10, size=n, dtype=int)
|
||||
)
|
||||
).astype("category")
|
||||
df.isna()
|
||||
buf = StringIO()
|
||||
df.info(buf=buf)
|
||||
|
||||
df2 = df[df["category"] == "d"]
|
||||
buf = StringIO()
|
||||
df2.info(buf=buf)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"fixture_func_name",
|
||||
[
|
||||
"int_frame",
|
||||
"float_frame",
|
||||
"datetime_frame",
|
||||
"duplicate_columns_frame",
|
||||
"float_string_frame",
|
||||
],
|
||||
)
|
||||
def test_info_smoke_test(fixture_func_name, request):
|
||||
frame = request.getfixturevalue(fixture_func_name)
|
||||
buf = StringIO()
|
||||
frame.info(buf=buf)
|
||||
result = buf.getvalue().splitlines()
|
||||
assert len(result) > 10
|
||||
|
||||
buf = StringIO()
|
||||
frame.info(buf=buf, verbose=False)
|
||||
|
||||
|
||||
def test_info_smoke_test2(float_frame):
|
||||
# pretty useless test, used to be mixed into the repr tests
|
||||
buf = StringIO()
|
||||
float_frame.reindex(columns=["A"]).info(verbose=False, buf=buf)
|
||||
float_frame.reindex(columns=["A", "B"]).info(verbose=False, buf=buf)
|
||||
|
||||
# no columns or index
|
||||
DataFrame().info(buf=buf)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"num_columns, max_info_columns, verbose",
|
||||
[
|
||||
(10, 100, True),
|
||||
(10, 11, True),
|
||||
(10, 10, True),
|
||||
(10, 9, False),
|
||||
(10, 1, False),
|
||||
],
|
||||
)
|
||||
def test_info_default_verbose_selection(num_columns, max_info_columns, verbose):
|
||||
frame = DataFrame(np.random.default_rng(2).standard_normal((5, num_columns)))
|
||||
with option_context("display.max_info_columns", max_info_columns):
|
||||
io_default = StringIO()
|
||||
frame.info(buf=io_default)
|
||||
result = io_default.getvalue()
|
||||
|
||||
io_explicit = StringIO()
|
||||
frame.info(buf=io_explicit, verbose=verbose)
|
||||
expected = io_explicit.getvalue()
|
||||
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_info_verbose_check_header_separator_body():
|
||||
buf = StringIO()
|
||||
size = 1001
|
||||
start = 5
|
||||
frame = DataFrame(np.random.default_rng(2).standard_normal((3, size)))
|
||||
frame.info(verbose=True, buf=buf)
|
||||
|
||||
res = buf.getvalue()
|
||||
header = " # Column Dtype \n--- ------ ----- "
|
||||
assert header in res
|
||||
|
||||
frame.info(verbose=True, buf=buf)
|
||||
buf.seek(0)
|
||||
lines = buf.readlines()
|
||||
assert len(lines) > 0
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
if start <= i < start + size:
|
||||
line_nr = f" {i - start} "
|
||||
assert line.startswith(line_nr)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"size, header_exp, separator_exp, first_line_exp, last_line_exp",
|
||||
[
|
||||
(
|
||||
4,
|
||||
" # Column Non-Null Count Dtype ",
|
||||
"--- ------ -------------- ----- ",
|
||||
" 0 0 3 non-null float64",
|
||||
" 3 3 3 non-null float64",
|
||||
),
|
||||
(
|
||||
11,
|
||||
" # Column Non-Null Count Dtype ",
|
||||
"--- ------ -------------- ----- ",
|
||||
" 0 0 3 non-null float64",
|
||||
" 10 10 3 non-null float64",
|
||||
),
|
||||
(
|
||||
101,
|
||||
" # Column Non-Null Count Dtype ",
|
||||
"--- ------ -------------- ----- ",
|
||||
" 0 0 3 non-null float64",
|
||||
" 100 100 3 non-null float64",
|
||||
),
|
||||
(
|
||||
1001,
|
||||
" # Column Non-Null Count Dtype ",
|
||||
"--- ------ -------------- ----- ",
|
||||
" 0 0 3 non-null float64",
|
||||
" 1000 1000 3 non-null float64",
|
||||
),
|
||||
(
|
||||
10001,
|
||||
" # Column Non-Null Count Dtype ",
|
||||
"--- ------ -------------- ----- ",
|
||||
" 0 0 3 non-null float64",
|
||||
" 10000 10000 3 non-null float64",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_info_verbose_with_counts_spacing(
|
||||
size, header_exp, separator_exp, first_line_exp, last_line_exp
|
||||
):
|
||||
"""Test header column, spacer, first line and last line in verbose mode."""
|
||||
frame = DataFrame(np.random.default_rng(2).standard_normal((3, size)))
|
||||
with StringIO() as buf:
|
||||
frame.info(verbose=True, show_counts=True, buf=buf)
|
||||
all_lines = buf.getvalue().splitlines()
|
||||
# Here table would contain only header, separator and table lines
|
||||
# dframe repr, index summary, memory usage and dtypes are excluded
|
||||
table = all_lines[3:-2]
|
||||
header, separator, first_line, *rest, last_line = table
|
||||
assert header == header_exp
|
||||
assert separator == separator_exp
|
||||
assert first_line == first_line_exp
|
||||
assert last_line == last_line_exp
|
||||
|
||||
|
||||
def test_info_memory():
|
||||
# https://github.com/pandas-dev/pandas/issues/21056
|
||||
df = DataFrame({"a": Series([1, 2], dtype="i8")})
|
||||
buf = StringIO()
|
||||
df.info(buf=buf)
|
||||
result = buf.getvalue()
|
||||
bytes = float(df.memory_usage().sum())
|
||||
expected = textwrap.dedent(
|
||||
f"""\
|
||||
<class 'pandas.core.frame.DataFrame'>
|
||||
RangeIndex: 2 entries, 0 to 1
|
||||
Data columns (total 1 columns):
|
||||
# Column Non-Null Count Dtype
|
||||
--- ------ -------------- -----
|
||||
0 a 2 non-null int64
|
||||
dtypes: int64(1)
|
||||
memory usage: {bytes} bytes
|
||||
"""
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_info_wide():
|
||||
io = StringIO()
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((5, 101)))
|
||||
df.info(buf=io)
|
||||
|
||||
io = StringIO()
|
||||
df.info(buf=io, max_cols=101)
|
||||
result = io.getvalue()
|
||||
assert len(result.splitlines()) > 100
|
||||
|
||||
expected = result
|
||||
with option_context("display.max_info_columns", 101):
|
||||
io = StringIO()
|
||||
df.info(buf=io)
|
||||
result = io.getvalue()
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_info_duplicate_columns_shows_correct_dtypes():
|
||||
# GH11761
|
||||
io = StringIO()
|
||||
frame = DataFrame([[1, 2.0]], columns=["a", "a"])
|
||||
frame.info(buf=io)
|
||||
lines = io.getvalue().splitlines(True)
|
||||
assert " 0 a 1 non-null int64 \n" == lines[5]
|
||||
assert " 1 a 1 non-null float64\n" == lines[6]
|
||||
|
||||
|
||||
def test_info_shows_column_dtypes():
|
||||
dtypes = [
|
||||
"int64",
|
||||
"float64",
|
||||
"datetime64[ns]",
|
||||
"timedelta64[ns]",
|
||||
"complex128",
|
||||
"object",
|
||||
"bool",
|
||||
]
|
||||
data = {}
|
||||
n = 10
|
||||
for i, dtype in enumerate(dtypes):
|
||||
data[i] = np.random.default_rng(2).integers(2, size=n).astype(dtype)
|
||||
df = DataFrame(data)
|
||||
buf = StringIO()
|
||||
df.info(buf=buf)
|
||||
res = buf.getvalue()
|
||||
header = (
|
||||
" # Column Non-Null Count Dtype \n"
|
||||
"--- ------ -------------- ----- "
|
||||
)
|
||||
assert header in res
|
||||
for i, dtype in enumerate(dtypes):
|
||||
name = f" {i:d} {i:d} {n:d} non-null {dtype}"
|
||||
assert name in res
|
||||
|
||||
|
||||
def test_info_max_cols():
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((10, 5)))
|
||||
for len_, verbose in [(5, None), (5, False), (12, True)]:
|
||||
# For verbose always ^ setting ^ summarize ^ full output
|
||||
with option_context("max_info_columns", 4):
|
||||
buf = StringIO()
|
||||
df.info(buf=buf, verbose=verbose)
|
||||
res = buf.getvalue()
|
||||
assert len(res.strip().split("\n")) == len_
|
||||
|
||||
for len_, verbose in [(12, None), (5, False), (12, True)]:
|
||||
# max_cols not exceeded
|
||||
with option_context("max_info_columns", 5):
|
||||
buf = StringIO()
|
||||
df.info(buf=buf, verbose=verbose)
|
||||
res = buf.getvalue()
|
||||
assert len(res.strip().split("\n")) == len_
|
||||
|
||||
for len_, max_cols in [(12, 5), (5, 4)]:
|
||||
# setting truncates
|
||||
with option_context("max_info_columns", 4):
|
||||
buf = StringIO()
|
||||
df.info(buf=buf, max_cols=max_cols)
|
||||
res = buf.getvalue()
|
||||
assert len(res.strip().split("\n")) == len_
|
||||
|
||||
# setting wouldn't truncate
|
||||
with option_context("max_info_columns", 5):
|
||||
buf = StringIO()
|
||||
df.info(buf=buf, max_cols=max_cols)
|
||||
res = buf.getvalue()
|
||||
assert len(res.strip().split("\n")) == len_
|
||||
|
||||
|
||||
def test_info_memory_usage():
|
||||
# Ensure memory usage is displayed, when asserted, on the last line
|
||||
dtypes = [
|
||||
"int64",
|
||||
"float64",
|
||||
"datetime64[ns]",
|
||||
"timedelta64[ns]",
|
||||
"complex128",
|
||||
"object",
|
||||
"bool",
|
||||
]
|
||||
data = {}
|
||||
n = 10
|
||||
for i, dtype in enumerate(dtypes):
|
||||
data[i] = np.random.default_rng(2).integers(2, size=n).astype(dtype)
|
||||
df = DataFrame(data)
|
||||
buf = StringIO()
|
||||
|
||||
# display memory usage case
|
||||
df.info(buf=buf, memory_usage=True)
|
||||
res = buf.getvalue().splitlines()
|
||||
assert "memory usage: " in res[-1]
|
||||
|
||||
# do not display memory usage case
|
||||
df.info(buf=buf, memory_usage=False)
|
||||
res = buf.getvalue().splitlines()
|
||||
assert "memory usage: " not in res[-1]
|
||||
|
||||
df.info(buf=buf, memory_usage=True)
|
||||
res = buf.getvalue().splitlines()
|
||||
|
||||
# memory usage is a lower bound, so print it as XYZ+ MB
|
||||
assert re.match(r"memory usage: [^+]+\+", res[-1])
|
||||
|
||||
df.iloc[:, :5].info(buf=buf, memory_usage=True)
|
||||
res = buf.getvalue().splitlines()
|
||||
|
||||
# excluded column with object dtype, so estimate is accurate
|
||||
assert not re.match(r"memory usage: [^+]+\+", res[-1])
|
||||
|
||||
# Test a DataFrame with duplicate columns
|
||||
dtypes = ["int64", "int64", "int64", "float64"]
|
||||
data = {}
|
||||
n = 100
|
||||
for i, dtype in enumerate(dtypes):
|
||||
data[i] = np.random.default_rng(2).integers(2, size=n).astype(dtype)
|
||||
df = DataFrame(data)
|
||||
df.columns = dtypes
|
||||
|
||||
df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
|
||||
df_with_object_index.info(buf=buf, memory_usage=True)
|
||||
res = buf.getvalue().splitlines()
|
||||
assert re.match(r"memory usage: [^+]+\+", res[-1])
|
||||
|
||||
df_with_object_index.info(buf=buf, memory_usage="deep")
|
||||
res = buf.getvalue().splitlines()
|
||||
assert re.match(r"memory usage: [^+]+$", res[-1])
|
||||
|
||||
# Ensure df size is as expected
|
||||
# (cols * rows * bytes) + index size
|
||||
df_size = df.memory_usage().sum()
|
||||
exp_size = len(dtypes) * n * 8 + df.index.nbytes
|
||||
assert df_size == exp_size
|
||||
|
||||
# Ensure number of cols in memory_usage is the same as df
|
||||
size_df = np.size(df.columns.values) + 1 # index=True; default
|
||||
assert size_df == np.size(df.memory_usage())
|
||||
|
||||
# assert deep works only on object
|
||||
assert df.memory_usage().sum() == df.memory_usage(deep=True).sum()
|
||||
|
||||
# test for validity
|
||||
DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True)
|
||||
DataFrame(1, index=["a"], columns=["A"]).index.nbytes
|
||||
df = DataFrame(
|
||||
data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"]
|
||||
)
|
||||
df.index.nbytes
|
||||
df.memory_usage(index=True)
|
||||
df.index.values.nbytes
|
||||
|
||||
mem = df.memory_usage(deep=True).sum()
|
||||
assert mem > 0
|
||||
|
||||
|
||||
@pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result")
|
||||
def test_info_memory_usage_deep_not_pypy():
|
||||
df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
|
||||
assert (
|
||||
df_with_object_index.memory_usage(index=True, deep=True).sum()
|
||||
> df_with_object_index.memory_usage(index=True).sum()
|
||||
)
|
||||
|
||||
df_object = DataFrame({"a": ["a"]})
|
||||
assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum()
|
||||
|
||||
|
||||
@pytest.mark.xfail(not PYPY, reason="on PyPy deep=True does not change result")
|
||||
def test_info_memory_usage_deep_pypy():
|
||||
df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
|
||||
assert (
|
||||
df_with_object_index.memory_usage(index=True, deep=True).sum()
|
||||
== df_with_object_index.memory_usage(index=True).sum()
|
||||
)
|
||||
|
||||
df_object = DataFrame({"a": ["a"]})
|
||||
assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum()
|
||||
|
||||
|
||||
@pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
|
||||
def test_usage_via_getsizeof():
|
||||
df = DataFrame(
|
||||
data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"]
|
||||
)
|
||||
mem = df.memory_usage(deep=True).sum()
|
||||
# sys.getsizeof will call the .memory_usage with
|
||||
# deep=True, and add on some GC overhead
|
||||
diff = mem - sys.getsizeof(df)
|
||||
assert abs(diff) < 100
|
||||
|
||||
|
||||
def test_info_memory_usage_qualified():
|
||||
buf = StringIO()
|
||||
df = DataFrame(1, columns=list("ab"), index=[1, 2, 3])
|
||||
df.info(buf=buf)
|
||||
assert "+" not in buf.getvalue()
|
||||
|
||||
buf = StringIO()
|
||||
df = DataFrame(1, columns=list("ab"), index=list("ABC"))
|
||||
df.info(buf=buf)
|
||||
assert "+" in buf.getvalue()
|
||||
|
||||
buf = StringIO()
|
||||
df = DataFrame(
|
||||
1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)])
|
||||
)
|
||||
df.info(buf=buf)
|
||||
assert "+" not in buf.getvalue()
|
||||
|
||||
buf = StringIO()
|
||||
df = DataFrame(
|
||||
1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]])
|
||||
)
|
||||
df.info(buf=buf)
|
||||
assert "+" in buf.getvalue()
|
||||
|
||||
|
||||
def test_info_memory_usage_bug_on_multiindex():
|
||||
# GH 14308
|
||||
# memory usage introspection should not materialize .values
|
||||
|
||||
def memory_usage(f):
|
||||
return f.memory_usage(deep=True).sum()
|
||||
|
||||
N = 100
|
||||
M = len(ascii_uppercase)
|
||||
index = MultiIndex.from_product(
|
||||
[list(ascii_uppercase), date_range("20160101", periods=N)],
|
||||
names=["id", "date"],
|
||||
)
|
||||
df = DataFrame(
|
||||
{"value": np.random.default_rng(2).standard_normal(N * M)}, index=index
|
||||
)
|
||||
|
||||
unstacked = df.unstack("id")
|
||||
assert df.values.nbytes == unstacked.values.nbytes
|
||||
assert memory_usage(df) > memory_usage(unstacked)
|
||||
|
||||
# high upper bound
|
||||
assert memory_usage(unstacked) - memory_usage(df) < 2000
|
||||
|
||||
|
||||
def test_info_categorical():
|
||||
# GH14298
|
||||
idx = CategoricalIndex(["a", "b"])
|
||||
df = DataFrame(np.zeros((2, 2)), index=idx, columns=idx)
|
||||
|
||||
buf = StringIO()
|
||||
df.info(buf=buf)
|
||||
|
||||
|
||||
@pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system")
|
||||
def test_info_int_columns():
|
||||
# GH#37245
|
||||
df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"])
|
||||
buf = StringIO()
|
||||
df.info(show_counts=True, buf=buf)
|
||||
result = buf.getvalue()
|
||||
expected = textwrap.dedent(
|
||||
"""\
|
||||
<class 'pandas.core.frame.DataFrame'>
|
||||
Index: 2 entries, A to B
|
||||
Data columns (total 2 columns):
|
||||
# Column Non-Null Count Dtype
|
||||
--- ------ -------------- -----
|
||||
0 1 2 non-null int64
|
||||
1 2 2 non-null int64
|
||||
dtypes: int64(2)
|
||||
memory usage: 48.0+ bytes
|
||||
"""
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_memory_usage_empty_no_warning():
|
||||
# GH#50066
|
||||
df = DataFrame(index=["a", "b"])
|
||||
with tm.assert_produces_warning(None):
|
||||
result = df.memory_usage()
|
||||
expected = Series(16 if IS64 else 8, index=["Index"])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.single_cpu
|
||||
def test_info_compute_numba():
|
||||
# GH#51922
|
||||
pytest.importorskip("numba")
|
||||
df = DataFrame([[1, 2], [3, 4]])
|
||||
|
||||
with option_context("compute.use_numba", True):
|
||||
buf = StringIO()
|
||||
df.info(buf=buf)
|
||||
result = buf.getvalue()
|
||||
|
||||
buf = StringIO()
|
||||
df.info(buf=buf)
|
||||
expected = buf.getvalue()
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"row, columns, show_counts, result",
|
||||
[
|
||||
[20, 20, None, True],
|
||||
[20, 20, True, True],
|
||||
[20, 20, False, False],
|
||||
[5, 5, None, False],
|
||||
[5, 5, True, False],
|
||||
[5, 5, False, False],
|
||||
],
|
||||
)
|
||||
def test_info_show_counts(row, columns, show_counts, result):
|
||||
# Explicit cast to float to avoid implicit cast when setting nan
|
||||
df = DataFrame(1, columns=range(10), index=range(10)).astype({1: "float"})
|
||||
df.iloc[1, 1] = np.nan
|
||||
|
||||
with option_context(
|
||||
"display.max_info_rows", row, "display.max_info_columns", columns
|
||||
):
|
||||
with StringIO() as buf:
|
||||
df.info(buf=buf, show_counts=show_counts)
|
||||
assert ("non-null" in buf.getvalue()) is result
|
@ -0,0 +1,548 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._config import using_pyarrow_string_dtype
|
||||
|
||||
from pandas.errors import ChainedAssignmentError
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
NaT,
|
||||
Series,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameInterpolate:
|
||||
def test_interpolate_complex(self):
|
||||
# GH#53635
|
||||
ser = Series([complex("1+1j"), float("nan"), complex("2+2j")])
|
||||
assert ser.dtype.kind == "c"
|
||||
|
||||
res = ser.interpolate()
|
||||
expected = Series([ser[0], ser[0] * 1.5, ser[2]])
|
||||
tm.assert_series_equal(res, expected)
|
||||
|
||||
df = ser.to_frame()
|
||||
res = df.interpolate()
|
||||
expected = expected.to_frame()
|
||||
tm.assert_frame_equal(res, expected)
|
||||
|
||||
def test_interpolate_datetimelike_values(self, frame_or_series):
|
||||
# GH#11312, GH#51005
|
||||
orig = Series(date_range("2012-01-01", periods=5))
|
||||
ser = orig.copy()
|
||||
ser[2] = NaT
|
||||
|
||||
res = frame_or_series(ser).interpolate()
|
||||
expected = frame_or_series(orig)
|
||||
tm.assert_equal(res, expected)
|
||||
|
||||
# datetime64tz cast
|
||||
ser_tz = ser.dt.tz_localize("US/Pacific")
|
||||
res_tz = frame_or_series(ser_tz).interpolate()
|
||||
expected_tz = frame_or_series(orig.dt.tz_localize("US/Pacific"))
|
||||
tm.assert_equal(res_tz, expected_tz)
|
||||
|
||||
# timedelta64 cast
|
||||
ser_td = ser - ser[0]
|
||||
res_td = frame_or_series(ser_td).interpolate()
|
||||
expected_td = frame_or_series(orig - orig[0])
|
||||
tm.assert_equal(res_td, expected_td)
|
||||
|
||||
def test_interpolate_inplace(self, frame_or_series, using_array_manager, request):
|
||||
# GH#44749
|
||||
if using_array_manager and frame_or_series is DataFrame:
|
||||
mark = pytest.mark.xfail(reason=".values-based in-place check is invalid")
|
||||
request.applymarker(mark)
|
||||
|
||||
obj = frame_or_series([1, np.nan, 2])
|
||||
orig = obj.values
|
||||
|
||||
obj.interpolate(inplace=True)
|
||||
expected = frame_or_series([1, 1.5, 2])
|
||||
tm.assert_equal(obj, expected)
|
||||
|
||||
# check we operated *actually* inplace
|
||||
assert np.shares_memory(orig, obj.values)
|
||||
assert orig.squeeze()[1] == 1.5
|
||||
|
||||
@pytest.mark.xfail(
|
||||
using_pyarrow_string_dtype(), reason="interpolate doesn't work for string"
|
||||
)
|
||||
def test_interp_basic(self, using_copy_on_write):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 2, np.nan, 4],
|
||||
"B": [1, 4, 9, np.nan],
|
||||
"C": [1, 2, 3, 5],
|
||||
"D": list("abcd"),
|
||||
}
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": [1.0, 2.0, 3.0, 4.0],
|
||||
"B": [1.0, 4.0, 9.0, 9.0],
|
||||
"C": [1, 2, 3, 5],
|
||||
"D": list("abcd"),
|
||||
}
|
||||
)
|
||||
msg = "DataFrame.interpolate with object dtype"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.interpolate()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# check we didn't operate inplace GH#45791
|
||||
cvalues = df["C"]._values
|
||||
dvalues = df["D"].values
|
||||
if using_copy_on_write:
|
||||
assert np.shares_memory(cvalues, result["C"]._values)
|
||||
assert np.shares_memory(dvalues, result["D"]._values)
|
||||
else:
|
||||
assert not np.shares_memory(cvalues, result["C"]._values)
|
||||
assert not np.shares_memory(dvalues, result["D"]._values)
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
res = df.interpolate(inplace=True)
|
||||
assert res is None
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
# check we DID operate inplace
|
||||
assert np.shares_memory(df["C"]._values, cvalues)
|
||||
assert np.shares_memory(df["D"]._values, dvalues)
|
||||
|
||||
@pytest.mark.xfail(
|
||||
using_pyarrow_string_dtype(), reason="interpolate doesn't work for string"
|
||||
)
|
||||
def test_interp_basic_with_non_range_index(self, using_infer_string):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 2, np.nan, 4],
|
||||
"B": [1, 4, 9, np.nan],
|
||||
"C": [1, 2, 3, 5],
|
||||
"D": list("abcd"),
|
||||
}
|
||||
)
|
||||
|
||||
msg = "DataFrame.interpolate with object dtype"
|
||||
warning = FutureWarning if not using_infer_string else None
|
||||
with tm.assert_produces_warning(warning, match=msg):
|
||||
result = df.set_index("C").interpolate()
|
||||
expected = df.set_index("C")
|
||||
expected.loc[3, "A"] = 3
|
||||
expected.loc[5, "B"] = 9
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_interp_empty(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/35598
|
||||
df = DataFrame()
|
||||
result = df.interpolate()
|
||||
assert result is not df
|
||||
expected = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_interp_bad_method(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 2, np.nan, 4],
|
||||
"B": [1, 4, 9, np.nan],
|
||||
"C": [1, 2, 3, 5],
|
||||
}
|
||||
)
|
||||
msg = (
|
||||
r"method must be one of \['linear', 'time', 'index', 'values', "
|
||||
r"'nearest', 'zero', 'slinear', 'quadratic', 'cubic', "
|
||||
r"'barycentric', 'krogh', 'spline', 'polynomial', "
|
||||
r"'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima', "
|
||||
r"'cubicspline'\]. Got 'not_a_method' instead."
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.interpolate(method="not_a_method")
|
||||
|
||||
def test_interp_combo(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1.0, 2.0, np.nan, 4.0],
|
||||
"B": [1, 4, 9, np.nan],
|
||||
"C": [1, 2, 3, 5],
|
||||
"D": list("abcd"),
|
||||
}
|
||||
)
|
||||
|
||||
result = df["A"].interpolate()
|
||||
expected = Series([1.0, 2.0, 3.0, 4.0], name="A")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
msg = "The 'downcast' keyword in Series.interpolate is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df["A"].interpolate(downcast="infer")
|
||||
expected = Series([1, 2, 3, 4], name="A")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_inerpolate_invalid_downcast(self):
|
||||
# GH#53103
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1.0, 2.0, np.nan, 4.0],
|
||||
"B": [1, 4, 9, np.nan],
|
||||
"C": [1, 2, 3, 5],
|
||||
"D": list("abcd"),
|
||||
}
|
||||
)
|
||||
|
||||
msg = "downcast must be either None or 'infer'"
|
||||
msg2 = "The 'downcast' keyword in DataFrame.interpolate is deprecated"
|
||||
msg3 = "The 'downcast' keyword in Series.interpolate is deprecated"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg2):
|
||||
df.interpolate(downcast="int64")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg3):
|
||||
df["A"].interpolate(downcast="int64")
|
||||
|
||||
def test_interp_nan_idx(self):
|
||||
df = DataFrame({"A": [1, 2, np.nan, 4], "B": [np.nan, 2, 3, 4]})
|
||||
df = df.set_index("A")
|
||||
msg = (
|
||||
"Interpolation with NaNs in the index has not been implemented. "
|
||||
"Try filling those NaNs before interpolating."
|
||||
)
|
||||
with pytest.raises(NotImplementedError, match=msg):
|
||||
df.interpolate(method="values")
|
||||
|
||||
def test_interp_various(self):
|
||||
pytest.importorskip("scipy")
|
||||
df = DataFrame(
|
||||
{"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]}
|
||||
)
|
||||
df = df.set_index("C")
|
||||
expected = df.copy()
|
||||
result = df.interpolate(method="polynomial", order=1)
|
||||
|
||||
expected.loc[3, "A"] = 2.66666667
|
||||
expected.loc[13, "A"] = 5.76923076
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(method="cubic")
|
||||
# GH #15662.
|
||||
expected.loc[3, "A"] = 2.81547781
|
||||
expected.loc[13, "A"] = 5.52964175
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(method="nearest")
|
||||
expected.loc[3, "A"] = 2
|
||||
expected.loc[13, "A"] = 5
|
||||
tm.assert_frame_equal(result, expected, check_dtype=False)
|
||||
|
||||
result = df.interpolate(method="quadratic")
|
||||
expected.loc[3, "A"] = 2.82150771
|
||||
expected.loc[13, "A"] = 6.12648668
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(method="slinear")
|
||||
expected.loc[3, "A"] = 2.66666667
|
||||
expected.loc[13, "A"] = 5.76923077
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(method="zero")
|
||||
expected.loc[3, "A"] = 2.0
|
||||
expected.loc[13, "A"] = 5
|
||||
tm.assert_frame_equal(result, expected, check_dtype=False)
|
||||
|
||||
def test_interp_alt_scipy(self):
|
||||
pytest.importorskip("scipy")
|
||||
df = DataFrame(
|
||||
{"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]}
|
||||
)
|
||||
result = df.interpolate(method="barycentric")
|
||||
expected = df.copy()
|
||||
expected.loc[2, "A"] = 3
|
||||
expected.loc[5, "A"] = 6
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
msg = "The 'downcast' keyword in DataFrame.interpolate is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.interpolate(method="barycentric", downcast="infer")
|
||||
tm.assert_frame_equal(result, expected.astype(np.int64))
|
||||
|
||||
result = df.interpolate(method="krogh")
|
||||
expectedk = df.copy()
|
||||
expectedk["A"] = expected["A"]
|
||||
tm.assert_frame_equal(result, expectedk)
|
||||
|
||||
result = df.interpolate(method="pchip")
|
||||
expected.loc[2, "A"] = 3
|
||||
expected.loc[5, "A"] = 6.0
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_interp_rowwise(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
0: [1, 2, np.nan, 4],
|
||||
1: [2, 3, 4, np.nan],
|
||||
2: [np.nan, 4, 5, 6],
|
||||
3: [4, np.nan, 6, 7],
|
||||
4: [1, 2, 3, 4],
|
||||
}
|
||||
)
|
||||
result = df.interpolate(axis=1)
|
||||
expected = df.copy()
|
||||
expected.loc[3, 1] = 5
|
||||
expected.loc[0, 2] = 3
|
||||
expected.loc[1, 3] = 3
|
||||
expected[4] = expected[4].astype(np.float64)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(axis=1, method="values")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(axis=0)
|
||||
expected = df.interpolate()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"axis_name, axis_number",
|
||||
[
|
||||
pytest.param("rows", 0, id="rows_0"),
|
||||
pytest.param("index", 0, id="index_0"),
|
||||
pytest.param("columns", 1, id="columns_1"),
|
||||
],
|
||||
)
|
||||
def test_interp_axis_names(self, axis_name, axis_number):
|
||||
# GH 29132: test axis names
|
||||
data = {0: [0, np.nan, 6], 1: [1, np.nan, 7], 2: [2, 5, 8]}
|
||||
|
||||
df = DataFrame(data, dtype=np.float64)
|
||||
result = df.interpolate(axis=axis_name, method="linear")
|
||||
expected = df.interpolate(axis=axis_number, method="linear")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_rowwise_alt(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
0: [0, 0.5, 1.0, np.nan, 4, 8, np.nan, np.nan, 64],
|
||||
1: [1, 2, 3, 4, 3, 2, 1, 0, -1],
|
||||
}
|
||||
)
|
||||
df.interpolate(axis=0)
|
||||
# TODO: assert something?
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"check_scipy", [False, pytest.param(True, marks=td.skip_if_no("scipy"))]
|
||||
)
|
||||
def test_interp_leading_nans(self, check_scipy):
|
||||
df = DataFrame(
|
||||
{"A": [np.nan, np.nan, 0.5, 0.25, 0], "B": [np.nan, -3, -3.5, np.nan, -4]}
|
||||
)
|
||||
result = df.interpolate()
|
||||
expected = df.copy()
|
||||
expected.loc[3, "B"] = -3.75
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
if check_scipy:
|
||||
result = df.interpolate(method="polynomial", order=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_interp_raise_on_only_mixed(self, axis):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 2, np.nan, 4],
|
||||
"B": ["a", "b", "c", "d"],
|
||||
"C": [np.nan, 2, 5, 7],
|
||||
"D": [np.nan, np.nan, 9, 9],
|
||||
"E": [1, 2, 3, 4],
|
||||
}
|
||||
)
|
||||
msg = (
|
||||
"Cannot interpolate with all object-dtype columns "
|
||||
"in the DataFrame. Try setting at least one "
|
||||
"column to a numeric dtype."
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.astype("object").interpolate(axis=axis)
|
||||
|
||||
def test_interp_raise_on_all_object_dtype(self):
|
||||
# GH 22985
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, dtype="object")
|
||||
msg = (
|
||||
"Cannot interpolate with all object-dtype columns "
|
||||
"in the DataFrame. Try setting at least one "
|
||||
"column to a numeric dtype."
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.interpolate()
|
||||
|
||||
def test_interp_inplace(self, using_copy_on_write):
|
||||
df = DataFrame({"a": [1.0, 2.0, np.nan, 4.0]})
|
||||
expected = DataFrame({"a": [1.0, 2.0, 3.0, 4.0]})
|
||||
expected_cow = df.copy()
|
||||
result = df.copy()
|
||||
|
||||
if using_copy_on_write:
|
||||
with tm.raises_chained_assignment_error():
|
||||
return_value = result["a"].interpolate(inplace=True)
|
||||
assert return_value is None
|
||||
tm.assert_frame_equal(result, expected_cow)
|
||||
else:
|
||||
with tm.assert_produces_warning(FutureWarning, match="inplace method"):
|
||||
return_value = result["a"].interpolate(inplace=True)
|
||||
assert return_value is None
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.copy()
|
||||
msg = "The 'downcast' keyword in Series.interpolate is deprecated"
|
||||
|
||||
if using_copy_on_write:
|
||||
with tm.assert_produces_warning(
|
||||
(FutureWarning, ChainedAssignmentError), match=msg
|
||||
):
|
||||
return_value = result["a"].interpolate(inplace=True, downcast="infer")
|
||||
assert return_value is None
|
||||
tm.assert_frame_equal(result, expected_cow)
|
||||
else:
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
return_value = result["a"].interpolate(inplace=True, downcast="infer")
|
||||
assert return_value is None
|
||||
tm.assert_frame_equal(result, expected.astype("int64"))
|
||||
|
||||
def test_interp_inplace_row(self):
|
||||
# GH 10395
|
||||
result = DataFrame(
|
||||
{"a": [1.0, 2.0, 3.0, 4.0], "b": [np.nan, 2.0, 3.0, 4.0], "c": [3, 2, 2, 2]}
|
||||
)
|
||||
expected = result.interpolate(method="linear", axis=1, inplace=False)
|
||||
return_value = result.interpolate(method="linear", axis=1, inplace=True)
|
||||
assert return_value is None
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_interp_ignore_all_good(self):
|
||||
# GH
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 2, np.nan, 4],
|
||||
"B": [1, 2, 3, 4],
|
||||
"C": [1.0, 2.0, np.nan, 4.0],
|
||||
"D": [1.0, 2.0, 3.0, 4.0],
|
||||
}
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": np.array([1, 2, 3, 4], dtype="float64"),
|
||||
"B": np.array([1, 2, 3, 4], dtype="int64"),
|
||||
"C": np.array([1.0, 2.0, 3, 4.0], dtype="float64"),
|
||||
"D": np.array([1.0, 2.0, 3.0, 4.0], dtype="float64"),
|
||||
}
|
||||
)
|
||||
|
||||
msg = "The 'downcast' keyword in DataFrame.interpolate is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.interpolate(downcast=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# all good
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df[["B", "D"]].interpolate(downcast=None)
|
||||
tm.assert_frame_equal(result, df[["B", "D"]])
|
||||
|
||||
def test_interp_time_inplace_axis(self):
|
||||
# GH 9687
|
||||
periods = 5
|
||||
idx = date_range(start="2014-01-01", periods=periods)
|
||||
data = np.random.default_rng(2).random((periods, periods))
|
||||
data[data < 0.5] = np.nan
|
||||
expected = DataFrame(index=idx, columns=idx, data=data)
|
||||
|
||||
result = expected.interpolate(axis=0, method="time")
|
||||
return_value = expected.interpolate(axis=0, method="time", inplace=True)
|
||||
assert return_value is None
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("axis_name, axis_number", [("index", 0), ("columns", 1)])
|
||||
def test_interp_string_axis(self, axis_name, axis_number):
|
||||
# https://github.com/pandas-dev/pandas/issues/25190
|
||||
x = np.linspace(0, 100, 1000)
|
||||
y = np.sin(x)
|
||||
df = DataFrame(
|
||||
data=np.tile(y, (10, 1)), index=np.arange(10), columns=x
|
||||
).reindex(columns=x * 1.005)
|
||||
result = df.interpolate(method="linear", axis=axis_name)
|
||||
expected = df.interpolate(method="linear", axis=axis_number)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("multiblock", [True, False])
|
||||
@pytest.mark.parametrize("method", ["ffill", "bfill", "pad"])
|
||||
def test_interp_fillna_methods(
|
||||
self, request, axis, multiblock, method, using_array_manager
|
||||
):
|
||||
# GH 12918
|
||||
if using_array_manager and axis in (1, "columns"):
|
||||
# TODO(ArrayManager) support axis=1
|
||||
td.mark_array_manager_not_yet_implemented(request)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1.0, 2.0, 3.0, 4.0, np.nan, 5.0],
|
||||
"B": [2.0, 4.0, 6.0, np.nan, 8.0, 10.0],
|
||||
"C": [3.0, 6.0, 9.0, np.nan, np.nan, 30.0],
|
||||
}
|
||||
)
|
||||
if multiblock:
|
||||
df["D"] = np.nan
|
||||
df["E"] = 1.0
|
||||
|
||||
method2 = method if method != "pad" else "ffill"
|
||||
expected = getattr(df, method2)(axis=axis)
|
||||
msg = f"DataFrame.interpolate with method={method} is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.interpolate(method=method, axis=axis)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_interpolate_empty_df(self):
|
||||
# GH#53199
|
||||
df = DataFrame()
|
||||
expected = df.copy()
|
||||
result = df.interpolate(inplace=True)
|
||||
assert result is None
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_interpolate_ea(self, any_int_ea_dtype):
|
||||
# GH#55347
|
||||
df = DataFrame({"a": [1, None, None, None, 3]}, dtype=any_int_ea_dtype)
|
||||
orig = df.copy()
|
||||
result = df.interpolate(limit=2)
|
||||
expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype="Float64")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(df, orig)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
"Float64",
|
||||
"Float32",
|
||||
pytest.param("float32[pyarrow]", marks=td.skip_if_no("pyarrow")),
|
||||
pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")),
|
||||
],
|
||||
)
|
||||
def test_interpolate_ea_float(self, dtype):
|
||||
# GH#55347
|
||||
df = DataFrame({"a": [1, None, None, None, 3]}, dtype=dtype)
|
||||
orig = df.copy()
|
||||
result = df.interpolate(limit=2)
|
||||
expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(df, orig)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
["int64", "uint64", "int32", "int16", "int8", "uint32", "uint16", "uint8"],
|
||||
)
|
||||
def test_interpolate_arrow(self, dtype):
|
||||
# GH#55347
|
||||
pytest.importorskip("pyarrow")
|
||||
df = DataFrame({"a": [1, None, None, None, 3]}, dtype=dtype + "[pyarrow]")
|
||||
result = df.interpolate(limit=2)
|
||||
expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype="float64[pyarrow]")
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,58 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
)
|
||||
|
||||
# _is_homogeneous_type always returns True for ArrayManager
|
||||
pytestmark = td.skip_array_manager_invalid_test
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, expected",
|
||||
[
|
||||
# empty
|
||||
(DataFrame(), True),
|
||||
# multi-same
|
||||
(DataFrame({"A": [1, 2], "B": [1, 2]}), True),
|
||||
# multi-object
|
||||
(
|
||||
DataFrame(
|
||||
{
|
||||
"A": np.array([1, 2], dtype=object),
|
||||
"B": np.array(["a", "b"], dtype=object),
|
||||
},
|
||||
dtype="object",
|
||||
),
|
||||
True,
|
||||
),
|
||||
# multi-extension
|
||||
(
|
||||
DataFrame({"A": Categorical(["a", "b"]), "B": Categorical(["a", "b"])}),
|
||||
True,
|
||||
),
|
||||
# differ types
|
||||
(DataFrame({"A": [1, 2], "B": [1.0, 2.0]}), False),
|
||||
# differ sizes
|
||||
(
|
||||
DataFrame(
|
||||
{
|
||||
"A": np.array([1, 2], dtype=np.int32),
|
||||
"B": np.array([1, 2], dtype=np.int64),
|
||||
}
|
||||
),
|
||||
False,
|
||||
),
|
||||
# multi-extension differ
|
||||
(
|
||||
DataFrame({"A": Categorical(["a", "b"]), "B": Categorical(["b", "c"])}),
|
||||
False,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_is_homogeneous_type(data, expected):
|
||||
assert data._is_homogeneous_type is expected
|
@ -0,0 +1,50 @@
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameSetItem:
|
||||
def test_isetitem_ea_df(self):
|
||||
# GH#49922
|
||||
df = DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
rhs = DataFrame([[11, 12], [13, 14]], dtype="Int64")
|
||||
|
||||
df.isetitem([0, 1], rhs)
|
||||
expected = DataFrame(
|
||||
{
|
||||
0: Series([11, 13], dtype="Int64"),
|
||||
1: Series([12, 14], dtype="Int64"),
|
||||
2: [3, 6],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_isetitem_ea_df_scalar_indexer(self):
|
||||
# GH#49922
|
||||
df = DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
rhs = DataFrame([[11], [13]], dtype="Int64")
|
||||
|
||||
df.isetitem(2, rhs)
|
||||
expected = DataFrame(
|
||||
{
|
||||
0: [1, 4],
|
||||
1: [2, 5],
|
||||
2: Series([11, 13], dtype="Int64"),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_isetitem_dimension_mismatch(self):
|
||||
# GH#51701
|
||||
df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]})
|
||||
value = df.copy()
|
||||
with pytest.raises(ValueError, match="Got 2 positions but value has 3 columns"):
|
||||
df.isetitem([1, 2], value)
|
||||
|
||||
value = df.copy()
|
||||
with pytest.raises(ValueError, match="Got 2 positions but value has 1 columns"):
|
||||
df.isetitem([1, 2], value[["a"]])
|
@ -0,0 +1,227 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
MultiIndex,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameIsIn:
|
||||
def test_isin(self):
|
||||
# GH#4211
|
||||
df = DataFrame(
|
||||
{
|
||||
"vals": [1, 2, 3, 4],
|
||||
"ids": ["a", "b", "f", "n"],
|
||||
"ids2": ["a", "n", "c", "n"],
|
||||
},
|
||||
index=["foo", "bar", "baz", "qux"],
|
||||
)
|
||||
other = ["a", "b", "c"]
|
||||
|
||||
result = df.isin(other)
|
||||
expected = DataFrame([df.loc[s].isin(other) for s in df.index])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])])
|
||||
def test_isin_empty(self, empty):
|
||||
# GH#16991
|
||||
df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]})
|
||||
expected = DataFrame(False, df.index, df.columns)
|
||||
|
||||
result = df.isin(empty)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_isin_dict(self):
|
||||
df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]})
|
||||
d = {"A": ["a"]}
|
||||
|
||||
expected = DataFrame(False, df.index, df.columns)
|
||||
expected.loc[0, "A"] = True
|
||||
|
||||
result = df.isin(d)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# non unique columns
|
||||
df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]})
|
||||
df.columns = ["A", "A"]
|
||||
expected = DataFrame(False, df.index, df.columns)
|
||||
expected.loc[0, "A"] = True
|
||||
result = df.isin(d)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_isin_with_string_scalar(self):
|
||||
# GH#4763
|
||||
df = DataFrame(
|
||||
{
|
||||
"vals": [1, 2, 3, 4],
|
||||
"ids": ["a", "b", "f", "n"],
|
||||
"ids2": ["a", "n", "c", "n"],
|
||||
},
|
||||
index=["foo", "bar", "baz", "qux"],
|
||||
)
|
||||
msg = (
|
||||
r"only list-like or dict-like objects are allowed "
|
||||
r"to be passed to DataFrame.isin\(\), you passed a 'str'"
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.isin("a")
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.isin("aaa")
|
||||
|
||||
def test_isin_df(self):
|
||||
df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]})
|
||||
df2 = DataFrame({"A": [0, 2, 12, 4], "B": [2, np.nan, 4, 5]})
|
||||
expected = DataFrame(False, df1.index, df1.columns)
|
||||
result = df1.isin(df2)
|
||||
expected.loc[[1, 3], "A"] = True
|
||||
expected.loc[[0, 2], "B"] = True
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# partial overlapping columns
|
||||
df2.columns = ["A", "C"]
|
||||
result = df1.isin(df2)
|
||||
expected["B"] = False
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_isin_tuples(self):
|
||||
# GH#16394
|
||||
df = DataFrame({"A": [1, 2, 3], "B": ["a", "b", "f"]})
|
||||
df["C"] = list(zip(df["A"], df["B"]))
|
||||
result = df["C"].isin([(1, "a")])
|
||||
tm.assert_series_equal(result, Series([True, False, False], name="C"))
|
||||
|
||||
def test_isin_df_dupe_values(self):
|
||||
df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]})
|
||||
# just cols duped
|
||||
df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]], columns=["B", "B"])
|
||||
msg = r"cannot compute isin with a duplicate axis\."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df1.isin(df2)
|
||||
|
||||
# just index duped
|
||||
df2 = DataFrame(
|
||||
[[0, 2], [12, 4], [2, np.nan], [4, 5]],
|
||||
columns=["A", "B"],
|
||||
index=[0, 0, 1, 1],
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df1.isin(df2)
|
||||
|
||||
# cols and index:
|
||||
df2.columns = ["B", "B"]
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df1.isin(df2)
|
||||
|
||||
def test_isin_dupe_self(self):
|
||||
other = DataFrame({"A": [1, 0, 1, 0], "B": [1, 1, 0, 0]})
|
||||
df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=["A", "A"])
|
||||
result = df.isin(other)
|
||||
expected = DataFrame(False, index=df.index, columns=df.columns)
|
||||
expected.loc[0] = True
|
||||
expected.iloc[1, 1] = True
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_isin_against_series(self):
|
||||
df = DataFrame(
|
||||
{"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}, index=["a", "b", "c", "d"]
|
||||
)
|
||||
s = Series([1, 3, 11, 4], index=["a", "b", "c", "d"])
|
||||
expected = DataFrame(False, index=df.index, columns=df.columns)
|
||||
expected.loc["a", "A"] = True
|
||||
expected.loc["d"] = True
|
||||
result = df.isin(s)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_isin_multiIndex(self):
|
||||
idx = MultiIndex.from_tuples(
|
||||
[
|
||||
(0, "a", "foo"),
|
||||
(0, "a", "bar"),
|
||||
(0, "b", "bar"),
|
||||
(0, "b", "baz"),
|
||||
(2, "a", "foo"),
|
||||
(2, "a", "bar"),
|
||||
(2, "c", "bar"),
|
||||
(2, "c", "baz"),
|
||||
(1, "b", "foo"),
|
||||
(1, "b", "bar"),
|
||||
(1, "c", "bar"),
|
||||
(1, "c", "baz"),
|
||||
]
|
||||
)
|
||||
df1 = DataFrame({"A": np.ones(12), "B": np.zeros(12)}, index=idx)
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"A": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
|
||||
"B": [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1],
|
||||
}
|
||||
)
|
||||
# against regular index
|
||||
expected = DataFrame(False, index=df1.index, columns=df1.columns)
|
||||
result = df1.isin(df2)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df2.index = idx
|
||||
expected = df2.values.astype(bool)
|
||||
expected[:, 1] = ~expected[:, 1]
|
||||
expected = DataFrame(expected, columns=["A", "B"], index=idx)
|
||||
|
||||
result = df1.isin(df2)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_isin_empty_datetimelike(self):
|
||||
# GH#15473
|
||||
df1_ts = DataFrame({"date": pd.to_datetime(["2014-01-01", "2014-01-02"])})
|
||||
df1_td = DataFrame({"date": [pd.Timedelta(1, "s"), pd.Timedelta(2, "s")]})
|
||||
df2 = DataFrame({"date": []})
|
||||
df3 = DataFrame()
|
||||
|
||||
expected = DataFrame({"date": [False, False]})
|
||||
|
||||
result = df1_ts.isin(df2)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df1_ts.isin(df3)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df1_td.isin(df2)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df1_td.isin(df3)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values",
|
||||
[
|
||||
DataFrame({"a": [1, 2, 3]}, dtype="category"),
|
||||
Series([1, 2, 3], dtype="category"),
|
||||
],
|
||||
)
|
||||
def test_isin_category_frame(self, values):
|
||||
# GH#34256
|
||||
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
|
||||
expected = DataFrame({"a": [True, True, True], "b": [False, False, False]})
|
||||
|
||||
result = df.isin(values)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_isin_read_only(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/37174
|
||||
arr = np.array([1, 2, 3])
|
||||
arr.setflags(write=False)
|
||||
df = DataFrame([1, 2, 3])
|
||||
result = df.isin(arr)
|
||||
expected = DataFrame([True, True, True])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_isin_not_lossy(self):
|
||||
# GH 53514
|
||||
val = 1666880195890293744
|
||||
df = DataFrame({"a": [val], "b": [1.0]})
|
||||
result = df.isin([val])
|
||||
expected = DataFrame({"a": [True], "b": [False]})
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,16 @@
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Timedelta,
|
||||
)
|
||||
|
||||
|
||||
def test_no_overflow_of_freq_and_time_in_dataframe():
|
||||
# GH 35665
|
||||
df = DataFrame(
|
||||
{
|
||||
"some_string": ["2222Y3"],
|
||||
"time": [Timedelta("0 days 00:00:00.990000")],
|
||||
}
|
||||
)
|
||||
for _, row in df.iterrows():
|
||||
assert row.dtype == "object"
|
@ -0,0 +1,576 @@
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import MergeError
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
date_range,
|
||||
period_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.reshape.concat import concat
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def frame_with_period_index():
|
||||
return DataFrame(
|
||||
data=np.arange(20).reshape(4, 5),
|
||||
columns=list("abcde"),
|
||||
index=period_range(start="2000", freq="Y", periods=4),
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def left():
|
||||
return DataFrame({"a": [20, 10, 0]}, index=[2, 1, 0])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def right():
|
||||
return DataFrame({"b": [300, 100, 200]}, index=[3, 1, 2])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def left_no_dup():
|
||||
return DataFrame(
|
||||
{"a": ["a", "b", "c", "d"], "b": ["cat", "dog", "weasel", "horse"]},
|
||||
index=range(4),
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def right_no_dup():
|
||||
return DataFrame(
|
||||
{
|
||||
"a": ["a", "b", "c", "d", "e"],
|
||||
"c": ["meow", "bark", "um... weasel noise?", "nay", "chirp"],
|
||||
},
|
||||
index=range(5),
|
||||
).set_index("a")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def left_w_dups(left_no_dup):
|
||||
return concat(
|
||||
[left_no_dup, DataFrame({"a": ["a"], "b": ["cow"]}, index=[3])], sort=True
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def right_w_dups(right_no_dup):
|
||||
return concat(
|
||||
[right_no_dup, DataFrame({"a": ["e"], "c": ["moo"]}, index=[3])]
|
||||
).set_index("a")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"how, sort, expected",
|
||||
[
|
||||
("inner", False, DataFrame({"a": [20, 10], "b": [200, 100]}, index=[2, 1])),
|
||||
("inner", True, DataFrame({"a": [10, 20], "b": [100, 200]}, index=[1, 2])),
|
||||
(
|
||||
"left",
|
||||
False,
|
||||
DataFrame({"a": [20, 10, 0], "b": [200, 100, np.nan]}, index=[2, 1, 0]),
|
||||
),
|
||||
(
|
||||
"left",
|
||||
True,
|
||||
DataFrame({"a": [0, 10, 20], "b": [np.nan, 100, 200]}, index=[0, 1, 2]),
|
||||
),
|
||||
(
|
||||
"right",
|
||||
False,
|
||||
DataFrame({"a": [np.nan, 10, 20], "b": [300, 100, 200]}, index=[3, 1, 2]),
|
||||
),
|
||||
(
|
||||
"right",
|
||||
True,
|
||||
DataFrame({"a": [10, 20, np.nan], "b": [100, 200, 300]}, index=[1, 2, 3]),
|
||||
),
|
||||
(
|
||||
"outer",
|
||||
False,
|
||||
DataFrame(
|
||||
{"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]},
|
||||
index=[0, 1, 2, 3],
|
||||
),
|
||||
),
|
||||
(
|
||||
"outer",
|
||||
True,
|
||||
DataFrame(
|
||||
{"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]},
|
||||
index=[0, 1, 2, 3],
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_join(left, right, how, sort, expected):
|
||||
result = left.join(right, how=how, sort=sort, validate="1:1")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_suffix_on_list_join():
|
||||
first = DataFrame({"key": [1, 2, 3, 4, 5]})
|
||||
second = DataFrame({"key": [1, 8, 3, 2, 5], "v1": [1, 2, 3, 4, 5]})
|
||||
third = DataFrame({"keys": [5, 2, 3, 4, 1], "v2": [1, 2, 3, 4, 5]})
|
||||
|
||||
# check proper errors are raised
|
||||
msg = "Suffixes not supported when joining multiple DataFrames"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
first.join([second], lsuffix="y")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
first.join([second, third], rsuffix="x")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
first.join([second, third], lsuffix="y", rsuffix="x")
|
||||
with pytest.raises(ValueError, match="Indexes have overlapping values"):
|
||||
first.join([second, third])
|
||||
|
||||
# no errors should be raised
|
||||
arr_joined = first.join([third])
|
||||
norm_joined = first.join(third)
|
||||
tm.assert_frame_equal(arr_joined, norm_joined)
|
||||
|
||||
|
||||
def test_join_invalid_validate(left_no_dup, right_no_dup):
|
||||
# GH 46622
|
||||
# Check invalid arguments
|
||||
msg = (
|
||||
'"invalid" is not a valid argument. '
|
||||
"Valid arguments are:\n"
|
||||
'- "1:1"\n'
|
||||
'- "1:m"\n'
|
||||
'- "m:1"\n'
|
||||
'- "m:m"\n'
|
||||
'- "one_to_one"\n'
|
||||
'- "one_to_many"\n'
|
||||
'- "many_to_one"\n'
|
||||
'- "many_to_many"'
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
left_no_dup.merge(right_no_dup, on="a", validate="invalid")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["object", "string[pyarrow]"])
|
||||
def test_join_on_single_col_dup_on_right(left_no_dup, right_w_dups, dtype):
|
||||
# GH 46622
|
||||
# Dups on right allowed by one_to_many constraint
|
||||
if dtype == "string[pyarrow]":
|
||||
pytest.importorskip("pyarrow")
|
||||
left_no_dup = left_no_dup.astype(dtype)
|
||||
right_w_dups.index = right_w_dups.index.astype(dtype)
|
||||
left_no_dup.join(
|
||||
right_w_dups,
|
||||
on="a",
|
||||
validate="one_to_many",
|
||||
)
|
||||
|
||||
# Dups on right not allowed by one_to_one constraint
|
||||
msg = "Merge keys are not unique in right dataset; not a one-to-one merge"
|
||||
with pytest.raises(MergeError, match=msg):
|
||||
left_no_dup.join(
|
||||
right_w_dups,
|
||||
on="a",
|
||||
validate="one_to_one",
|
||||
)
|
||||
|
||||
|
||||
def test_join_on_single_col_dup_on_left(left_w_dups, right_no_dup):
|
||||
# GH 46622
|
||||
# Dups on left allowed by many_to_one constraint
|
||||
left_w_dups.join(
|
||||
right_no_dup,
|
||||
on="a",
|
||||
validate="many_to_one",
|
||||
)
|
||||
|
||||
# Dups on left not allowed by one_to_one constraint
|
||||
msg = "Merge keys are not unique in left dataset; not a one-to-one merge"
|
||||
with pytest.raises(MergeError, match=msg):
|
||||
left_w_dups.join(
|
||||
right_no_dup,
|
||||
on="a",
|
||||
validate="one_to_one",
|
||||
)
|
||||
|
||||
|
||||
def test_join_on_single_col_dup_on_both(left_w_dups, right_w_dups):
|
||||
# GH 46622
|
||||
# Dups on both allowed by many_to_many constraint
|
||||
left_w_dups.join(right_w_dups, on="a", validate="many_to_many")
|
||||
|
||||
# Dups on both not allowed by many_to_one constraint
|
||||
msg = "Merge keys are not unique in right dataset; not a many-to-one merge"
|
||||
with pytest.raises(MergeError, match=msg):
|
||||
left_w_dups.join(
|
||||
right_w_dups,
|
||||
on="a",
|
||||
validate="many_to_one",
|
||||
)
|
||||
|
||||
# Dups on both not allowed by one_to_many constraint
|
||||
msg = "Merge keys are not unique in left dataset; not a one-to-many merge"
|
||||
with pytest.raises(MergeError, match=msg):
|
||||
left_w_dups.join(
|
||||
right_w_dups,
|
||||
on="a",
|
||||
validate="one_to_many",
|
||||
)
|
||||
|
||||
|
||||
def test_join_on_multi_col_check_dup():
|
||||
# GH 46622
|
||||
# Two column join, dups in both, but jointly no dups
|
||||
left = DataFrame(
|
||||
{
|
||||
"a": ["a", "a", "b", "b"],
|
||||
"b": [0, 1, 0, 1],
|
||||
"c": ["cat", "dog", "weasel", "horse"],
|
||||
},
|
||||
index=range(4),
|
||||
).set_index(["a", "b"])
|
||||
|
||||
right = DataFrame(
|
||||
{
|
||||
"a": ["a", "a", "b"],
|
||||
"b": [0, 1, 0],
|
||||
"d": ["meow", "bark", "um... weasel noise?"],
|
||||
},
|
||||
index=range(3),
|
||||
).set_index(["a", "b"])
|
||||
|
||||
expected_multi = DataFrame(
|
||||
{
|
||||
"a": ["a", "a", "b"],
|
||||
"b": [0, 1, 0],
|
||||
"c": ["cat", "dog", "weasel"],
|
||||
"d": ["meow", "bark", "um... weasel noise?"],
|
||||
},
|
||||
index=range(3),
|
||||
).set_index(["a", "b"])
|
||||
|
||||
# Jointly no dups allowed by one_to_one constraint
|
||||
result = left.join(right, how="inner", validate="1:1")
|
||||
tm.assert_frame_equal(result, expected_multi)
|
||||
|
||||
|
||||
def test_join_index(float_frame):
|
||||
# left / right
|
||||
|
||||
f = float_frame.loc[float_frame.index[:10], ["A", "B"]]
|
||||
f2 = float_frame.loc[float_frame.index[5:], ["C", "D"]].iloc[::-1]
|
||||
|
||||
joined = f.join(f2)
|
||||
tm.assert_index_equal(f.index, joined.index)
|
||||
expected_columns = Index(["A", "B", "C", "D"])
|
||||
tm.assert_index_equal(joined.columns, expected_columns)
|
||||
|
||||
joined = f.join(f2, how="left")
|
||||
tm.assert_index_equal(joined.index, f.index)
|
||||
tm.assert_index_equal(joined.columns, expected_columns)
|
||||
|
||||
joined = f.join(f2, how="right")
|
||||
tm.assert_index_equal(joined.index, f2.index)
|
||||
tm.assert_index_equal(joined.columns, expected_columns)
|
||||
|
||||
# inner
|
||||
|
||||
joined = f.join(f2, how="inner")
|
||||
tm.assert_index_equal(joined.index, f.index[5:10])
|
||||
tm.assert_index_equal(joined.columns, expected_columns)
|
||||
|
||||
# outer
|
||||
|
||||
joined = f.join(f2, how="outer")
|
||||
tm.assert_index_equal(joined.index, float_frame.index.sort_values())
|
||||
tm.assert_index_equal(joined.columns, expected_columns)
|
||||
|
||||
with pytest.raises(ValueError, match="join method"):
|
||||
f.join(f2, how="foo")
|
||||
|
||||
# corner case - overlapping columns
|
||||
msg = "columns overlap but no suffix"
|
||||
for how in ("outer", "left", "inner"):
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
float_frame.join(float_frame, how=how)
|
||||
|
||||
|
||||
def test_join_index_more(float_frame):
|
||||
af = float_frame.loc[:, ["A", "B"]]
|
||||
bf = float_frame.loc[::2, ["C", "D"]]
|
||||
|
||||
expected = af.copy()
|
||||
expected["C"] = float_frame["C"][::2]
|
||||
expected["D"] = float_frame["D"][::2]
|
||||
|
||||
result = af.join(bf)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = af.join(bf, how="right")
|
||||
tm.assert_frame_equal(result, expected[::2])
|
||||
|
||||
result = bf.join(af, how="right")
|
||||
tm.assert_frame_equal(result, expected.loc[:, result.columns])
|
||||
|
||||
|
||||
def test_join_index_series(float_frame):
|
||||
df = float_frame.copy()
|
||||
ser = df.pop(float_frame.columns[-1])
|
||||
joined = df.join(ser)
|
||||
|
||||
tm.assert_frame_equal(joined, float_frame)
|
||||
|
||||
ser.name = None
|
||||
with pytest.raises(ValueError, match="must have a name"):
|
||||
df.join(ser)
|
||||
|
||||
|
||||
def test_join_overlap(float_frame):
|
||||
df1 = float_frame.loc[:, ["A", "B", "C"]]
|
||||
df2 = float_frame.loc[:, ["B", "C", "D"]]
|
||||
|
||||
joined = df1.join(df2, lsuffix="_df1", rsuffix="_df2")
|
||||
df1_suf = df1.loc[:, ["B", "C"]].add_suffix("_df1")
|
||||
df2_suf = df2.loc[:, ["B", "C"]].add_suffix("_df2")
|
||||
|
||||
no_overlap = float_frame.loc[:, ["A", "D"]]
|
||||
expected = df1_suf.join(df2_suf).join(no_overlap)
|
||||
|
||||
# column order not necessarily sorted
|
||||
tm.assert_frame_equal(joined, expected.loc[:, joined.columns])
|
||||
|
||||
|
||||
def test_join_period_index(frame_with_period_index):
|
||||
other = frame_with_period_index.rename(columns=lambda key: f"{key}{key}")
|
||||
|
||||
joined_values = np.concatenate([frame_with_period_index.values] * 2, axis=1)
|
||||
|
||||
joined_cols = frame_with_period_index.columns.append(other.columns)
|
||||
|
||||
joined = frame_with_period_index.join(other)
|
||||
expected = DataFrame(
|
||||
data=joined_values, columns=joined_cols, index=frame_with_period_index.index
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(joined, expected)
|
||||
|
||||
|
||||
def test_join_left_sequence_non_unique_index():
|
||||
# https://github.com/pandas-dev/pandas/issues/19607
|
||||
df1 = DataFrame({"a": [0, 10, 20]}, index=[1, 2, 3])
|
||||
df2 = DataFrame({"b": [100, 200, 300]}, index=[4, 3, 2])
|
||||
df3 = DataFrame({"c": [400, 500, 600]}, index=[2, 2, 4])
|
||||
|
||||
joined = df1.join([df2, df3], how="left")
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": [0, 10, 10, 20],
|
||||
"b": [np.nan, 300, 300, 200],
|
||||
"c": [np.nan, 400, 500, np.nan],
|
||||
},
|
||||
index=[1, 2, 2, 3],
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(joined, expected)
|
||||
|
||||
|
||||
def test_join_list_series(float_frame):
|
||||
# GH#46850
|
||||
# Join a DataFrame with a list containing both a Series and a DataFrame
|
||||
left = float_frame.A.to_frame()
|
||||
right = [float_frame.B, float_frame[["C", "D"]]]
|
||||
result = left.join(right)
|
||||
tm.assert_frame_equal(result, float_frame)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sort_kw", [True, False])
|
||||
def test_suppress_future_warning_with_sort_kw(sort_kw):
|
||||
a = DataFrame({"col1": [1, 2]}, index=["c", "a"])
|
||||
|
||||
b = DataFrame({"col2": [4, 5]}, index=["b", "a"])
|
||||
|
||||
c = DataFrame({"col3": [7, 8]}, index=["a", "b"])
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"col1": {"a": 2.0, "b": float("nan"), "c": 1.0},
|
||||
"col2": {"a": 5.0, "b": 4.0, "c": float("nan")},
|
||||
"col3": {"a": 7.0, "b": 8.0, "c": float("nan")},
|
||||
}
|
||||
)
|
||||
if sort_kw is False:
|
||||
expected = expected.reindex(index=["c", "a", "b"])
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
result = a.join([b, c], how="outer", sort=sort_kw)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestDataFrameJoin:
|
||||
def test_join(self, multiindex_dataframe_random_data):
|
||||
frame = multiindex_dataframe_random_data
|
||||
|
||||
a = frame.loc[frame.index[:5], ["A"]]
|
||||
b = frame.loc[frame.index[2:], ["B", "C"]]
|
||||
|
||||
joined = a.join(b, how="outer").reindex(frame.index)
|
||||
expected = frame.copy().values.copy()
|
||||
expected[np.isnan(joined.values)] = np.nan
|
||||
expected = DataFrame(expected, index=frame.index, columns=frame.columns)
|
||||
|
||||
assert not np.isnan(joined.values).all()
|
||||
|
||||
tm.assert_frame_equal(joined, expected)
|
||||
|
||||
def test_join_segfault(self):
|
||||
# GH#1532
|
||||
df1 = DataFrame({"a": [1, 1], "b": [1, 2], "x": [1, 2]})
|
||||
df2 = DataFrame({"a": [2, 2], "b": [1, 2], "y": [1, 2]})
|
||||
df1 = df1.set_index(["a", "b"])
|
||||
df2 = df2.set_index(["a", "b"])
|
||||
# it works!
|
||||
for how in ["left", "right", "outer"]:
|
||||
df1.join(df2, how=how)
|
||||
|
||||
def test_join_str_datetime(self):
|
||||
str_dates = ["20120209", "20120222"]
|
||||
dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)]
|
||||
|
||||
A = DataFrame(str_dates, index=range(2), columns=["aa"])
|
||||
C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates)
|
||||
|
||||
tst = A.join(C, on="aa")
|
||||
|
||||
assert len(tst.columns) == 3
|
||||
|
||||
def test_join_multiindex_leftright(self):
|
||||
# GH 10741
|
||||
df1 = DataFrame(
|
||||
[
|
||||
["a", "x", 0.471780],
|
||||
["a", "y", 0.774908],
|
||||
["a", "z", 0.563634],
|
||||
["b", "x", -0.353756],
|
||||
["b", "y", 0.368062],
|
||||
["b", "z", -1.721840],
|
||||
["c", "x", 1],
|
||||
["c", "y", 2],
|
||||
["c", "z", 3],
|
||||
],
|
||||
columns=["first", "second", "value1"],
|
||||
).set_index(["first", "second"])
|
||||
|
||||
df2 = DataFrame([["a", 10], ["b", 20]], columns=["first", "value2"]).set_index(
|
||||
["first"]
|
||||
)
|
||||
|
||||
exp = DataFrame(
|
||||
[
|
||||
[0.471780, 10],
|
||||
[0.774908, 10],
|
||||
[0.563634, 10],
|
||||
[-0.353756, 20],
|
||||
[0.368062, 20],
|
||||
[-1.721840, 20],
|
||||
[1.000000, np.nan],
|
||||
[2.000000, np.nan],
|
||||
[3.000000, np.nan],
|
||||
],
|
||||
index=df1.index,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
# these must be the same results (but columns are flipped)
|
||||
tm.assert_frame_equal(df1.join(df2, how="left"), exp)
|
||||
tm.assert_frame_equal(df2.join(df1, how="right"), exp[["value2", "value1"]])
|
||||
|
||||
exp_idx = MultiIndex.from_product(
|
||||
[["a", "b"], ["x", "y", "z"]], names=["first", "second"]
|
||||
)
|
||||
exp = DataFrame(
|
||||
[
|
||||
[0.471780, 10],
|
||||
[0.774908, 10],
|
||||
[0.563634, 10],
|
||||
[-0.353756, 20],
|
||||
[0.368062, 20],
|
||||
[-1.721840, 20],
|
||||
],
|
||||
index=exp_idx,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(df1.join(df2, how="right"), exp)
|
||||
tm.assert_frame_equal(df2.join(df1, how="left"), exp[["value2", "value1"]])
|
||||
|
||||
def test_join_multiindex_dates(self):
|
||||
# GH 33692
|
||||
date = pd.Timestamp(2000, 1, 1).date()
|
||||
|
||||
df1_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"])
|
||||
df1 = DataFrame({"col1": [0]}, index=df1_index)
|
||||
df2_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"])
|
||||
df2 = DataFrame({"col2": [0]}, index=df2_index)
|
||||
df3_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"])
|
||||
df3 = DataFrame({"col3": [0]}, index=df3_index)
|
||||
|
||||
result = df1.join([df2, df3])
|
||||
|
||||
expected_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"])
|
||||
expected = DataFrame(
|
||||
{"col1": [0], "col2": [0], "col3": [0]}, index=expected_index
|
||||
)
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_merge_join_different_levels_raises(self):
|
||||
# GH#9455
|
||||
# GH 40993: For raising, enforced in 2.0
|
||||
|
||||
# first dataframe
|
||||
df1 = DataFrame(columns=["a", "b"], data=[[1, 11], [0, 22]])
|
||||
|
||||
# second dataframe
|
||||
columns = MultiIndex.from_tuples([("a", ""), ("c", "c1")])
|
||||
df2 = DataFrame(columns=columns, data=[[1, 33], [0, 44]])
|
||||
|
||||
# merge
|
||||
with pytest.raises(
|
||||
MergeError, match="Not allowed to merge between different levels"
|
||||
):
|
||||
pd.merge(df1, df2, on="a")
|
||||
|
||||
# join, see discussion in GH#12219
|
||||
with pytest.raises(
|
||||
MergeError, match="Not allowed to merge between different levels"
|
||||
):
|
||||
df1.join(df2, on="a")
|
||||
|
||||
def test_frame_join_tzaware(self):
|
||||
test1 = DataFrame(
|
||||
np.zeros((6, 3)),
|
||||
index=date_range(
|
||||
"2012-11-15 00:00:00", periods=6, freq="100ms", tz="US/Central"
|
||||
),
|
||||
)
|
||||
test2 = DataFrame(
|
||||
np.zeros((3, 3)),
|
||||
index=date_range(
|
||||
"2012-11-15 00:00:00", periods=3, freq="250ms", tz="US/Central"
|
||||
),
|
||||
columns=range(3, 6),
|
||||
)
|
||||
|
||||
result = test1.join(test2, how="outer")
|
||||
expected = test1.index.union(test2.index)
|
||||
|
||||
tm.assert_index_equal(result.index, expected)
|
||||
assert result.index.tz.zone == "US/Central"
|
@ -0,0 +1,216 @@
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.tseries.offsets import BDay
|
||||
|
||||
|
||||
def test_map(float_frame):
|
||||
result = float_frame.map(lambda x: x * 2)
|
||||
tm.assert_frame_equal(result, float_frame * 2)
|
||||
float_frame.map(type)
|
||||
|
||||
# GH 465: function returning tuples
|
||||
result = float_frame.map(lambda x: (x, x))["A"].iloc[0]
|
||||
assert isinstance(result, tuple)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("val", [1, 1.0])
|
||||
def test_map_float_object_conversion(val):
|
||||
# GH 2909: object conversion to float in constructor?
|
||||
df = DataFrame(data=[val, "a"])
|
||||
result = df.map(lambda x: x).dtypes[0]
|
||||
assert result == object
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_action", [None, "ignore"])
|
||||
def test_map_keeps_dtype(na_action):
|
||||
# GH52219
|
||||
arr = Series(["a", np.nan, "b"])
|
||||
sparse_arr = arr.astype(pd.SparseDtype(object))
|
||||
df = DataFrame(data={"a": arr, "b": sparse_arr})
|
||||
|
||||
def func(x):
|
||||
return str.upper(x) if not pd.isna(x) else x
|
||||
|
||||
result = df.map(func, na_action=na_action)
|
||||
|
||||
expected_sparse = pd.array(["A", np.nan, "B"], dtype=pd.SparseDtype(object))
|
||||
expected_arr = expected_sparse.astype(object)
|
||||
expected = DataFrame({"a": expected_arr, "b": expected_sparse})
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result_empty = df.iloc[:0, :].map(func, na_action=na_action)
|
||||
expected_empty = expected.iloc[:0, :]
|
||||
tm.assert_frame_equal(result_empty, expected_empty)
|
||||
|
||||
|
||||
def test_map_str():
|
||||
# GH 2786
|
||||
df = DataFrame(np.random.default_rng(2).random((3, 4)))
|
||||
df2 = df.copy()
|
||||
cols = ["a", "a", "a", "a"]
|
||||
df.columns = cols
|
||||
|
||||
expected = df2.map(str)
|
||||
expected.columns = cols
|
||||
result = df.map(str)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"col, val",
|
||||
[["datetime", Timestamp("20130101")], ["timedelta", pd.Timedelta("1 min")]],
|
||||
)
|
||||
def test_map_datetimelike(col, val):
|
||||
# datetime/timedelta
|
||||
df = DataFrame(np.random.default_rng(2).random((3, 4)))
|
||||
df[col] = val
|
||||
result = df.map(str)
|
||||
assert result.loc[0, col] == str(df.loc[0, col])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"expected",
|
||||
[
|
||||
DataFrame(),
|
||||
DataFrame(columns=list("ABC")),
|
||||
DataFrame(index=list("ABC")),
|
||||
DataFrame({"A": [], "B": [], "C": []}),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("func", [round, lambda x: x])
|
||||
def test_map_empty(expected, func):
|
||||
# GH 8222
|
||||
result = expected.map(func)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_map_kwargs():
|
||||
# GH 40652
|
||||
result = DataFrame([[1, 2], [3, 4]]).map(lambda x, y: x + y, y=2)
|
||||
expected = DataFrame([[3, 4], [5, 6]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_map_na_ignore(float_frame):
|
||||
# GH 23803
|
||||
strlen_frame = float_frame.map(lambda x: len(str(x)))
|
||||
float_frame_with_na = float_frame.copy()
|
||||
mask = np.random.default_rng(2).integers(0, 2, size=float_frame.shape, dtype=bool)
|
||||
float_frame_with_na[mask] = pd.NA
|
||||
strlen_frame_na_ignore = float_frame_with_na.map(
|
||||
lambda x: len(str(x)), na_action="ignore"
|
||||
)
|
||||
# Set float64 type to avoid upcast when setting NA below
|
||||
strlen_frame_with_na = strlen_frame.copy().astype("float64")
|
||||
strlen_frame_with_na[mask] = pd.NA
|
||||
tm.assert_frame_equal(strlen_frame_na_ignore, strlen_frame_with_na)
|
||||
|
||||
|
||||
def test_map_box_timestamps():
|
||||
# GH 2689, GH 2627
|
||||
ser = Series(date_range("1/1/2000", periods=10))
|
||||
|
||||
def func(x):
|
||||
return (x.hour, x.day, x.month)
|
||||
|
||||
# it works!
|
||||
DataFrame(ser).map(func)
|
||||
|
||||
|
||||
def test_map_box():
|
||||
# ufunc will not be boxed. Same test cases as the test_map_box
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [Timestamp("2011-01-01"), Timestamp("2011-01-02")],
|
||||
"b": [
|
||||
Timestamp("2011-01-01", tz="US/Eastern"),
|
||||
Timestamp("2011-01-02", tz="US/Eastern"),
|
||||
],
|
||||
"c": [pd.Timedelta("1 days"), pd.Timedelta("2 days")],
|
||||
"d": [
|
||||
pd.Period("2011-01-01", freq="M"),
|
||||
pd.Period("2011-01-02", freq="M"),
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
result = df.map(lambda x: type(x).__name__)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": ["Timestamp", "Timestamp"],
|
||||
"b": ["Timestamp", "Timestamp"],
|
||||
"c": ["Timedelta", "Timedelta"],
|
||||
"d": ["Period", "Period"],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_frame_map_dont_convert_datetime64():
|
||||
df = DataFrame({"x1": [datetime(1996, 1, 1)]})
|
||||
|
||||
df = df.map(lambda x: x + BDay())
|
||||
df = df.map(lambda x: x + BDay())
|
||||
|
||||
result = df.x1.dtype
|
||||
assert result == "M8[ns]"
|
||||
|
||||
|
||||
def test_map_function_runs_once():
|
||||
df = DataFrame({"a": [1, 2, 3]})
|
||||
values = [] # Save values function is applied to
|
||||
|
||||
def reducing_function(val):
|
||||
values.append(val)
|
||||
|
||||
def non_reducing_function(val):
|
||||
values.append(val)
|
||||
return val
|
||||
|
||||
for func in [reducing_function, non_reducing_function]:
|
||||
del values[:]
|
||||
|
||||
df.map(func)
|
||||
assert values == df.a.to_list()
|
||||
|
||||
|
||||
def test_map_type():
|
||||
# GH 46719
|
||||
df = DataFrame(
|
||||
{"col1": [3, "string", float], "col2": [0.25, datetime(2020, 1, 1), np.nan]},
|
||||
index=["a", "b", "c"],
|
||||
)
|
||||
|
||||
result = df.map(type)
|
||||
expected = DataFrame(
|
||||
{"col1": [int, str, type], "col2": [float, datetime, float]},
|
||||
index=["a", "b", "c"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_map_invalid_na_action(float_frame):
|
||||
# GH 23803
|
||||
with pytest.raises(ValueError, match="na_action must be .*Got 'abc'"):
|
||||
float_frame.map(lambda x: len(str(x)), na_action="abc")
|
||||
|
||||
|
||||
def test_applymap_deprecated():
|
||||
# GH52353
|
||||
df = DataFrame({"a": [1, 2, 3]})
|
||||
msg = "DataFrame.applymap has been deprecated. Use DataFrame.map instead."
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.applymap(lambda x: x)
|
@ -0,0 +1,98 @@
|
||||
import operator
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestMatMul:
|
||||
def test_matmul(self):
|
||||
# matmul test is for GH#10259
|
||||
a = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((3, 4)),
|
||||
index=["a", "b", "c"],
|
||||
columns=["p", "q", "r", "s"],
|
||||
)
|
||||
b = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((4, 2)),
|
||||
index=["p", "q", "r", "s"],
|
||||
columns=["one", "two"],
|
||||
)
|
||||
|
||||
# DataFrame @ DataFrame
|
||||
result = operator.matmul(a, b)
|
||||
expected = DataFrame(
|
||||
np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# DataFrame @ Series
|
||||
result = operator.matmul(a, b.one)
|
||||
expected = Series(np.dot(a.values, b.one.values), index=["a", "b", "c"])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# np.array @ DataFrame
|
||||
result = operator.matmul(a.values, b)
|
||||
assert isinstance(result, DataFrame)
|
||||
assert result.columns.equals(b.columns)
|
||||
assert result.index.equals(Index(range(3)))
|
||||
expected = np.dot(a.values, b.values)
|
||||
tm.assert_almost_equal(result.values, expected)
|
||||
|
||||
# nested list @ DataFrame (__rmatmul__)
|
||||
result = operator.matmul(a.values.tolist(), b)
|
||||
expected = DataFrame(
|
||||
np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"]
|
||||
)
|
||||
tm.assert_almost_equal(result.values, expected.values)
|
||||
|
||||
# mixed dtype DataFrame @ DataFrame
|
||||
a["q"] = a.q.round().astype(int)
|
||||
result = operator.matmul(a, b)
|
||||
expected = DataFrame(
|
||||
np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# different dtypes DataFrame @ DataFrame
|
||||
a = a.astype(int)
|
||||
result = operator.matmul(a, b)
|
||||
expected = DataFrame(
|
||||
np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# unaligned
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((3, 4)),
|
||||
index=[1, 2, 3],
|
||||
columns=range(4),
|
||||
)
|
||||
df2 = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((5, 3)),
|
||||
index=range(5),
|
||||
columns=[1, 2, 3],
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="aligned"):
|
||||
operator.matmul(df, df2)
|
||||
|
||||
def test_matmul_message_shapes(self):
|
||||
# GH#21581 exception message should reflect original shapes,
|
||||
# not transposed shapes
|
||||
a = np.random.default_rng(2).random((10, 4))
|
||||
b = np.random.default_rng(2).random((5, 3))
|
||||
|
||||
df = DataFrame(b)
|
||||
|
||||
msg = r"shapes \(10, 4\) and \(5, 3\) not aligned"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
a @ df
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
a.tolist() @ df
|
@ -0,0 +1,250 @@
|
||||
"""
|
||||
Note: for naming purposes, most tests are title with as e.g. "test_nlargest_foo"
|
||||
but are implicitly also testing nsmallest_foo.
|
||||
"""
|
||||
from string import ascii_lowercase
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.util.version import Version
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_duplicates():
|
||||
return pd.DataFrame(
|
||||
{"a": [1, 2, 3, 4, 4], "b": [1, 1, 1, 1, 1], "c": [0, 1, 2, 5, 4]},
|
||||
index=[0, 0, 1, 1, 1],
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_strings():
|
||||
return pd.DataFrame(
|
||||
{
|
||||
"a": np.random.default_rng(2).permutation(10),
|
||||
"b": list(ascii_lowercase[:10]),
|
||||
"c": np.random.default_rng(2).permutation(10).astype("float64"),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_main_dtypes():
|
||||
return pd.DataFrame(
|
||||
{
|
||||
"group": [1, 1, 2],
|
||||
"int": [1, 2, 3],
|
||||
"float": [4.0, 5.0, 6.0],
|
||||
"string": list("abc"),
|
||||
"category_string": pd.Series(list("abc")).astype("category"),
|
||||
"category_int": [7, 8, 9],
|
||||
"datetime": pd.date_range("20130101", periods=3),
|
||||
"datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"),
|
||||
"timedelta": pd.timedelta_range("1 s", periods=3, freq="s"),
|
||||
},
|
||||
columns=[
|
||||
"group",
|
||||
"int",
|
||||
"float",
|
||||
"string",
|
||||
"category_string",
|
||||
"category_int",
|
||||
"datetime",
|
||||
"datetimetz",
|
||||
"timedelta",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
class TestNLargestNSmallest:
|
||||
# ----------------------------------------------------------------------
|
||||
# Top / bottom
|
||||
@pytest.mark.parametrize(
|
||||
"order",
|
||||
[
|
||||
["a"],
|
||||
["c"],
|
||||
["a", "b"],
|
||||
["a", "c"],
|
||||
["b", "a"],
|
||||
["b", "c"],
|
||||
["a", "b", "c"],
|
||||
["c", "a", "b"],
|
||||
["c", "b", "a"],
|
||||
["b", "c", "a"],
|
||||
["b", "a", "c"],
|
||||
# dups!
|
||||
["b", "c", "c"],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("n", range(1, 11))
|
||||
def test_nlargest_n(self, df_strings, nselect_method, n, order):
|
||||
# GH#10393
|
||||
df = df_strings
|
||||
if "b" in order:
|
||||
error_msg = (
|
||||
f"Column 'b' has dtype (object|string), "
|
||||
f"cannot use method '{nselect_method}' with this dtype"
|
||||
)
|
||||
with pytest.raises(TypeError, match=error_msg):
|
||||
getattr(df, nselect_method)(n, order)
|
||||
else:
|
||||
ascending = nselect_method == "nsmallest"
|
||||
result = getattr(df, nselect_method)(n, order)
|
||||
expected = df.sort_values(order, ascending=ascending).head(n)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"columns", [["group", "category_string"], ["group", "string"]]
|
||||
)
|
||||
def test_nlargest_error(self, df_main_dtypes, nselect_method, columns):
|
||||
df = df_main_dtypes
|
||||
col = columns[1]
|
||||
error_msg = (
|
||||
f"Column '{col}' has dtype {df[col].dtype}, "
|
||||
f"cannot use method '{nselect_method}' with this dtype"
|
||||
)
|
||||
# escape some characters that may be in the repr
|
||||
error_msg = (
|
||||
error_msg.replace("(", "\\(")
|
||||
.replace(")", "\\)")
|
||||
.replace("[", "\\[")
|
||||
.replace("]", "\\]")
|
||||
)
|
||||
with pytest.raises(TypeError, match=error_msg):
|
||||
getattr(df, nselect_method)(2, columns)
|
||||
|
||||
def test_nlargest_all_dtypes(self, df_main_dtypes):
|
||||
df = df_main_dtypes
|
||||
df.nsmallest(2, list(set(df) - {"category_string", "string"}))
|
||||
df.nlargest(2, list(set(df) - {"category_string", "string"}))
|
||||
|
||||
def test_nlargest_duplicates_on_starter_columns(self):
|
||||
# regression test for GH#22752
|
||||
|
||||
df = pd.DataFrame({"a": [2, 2, 2, 1, 1, 1], "b": [1, 2, 3, 3, 2, 1]})
|
||||
|
||||
result = df.nlargest(4, columns=["a", "b"])
|
||||
expected = pd.DataFrame(
|
||||
{"a": [2, 2, 2, 1], "b": [3, 2, 1, 3]}, index=[2, 1, 0, 3]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.nsmallest(4, columns=["a", "b"])
|
||||
expected = pd.DataFrame(
|
||||
{"a": [1, 1, 1, 2], "b": [1, 2, 3, 1]}, index=[5, 4, 3, 0]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_nlargest_n_identical_values(self):
|
||||
# GH#15297
|
||||
df = pd.DataFrame({"a": [1] * 5, "b": [1, 2, 3, 4, 5]})
|
||||
|
||||
result = df.nlargest(3, "a")
|
||||
expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}, index=[0, 1, 2])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.nsmallest(3, "a")
|
||||
expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"order",
|
||||
[["a", "b", "c"], ["c", "b", "a"], ["a"], ["b"], ["a", "b"], ["c", "b"]],
|
||||
)
|
||||
@pytest.mark.parametrize("n", range(1, 6))
|
||||
def test_nlargest_n_duplicate_index(self, df_duplicates, n, order, request):
|
||||
# GH#13412
|
||||
|
||||
df = df_duplicates
|
||||
result = df.nsmallest(n, order)
|
||||
expected = df.sort_values(order).head(n)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.nlargest(n, order)
|
||||
expected = df.sort_values(order, ascending=False).head(n)
|
||||
if Version(np.__version__) >= Version("1.25") and (
|
||||
(order == ["a"] and n in (1, 2, 3, 4)) or (order == ["a", "b"]) and n == 5
|
||||
):
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(
|
||||
reason=(
|
||||
"pandas default unstable sorting of duplicates"
|
||||
"issue with numpy>=1.25 with AVX instructions"
|
||||
),
|
||||
strict=False,
|
||||
)
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_nlargest_duplicate_keep_all_ties(self):
|
||||
# GH#16818
|
||||
df = pd.DataFrame(
|
||||
{"a": [5, 4, 4, 2, 3, 3, 3, 3], "b": [10, 9, 8, 7, 5, 50, 10, 20]}
|
||||
)
|
||||
result = df.nlargest(4, "a", keep="all")
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"a": {0: 5, 1: 4, 2: 4, 4: 3, 5: 3, 6: 3, 7: 3},
|
||||
"b": {0: 10, 1: 9, 2: 8, 4: 5, 5: 50, 6: 10, 7: 20},
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.nsmallest(2, "a", keep="all")
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"a": {3: 2, 4: 3, 5: 3, 6: 3, 7: 3},
|
||||
"b": {3: 7, 4: 5, 5: 50, 6: 10, 7: 20},
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_nlargest_multiindex_column_lookup(self):
|
||||
# Check whether tuples are correctly treated as multi-level lookups.
|
||||
# GH#23033
|
||||
df = pd.DataFrame(
|
||||
columns=pd.MultiIndex.from_product([["x"], ["a", "b"]]),
|
||||
data=[[0.33, 0.13], [0.86, 0.25], [0.25, 0.70], [0.85, 0.91]],
|
||||
)
|
||||
|
||||
# nsmallest
|
||||
result = df.nsmallest(3, ("x", "a"))
|
||||
expected = df.iloc[[2, 0, 3]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# nlargest
|
||||
result = df.nlargest(3, ("x", "b"))
|
||||
expected = df.iloc[[3, 2, 1]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_nlargest_nan(self):
|
||||
# GH#43060
|
||||
df = pd.DataFrame([np.nan, np.nan, 0, 1, 2, 3])
|
||||
result = df.nlargest(5, 0)
|
||||
expected = df.sort_values(0, ascending=False).head(5)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_nsmallest_nan_after_n_element(self):
|
||||
# GH#46589
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"a": [1, 2, 3, 4, 5, None, 7],
|
||||
"b": [7, 6, 5, 4, 3, 2, 1],
|
||||
"c": [1, 1, 2, 2, 3, 3, 3],
|
||||
},
|
||||
index=range(7),
|
||||
)
|
||||
result = df.nsmallest(5, columns=["a", "b"])
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"a": [1, 2, 3, 4, 5],
|
||||
"b": [7, 6, 5, 4, 3],
|
||||
"c": [1, 1, 2, 2, 3],
|
||||
},
|
||||
index=range(5),
|
||||
).astype({"a": "float"})
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,180 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFramePctChange:
|
||||
@pytest.mark.parametrize(
|
||||
"periods, fill_method, limit, exp",
|
||||
[
|
||||
(1, "ffill", None, [np.nan, np.nan, np.nan, 1, 1, 1.5, 0, 0]),
|
||||
(1, "ffill", 1, [np.nan, np.nan, np.nan, 1, 1, 1.5, 0, np.nan]),
|
||||
(1, "bfill", None, [np.nan, 0, 0, 1, 1, 1.5, np.nan, np.nan]),
|
||||
(1, "bfill", 1, [np.nan, np.nan, 0, 1, 1, 1.5, np.nan, np.nan]),
|
||||
(-1, "ffill", None, [np.nan, np.nan, -0.5, -0.5, -0.6, 0, 0, np.nan]),
|
||||
(-1, "ffill", 1, [np.nan, np.nan, -0.5, -0.5, -0.6, 0, np.nan, np.nan]),
|
||||
(-1, "bfill", None, [0, 0, -0.5, -0.5, -0.6, np.nan, np.nan, np.nan]),
|
||||
(-1, "bfill", 1, [np.nan, 0, -0.5, -0.5, -0.6, np.nan, np.nan, np.nan]),
|
||||
],
|
||||
)
|
||||
def test_pct_change_with_nas(
|
||||
self, periods, fill_method, limit, exp, frame_or_series
|
||||
):
|
||||
vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan]
|
||||
obj = frame_or_series(vals)
|
||||
|
||||
msg = (
|
||||
"The 'fill_method' keyword being not None and the 'limit' keyword in "
|
||||
f"{type(obj).__name__}.pct_change are deprecated"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
res = obj.pct_change(periods=periods, fill_method=fill_method, limit=limit)
|
||||
tm.assert_equal(res, frame_or_series(exp))
|
||||
|
||||
def test_pct_change_numeric(self):
|
||||
# GH#11150
|
||||
pnl = DataFrame(
|
||||
[np.arange(0, 40, 10), np.arange(0, 40, 10), np.arange(0, 40, 10)]
|
||||
).astype(np.float64)
|
||||
pnl.iat[1, 0] = np.nan
|
||||
pnl.iat[1, 1] = np.nan
|
||||
pnl.iat[2, 3] = 60
|
||||
|
||||
msg = (
|
||||
"The 'fill_method' keyword being not None and the 'limit' keyword in "
|
||||
"DataFrame.pct_change are deprecated"
|
||||
)
|
||||
|
||||
for axis in range(2):
|
||||
expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift(axis=axis) - 1
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = pnl.pct_change(axis=axis, fill_method="pad")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_pct_change(self, datetime_frame):
|
||||
msg = (
|
||||
"The 'fill_method' keyword being not None and the 'limit' keyword in "
|
||||
"DataFrame.pct_change are deprecated"
|
||||
)
|
||||
|
||||
rs = datetime_frame.pct_change(fill_method=None)
|
||||
tm.assert_frame_equal(rs, datetime_frame / datetime_frame.shift(1) - 1)
|
||||
|
||||
rs = datetime_frame.pct_change(2)
|
||||
filled = datetime_frame.ffill()
|
||||
tm.assert_frame_equal(rs, filled / filled.shift(2) - 1)
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
rs = datetime_frame.pct_change(fill_method="bfill", limit=1)
|
||||
filled = datetime_frame.bfill(limit=1)
|
||||
tm.assert_frame_equal(rs, filled / filled.shift(1) - 1)
|
||||
|
||||
rs = datetime_frame.pct_change(freq="5D")
|
||||
filled = datetime_frame.ffill()
|
||||
tm.assert_frame_equal(
|
||||
rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled)
|
||||
)
|
||||
|
||||
def test_pct_change_shift_over_nas(self):
|
||||
s = Series([1.0, 1.5, np.nan, 2.5, 3.0])
|
||||
|
||||
df = DataFrame({"a": s, "b": s})
|
||||
|
||||
msg = "The default fill_method='pad' in DataFrame.pct_change is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
chg = df.pct_change()
|
||||
|
||||
expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2])
|
||||
edf = DataFrame({"a": expected, "b": expected})
|
||||
tm.assert_frame_equal(chg, edf)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"freq, periods, fill_method, limit",
|
||||
[
|
||||
("5B", 5, None, None),
|
||||
("3B", 3, None, None),
|
||||
("3B", 3, "bfill", None),
|
||||
("7B", 7, "pad", 1),
|
||||
("7B", 7, "bfill", 3),
|
||||
("14B", 14, None, None),
|
||||
],
|
||||
)
|
||||
def test_pct_change_periods_freq(
|
||||
self, datetime_frame, freq, periods, fill_method, limit
|
||||
):
|
||||
msg = (
|
||||
"The 'fill_method' keyword being not None and the 'limit' keyword in "
|
||||
"DataFrame.pct_change are deprecated"
|
||||
)
|
||||
|
||||
# GH#7292
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
rs_freq = datetime_frame.pct_change(
|
||||
freq=freq, fill_method=fill_method, limit=limit
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
rs_periods = datetime_frame.pct_change(
|
||||
periods, fill_method=fill_method, limit=limit
|
||||
)
|
||||
tm.assert_frame_equal(rs_freq, rs_periods)
|
||||
|
||||
empty_ts = DataFrame(index=datetime_frame.index, columns=datetime_frame.columns)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
rs_freq = empty_ts.pct_change(
|
||||
freq=freq, fill_method=fill_method, limit=limit
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
rs_periods = empty_ts.pct_change(
|
||||
periods, fill_method=fill_method, limit=limit
|
||||
)
|
||||
tm.assert_frame_equal(rs_freq, rs_periods)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("fill_method", ["pad", "ffill", None])
|
||||
def test_pct_change_with_duplicated_indices(fill_method):
|
||||
# GH30463
|
||||
data = DataFrame(
|
||||
{0: [np.nan, 1, 2, 3, 9, 18], 1: [0, 1, np.nan, 3, 9, 18]}, index=["a", "b"] * 3
|
||||
)
|
||||
|
||||
warn = None if fill_method is None else FutureWarning
|
||||
msg = (
|
||||
"The 'fill_method' keyword being not None and the 'limit' keyword in "
|
||||
"DataFrame.pct_change are deprecated"
|
||||
)
|
||||
with tm.assert_produces_warning(warn, match=msg):
|
||||
result = data.pct_change(fill_method=fill_method)
|
||||
|
||||
if fill_method is None:
|
||||
second_column = [np.nan, np.inf, np.nan, np.nan, 2.0, 1.0]
|
||||
else:
|
||||
second_column = [np.nan, np.inf, 0.0, 2.0, 2.0, 1.0]
|
||||
expected = DataFrame(
|
||||
{0: [np.nan, np.nan, 1.0, 0.5, 2.0, 1.0], 1: second_column},
|
||||
index=["a", "b"] * 3,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_pct_change_none_beginning_no_warning():
|
||||
# GH#54481
|
||||
df = DataFrame(
|
||||
[
|
||||
[1, None],
|
||||
[2, 1],
|
||||
[3, 2],
|
||||
[4, 3],
|
||||
[5, 4],
|
||||
]
|
||||
)
|
||||
result = df.pct_change()
|
||||
expected = DataFrame(
|
||||
{0: [np.nan, 1, 0.5, 1 / 3, 0.25], 1: [np.nan, np.nan, 1, 0.5, 1 / 3]}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,39 @@
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestPipe:
|
||||
def test_pipe(self, frame_or_series):
|
||||
obj = DataFrame({"A": [1, 2, 3]})
|
||||
expected = DataFrame({"A": [1, 4, 9]})
|
||||
if frame_or_series is Series:
|
||||
obj = obj["A"]
|
||||
expected = expected["A"]
|
||||
|
||||
f = lambda x, y: x**y
|
||||
result = obj.pipe(f, 2)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_pipe_tuple(self, frame_or_series):
|
||||
obj = DataFrame({"A": [1, 2, 3]})
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
f = lambda x, y: y
|
||||
result = obj.pipe((f, "y"), 0)
|
||||
tm.assert_equal(result, obj)
|
||||
|
||||
def test_pipe_tuple_error(self, frame_or_series):
|
||||
obj = DataFrame({"A": [1, 2, 3]})
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
f = lambda x, y: y
|
||||
|
||||
msg = "y is both the pipe target and a keyword argument"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.pipe((f, "y"), x=1, y=0)
|
@ -0,0 +1,72 @@
|
||||
import numpy as np
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
MultiIndex,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFramePop:
|
||||
def test_pop(self, float_frame, warn_copy_on_write):
|
||||
float_frame.columns.name = "baz"
|
||||
|
||||
float_frame.pop("A")
|
||||
assert "A" not in float_frame
|
||||
|
||||
float_frame["foo"] = "bar"
|
||||
float_frame.pop("foo")
|
||||
assert "foo" not in float_frame
|
||||
assert float_frame.columns.name == "baz"
|
||||
|
||||
# gh-10912: inplace ops cause caching issue
|
||||
a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"], index=["X", "Y"])
|
||||
b = a.pop("B")
|
||||
with tm.assert_cow_warning(warn_copy_on_write):
|
||||
b += 1
|
||||
|
||||
# original frame
|
||||
expected = DataFrame([[1, 3], [4, 6]], columns=["A", "C"], index=["X", "Y"])
|
||||
tm.assert_frame_equal(a, expected)
|
||||
|
||||
# result
|
||||
expected = Series([2, 5], index=["X", "Y"], name="B") + 1
|
||||
tm.assert_series_equal(b, expected)
|
||||
|
||||
def test_pop_non_unique_cols(self):
|
||||
df = DataFrame({0: [0, 1], 1: [0, 1], 2: [4, 5]})
|
||||
df.columns = ["a", "b", "a"]
|
||||
|
||||
res = df.pop("a")
|
||||
assert type(res) == DataFrame
|
||||
assert len(res) == 2
|
||||
assert len(df.columns) == 1
|
||||
assert "b" in df.columns
|
||||
assert "a" not in df.columns
|
||||
assert len(df.index) == 2
|
||||
|
||||
def test_mixed_depth_pop(self):
|
||||
arrays = [
|
||||
["a", "top", "top", "routine1", "routine1", "routine2"],
|
||||
["", "OD", "OD", "result1", "result2", "result1"],
|
||||
["", "wx", "wy", "", "", ""],
|
||||
]
|
||||
|
||||
tuples = sorted(zip(*arrays))
|
||||
index = MultiIndex.from_tuples(tuples)
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((4, 6)), columns=index)
|
||||
|
||||
df1 = df.copy()
|
||||
df2 = df.copy()
|
||||
result = df1.pop("a")
|
||||
expected = df2.pop(("a", "", ""))
|
||||
tm.assert_series_equal(expected, result, check_names=False)
|
||||
tm.assert_frame_equal(df1, df2)
|
||||
assert result.name == "a"
|
||||
|
||||
expected = df1["top"]
|
||||
df1 = df1.drop(["top"], axis=1)
|
||||
result = df2.pop("top")
|
||||
tm.assert_frame_equal(expected, result)
|
||||
tm.assert_frame_equal(df1, df2)
|
@ -0,0 +1,972 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
Timestamp,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[["linear", "single"], ["nearest", "table"]], ids=lambda x: "-".join(x)
|
||||
)
|
||||
def interp_method(request):
|
||||
"""(interpolation, method) arguments for quantile"""
|
||||
return request.param
|
||||
|
||||
|
||||
class TestDataFrameQuantile:
|
||||
@pytest.mark.parametrize(
|
||||
"df,expected",
|
||||
[
|
||||
[
|
||||
DataFrame(
|
||||
{
|
||||
0: Series(pd.arrays.SparseArray([1, 2])),
|
||||
1: Series(pd.arrays.SparseArray([3, 4])),
|
||||
}
|
||||
),
|
||||
Series([1.5, 3.5], name=0.5),
|
||||
],
|
||||
[
|
||||
DataFrame(Series([0.0, None, 1.0, 2.0], dtype="Sparse[float]")),
|
||||
Series([1.0], name=0.5),
|
||||
],
|
||||
],
|
||||
)
|
||||
def test_quantile_sparse(self, df, expected):
|
||||
# GH#17198
|
||||
# GH#24600
|
||||
result = df.quantile()
|
||||
expected = expected.astype("Sparse[float]")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_quantile(
|
||||
self, datetime_frame, interp_method, using_array_manager, request
|
||||
):
|
||||
interpolation, method = interp_method
|
||||
df = datetime_frame
|
||||
result = df.quantile(
|
||||
0.1, axis=0, numeric_only=True, interpolation=interpolation, method=method
|
||||
)
|
||||
expected = Series(
|
||||
[np.percentile(df[col], 10) for col in df.columns],
|
||||
index=df.columns,
|
||||
name=0.1,
|
||||
)
|
||||
if interpolation == "linear":
|
||||
# np.percentile values only comparable to linear interpolation
|
||||
tm.assert_series_equal(result, expected)
|
||||
else:
|
||||
tm.assert_index_equal(result.index, expected.index)
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(
|
||||
using_array_manager, reason="Name set incorrectly for arraymanager"
|
||||
)
|
||||
)
|
||||
assert result.name == expected.name
|
||||
|
||||
result = df.quantile(
|
||||
0.9, axis=1, numeric_only=True, interpolation=interpolation, method=method
|
||||
)
|
||||
expected = Series(
|
||||
[np.percentile(df.loc[date], 90) for date in df.index],
|
||||
index=df.index,
|
||||
name=0.9,
|
||||
)
|
||||
if interpolation == "linear":
|
||||
# np.percentile values only comparable to linear interpolation
|
||||
tm.assert_series_equal(result, expected)
|
||||
else:
|
||||
tm.assert_index_equal(result.index, expected.index)
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(
|
||||
using_array_manager, reason="Name set incorrectly for arraymanager"
|
||||
)
|
||||
)
|
||||
assert result.name == expected.name
|
||||
|
||||
def test_empty(self, interp_method):
|
||||
interpolation, method = interp_method
|
||||
q = DataFrame({"x": [], "y": []}).quantile(
|
||||
0.1, axis=0, numeric_only=True, interpolation=interpolation, method=method
|
||||
)
|
||||
assert np.isnan(q["x"]) and np.isnan(q["y"])
|
||||
|
||||
def test_non_numeric_exclusion(self, interp_method, request, using_array_manager):
|
||||
interpolation, method = interp_method
|
||||
df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]})
|
||||
rs = df.quantile(
|
||||
0.5, numeric_only=True, interpolation=interpolation, method=method
|
||||
)
|
||||
xp = df.median(numeric_only=True).rename(0.5)
|
||||
if interpolation == "nearest":
|
||||
xp = (xp + 0.5).astype(np.int64)
|
||||
if method == "table" and using_array_manager:
|
||||
request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
|
||||
tm.assert_series_equal(rs, xp)
|
||||
|
||||
def test_axis(self, interp_method, request, using_array_manager):
|
||||
# axis
|
||||
interpolation, method = interp_method
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
|
||||
result = df.quantile(0.5, axis=1, interpolation=interpolation, method=method)
|
||||
expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
|
||||
if interpolation == "nearest":
|
||||
expected = expected.astype(np.int64)
|
||||
if method == "table" and using_array_manager:
|
||||
request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.quantile(
|
||||
[0.5, 0.75], axis=1, interpolation=interpolation, method=method
|
||||
)
|
||||
expected = DataFrame(
|
||||
{1: [1.5, 1.75], 2: [2.5, 2.75], 3: [3.5, 3.75]}, index=[0.5, 0.75]
|
||||
)
|
||||
if interpolation == "nearest":
|
||||
expected.iloc[0, :] -= 0.5
|
||||
expected.iloc[1, :] += 0.25
|
||||
expected = expected.astype(np.int64)
|
||||
tm.assert_frame_equal(result, expected, check_index_type=True)
|
||||
|
||||
def test_axis_numeric_only_true(self, interp_method, request, using_array_manager):
|
||||
# We may want to break API in the future to change this
|
||||
# so that we exclude non-numeric along the same axis
|
||||
# See GH #7312
|
||||
interpolation, method = interp_method
|
||||
df = DataFrame([[1, 2, 3], ["a", "b", 4]])
|
||||
result = df.quantile(
|
||||
0.5, axis=1, numeric_only=True, interpolation=interpolation, method=method
|
||||
)
|
||||
expected = Series([3.0, 4.0], index=[0, 1], name=0.5)
|
||||
if interpolation == "nearest":
|
||||
expected = expected.astype(np.int64)
|
||||
if method == "table" and using_array_manager:
|
||||
request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_quantile_date_range(self, interp_method, request, using_array_manager):
|
||||
# GH 2460
|
||||
interpolation, method = interp_method
|
||||
dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific")
|
||||
ser = Series(dti)
|
||||
df = DataFrame(ser)
|
||||
|
||||
result = df.quantile(
|
||||
numeric_only=False, interpolation=interpolation, method=method
|
||||
)
|
||||
expected = Series(
|
||||
["2016-01-02 00:00:00"], name=0.5, dtype="datetime64[ns, US/Pacific]"
|
||||
)
|
||||
if method == "table" and using_array_manager:
|
||||
request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_quantile_axis_mixed(self, interp_method, request, using_array_manager):
|
||||
# mixed on axis=1
|
||||
interpolation, method = interp_method
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3],
|
||||
"B": [2.0, 3.0, 4.0],
|
||||
"C": pd.date_range("20130101", periods=3),
|
||||
"D": ["foo", "bar", "baz"],
|
||||
}
|
||||
)
|
||||
result = df.quantile(
|
||||
0.5, axis=1, numeric_only=True, interpolation=interpolation, method=method
|
||||
)
|
||||
expected = Series([1.5, 2.5, 3.5], name=0.5)
|
||||
if interpolation == "nearest":
|
||||
expected -= 0.5
|
||||
if method == "table" and using_array_manager:
|
||||
request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# must raise
|
||||
msg = "'<' not supported between instances of 'Timestamp' and 'float'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.quantile(0.5, axis=1, numeric_only=False)
|
||||
|
||||
def test_quantile_axis_parameter(self, interp_method, request, using_array_manager):
|
||||
# GH 9543/9544
|
||||
interpolation, method = interp_method
|
||||
if method == "table" and using_array_manager:
|
||||
request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
|
||||
|
||||
result = df.quantile(0.5, axis=0, interpolation=interpolation, method=method)
|
||||
|
||||
expected = Series([2.0, 3.0], index=["A", "B"], name=0.5)
|
||||
if interpolation == "nearest":
|
||||
expected = expected.astype(np.int64)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
expected = df.quantile(
|
||||
0.5, axis="index", interpolation=interpolation, method=method
|
||||
)
|
||||
if interpolation == "nearest":
|
||||
expected = expected.astype(np.int64)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.quantile(0.5, axis=1, interpolation=interpolation, method=method)
|
||||
|
||||
expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
|
||||
if interpolation == "nearest":
|
||||
expected = expected.astype(np.int64)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.quantile(
|
||||
0.5, axis="columns", interpolation=interpolation, method=method
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
msg = "No axis named -1 for object type DataFrame"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.quantile(0.1, axis=-1, interpolation=interpolation, method=method)
|
||||
msg = "No axis named column for object type DataFrame"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.quantile(0.1, axis="column")
|
||||
|
||||
def test_quantile_interpolation(self):
|
||||
# see gh-10174
|
||||
|
||||
# interpolation method other than default linear
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
|
||||
result = df.quantile(0.5, axis=1, interpolation="nearest")
|
||||
expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# cross-check interpolation=nearest results in original dtype
|
||||
exp = np.percentile(
|
||||
np.array([[1, 2, 3], [2, 3, 4]]),
|
||||
0.5,
|
||||
axis=0,
|
||||
method="nearest",
|
||||
)
|
||||
expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="int64")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# float
|
||||
df = DataFrame({"A": [1.0, 2.0, 3.0], "B": [2.0, 3.0, 4.0]}, index=[1, 2, 3])
|
||||
result = df.quantile(0.5, axis=1, interpolation="nearest")
|
||||
expected = Series([1.0, 2.0, 3.0], index=[1, 2, 3], name=0.5)
|
||||
tm.assert_series_equal(result, expected)
|
||||
exp = np.percentile(
|
||||
np.array([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]]),
|
||||
0.5,
|
||||
axis=0,
|
||||
method="nearest",
|
||||
)
|
||||
expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="float64")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# axis
|
||||
result = df.quantile([0.5, 0.75], axis=1, interpolation="lower")
|
||||
expected = DataFrame(
|
||||
{1: [1.0, 1.0], 2: [2.0, 2.0], 3: [3.0, 3.0]}, index=[0.5, 0.75]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# test degenerate case
|
||||
df = DataFrame({"x": [], "y": []})
|
||||
q = df.quantile(0.1, axis=0, interpolation="higher")
|
||||
assert np.isnan(q["x"]) and np.isnan(q["y"])
|
||||
|
||||
# multi
|
||||
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"])
|
||||
result = df.quantile([0.25, 0.5], interpolation="midpoint")
|
||||
|
||||
# https://github.com/numpy/numpy/issues/7163
|
||||
expected = DataFrame(
|
||||
[[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]],
|
||||
index=[0.25, 0.5],
|
||||
columns=["a", "b", "c"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_quantile_interpolation_datetime(self, datetime_frame):
|
||||
# see gh-10174
|
||||
|
||||
# interpolation = linear (default case)
|
||||
df = datetime_frame
|
||||
q = df.quantile(0.1, axis=0, numeric_only=True, interpolation="linear")
|
||||
assert q["A"] == np.percentile(df["A"], 10)
|
||||
|
||||
def test_quantile_interpolation_int(self, int_frame):
|
||||
# see gh-10174
|
||||
|
||||
df = int_frame
|
||||
# interpolation = linear (default case)
|
||||
q = df.quantile(0.1)
|
||||
assert q["A"] == np.percentile(df["A"], 10)
|
||||
|
||||
# test with and without interpolation keyword
|
||||
q1 = df.quantile(0.1, axis=0, interpolation="linear")
|
||||
assert q1["A"] == np.percentile(df["A"], 10)
|
||||
tm.assert_series_equal(q, q1)
|
||||
|
||||
def test_quantile_multi(self, interp_method, request, using_array_manager):
|
||||
interpolation, method = interp_method
|
||||
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"])
|
||||
result = df.quantile([0.25, 0.5], interpolation=interpolation, method=method)
|
||||
expected = DataFrame(
|
||||
[[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]],
|
||||
index=[0.25, 0.5],
|
||||
columns=["a", "b", "c"],
|
||||
)
|
||||
if interpolation == "nearest":
|
||||
expected = expected.astype(np.int64)
|
||||
if method == "table" and using_array_manager:
|
||||
request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_quantile_multi_axis_1(self, interp_method, request, using_array_manager):
|
||||
interpolation, method = interp_method
|
||||
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"])
|
||||
result = df.quantile(
|
||||
[0.25, 0.5], axis=1, interpolation=interpolation, method=method
|
||||
)
|
||||
expected = DataFrame(
|
||||
[[1.0, 2.0, 3.0]] * 2, index=[0.25, 0.5], columns=[0, 1, 2]
|
||||
)
|
||||
if interpolation == "nearest":
|
||||
expected = expected.astype(np.int64)
|
||||
if method == "table" and using_array_manager:
|
||||
request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_quantile_multi_empty(self, interp_method):
|
||||
interpolation, method = interp_method
|
||||
result = DataFrame({"x": [], "y": []}).quantile(
|
||||
[0.1, 0.9], axis=0, interpolation=interpolation, method=method
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_quantile_datetime(self, unit):
|
||||
dti = pd.to_datetime(["2010", "2011"]).as_unit(unit)
|
||||
df = DataFrame({"a": dti, "b": [0, 5]})
|
||||
|
||||
# exclude datetime
|
||||
result = df.quantile(0.5, numeric_only=True)
|
||||
expected = Series([2.5], index=["b"], name=0.5)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# datetime
|
||||
result = df.quantile(0.5, numeric_only=False)
|
||||
expected = Series(
|
||||
[Timestamp("2010-07-02 12:00:00"), 2.5], index=["a", "b"], name=0.5
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# datetime w/ multi
|
||||
result = df.quantile([0.5], numeric_only=False)
|
||||
expected = DataFrame(
|
||||
{"a": Timestamp("2010-07-02 12:00:00").as_unit(unit), "b": 2.5},
|
||||
index=[0.5],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# axis = 1
|
||||
df["c"] = pd.to_datetime(["2011", "2012"]).as_unit(unit)
|
||||
result = df[["a", "c"]].quantile(0.5, axis=1, numeric_only=False)
|
||||
expected = Series(
|
||||
[Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")],
|
||||
index=[0, 1],
|
||||
name=0.5,
|
||||
dtype=f"M8[{unit}]",
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df[["a", "c"]].quantile([0.5], axis=1, numeric_only=False)
|
||||
expected = DataFrame(
|
||||
[[Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")]],
|
||||
index=[0.5],
|
||||
columns=[0, 1],
|
||||
dtype=f"M8[{unit}]",
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# empty when numeric_only=True
|
||||
result = df[["a", "c"]].quantile(0.5, numeric_only=True)
|
||||
expected = Series([], index=[], dtype=np.float64, name=0.5)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df[["a", "c"]].quantile([0.5], numeric_only=True)
|
||||
expected = DataFrame(index=[0.5], columns=[])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
"datetime64[ns]",
|
||||
"datetime64[ns, US/Pacific]",
|
||||
"timedelta64[ns]",
|
||||
"Period[D]",
|
||||
],
|
||||
)
|
||||
def test_quantile_dt64_empty(self, dtype, interp_method):
|
||||
# GH#41544
|
||||
interpolation, method = interp_method
|
||||
df = DataFrame(columns=["a", "b"], dtype=dtype)
|
||||
|
||||
res = df.quantile(
|
||||
0.5, axis=1, numeric_only=False, interpolation=interpolation, method=method
|
||||
)
|
||||
expected = Series([], index=[], name=0.5, dtype=dtype)
|
||||
tm.assert_series_equal(res, expected)
|
||||
|
||||
# no columns in result, so no dtype preservation
|
||||
res = df.quantile(
|
||||
[0.5],
|
||||
axis=1,
|
||||
numeric_only=False,
|
||||
interpolation=interpolation,
|
||||
method=method,
|
||||
)
|
||||
expected = DataFrame(index=[0.5], columns=[])
|
||||
tm.assert_frame_equal(res, expected)
|
||||
|
||||
@pytest.mark.parametrize("invalid", [-1, 2, [0.5, -1], [0.5, 2]])
|
||||
def test_quantile_invalid(self, invalid, datetime_frame, interp_method):
|
||||
msg = "percentiles should all be in the interval \\[0, 1\\]"
|
||||
interpolation, method = interp_method
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
datetime_frame.quantile(invalid, interpolation=interpolation, method=method)
|
||||
|
||||
def test_quantile_box(self, interp_method, request, using_array_manager):
|
||||
interpolation, method = interp_method
|
||||
if method == "table" and using_array_manager:
|
||||
request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
Timestamp("2011-01-01"),
|
||||
Timestamp("2011-01-02"),
|
||||
Timestamp("2011-01-03"),
|
||||
],
|
||||
"B": [
|
||||
Timestamp("2011-01-01", tz="US/Eastern"),
|
||||
Timestamp("2011-01-02", tz="US/Eastern"),
|
||||
Timestamp("2011-01-03", tz="US/Eastern"),
|
||||
],
|
||||
"C": [
|
||||
pd.Timedelta("1 days"),
|
||||
pd.Timedelta("2 days"),
|
||||
pd.Timedelta("3 days"),
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
res = df.quantile(
|
||||
0.5, numeric_only=False, interpolation=interpolation, method=method
|
||||
)
|
||||
|
||||
exp = Series(
|
||||
[
|
||||
Timestamp("2011-01-02"),
|
||||
Timestamp("2011-01-02", tz="US/Eastern"),
|
||||
pd.Timedelta("2 days"),
|
||||
],
|
||||
name=0.5,
|
||||
index=["A", "B", "C"],
|
||||
)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile(
|
||||
[0.5], numeric_only=False, interpolation=interpolation, method=method
|
||||
)
|
||||
exp = DataFrame(
|
||||
[
|
||||
[
|
||||
Timestamp("2011-01-02"),
|
||||
Timestamp("2011-01-02", tz="US/Eastern"),
|
||||
pd.Timedelta("2 days"),
|
||||
]
|
||||
],
|
||||
index=[0.5],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_quantile_box_nat(self):
|
||||
# DatetimeLikeBlock may be consolidated and contain NaT in different loc
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
Timestamp("2011-01-01"),
|
||||
pd.NaT,
|
||||
Timestamp("2011-01-02"),
|
||||
Timestamp("2011-01-03"),
|
||||
],
|
||||
"a": [
|
||||
Timestamp("2011-01-01"),
|
||||
Timestamp("2011-01-02"),
|
||||
pd.NaT,
|
||||
Timestamp("2011-01-03"),
|
||||
],
|
||||
"B": [
|
||||
Timestamp("2011-01-01", tz="US/Eastern"),
|
||||
pd.NaT,
|
||||
Timestamp("2011-01-02", tz="US/Eastern"),
|
||||
Timestamp("2011-01-03", tz="US/Eastern"),
|
||||
],
|
||||
"b": [
|
||||
Timestamp("2011-01-01", tz="US/Eastern"),
|
||||
Timestamp("2011-01-02", tz="US/Eastern"),
|
||||
pd.NaT,
|
||||
Timestamp("2011-01-03", tz="US/Eastern"),
|
||||
],
|
||||
"C": [
|
||||
pd.Timedelta("1 days"),
|
||||
pd.Timedelta("2 days"),
|
||||
pd.Timedelta("3 days"),
|
||||
pd.NaT,
|
||||
],
|
||||
"c": [
|
||||
pd.NaT,
|
||||
pd.Timedelta("1 days"),
|
||||
pd.Timedelta("2 days"),
|
||||
pd.Timedelta("3 days"),
|
||||
],
|
||||
},
|
||||
columns=list("AaBbCc"),
|
||||
)
|
||||
|
||||
res = df.quantile(0.5, numeric_only=False)
|
||||
exp = Series(
|
||||
[
|
||||
Timestamp("2011-01-02"),
|
||||
Timestamp("2011-01-02"),
|
||||
Timestamp("2011-01-02", tz="US/Eastern"),
|
||||
Timestamp("2011-01-02", tz="US/Eastern"),
|
||||
pd.Timedelta("2 days"),
|
||||
pd.Timedelta("2 days"),
|
||||
],
|
||||
name=0.5,
|
||||
index=list("AaBbCc"),
|
||||
)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5], numeric_only=False)
|
||||
exp = DataFrame(
|
||||
[
|
||||
[
|
||||
Timestamp("2011-01-02"),
|
||||
Timestamp("2011-01-02"),
|
||||
Timestamp("2011-01-02", tz="US/Eastern"),
|
||||
Timestamp("2011-01-02", tz="US/Eastern"),
|
||||
pd.Timedelta("2 days"),
|
||||
pd.Timedelta("2 days"),
|
||||
]
|
||||
],
|
||||
index=[0.5],
|
||||
columns=list("AaBbCc"),
|
||||
)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_quantile_nan(self, interp_method, request, using_array_manager):
|
||||
interpolation, method = interp_method
|
||||
if method == "table" and using_array_manager:
|
||||
request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
|
||||
# GH 14357 - float block where some cols have missing values
|
||||
df = DataFrame({"a": np.arange(1, 6.0), "b": np.arange(1, 6.0)})
|
||||
df.iloc[-1, 1] = np.nan
|
||||
|
||||
res = df.quantile(0.5, interpolation=interpolation, method=method)
|
||||
exp = Series(
|
||||
[3.0, 2.5 if interpolation == "linear" else 3.0], index=["a", "b"], name=0.5
|
||||
)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5, 0.75], interpolation=interpolation, method=method)
|
||||
exp = DataFrame(
|
||||
{
|
||||
"a": [3.0, 4.0],
|
||||
"b": [2.5, 3.25] if interpolation == "linear" else [3.0, 4.0],
|
||||
},
|
||||
index=[0.5, 0.75],
|
||||
)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
res = df.quantile(0.5, axis=1, interpolation=interpolation, method=method)
|
||||
exp = Series(np.arange(1.0, 6.0), name=0.5)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile(
|
||||
[0.5, 0.75], axis=1, interpolation=interpolation, method=method
|
||||
)
|
||||
exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75])
|
||||
if interpolation == "nearest":
|
||||
exp.iloc[1, -1] = np.nan
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# full-nan column
|
||||
df["b"] = np.nan
|
||||
|
||||
res = df.quantile(0.5, interpolation=interpolation, method=method)
|
||||
exp = Series([3.0, np.nan], index=["a", "b"], name=0.5)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5, 0.75], interpolation=interpolation, method=method)
|
||||
exp = DataFrame({"a": [3.0, 4.0], "b": [np.nan, np.nan]}, index=[0.5, 0.75])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_quantile_nat(self, interp_method, request, using_array_manager, unit):
|
||||
interpolation, method = interp_method
|
||||
if method == "table" and using_array_manager:
|
||||
request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
|
||||
# full NaT column
|
||||
df = DataFrame({"a": [pd.NaT, pd.NaT, pd.NaT]}, dtype=f"M8[{unit}]")
|
||||
|
||||
res = df.quantile(
|
||||
0.5, numeric_only=False, interpolation=interpolation, method=method
|
||||
)
|
||||
exp = Series([pd.NaT], index=["a"], name=0.5, dtype=f"M8[{unit}]")
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile(
|
||||
[0.5], numeric_only=False, interpolation=interpolation, method=method
|
||||
)
|
||||
exp = DataFrame({"a": [pd.NaT]}, index=[0.5], dtype=f"M8[{unit}]")
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# mixed non-null / full null column
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [
|
||||
Timestamp("2012-01-01"),
|
||||
Timestamp("2012-01-02"),
|
||||
Timestamp("2012-01-03"),
|
||||
],
|
||||
"b": [pd.NaT, pd.NaT, pd.NaT],
|
||||
},
|
||||
dtype=f"M8[{unit}]",
|
||||
)
|
||||
|
||||
res = df.quantile(
|
||||
0.5, numeric_only=False, interpolation=interpolation, method=method
|
||||
)
|
||||
exp = Series(
|
||||
[Timestamp("2012-01-02"), pd.NaT],
|
||||
index=["a", "b"],
|
||||
name=0.5,
|
||||
dtype=f"M8[{unit}]",
|
||||
)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile(
|
||||
[0.5], numeric_only=False, interpolation=interpolation, method=method
|
||||
)
|
||||
exp = DataFrame(
|
||||
[[Timestamp("2012-01-02"), pd.NaT]],
|
||||
index=[0.5],
|
||||
columns=["a", "b"],
|
||||
dtype=f"M8[{unit}]",
|
||||
)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_quantile_empty_no_rows_floats(self, interp_method):
|
||||
interpolation, method = interp_method
|
||||
|
||||
df = DataFrame(columns=["a", "b"], dtype="float64")
|
||||
|
||||
res = df.quantile(0.5, interpolation=interpolation, method=method)
|
||||
exp = Series([np.nan, np.nan], index=["a", "b"], name=0.5)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5], interpolation=interpolation, method=method)
|
||||
exp = DataFrame([[np.nan, np.nan]], columns=["a", "b"], index=[0.5])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
res = df.quantile(0.5, axis=1, interpolation=interpolation, method=method)
|
||||
exp = Series([], index=[], dtype="float64", name=0.5)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5], axis=1, interpolation=interpolation, method=method)
|
||||
exp = DataFrame(columns=[], index=[0.5])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_quantile_empty_no_rows_ints(self, interp_method):
|
||||
interpolation, method = interp_method
|
||||
df = DataFrame(columns=["a", "b"], dtype="int64")
|
||||
|
||||
res = df.quantile(0.5, interpolation=interpolation, method=method)
|
||||
exp = Series([np.nan, np.nan], index=["a", "b"], name=0.5)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
def test_quantile_empty_no_rows_dt64(self, interp_method):
|
||||
interpolation, method = interp_method
|
||||
# datetimes
|
||||
df = DataFrame(columns=["a", "b"], dtype="datetime64[ns]")
|
||||
|
||||
res = df.quantile(
|
||||
0.5, numeric_only=False, interpolation=interpolation, method=method
|
||||
)
|
||||
exp = Series(
|
||||
[pd.NaT, pd.NaT], index=["a", "b"], dtype="datetime64[ns]", name=0.5
|
||||
)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
# Mixed dt64/dt64tz
|
||||
df["a"] = df["a"].dt.tz_localize("US/Central")
|
||||
res = df.quantile(
|
||||
0.5, numeric_only=False, interpolation=interpolation, method=method
|
||||
)
|
||||
exp = exp.astype(object)
|
||||
if interpolation == "nearest":
|
||||
# GH#18463 TODO: would we prefer NaTs here?
|
||||
msg = "The 'downcast' keyword in fillna is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
exp = exp.fillna(np.nan, downcast=False)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
# both dt64tz
|
||||
df["b"] = df["b"].dt.tz_localize("US/Central")
|
||||
res = df.quantile(
|
||||
0.5, numeric_only=False, interpolation=interpolation, method=method
|
||||
)
|
||||
exp = exp.astype(df["b"].dtype)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
def test_quantile_empty_no_columns(self, interp_method):
|
||||
# GH#23925 _get_numeric_data may drop all columns
|
||||
interpolation, method = interp_method
|
||||
df = DataFrame(pd.date_range("1/1/18", periods=5))
|
||||
df.columns.name = "captain tightpants"
|
||||
result = df.quantile(
|
||||
0.5, numeric_only=True, interpolation=interpolation, method=method
|
||||
)
|
||||
expected = Series([], index=[], name=0.5, dtype=np.float64)
|
||||
expected.index.name = "captain tightpants"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.quantile(
|
||||
[0.5], numeric_only=True, interpolation=interpolation, method=method
|
||||
)
|
||||
expected = DataFrame([], index=[0.5], columns=[])
|
||||
expected.columns.name = "captain tightpants"
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_quantile_item_cache(
|
||||
self, using_array_manager, interp_method, using_copy_on_write
|
||||
):
|
||||
# previous behavior incorrect retained an invalid _item_cache entry
|
||||
interpolation, method = interp_method
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((4, 3)), columns=["A", "B", "C"]
|
||||
)
|
||||
df["D"] = df["A"] * 2
|
||||
ser = df["A"]
|
||||
if not using_array_manager:
|
||||
assert len(df._mgr.blocks) == 2
|
||||
|
||||
df.quantile(numeric_only=False, interpolation=interpolation, method=method)
|
||||
|
||||
if using_copy_on_write:
|
||||
ser.iloc[0] = 99
|
||||
assert df.iloc[0, 0] == df["A"][0]
|
||||
assert df.iloc[0, 0] != 99
|
||||
else:
|
||||
ser.values[0] = 99
|
||||
assert df.iloc[0, 0] == df["A"][0]
|
||||
assert df.iloc[0, 0] == 99
|
||||
|
||||
def test_invalid_method(self):
|
||||
with pytest.raises(ValueError, match="Invalid method: foo"):
|
||||
DataFrame(range(1)).quantile(0.5, method="foo")
|
||||
|
||||
def test_table_invalid_interpolation(self):
|
||||
with pytest.raises(ValueError, match="Invalid interpolation: foo"):
|
||||
DataFrame(range(1)).quantile(0.5, method="table", interpolation="foo")
|
||||
|
||||
|
||||
class TestQuantileExtensionDtype:
|
||||
# TODO: tests for axis=1?
|
||||
# TODO: empty case?
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
pytest.param(
|
||||
pd.IntervalIndex.from_breaks(range(10)),
|
||||
marks=pytest.mark.xfail(reason="raises when trying to add Intervals"),
|
||||
),
|
||||
pd.period_range("2016-01-01", periods=9, freq="D"),
|
||||
pd.date_range("2016-01-01", periods=9, tz="US/Pacific"),
|
||||
pd.timedelta_range("1 Day", periods=9),
|
||||
pd.array(np.arange(9), dtype="Int64"),
|
||||
pd.array(np.arange(9), dtype="Float64"),
|
||||
],
|
||||
ids=lambda x: str(x.dtype),
|
||||
)
|
||||
def index(self, request):
|
||||
# NB: not actually an Index object
|
||||
idx = request.param
|
||||
idx.name = "A"
|
||||
return idx
|
||||
|
||||
@pytest.fixture
|
||||
def obj(self, index, frame_or_series):
|
||||
# bc index is not always an Index (yet), we need to re-patch .name
|
||||
obj = frame_or_series(index).copy()
|
||||
|
||||
if frame_or_series is Series:
|
||||
obj.name = "A"
|
||||
else:
|
||||
obj.columns = ["A"]
|
||||
return obj
|
||||
|
||||
def compute_quantile(self, obj, qs):
|
||||
if isinstance(obj, Series):
|
||||
result = obj.quantile(qs)
|
||||
else:
|
||||
result = obj.quantile(qs, numeric_only=False)
|
||||
return result
|
||||
|
||||
def test_quantile_ea(self, request, obj, index):
|
||||
# result should be invariant to shuffling
|
||||
indexer = np.arange(len(index), dtype=np.intp)
|
||||
np.random.default_rng(2).shuffle(indexer)
|
||||
obj = obj.iloc[indexer]
|
||||
|
||||
qs = [0.5, 0, 1]
|
||||
result = self.compute_quantile(obj, qs)
|
||||
|
||||
exp_dtype = index.dtype
|
||||
if index.dtype == "Int64":
|
||||
# match non-nullable casting behavior
|
||||
exp_dtype = "Float64"
|
||||
|
||||
# expected here assumes len(index) == 9
|
||||
expected = Series(
|
||||
[index[4], index[0], index[-1]], dtype=exp_dtype, index=qs, name="A"
|
||||
)
|
||||
expected = type(obj)(expected)
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_quantile_ea_with_na(self, obj, index):
|
||||
obj.iloc[0] = index._na_value
|
||||
obj.iloc[-1] = index._na_value
|
||||
|
||||
# result should be invariant to shuffling
|
||||
indexer = np.arange(len(index), dtype=np.intp)
|
||||
np.random.default_rng(2).shuffle(indexer)
|
||||
obj = obj.iloc[indexer]
|
||||
|
||||
qs = [0.5, 0, 1]
|
||||
result = self.compute_quantile(obj, qs)
|
||||
|
||||
# expected here assumes len(index) == 9
|
||||
expected = Series(
|
||||
[index[4], index[1], index[-2]], dtype=index.dtype, index=qs, name="A"
|
||||
)
|
||||
expected = type(obj)(expected)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_quantile_ea_all_na(self, request, obj, index):
|
||||
obj.iloc[:] = index._na_value
|
||||
# Check dtypes were preserved; this was once a problem see GH#39763
|
||||
assert np.all(obj.dtypes == index.dtype)
|
||||
|
||||
# result should be invariant to shuffling
|
||||
indexer = np.arange(len(index), dtype=np.intp)
|
||||
np.random.default_rng(2).shuffle(indexer)
|
||||
obj = obj.iloc[indexer]
|
||||
|
||||
qs = [0.5, 0, 1]
|
||||
result = self.compute_quantile(obj, qs)
|
||||
|
||||
expected = index.take([-1, -1, -1], allow_fill=True, fill_value=index._na_value)
|
||||
expected = Series(expected, index=qs, name="A")
|
||||
expected = type(obj)(expected)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_quantile_ea_scalar(self, request, obj, index):
|
||||
# scalar qs
|
||||
|
||||
# result should be invariant to shuffling
|
||||
indexer = np.arange(len(index), dtype=np.intp)
|
||||
np.random.default_rng(2).shuffle(indexer)
|
||||
obj = obj.iloc[indexer]
|
||||
|
||||
qs = 0.5
|
||||
result = self.compute_quantile(obj, qs)
|
||||
|
||||
exp_dtype = index.dtype
|
||||
if index.dtype == "Int64":
|
||||
exp_dtype = "Float64"
|
||||
|
||||
expected = Series({"A": index[4]}, dtype=exp_dtype, name=0.5)
|
||||
if isinstance(obj, Series):
|
||||
expected = expected["A"]
|
||||
assert result == expected
|
||||
else:
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype, expected_data, expected_index, axis",
|
||||
[
|
||||
["float64", [], [], 1],
|
||||
["int64", [], [], 1],
|
||||
["float64", [np.nan, np.nan], ["a", "b"], 0],
|
||||
["int64", [np.nan, np.nan], ["a", "b"], 0],
|
||||
],
|
||||
)
|
||||
def test_empty_numeric(self, dtype, expected_data, expected_index, axis):
|
||||
# GH 14564
|
||||
df = DataFrame(columns=["a", "b"], dtype=dtype)
|
||||
result = df.quantile(0.5, axis=axis)
|
||||
expected = Series(
|
||||
expected_data, name=0.5, index=Index(expected_index), dtype="float64"
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype, expected_data, expected_index, axis, expected_dtype",
|
||||
[
|
||||
["datetime64[ns]", [], [], 1, "datetime64[ns]"],
|
||||
["datetime64[ns]", [pd.NaT, pd.NaT], ["a", "b"], 0, "datetime64[ns]"],
|
||||
],
|
||||
)
|
||||
def test_empty_datelike(
|
||||
self, dtype, expected_data, expected_index, axis, expected_dtype
|
||||
):
|
||||
# GH 14564
|
||||
df = DataFrame(columns=["a", "b"], dtype=dtype)
|
||||
result = df.quantile(0.5, axis=axis, numeric_only=False)
|
||||
expected = Series(
|
||||
expected_data, name=0.5, index=Index(expected_index), dtype=expected_dtype
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"expected_data, expected_index, axis",
|
||||
[
|
||||
[[np.nan, np.nan], range(2), 1],
|
||||
[[], [], 0],
|
||||
],
|
||||
)
|
||||
def test_datelike_numeric_only(self, expected_data, expected_index, axis):
|
||||
# GH 14564
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": pd.to_datetime(["2010", "2011"]),
|
||||
"b": [0, 5],
|
||||
"c": pd.to_datetime(["2011", "2012"]),
|
||||
}
|
||||
)
|
||||
result = df[["a", "c"]].quantile(0.5, axis=axis, numeric_only=True)
|
||||
expected = Series(
|
||||
expected_data, name=0.5, index=Index(expected_index), dtype=np.float64
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
@ -0,0 +1,510 @@
|
||||
from datetime import (
|
||||
datetime,
|
||||
timedelta,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.algos import (
|
||||
Infinity,
|
||||
NegInfinity,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestRank:
|
||||
s = Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])
|
||||
df = DataFrame({"A": s, "B": s})
|
||||
|
||||
results = {
|
||||
"average": np.array([1.5, 5.5, 7.0, 3.5, np.nan, 3.5, 1.5, 8.0, np.nan, 5.5]),
|
||||
"min": np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5]),
|
||||
"max": np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6]),
|
||||
"first": np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6]),
|
||||
"dense": np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]),
|
||||
}
|
||||
|
||||
@pytest.fixture(params=["average", "min", "max", "first", "dense"])
|
||||
def method(self, request):
|
||||
"""
|
||||
Fixture for trying all rank methods
|
||||
"""
|
||||
return request.param
|
||||
|
||||
def test_rank(self, float_frame):
|
||||
sp_stats = pytest.importorskip("scipy.stats")
|
||||
|
||||
float_frame.loc[::2, "A"] = np.nan
|
||||
float_frame.loc[::3, "B"] = np.nan
|
||||
float_frame.loc[::4, "C"] = np.nan
|
||||
float_frame.loc[::5, "D"] = np.nan
|
||||
|
||||
ranks0 = float_frame.rank()
|
||||
ranks1 = float_frame.rank(1)
|
||||
mask = np.isnan(float_frame.values)
|
||||
|
||||
fvals = float_frame.fillna(np.inf).values
|
||||
|
||||
exp0 = np.apply_along_axis(sp_stats.rankdata, 0, fvals)
|
||||
exp0[mask] = np.nan
|
||||
|
||||
exp1 = np.apply_along_axis(sp_stats.rankdata, 1, fvals)
|
||||
exp1[mask] = np.nan
|
||||
|
||||
tm.assert_almost_equal(ranks0.values, exp0)
|
||||
tm.assert_almost_equal(ranks1.values, exp1)
|
||||
|
||||
# integers
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).integers(0, 5, size=40).reshape((10, 4))
|
||||
)
|
||||
|
||||
result = df.rank()
|
||||
exp = df.astype(float).rank()
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
result = df.rank(1)
|
||||
exp = df.astype(float).rank(1)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
def test_rank2(self):
|
||||
df = DataFrame([[1, 3, 2], [1, 2, 3]])
|
||||
expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0
|
||||
result = df.rank(1, pct=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame([[1, 3, 2], [1, 2, 3]])
|
||||
expected = df.rank(0) / 2.0
|
||||
result = df.rank(0, pct=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame([["b", "c", "a"], ["a", "c", "b"]])
|
||||
expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]])
|
||||
result = df.rank(1, numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]])
|
||||
result = df.rank(0, numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame([["b", np.nan, "a"], ["a", "c", "b"]])
|
||||
expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 3.0, 2.0]])
|
||||
result = df.rank(1, numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 1.0, 2.0]])
|
||||
result = df.rank(0, numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# f7u12, this does not work without extensive workaround
|
||||
data = [
|
||||
[datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)],
|
||||
[datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)],
|
||||
]
|
||||
df = DataFrame(data)
|
||||
|
||||
# check the rank
|
||||
expected = DataFrame([[2.0, np.nan, 1.0], [2.0, 3.0, 1.0]])
|
||||
result = df.rank(1, numeric_only=False, ascending=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame([[1.0, np.nan, 2.0], [2.0, 1.0, 3.0]])
|
||||
result = df.rank(1, numeric_only=False, ascending=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10, 1e60, 1e80, 1e-30]})
|
||||
exp = DataFrame({"a": [3.5, 1.0, 3.5, 5.0, 6.0, 7.0, 2.0]})
|
||||
tm.assert_frame_equal(df.rank(), exp)
|
||||
|
||||
def test_rank_does_not_mutate(self):
|
||||
# GH#18521
|
||||
# Check rank does not mutate DataFrame
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 3)), dtype="float64"
|
||||
)
|
||||
expected = df.copy()
|
||||
df.rank()
|
||||
result = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_rank_mixed_frame(self, float_string_frame):
|
||||
float_string_frame["datetime"] = datetime.now()
|
||||
float_string_frame["timedelta"] = timedelta(days=1, seconds=1)
|
||||
|
||||
float_string_frame.rank(numeric_only=False)
|
||||
with pytest.raises(TypeError, match="not supported between instances of"):
|
||||
float_string_frame.rank(axis=1)
|
||||
|
||||
def test_rank_na_option(self, float_frame):
|
||||
sp_stats = pytest.importorskip("scipy.stats")
|
||||
|
||||
float_frame.loc[::2, "A"] = np.nan
|
||||
float_frame.loc[::3, "B"] = np.nan
|
||||
float_frame.loc[::4, "C"] = np.nan
|
||||
float_frame.loc[::5, "D"] = np.nan
|
||||
|
||||
# bottom
|
||||
ranks0 = float_frame.rank(na_option="bottom")
|
||||
ranks1 = float_frame.rank(1, na_option="bottom")
|
||||
|
||||
fvals = float_frame.fillna(np.inf).values
|
||||
|
||||
exp0 = np.apply_along_axis(sp_stats.rankdata, 0, fvals)
|
||||
exp1 = np.apply_along_axis(sp_stats.rankdata, 1, fvals)
|
||||
|
||||
tm.assert_almost_equal(ranks0.values, exp0)
|
||||
tm.assert_almost_equal(ranks1.values, exp1)
|
||||
|
||||
# top
|
||||
ranks0 = float_frame.rank(na_option="top")
|
||||
ranks1 = float_frame.rank(1, na_option="top")
|
||||
|
||||
fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values
|
||||
fval1 = float_frame.T
|
||||
fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
|
||||
fval1 = fval1.fillna(np.inf).values
|
||||
|
||||
exp0 = np.apply_along_axis(sp_stats.rankdata, 0, fval0)
|
||||
exp1 = np.apply_along_axis(sp_stats.rankdata, 1, fval1)
|
||||
|
||||
tm.assert_almost_equal(ranks0.values, exp0)
|
||||
tm.assert_almost_equal(ranks1.values, exp1)
|
||||
|
||||
# descending
|
||||
|
||||
# bottom
|
||||
ranks0 = float_frame.rank(na_option="top", ascending=False)
|
||||
ranks1 = float_frame.rank(1, na_option="top", ascending=False)
|
||||
|
||||
fvals = float_frame.fillna(np.inf).values
|
||||
|
||||
exp0 = np.apply_along_axis(sp_stats.rankdata, 0, -fvals)
|
||||
exp1 = np.apply_along_axis(sp_stats.rankdata, 1, -fvals)
|
||||
|
||||
tm.assert_almost_equal(ranks0.values, exp0)
|
||||
tm.assert_almost_equal(ranks1.values, exp1)
|
||||
|
||||
# descending
|
||||
|
||||
# top
|
||||
ranks0 = float_frame.rank(na_option="bottom", ascending=False)
|
||||
ranks1 = float_frame.rank(1, na_option="bottom", ascending=False)
|
||||
|
||||
fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values
|
||||
fval1 = float_frame.T
|
||||
fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
|
||||
fval1 = fval1.fillna(np.inf).values
|
||||
|
||||
exp0 = np.apply_along_axis(sp_stats.rankdata, 0, -fval0)
|
||||
exp1 = np.apply_along_axis(sp_stats.rankdata, 1, -fval1)
|
||||
|
||||
tm.assert_numpy_array_equal(ranks0.values, exp0)
|
||||
tm.assert_numpy_array_equal(ranks1.values, exp1)
|
||||
|
||||
# bad values throw error
|
||||
msg = "na_option must be one of 'keep', 'top', or 'bottom'"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
float_frame.rank(na_option="bad", ascending=False)
|
||||
|
||||
# invalid type
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
float_frame.rank(na_option=True, ascending=False)
|
||||
|
||||
def test_rank_axis(self):
|
||||
# check if using axes' names gives the same result
|
||||
df = DataFrame([[2, 1], [4, 3]])
|
||||
tm.assert_frame_equal(df.rank(axis=0), df.rank(axis="index"))
|
||||
tm.assert_frame_equal(df.rank(axis=1), df.rank(axis="columns"))
|
||||
|
||||
@pytest.mark.parametrize("ax", [0, 1])
|
||||
@pytest.mark.parametrize("m", ["average", "min", "max", "first", "dense"])
|
||||
def test_rank_methods_frame(self, ax, m):
|
||||
sp_stats = pytest.importorskip("scipy.stats")
|
||||
|
||||
xs = np.random.default_rng(2).integers(0, 21, (100, 26))
|
||||
xs = (xs - 10.0) / 10.0
|
||||
cols = [chr(ord("z") - i) for i in range(xs.shape[1])]
|
||||
|
||||
for vals in [xs, xs + 1e6, xs * 1e-6]:
|
||||
df = DataFrame(vals, columns=cols)
|
||||
|
||||
result = df.rank(axis=ax, method=m)
|
||||
sprank = np.apply_along_axis(
|
||||
sp_stats.rankdata, ax, vals, m if m != "first" else "ordinal"
|
||||
)
|
||||
sprank = sprank.astype(np.float64)
|
||||
expected = DataFrame(sprank, columns=cols).astype("float64")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["O", "f8", "i8"])
|
||||
def test_rank_descending(self, method, dtype):
|
||||
if "i" in dtype:
|
||||
df = self.df.dropna().astype(dtype)
|
||||
else:
|
||||
df = self.df.astype(dtype)
|
||||
|
||||
res = df.rank(ascending=False)
|
||||
expected = (df.max() - df).rank()
|
||||
tm.assert_frame_equal(res, expected)
|
||||
|
||||
expected = (df.max() - df).rank(method=method)
|
||||
|
||||
if dtype != "O":
|
||||
res2 = df.rank(method=method, ascending=False, numeric_only=True)
|
||||
tm.assert_frame_equal(res2, expected)
|
||||
|
||||
res3 = df.rank(method=method, ascending=False, numeric_only=False)
|
||||
tm.assert_frame_equal(res3, expected)
|
||||
|
||||
@pytest.mark.parametrize("axis", [0, 1])
|
||||
@pytest.mark.parametrize("dtype", [None, object])
|
||||
def test_rank_2d_tie_methods(self, method, axis, dtype):
|
||||
df = self.df
|
||||
|
||||
def _check2d(df, expected, method="average", axis=0):
|
||||
exp_df = DataFrame({"A": expected, "B": expected})
|
||||
|
||||
if axis == 1:
|
||||
df = df.T
|
||||
exp_df = exp_df.T
|
||||
|
||||
result = df.rank(method=method, axis=axis)
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
frame = df if dtype is None else df.astype(dtype)
|
||||
_check2d(frame, self.results[method], method=method, axis=axis)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method,exp",
|
||||
[
|
||||
("dense", [[1.0, 1.0, 1.0], [1.0, 0.5, 2.0 / 3], [1.0, 0.5, 1.0 / 3]]),
|
||||
(
|
||||
"min",
|
||||
[
|
||||
[1.0 / 3, 1.0, 1.0],
|
||||
[1.0 / 3, 1.0 / 3, 2.0 / 3],
|
||||
[1.0 / 3, 1.0 / 3, 1.0 / 3],
|
||||
],
|
||||
),
|
||||
(
|
||||
"max",
|
||||
[[1.0, 1.0, 1.0], [1.0, 2.0 / 3, 2.0 / 3], [1.0, 2.0 / 3, 1.0 / 3]],
|
||||
),
|
||||
(
|
||||
"average",
|
||||
[[2.0 / 3, 1.0, 1.0], [2.0 / 3, 0.5, 2.0 / 3], [2.0 / 3, 0.5, 1.0 / 3]],
|
||||
),
|
||||
(
|
||||
"first",
|
||||
[
|
||||
[1.0 / 3, 1.0, 1.0],
|
||||
[2.0 / 3, 1.0 / 3, 2.0 / 3],
|
||||
[3.0 / 3, 2.0 / 3, 1.0 / 3],
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_rank_pct_true(self, method, exp):
|
||||
# see gh-15630.
|
||||
|
||||
df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]])
|
||||
result = df.rank(method=method, pct=True)
|
||||
|
||||
expected = DataFrame(exp)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.single_cpu
|
||||
def test_pct_max_many_rows(self):
|
||||
# GH 18271
|
||||
df = DataFrame(
|
||||
{"A": np.arange(2**24 + 1), "B": np.arange(2**24 + 1, 0, -1)}
|
||||
)
|
||||
result = df.rank(pct=True).max()
|
||||
assert (result == 1).all()
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"contents,dtype",
|
||||
[
|
||||
(
|
||||
[
|
||||
-np.inf,
|
||||
-50,
|
||||
-1,
|
||||
-1e-20,
|
||||
-1e-25,
|
||||
-1e-50,
|
||||
0,
|
||||
1e-40,
|
||||
1e-20,
|
||||
1e-10,
|
||||
2,
|
||||
40,
|
||||
np.inf,
|
||||
],
|
||||
"float64",
|
||||
),
|
||||
(
|
||||
[
|
||||
-np.inf,
|
||||
-50,
|
||||
-1,
|
||||
-1e-20,
|
||||
-1e-25,
|
||||
-1e-45,
|
||||
0,
|
||||
1e-40,
|
||||
1e-20,
|
||||
1e-10,
|
||||
2,
|
||||
40,
|
||||
np.inf,
|
||||
],
|
||||
"float32",
|
||||
),
|
||||
([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"),
|
||||
(
|
||||
[
|
||||
np.iinfo(np.int64).min,
|
||||
-100,
|
||||
0,
|
||||
1,
|
||||
9999,
|
||||
100000,
|
||||
1e10,
|
||||
np.iinfo(np.int64).max,
|
||||
],
|
||||
"int64",
|
||||
),
|
||||
([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"),
|
||||
(
|
||||
[datetime(2001, 1, 1), datetime(2001, 1, 2), datetime(2001, 1, 5)],
|
||||
"datetime64",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_rank_inf_and_nan(self, contents, dtype, frame_or_series):
|
||||
dtype_na_map = {
|
||||
"float64": np.nan,
|
||||
"float32": np.nan,
|
||||
"object": None,
|
||||
"datetime64": np.datetime64("nat"),
|
||||
}
|
||||
# Insert nans at random positions if underlying dtype has missing
|
||||
# value. Then adjust the expected order by adding nans accordingly
|
||||
# This is for testing whether rank calculation is affected
|
||||
# when values are interwined with nan values.
|
||||
values = np.array(contents, dtype=dtype)
|
||||
exp_order = np.array(range(len(values)), dtype="float64") + 1.0
|
||||
if dtype in dtype_na_map:
|
||||
na_value = dtype_na_map[dtype]
|
||||
nan_indices = np.random.default_rng(2).choice(range(len(values)), 5)
|
||||
values = np.insert(values, nan_indices, na_value)
|
||||
exp_order = np.insert(exp_order, nan_indices, np.nan)
|
||||
|
||||
# Shuffle the testing array and expected results in the same way
|
||||
random_order = np.random.default_rng(2).permutation(len(values))
|
||||
obj = frame_or_series(values[random_order])
|
||||
expected = frame_or_series(exp_order[random_order], dtype="float64")
|
||||
result = obj.rank()
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_df_series_inf_nan_consistency(self):
|
||||
# GH#32593
|
||||
index = [5, 4, 3, 2, 1, 6, 7, 8, 9, 10]
|
||||
col1 = [5, 4, 3, 5, 8, 5, 2, 1, 6, 6]
|
||||
col2 = [5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf]
|
||||
df = DataFrame(
|
||||
data={
|
||||
"col1": col1,
|
||||
"col2": col2,
|
||||
},
|
||||
index=index,
|
||||
dtype="f8",
|
||||
)
|
||||
df_result = df.rank()
|
||||
|
||||
series_result = df.copy()
|
||||
series_result["col1"] = df["col1"].rank()
|
||||
series_result["col2"] = df["col2"].rank()
|
||||
|
||||
tm.assert_frame_equal(df_result, series_result)
|
||||
|
||||
def test_rank_both_inf(self):
|
||||
# GH#32593
|
||||
df = DataFrame({"a": [-np.inf, 0, np.inf]})
|
||||
expected = DataFrame({"a": [1.0, 2.0, 3.0]})
|
||||
result = df.rank()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"na_option,ascending,expected",
|
||||
[
|
||||
("top", True, [3.0, 1.0, 2.0]),
|
||||
("top", False, [2.0, 1.0, 3.0]),
|
||||
("bottom", True, [2.0, 3.0, 1.0]),
|
||||
("bottom", False, [1.0, 3.0, 2.0]),
|
||||
],
|
||||
)
|
||||
def test_rank_inf_nans_na_option(
|
||||
self, frame_or_series, method, na_option, ascending, expected
|
||||
):
|
||||
obj = frame_or_series([np.inf, np.nan, -np.inf])
|
||||
result = obj.rank(method=method, na_option=na_option, ascending=ascending)
|
||||
expected = frame_or_series(expected)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"na_option,ascending,expected",
|
||||
[
|
||||
("bottom", True, [1.0, 2.0, 4.0, 3.0]),
|
||||
("bottom", False, [1.0, 2.0, 4.0, 3.0]),
|
||||
("top", True, [2.0, 3.0, 1.0, 4.0]),
|
||||
("top", False, [2.0, 3.0, 1.0, 4.0]),
|
||||
],
|
||||
)
|
||||
def test_rank_object_first(
|
||||
self, frame_or_series, na_option, ascending, expected, using_infer_string
|
||||
):
|
||||
obj = frame_or_series(["foo", "foo", None, "foo"])
|
||||
result = obj.rank(method="first", na_option=na_option, ascending=ascending)
|
||||
expected = frame_or_series(expected)
|
||||
if using_infer_string and isinstance(obj, Series):
|
||||
expected = expected.astype("uint64")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,expected",
|
||||
[
|
||||
(
|
||||
{"a": [1, 2, "a"], "b": [4, 5, 6]},
|
||||
DataFrame({"b": [1.0, 2.0, 3.0]}, columns=Index(["b"], dtype=object)),
|
||||
),
|
||||
({"a": [1, 2, "a"]}, DataFrame(index=range(3), columns=[])),
|
||||
],
|
||||
)
|
||||
def test_rank_mixed_axis_zero(self, data, expected):
|
||||
df = DataFrame(data, columns=Index(list(data.keys()), dtype=object))
|
||||
with pytest.raises(TypeError, match="'<' not supported between instances of"):
|
||||
df.rank()
|
||||
result = df.rank(numeric_only=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype, exp_dtype",
|
||||
[("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")],
|
||||
)
|
||||
def test_rank_string_dtype(self, dtype, exp_dtype):
|
||||
# GH#55362
|
||||
pytest.importorskip("pyarrow")
|
||||
obj = Series(["foo", "foo", None, "foo"], dtype=dtype)
|
||||
result = obj.rank(method="first")
|
||||
expected = Series([1, 2, None, 3], dtype=exp_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,39 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameReindexLike:
|
||||
def test_reindex_like(self, float_frame):
|
||||
other = float_frame.reindex(index=float_frame.index[:10], columns=["C", "B"])
|
||||
|
||||
tm.assert_frame_equal(other, float_frame.reindex_like(other))
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method,expected_values",
|
||||
[
|
||||
("nearest", [0, 1, 1, 2]),
|
||||
("pad", [np.nan, 0, 1, 1]),
|
||||
("backfill", [0, 1, 2, 2]),
|
||||
],
|
||||
)
|
||||
def test_reindex_like_methods(self, method, expected_values):
|
||||
df = DataFrame({"x": list(range(5))})
|
||||
|
||||
result = df.reindex_like(df, method=method, tolerance=0)
|
||||
tm.assert_frame_equal(df, result)
|
||||
result = df.reindex_like(df, method=method, tolerance=[0, 0, 0, 0])
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
def test_reindex_like_subclass(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/31925
|
||||
class MyDataFrame(DataFrame):
|
||||
pass
|
||||
|
||||
expected = DataFrame()
|
||||
df = MyDataFrame()
|
||||
result = df.reindex_like(expected)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,415 @@
|
||||
from collections import ChainMap
|
||||
import inspect
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
merge,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestRename:
|
||||
def test_rename_signature(self):
|
||||
sig = inspect.signature(DataFrame.rename)
|
||||
parameters = set(sig.parameters)
|
||||
assert parameters == {
|
||||
"self",
|
||||
"mapper",
|
||||
"index",
|
||||
"columns",
|
||||
"axis",
|
||||
"inplace",
|
||||
"copy",
|
||||
"level",
|
||||
"errors",
|
||||
}
|
||||
|
||||
def test_rename_mi(self, frame_or_series):
|
||||
obj = frame_or_series(
|
||||
[11, 21, 31],
|
||||
index=MultiIndex.from_tuples([("A", x) for x in ["a", "B", "c"]]),
|
||||
)
|
||||
obj.rename(str.lower)
|
||||
|
||||
def test_rename(self, float_frame):
|
||||
mapping = {"A": "a", "B": "b", "C": "c", "D": "d"}
|
||||
|
||||
renamed = float_frame.rename(columns=mapping)
|
||||
renamed2 = float_frame.rename(columns=str.lower)
|
||||
|
||||
tm.assert_frame_equal(renamed, renamed2)
|
||||
tm.assert_frame_equal(
|
||||
renamed2.rename(columns=str.upper), float_frame, check_names=False
|
||||
)
|
||||
|
||||
# index
|
||||
data = {"A": {"foo": 0, "bar": 1}}
|
||||
|
||||
df = DataFrame(data)
|
||||
renamed = df.rename(index={"foo": "bar", "bar": "foo"})
|
||||
tm.assert_index_equal(renamed.index, Index(["bar", "foo"]))
|
||||
|
||||
renamed = df.rename(index=str.upper)
|
||||
tm.assert_index_equal(renamed.index, Index(["FOO", "BAR"]))
|
||||
|
||||
# have to pass something
|
||||
with pytest.raises(TypeError, match="must pass an index to rename"):
|
||||
float_frame.rename()
|
||||
|
||||
# partial columns
|
||||
renamed = float_frame.rename(columns={"C": "foo", "D": "bar"})
|
||||
tm.assert_index_equal(renamed.columns, Index(["A", "B", "foo", "bar"]))
|
||||
|
||||
# other axis
|
||||
renamed = float_frame.T.rename(index={"C": "foo", "D": "bar"})
|
||||
tm.assert_index_equal(renamed.index, Index(["A", "B", "foo", "bar"]))
|
||||
|
||||
# index with name
|
||||
index = Index(["foo", "bar"], name="name")
|
||||
renamer = DataFrame(data, index=index)
|
||||
renamed = renamer.rename(index={"foo": "bar", "bar": "foo"})
|
||||
tm.assert_index_equal(renamed.index, Index(["bar", "foo"], name="name"))
|
||||
assert renamed.index.name == renamer.index.name
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"args,kwargs",
|
||||
[
|
||||
((ChainMap({"A": "a"}, {"B": "b"}),), {"axis": "columns"}),
|
||||
((), {"columns": ChainMap({"A": "a"}, {"B": "b"})}),
|
||||
],
|
||||
)
|
||||
def test_rename_chainmap(self, args, kwargs):
|
||||
# see gh-23859
|
||||
colAData = range(1, 11)
|
||||
colBdata = np.random.default_rng(2).standard_normal(10)
|
||||
|
||||
df = DataFrame({"A": colAData, "B": colBdata})
|
||||
result = df.rename(*args, **kwargs)
|
||||
|
||||
expected = DataFrame({"a": colAData, "b": colBdata})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_rename_multiindex(self):
|
||||
tuples_index = [("foo1", "bar1"), ("foo2", "bar2")]
|
||||
tuples_columns = [("fizz1", "buzz1"), ("fizz2", "buzz2")]
|
||||
index = MultiIndex.from_tuples(tuples_index, names=["foo", "bar"])
|
||||
columns = MultiIndex.from_tuples(tuples_columns, names=["fizz", "buzz"])
|
||||
df = DataFrame([(0, 0), (1, 1)], index=index, columns=columns)
|
||||
|
||||
#
|
||||
# without specifying level -> across all levels
|
||||
|
||||
renamed = df.rename(
|
||||
index={"foo1": "foo3", "bar2": "bar3"},
|
||||
columns={"fizz1": "fizz3", "buzz2": "buzz3"},
|
||||
)
|
||||
new_index = MultiIndex.from_tuples(
|
||||
[("foo3", "bar1"), ("foo2", "bar3")], names=["foo", "bar"]
|
||||
)
|
||||
new_columns = MultiIndex.from_tuples(
|
||||
[("fizz3", "buzz1"), ("fizz2", "buzz3")], names=["fizz", "buzz"]
|
||||
)
|
||||
tm.assert_index_equal(renamed.index, new_index)
|
||||
tm.assert_index_equal(renamed.columns, new_columns)
|
||||
assert renamed.index.names == df.index.names
|
||||
assert renamed.columns.names == df.columns.names
|
||||
|
||||
#
|
||||
# with specifying a level (GH13766)
|
||||
|
||||
# dict
|
||||
new_columns = MultiIndex.from_tuples(
|
||||
[("fizz3", "buzz1"), ("fizz2", "buzz2")], names=["fizz", "buzz"]
|
||||
)
|
||||
renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=0)
|
||||
tm.assert_index_equal(renamed.columns, new_columns)
|
||||
renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz")
|
||||
tm.assert_index_equal(renamed.columns, new_columns)
|
||||
|
||||
new_columns = MultiIndex.from_tuples(
|
||||
[("fizz1", "buzz1"), ("fizz2", "buzz3")], names=["fizz", "buzz"]
|
||||
)
|
||||
renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1)
|
||||
tm.assert_index_equal(renamed.columns, new_columns)
|
||||
renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz")
|
||||
tm.assert_index_equal(renamed.columns, new_columns)
|
||||
|
||||
# function
|
||||
func = str.upper
|
||||
new_columns = MultiIndex.from_tuples(
|
||||
[("FIZZ1", "buzz1"), ("FIZZ2", "buzz2")], names=["fizz", "buzz"]
|
||||
)
|
||||
renamed = df.rename(columns=func, level=0)
|
||||
tm.assert_index_equal(renamed.columns, new_columns)
|
||||
renamed = df.rename(columns=func, level="fizz")
|
||||
tm.assert_index_equal(renamed.columns, new_columns)
|
||||
|
||||
new_columns = MultiIndex.from_tuples(
|
||||
[("fizz1", "BUZZ1"), ("fizz2", "BUZZ2")], names=["fizz", "buzz"]
|
||||
)
|
||||
renamed = df.rename(columns=func, level=1)
|
||||
tm.assert_index_equal(renamed.columns, new_columns)
|
||||
renamed = df.rename(columns=func, level="buzz")
|
||||
tm.assert_index_equal(renamed.columns, new_columns)
|
||||
|
||||
# index
|
||||
new_index = MultiIndex.from_tuples(
|
||||
[("foo3", "bar1"), ("foo2", "bar2")], names=["foo", "bar"]
|
||||
)
|
||||
renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0)
|
||||
tm.assert_index_equal(renamed.index, new_index)
|
||||
|
||||
def test_rename_nocopy(self, float_frame, using_copy_on_write, warn_copy_on_write):
|
||||
renamed = float_frame.rename(columns={"C": "foo"}, copy=False)
|
||||
|
||||
assert np.shares_memory(renamed["foo"]._values, float_frame["C"]._values)
|
||||
|
||||
with tm.assert_cow_warning(warn_copy_on_write):
|
||||
renamed.loc[:, "foo"] = 1.0
|
||||
if using_copy_on_write:
|
||||
assert not (float_frame["C"] == 1.0).all()
|
||||
else:
|
||||
assert (float_frame["C"] == 1.0).all()
|
||||
|
||||
def test_rename_inplace(self, float_frame):
|
||||
float_frame.rename(columns={"C": "foo"})
|
||||
assert "C" in float_frame
|
||||
assert "foo" not in float_frame
|
||||
|
||||
c_values = float_frame["C"]
|
||||
float_frame = float_frame.copy()
|
||||
return_value = float_frame.rename(columns={"C": "foo"}, inplace=True)
|
||||
assert return_value is None
|
||||
|
||||
assert "C" not in float_frame
|
||||
assert "foo" in float_frame
|
||||
# GH 44153
|
||||
# Used to be id(float_frame["foo"]) != c_id, but flaky in the CI
|
||||
assert float_frame["foo"] is not c_values
|
||||
|
||||
def test_rename_bug(self):
|
||||
# GH 5344
|
||||
# rename set ref_locs, and set_index was not resetting
|
||||
df = DataFrame({0: ["foo", "bar"], 1: ["bah", "bas"], 2: [1, 2]})
|
||||
df = df.rename(columns={0: "a"})
|
||||
df = df.rename(columns={1: "b"})
|
||||
df = df.set_index(["a", "b"])
|
||||
df.columns = ["2001-01-01"]
|
||||
expected = DataFrame(
|
||||
[[1], [2]],
|
||||
index=MultiIndex.from_tuples(
|
||||
[("foo", "bah"), ("bar", "bas")], names=["a", "b"]
|
||||
),
|
||||
columns=["2001-01-01"],
|
||||
)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_rename_bug2(self):
|
||||
# GH 19497
|
||||
# rename was changing Index to MultiIndex if Index contained tuples
|
||||
|
||||
df = DataFrame(data=np.arange(3), index=[(0, 0), (1, 1), (2, 2)], columns=["a"])
|
||||
df = df.rename({(1, 1): (5, 4)}, axis="index")
|
||||
expected = DataFrame(
|
||||
data=np.arange(3), index=[(0, 0), (5, 4), (2, 2)], columns=["a"]
|
||||
)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_rename_errors_raises(self):
|
||||
df = DataFrame(columns=["A", "B", "C", "D"])
|
||||
with pytest.raises(KeyError, match="'E'] not found in axis"):
|
||||
df.rename(columns={"A": "a", "E": "e"}, errors="raise")
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"mapper, errors, expected_columns",
|
||||
[
|
||||
({"A": "a", "E": "e"}, "ignore", ["a", "B", "C", "D"]),
|
||||
({"A": "a"}, "raise", ["a", "B", "C", "D"]),
|
||||
(str.lower, "raise", ["a", "b", "c", "d"]),
|
||||
],
|
||||
)
|
||||
def test_rename_errors(self, mapper, errors, expected_columns):
|
||||
# GH 13473
|
||||
# rename now works with errors parameter
|
||||
df = DataFrame(columns=["A", "B", "C", "D"])
|
||||
result = df.rename(columns=mapper, errors=errors)
|
||||
expected = DataFrame(columns=expected_columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_rename_objects(self, float_string_frame):
|
||||
renamed = float_string_frame.rename(columns=str.upper)
|
||||
|
||||
assert "FOO" in renamed
|
||||
assert "foo" not in renamed
|
||||
|
||||
def test_rename_axis_style(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/12392
|
||||
df = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["X", "Y"])
|
||||
expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=["X", "Y"])
|
||||
|
||||
result = df.rename(str.lower, axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.rename(str.lower, axis="columns")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.rename({"A": "a", "B": "b"}, axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.rename({"A": "a", "B": "b"}, axis="columns")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Index
|
||||
expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["x", "y"])
|
||||
result = df.rename(str.lower, axis=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.rename(str.lower, axis="index")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.rename({"X": "x", "Y": "y"}, axis=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.rename({"X": "x", "Y": "y"}, axis="index")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.rename(mapper=str.lower, axis="index")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_rename_mapper_multi(self):
|
||||
df = DataFrame({"A": ["a", "b"], "B": ["c", "d"], "C": [1, 2]}).set_index(
|
||||
["A", "B"]
|
||||
)
|
||||
result = df.rename(str.upper)
|
||||
expected = df.rename(index=str.upper)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_rename_positional_named(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/12392
|
||||
df = DataFrame({"a": [1, 2], "b": [1, 2]}, index=["X", "Y"])
|
||||
result = df.rename(index=str.lower, columns=str.upper)
|
||||
expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["x", "y"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_rename_axis_style_raises(self):
|
||||
# see gh-12392
|
||||
df = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["0", "1"])
|
||||
|
||||
# Named target and axis
|
||||
over_spec_msg = "Cannot specify both 'axis' and any of 'index' or 'columns'"
|
||||
with pytest.raises(TypeError, match=over_spec_msg):
|
||||
df.rename(index=str.lower, axis=1)
|
||||
|
||||
with pytest.raises(TypeError, match=over_spec_msg):
|
||||
df.rename(index=str.lower, axis="columns")
|
||||
|
||||
with pytest.raises(TypeError, match=over_spec_msg):
|
||||
df.rename(columns=str.lower, axis="columns")
|
||||
|
||||
with pytest.raises(TypeError, match=over_spec_msg):
|
||||
df.rename(index=str.lower, axis=0)
|
||||
|
||||
# Multiple targets and axis
|
||||
with pytest.raises(TypeError, match=over_spec_msg):
|
||||
df.rename(str.lower, index=str.lower, axis="columns")
|
||||
|
||||
# Too many targets
|
||||
over_spec_msg = "Cannot specify both 'mapper' and any of 'index' or 'columns'"
|
||||
with pytest.raises(TypeError, match=over_spec_msg):
|
||||
df.rename(str.lower, index=str.lower, columns=str.lower)
|
||||
|
||||
# Duplicates
|
||||
with pytest.raises(TypeError, match="multiple values"):
|
||||
df.rename(id, mapper=id)
|
||||
|
||||
def test_rename_positional_raises(self):
|
||||
# GH 29136
|
||||
df = DataFrame(columns=["A", "B"])
|
||||
msg = r"rename\(\) takes from 1 to 2 positional arguments"
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.rename(None, str.lower)
|
||||
|
||||
def test_rename_no_mappings_raises(self):
|
||||
# GH 29136
|
||||
df = DataFrame([[1]])
|
||||
msg = "must pass an index to rename"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.rename()
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.rename(None, index=None)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.rename(None, columns=None)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.rename(None, columns=None, index=None)
|
||||
|
||||
def test_rename_mapper_and_positional_arguments_raises(self):
|
||||
# GH 29136
|
||||
df = DataFrame([[1]])
|
||||
msg = "Cannot specify both 'mapper' and any of 'index' or 'columns'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.rename({}, index={})
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.rename({}, columns={})
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.rename({}, columns={}, index={})
|
||||
|
||||
def test_rename_with_duplicate_columns(self):
|
||||
# GH#4403
|
||||
df4 = DataFrame(
|
||||
{"RT": [0.0454], "TClose": [22.02], "TExg": [0.0422]},
|
||||
index=MultiIndex.from_tuples(
|
||||
[(600809, 20130331)], names=["STK_ID", "RPT_Date"]
|
||||
),
|
||||
)
|
||||
|
||||
df5 = DataFrame(
|
||||
{
|
||||
"RPT_Date": [20120930, 20121231, 20130331],
|
||||
"STK_ID": [600809] * 3,
|
||||
"STK_Name": ["饡驦", "饡驦", "饡驦"],
|
||||
"TClose": [38.05, 41.66, 30.01],
|
||||
},
|
||||
index=MultiIndex.from_tuples(
|
||||
[(600809, 20120930), (600809, 20121231), (600809, 20130331)],
|
||||
names=["STK_ID", "RPT_Date"],
|
||||
),
|
||||
)
|
||||
# TODO: can we construct this without merge?
|
||||
k = merge(df4, df5, how="inner", left_index=True, right_index=True)
|
||||
result = k.rename(columns={"TClose_x": "TClose", "TClose_y": "QT_Close"})
|
||||
|
||||
expected = DataFrame(
|
||||
[[0.0454, 22.02, 0.0422, 20130331, 600809, "饡驦", 30.01]],
|
||||
columns=[
|
||||
"RT",
|
||||
"TClose",
|
||||
"TExg",
|
||||
"RPT_Date",
|
||||
"STK_ID",
|
||||
"STK_Name",
|
||||
"QT_Close",
|
||||
],
|
||||
).set_index(["STK_ID", "RPT_Date"], drop=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_rename_boolean_index(self):
|
||||
df = DataFrame(np.arange(15).reshape(3, 5), columns=[False, True, 2, 3, 4])
|
||||
mapper = {0: "foo", 1: "bar", 2: "bah"}
|
||||
res = df.rename(index=mapper)
|
||||
exp = DataFrame(
|
||||
np.arange(15).reshape(3, 5),
|
||||
columns=[False, True, 2, 3, 4],
|
||||
index=["foo", "bar", "bah"],
|
||||
)
|
||||
tm.assert_frame_equal(res, exp)
|
@ -0,0 +1,111 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameRenameAxis:
|
||||
def test_rename_axis_inplace(self, float_frame):
|
||||
# GH#15704
|
||||
expected = float_frame.rename_axis("foo")
|
||||
result = float_frame.copy()
|
||||
return_value = no_return = result.rename_axis("foo", inplace=True)
|
||||
assert return_value is None
|
||||
|
||||
assert no_return is None
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = float_frame.rename_axis("bar", axis=1)
|
||||
result = float_frame.copy()
|
||||
return_value = no_return = result.rename_axis("bar", axis=1, inplace=True)
|
||||
assert return_value is None
|
||||
|
||||
assert no_return is None
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_rename_axis_raises(self):
|
||||
# GH#17833
|
||||
df = DataFrame({"A": [1, 2], "B": [1, 2]})
|
||||
with pytest.raises(ValueError, match="Use `.rename`"):
|
||||
df.rename_axis(id, axis=0)
|
||||
|
||||
with pytest.raises(ValueError, match="Use `.rename`"):
|
||||
df.rename_axis({0: 10, 1: 20}, axis=0)
|
||||
|
||||
with pytest.raises(ValueError, match="Use `.rename`"):
|
||||
df.rename_axis(id, axis=1)
|
||||
|
||||
with pytest.raises(ValueError, match="Use `.rename`"):
|
||||
df["A"].rename_axis(id)
|
||||
|
||||
def test_rename_axis_mapper(self):
|
||||
# GH#19978
|
||||
mi = MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["ll", "nn"])
|
||||
df = DataFrame(
|
||||
{"x": list(range(len(mi))), "y": [i * 10 for i in range(len(mi))]}, index=mi
|
||||
)
|
||||
|
||||
# Test for rename of the Index object of columns
|
||||
result = df.rename_axis("cols", axis=1)
|
||||
tm.assert_index_equal(result.columns, Index(["x", "y"], name="cols"))
|
||||
|
||||
# Test for rename of the Index object of columns using dict
|
||||
result = result.rename_axis(columns={"cols": "new"}, axis=1)
|
||||
tm.assert_index_equal(result.columns, Index(["x", "y"], name="new"))
|
||||
|
||||
# Test for renaming index using dict
|
||||
result = df.rename_axis(index={"ll": "foo"})
|
||||
assert result.index.names == ["foo", "nn"]
|
||||
|
||||
# Test for renaming index using a function
|
||||
result = df.rename_axis(index=str.upper, axis=0)
|
||||
assert result.index.names == ["LL", "NN"]
|
||||
|
||||
# Test for renaming index providing complete list
|
||||
result = df.rename_axis(index=["foo", "goo"])
|
||||
assert result.index.names == ["foo", "goo"]
|
||||
|
||||
# Test for changing index and columns at same time
|
||||
sdf = df.reset_index().set_index("nn").drop(columns=["ll", "y"])
|
||||
result = sdf.rename_axis(index="foo", columns="meh")
|
||||
assert result.index.name == "foo"
|
||||
assert result.columns.name == "meh"
|
||||
|
||||
# Test different error cases
|
||||
with pytest.raises(TypeError, match="Must pass"):
|
||||
df.rename_axis(index="wrong")
|
||||
|
||||
with pytest.raises(ValueError, match="Length of names"):
|
||||
df.rename_axis(index=["wrong"])
|
||||
|
||||
with pytest.raises(TypeError, match="bogus"):
|
||||
df.rename_axis(bogus=None)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs, rename_index, rename_columns",
|
||||
[
|
||||
({"mapper": None, "axis": 0}, True, False),
|
||||
({"mapper": None, "axis": 1}, False, True),
|
||||
({"index": None}, True, False),
|
||||
({"columns": None}, False, True),
|
||||
({"index": None, "columns": None}, True, True),
|
||||
({}, False, False),
|
||||
],
|
||||
)
|
||||
def test_rename_axis_none(self, kwargs, rename_index, rename_columns):
|
||||
# GH 25034
|
||||
index = Index(list("abc"), name="foo")
|
||||
columns = Index(["col1", "col2"], name="bar")
|
||||
data = np.arange(6).reshape(3, 2)
|
||||
df = DataFrame(data, index, columns)
|
||||
|
||||
result = df.rename_axis(**kwargs)
|
||||
expected_index = index.rename(None) if rename_index else index
|
||||
expected_columns = columns.rename(None) if rename_columns else columns
|
||||
expected = DataFrame(data, expected_index, expected_columns)
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,74 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
MultiIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestReorderLevels:
|
||||
def test_reorder_levels(self, frame_or_series):
|
||||
index = MultiIndex(
|
||||
levels=[["bar"], ["one", "two", "three"], [0, 1]],
|
||||
codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]],
|
||||
names=["L0", "L1", "L2"],
|
||||
)
|
||||
df = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=index)
|
||||
obj = tm.get_obj(df, frame_or_series)
|
||||
|
||||
# no change, position
|
||||
result = obj.reorder_levels([0, 1, 2])
|
||||
tm.assert_equal(obj, result)
|
||||
|
||||
# no change, labels
|
||||
result = obj.reorder_levels(["L0", "L1", "L2"])
|
||||
tm.assert_equal(obj, result)
|
||||
|
||||
# rotate, position
|
||||
result = obj.reorder_levels([1, 2, 0])
|
||||
e_idx = MultiIndex(
|
||||
levels=[["one", "two", "three"], [0, 1], ["bar"]],
|
||||
codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1], [0, 0, 0, 0, 0, 0]],
|
||||
names=["L1", "L2", "L0"],
|
||||
)
|
||||
expected = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=e_idx)
|
||||
expected = tm.get_obj(expected, frame_or_series)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
result = obj.reorder_levels([0, 0, 0])
|
||||
e_idx = MultiIndex(
|
||||
levels=[["bar"], ["bar"], ["bar"]],
|
||||
codes=[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]],
|
||||
names=["L0", "L0", "L0"],
|
||||
)
|
||||
expected = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=e_idx)
|
||||
expected = tm.get_obj(expected, frame_or_series)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
result = obj.reorder_levels(["L0", "L0", "L0"])
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_reorder_levels_swaplevel_equivalence(
|
||||
self, multiindex_year_month_day_dataframe_random_data
|
||||
):
|
||||
ymd = multiindex_year_month_day_dataframe_random_data
|
||||
|
||||
result = ymd.reorder_levels(["month", "day", "year"])
|
||||
expected = ymd.swaplevel(0, 1).swaplevel(1, 2)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = ymd["A"].reorder_levels(["month", "day", "year"])
|
||||
expected = ymd["A"].swaplevel(0, 1).swaplevel(1, 2)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = ymd.T.reorder_levels(["month", "day", "year"], axis=1)
|
||||
expected = ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with pytest.raises(TypeError, match="hierarchical axis"):
|
||||
ymd.reorder_levels([1, 2], axis=1)
|
||||
|
||||
with pytest.raises(IndexError, match="Too many levels"):
|
||||
ymd.index.reorder_levels([1, 2, 3])
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,782 @@
|
||||
from datetime import datetime
|
||||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_float_dtype,
|
||||
is_integer_dtype,
|
||||
)
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
CategoricalIndex,
|
||||
DataFrame,
|
||||
Index,
|
||||
Interval,
|
||||
IntervalIndex,
|
||||
MultiIndex,
|
||||
RangeIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
cut,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def multiindex_df():
|
||||
levels = [["A", ""], ["B", "b"]]
|
||||
return DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels))
|
||||
|
||||
|
||||
class TestResetIndex:
|
||||
def test_reset_index_empty_rangeindex(self):
|
||||
# GH#45230
|
||||
df = DataFrame(
|
||||
columns=["brand"], dtype=np.int64, index=RangeIndex(0, 0, 1, name="foo")
|
||||
)
|
||||
|
||||
df2 = df.set_index([df.index, "brand"])
|
||||
|
||||
result = df2.reset_index([1], drop=True)
|
||||
tm.assert_frame_equal(result, df[[]], check_index_type=True)
|
||||
|
||||
def test_set_reset(self):
|
||||
idx = Index([2**63, 2**63 + 5, 2**63 + 10], name="foo")
|
||||
|
||||
# set/reset
|
||||
df = DataFrame({"A": [0, 1, 2]}, index=idx)
|
||||
result = df.reset_index()
|
||||
assert result["foo"].dtype == np.dtype("uint64")
|
||||
|
||||
df = result.set_index("foo")
|
||||
tm.assert_index_equal(df.index, idx)
|
||||
|
||||
def test_set_index_reset_index_dt64tz(self):
|
||||
idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo")
|
||||
|
||||
# set/reset
|
||||
df = DataFrame({"A": [0, 1, 2]}, index=idx)
|
||||
result = df.reset_index()
|
||||
assert result["foo"].dtype == "datetime64[ns, US/Eastern]"
|
||||
|
||||
df = result.set_index("foo")
|
||||
tm.assert_index_equal(df.index, idx)
|
||||
|
||||
def test_reset_index_tz(self, tz_aware_fixture):
|
||||
# GH 3950
|
||||
# reset_index with single level
|
||||
tz = tz_aware_fixture
|
||||
idx = date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx")
|
||||
df = DataFrame({"a": range(5), "b": ["A", "B", "C", "D", "E"]}, index=idx)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"idx": idx,
|
||||
"a": range(5),
|
||||
"b": ["A", "B", "C", "D", "E"],
|
||||
},
|
||||
columns=["idx", "a", "b"],
|
||||
)
|
||||
result = df.reset_index()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"])
|
||||
def test_frame_reset_index_tzaware_index(self, tz):
|
||||
dr = date_range("2012-06-02", periods=10, tz=tz)
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal(len(dr)), dr)
|
||||
roundtripped = df.reset_index().set_index("index")
|
||||
xp = df.index.tz
|
||||
rs = roundtripped.index.tz
|
||||
assert xp == rs
|
||||
|
||||
def test_reset_index_with_intervals(self):
|
||||
idx = IntervalIndex.from_breaks(np.arange(11), name="x")
|
||||
original = DataFrame({"x": idx, "y": np.arange(10)})[["x", "y"]]
|
||||
|
||||
result = original.set_index("x")
|
||||
expected = DataFrame({"y": np.arange(10)}, index=idx)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result2 = result.reset_index()
|
||||
tm.assert_frame_equal(result2, original)
|
||||
|
||||
def test_reset_index(self, float_frame):
|
||||
stacked = float_frame.stack(future_stack=True)[::2]
|
||||
stacked = DataFrame({"foo": stacked, "bar": stacked})
|
||||
|
||||
names = ["first", "second"]
|
||||
stacked.index.names = names
|
||||
deleveled = stacked.reset_index()
|
||||
for i, (lev, level_codes) in enumerate(
|
||||
zip(stacked.index.levels, stacked.index.codes)
|
||||
):
|
||||
values = lev.take(level_codes)
|
||||
name = names[i]
|
||||
tm.assert_index_equal(values, Index(deleveled[name]))
|
||||
|
||||
stacked.index.names = [None, None]
|
||||
deleveled2 = stacked.reset_index()
|
||||
tm.assert_series_equal(
|
||||
deleveled["first"], deleveled2["level_0"], check_names=False
|
||||
)
|
||||
tm.assert_series_equal(
|
||||
deleveled["second"], deleveled2["level_1"], check_names=False
|
||||
)
|
||||
|
||||
# default name assigned
|
||||
rdf = float_frame.reset_index()
|
||||
exp = Series(float_frame.index.values, name="index")
|
||||
tm.assert_series_equal(rdf["index"], exp)
|
||||
|
||||
# default name assigned, corner case
|
||||
df = float_frame.copy()
|
||||
df["index"] = "foo"
|
||||
rdf = df.reset_index()
|
||||
exp = Series(float_frame.index.values, name="level_0")
|
||||
tm.assert_series_equal(rdf["level_0"], exp)
|
||||
|
||||
# but this is ok
|
||||
float_frame.index.name = "index"
|
||||
deleveled = float_frame.reset_index()
|
||||
tm.assert_series_equal(deleveled["index"], Series(float_frame.index))
|
||||
tm.assert_index_equal(deleveled.index, Index(range(len(deleveled))), exact=True)
|
||||
|
||||
# preserve column names
|
||||
float_frame.columns.name = "columns"
|
||||
reset = float_frame.reset_index()
|
||||
assert reset.columns.name == "columns"
|
||||
|
||||
# only remove certain columns
|
||||
df = float_frame.reset_index().set_index(["index", "A", "B"])
|
||||
rs = df.reset_index(["A", "B"])
|
||||
|
||||
tm.assert_frame_equal(rs, float_frame)
|
||||
|
||||
rs = df.reset_index(["index", "A", "B"])
|
||||
tm.assert_frame_equal(rs, float_frame.reset_index())
|
||||
|
||||
rs = df.reset_index(["index", "A", "B"])
|
||||
tm.assert_frame_equal(rs, float_frame.reset_index())
|
||||
|
||||
rs = df.reset_index("A")
|
||||
xp = float_frame.reset_index().set_index(["index", "B"])
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
# test resetting in place
|
||||
df = float_frame.copy()
|
||||
reset = float_frame.reset_index()
|
||||
return_value = df.reset_index(inplace=True)
|
||||
assert return_value is None
|
||||
tm.assert_frame_equal(df, reset)
|
||||
|
||||
df = float_frame.reset_index().set_index(["index", "A", "B"])
|
||||
rs = df.reset_index("A", drop=True)
|
||||
xp = float_frame.copy()
|
||||
del xp["A"]
|
||||
xp = xp.set_index(["B"], append=True)
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
def test_reset_index_name(self):
|
||||
df = DataFrame(
|
||||
[[1, 2, 3, 4], [5, 6, 7, 8]],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=Index(range(2), name="x"),
|
||||
)
|
||||
assert df.reset_index().index.name is None
|
||||
assert df.reset_index(drop=True).index.name is None
|
||||
return_value = df.reset_index(inplace=True)
|
||||
assert return_value is None
|
||||
assert df.index.name is None
|
||||
|
||||
@pytest.mark.parametrize("levels", [["A", "B"], [0, 1]])
|
||||
def test_reset_index_level(self, levels):
|
||||
df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "C", "D"])
|
||||
|
||||
# With MultiIndex
|
||||
result = df.set_index(["A", "B"]).reset_index(level=levels[0])
|
||||
tm.assert_frame_equal(result, df.set_index("B"))
|
||||
|
||||
result = df.set_index(["A", "B"]).reset_index(level=levels[:1])
|
||||
tm.assert_frame_equal(result, df.set_index("B"))
|
||||
|
||||
result = df.set_index(["A", "B"]).reset_index(level=levels)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
result = df.set_index(["A", "B"]).reset_index(level=levels, drop=True)
|
||||
tm.assert_frame_equal(result, df[["C", "D"]])
|
||||
|
||||
# With single-level Index (GH 16263)
|
||||
result = df.set_index("A").reset_index(level=levels[0])
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
result = df.set_index("A").reset_index(level=levels[:1])
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
result = df.set_index(["A"]).reset_index(level=levels[0], drop=True)
|
||||
tm.assert_frame_equal(result, df[["B", "C", "D"]])
|
||||
|
||||
@pytest.mark.parametrize("idx_lev", [["A", "B"], ["A"]])
|
||||
def test_reset_index_level_missing(self, idx_lev):
|
||||
# Missing levels - for both MultiIndex and single-level Index:
|
||||
df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "C", "D"])
|
||||
|
||||
with pytest.raises(KeyError, match=r"(L|l)evel \(?E\)?"):
|
||||
df.set_index(idx_lev).reset_index(level=["A", "E"])
|
||||
with pytest.raises(IndexError, match="Too many levels"):
|
||||
df.set_index(idx_lev).reset_index(level=[0, 1, 2])
|
||||
|
||||
def test_reset_index_right_dtype(self):
|
||||
time = np.arange(0.0, 10, np.sqrt(2) / 2)
|
||||
s1 = Series(
|
||||
(9.81 * time**2) / 2, index=Index(time, name="time"), name="speed"
|
||||
)
|
||||
df = DataFrame(s1)
|
||||
|
||||
reset = s1.reset_index()
|
||||
assert reset["time"].dtype == np.float64
|
||||
|
||||
reset = df.reset_index()
|
||||
assert reset["time"].dtype == np.float64
|
||||
|
||||
def test_reset_index_multiindex_col(self):
|
||||
vals = np.random.default_rng(2).standard_normal((3, 3)).astype(object)
|
||||
idx = ["x", "y", "z"]
|
||||
full = np.hstack(([[x] for x in idx], vals))
|
||||
df = DataFrame(
|
||||
vals,
|
||||
Index(idx, name="a"),
|
||||
columns=[["b", "b", "c"], ["mean", "median", "mean"]],
|
||||
)
|
||||
rs = df.reset_index()
|
||||
xp = DataFrame(
|
||||
full, columns=[["a", "b", "b", "c"], ["", "mean", "median", "mean"]]
|
||||
)
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
rs = df.reset_index(col_fill=None)
|
||||
xp = DataFrame(
|
||||
full, columns=[["a", "b", "b", "c"], ["a", "mean", "median", "mean"]]
|
||||
)
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
rs = df.reset_index(col_level=1, col_fill="blah")
|
||||
xp = DataFrame(
|
||||
full, columns=[["blah", "b", "b", "c"], ["a", "mean", "median", "mean"]]
|
||||
)
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
df = DataFrame(
|
||||
vals,
|
||||
MultiIndex.from_arrays([[0, 1, 2], ["x", "y", "z"]], names=["d", "a"]),
|
||||
columns=[["b", "b", "c"], ["mean", "median", "mean"]],
|
||||
)
|
||||
rs = df.reset_index("a")
|
||||
xp = DataFrame(
|
||||
full,
|
||||
Index([0, 1, 2], name="d"),
|
||||
columns=[["a", "b", "b", "c"], ["", "mean", "median", "mean"]],
|
||||
)
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
rs = df.reset_index("a", col_fill=None)
|
||||
xp = DataFrame(
|
||||
full,
|
||||
Index(range(3), name="d"),
|
||||
columns=[["a", "b", "b", "c"], ["a", "mean", "median", "mean"]],
|
||||
)
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
rs = df.reset_index("a", col_fill="blah", col_level=1)
|
||||
xp = DataFrame(
|
||||
full,
|
||||
Index(range(3), name="d"),
|
||||
columns=[["blah", "b", "b", "c"], ["a", "mean", "median", "mean"]],
|
||||
)
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
def test_reset_index_multiindex_nan(self):
|
||||
# GH#6322, testing reset_index on MultiIndexes
|
||||
# when we have a nan or all nan
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["a", "b", "c"],
|
||||
"B": [0, 1, np.nan],
|
||||
"C": np.random.default_rng(2).random(3),
|
||||
}
|
||||
)
|
||||
rs = df.set_index(["A", "B"]).reset_index()
|
||||
tm.assert_frame_equal(rs, df)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [np.nan, "b", "c"],
|
||||
"B": [0, 1, 2],
|
||||
"C": np.random.default_rng(2).random(3),
|
||||
}
|
||||
)
|
||||
rs = df.set_index(["A", "B"]).reset_index()
|
||||
tm.assert_frame_equal(rs, df)
|
||||
|
||||
df = DataFrame({"A": ["a", "b", "c"], "B": [0, 1, 2], "C": [np.nan, 1.1, 2.2]})
|
||||
rs = df.set_index(["A", "B"]).reset_index()
|
||||
tm.assert_frame_equal(rs, df)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["a", "b", "c"],
|
||||
"B": [np.nan, np.nan, np.nan],
|
||||
"C": np.random.default_rng(2).random(3),
|
||||
}
|
||||
)
|
||||
rs = df.set_index(["A", "B"]).reset_index()
|
||||
tm.assert_frame_equal(rs, df)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"name",
|
||||
[
|
||||
None,
|
||||
"foo",
|
||||
2,
|
||||
3.0,
|
||||
pd.Timedelta(6),
|
||||
Timestamp("2012-12-30", tz="UTC"),
|
||||
"2012-12-31",
|
||||
],
|
||||
)
|
||||
def test_reset_index_with_datetimeindex_cols(self, name):
|
||||
# GH#5818
|
||||
df = DataFrame(
|
||||
[[1, 2], [3, 4]],
|
||||
columns=date_range("1/1/2013", "1/2/2013"),
|
||||
index=["A", "B"],
|
||||
)
|
||||
df.index.name = name
|
||||
|
||||
result = df.reset_index()
|
||||
|
||||
item = name if name is not None else "index"
|
||||
columns = Index([item, datetime(2013, 1, 1), datetime(2013, 1, 2)])
|
||||
if isinstance(item, str) and item == "2012-12-31":
|
||||
columns = columns.astype("datetime64[ns]")
|
||||
else:
|
||||
assert columns.dtype == object
|
||||
|
||||
expected = DataFrame(
|
||||
[["A", 1, 2], ["B", 3, 4]],
|
||||
columns=columns,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_reset_index_range(self):
|
||||
# GH#12071
|
||||
df = DataFrame([[0, 0], [1, 1]], columns=["A", "B"], index=RangeIndex(stop=2))
|
||||
result = df.reset_index()
|
||||
assert isinstance(result.index, RangeIndex)
|
||||
expected = DataFrame(
|
||||
[[0, 0, 0], [1, 1, 1]],
|
||||
columns=["index", "A", "B"],
|
||||
index=RangeIndex(stop=2),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_reset_index_multiindex_columns(self, multiindex_df):
|
||||
result = multiindex_df[["B"]].rename_axis("A").reset_index()
|
||||
tm.assert_frame_equal(result, multiindex_df)
|
||||
|
||||
# GH#16120: already existing column
|
||||
msg = r"cannot insert \('A', ''\), already exists"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
multiindex_df.rename_axis("A").reset_index()
|
||||
|
||||
# GH#16164: multiindex (tuple) full key
|
||||
result = multiindex_df.set_index([("A", "")]).reset_index()
|
||||
tm.assert_frame_equal(result, multiindex_df)
|
||||
|
||||
# with additional (unnamed) index level
|
||||
idx_col = DataFrame(
|
||||
[[0], [1]], columns=MultiIndex.from_tuples([("level_0", "")])
|
||||
)
|
||||
expected = pd.concat([idx_col, multiindex_df[[("B", "b"), ("A", "")]]], axis=1)
|
||||
result = multiindex_df.set_index([("B", "b")], append=True).reset_index()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# with index name which is a too long tuple...
|
||||
msg = "Item must have length equal to number of levels."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
multiindex_df.rename_axis([("C", "c", "i")]).reset_index()
|
||||
|
||||
# or too short...
|
||||
levels = [["A", "a", ""], ["B", "b", "i"]]
|
||||
df2 = DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels))
|
||||
idx_col = DataFrame(
|
||||
[[0], [1]], columns=MultiIndex.from_tuples([("C", "c", "ii")])
|
||||
)
|
||||
expected = pd.concat([idx_col, df2], axis=1)
|
||||
result = df2.rename_axis([("C", "c")]).reset_index(col_fill="ii")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# ... which is incompatible with col_fill=None
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=(
|
||||
"col_fill=None is incompatible with "
|
||||
r"incomplete column name \('C', 'c'\)"
|
||||
),
|
||||
):
|
||||
df2.rename_axis([("C", "c")]).reset_index(col_fill=None)
|
||||
|
||||
# with col_level != 0
|
||||
result = df2.rename_axis([("c", "ii")]).reset_index(col_level=1, col_fill="C")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("flag", [False, True])
|
||||
@pytest.mark.parametrize("allow_duplicates", [False, True])
|
||||
def test_reset_index_duplicate_columns_allow(
|
||||
self, multiindex_df, flag, allow_duplicates
|
||||
):
|
||||
# GH#44755 reset_index with duplicate column labels
|
||||
df = multiindex_df.rename_axis("A")
|
||||
df = df.set_flags(allows_duplicate_labels=flag)
|
||||
|
||||
if flag and allow_duplicates:
|
||||
result = df.reset_index(allow_duplicates=allow_duplicates)
|
||||
levels = [["A", ""], ["A", ""], ["B", "b"]]
|
||||
expected = DataFrame(
|
||||
[[0, 0, 2], [1, 1, 3]], columns=MultiIndex.from_tuples(levels)
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
if not flag and allow_duplicates:
|
||||
msg = (
|
||||
"Cannot specify 'allow_duplicates=True' when "
|
||||
"'self.flags.allows_duplicate_labels' is False"
|
||||
)
|
||||
else:
|
||||
msg = r"cannot insert \('A', ''\), already exists"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.reset_index(allow_duplicates=allow_duplicates)
|
||||
|
||||
@pytest.mark.parametrize("flag", [False, True])
|
||||
def test_reset_index_duplicate_columns_default(self, multiindex_df, flag):
|
||||
df = multiindex_df.rename_axis("A")
|
||||
df = df.set_flags(allows_duplicate_labels=flag)
|
||||
|
||||
msg = r"cannot insert \('A', ''\), already exists"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.reset_index()
|
||||
|
||||
@pytest.mark.parametrize("allow_duplicates", ["bad value"])
|
||||
def test_reset_index_allow_duplicates_check(self, multiindex_df, allow_duplicates):
|
||||
with pytest.raises(ValueError, match="expected type bool"):
|
||||
multiindex_df.reset_index(allow_duplicates=allow_duplicates)
|
||||
|
||||
def test_reset_index_datetime(self, tz_naive_fixture):
|
||||
# GH#3950
|
||||
tz = tz_naive_fixture
|
||||
idx1 = date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx1")
|
||||
idx2 = Index(range(5), name="idx2", dtype="int64")
|
||||
idx = MultiIndex.from_arrays([idx1, idx2])
|
||||
df = DataFrame(
|
||||
{"a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"]},
|
||||
index=idx,
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"idx1": idx1,
|
||||
"idx2": np.arange(5, dtype="int64"),
|
||||
"a": np.arange(5, dtype="int64"),
|
||||
"b": ["A", "B", "C", "D", "E"],
|
||||
},
|
||||
columns=["idx1", "idx2", "a", "b"],
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(df.reset_index(), expected)
|
||||
|
||||
def test_reset_index_datetime2(self, tz_naive_fixture):
|
||||
tz = tz_naive_fixture
|
||||
idx1 = date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx1")
|
||||
idx2 = Index(range(5), name="idx2", dtype="int64")
|
||||
idx3 = date_range(
|
||||
"1/1/2012", periods=5, freq="MS", tz="Europe/Paris", name="idx3"
|
||||
)
|
||||
idx = MultiIndex.from_arrays([idx1, idx2, idx3])
|
||||
df = DataFrame(
|
||||
{"a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"]},
|
||||
index=idx,
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"idx1": idx1,
|
||||
"idx2": np.arange(5, dtype="int64"),
|
||||
"idx3": idx3,
|
||||
"a": np.arange(5, dtype="int64"),
|
||||
"b": ["A", "B", "C", "D", "E"],
|
||||
},
|
||||
columns=["idx1", "idx2", "idx3", "a", "b"],
|
||||
)
|
||||
result = df.reset_index()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_reset_index_datetime3(self, tz_naive_fixture):
|
||||
# GH#7793
|
||||
tz = tz_naive_fixture
|
||||
dti = date_range("20130101", periods=3, tz=tz)
|
||||
idx = MultiIndex.from_product([["a", "b"], dti])
|
||||
df = DataFrame(
|
||||
np.arange(6, dtype="int64").reshape(6, 1), columns=["a"], index=idx
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"level_0": "a a a b b b".split(),
|
||||
"level_1": dti.append(dti),
|
||||
"a": np.arange(6, dtype="int64"),
|
||||
},
|
||||
columns=["level_0", "level_1", "a"],
|
||||
)
|
||||
result = df.reset_index()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_reset_index_period(self):
|
||||
# GH#7746
|
||||
idx = MultiIndex.from_product(
|
||||
[pd.period_range("20130101", periods=3, freq="M"), list("abc")],
|
||||
names=["month", "feature"],
|
||||
)
|
||||
|
||||
df = DataFrame(
|
||||
np.arange(9, dtype="int64").reshape(-1, 1), index=idx, columns=["a"]
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"month": (
|
||||
[pd.Period("2013-01", freq="M")] * 3
|
||||
+ [pd.Period("2013-02", freq="M")] * 3
|
||||
+ [pd.Period("2013-03", freq="M")] * 3
|
||||
),
|
||||
"feature": ["a", "b", "c"] * 3,
|
||||
"a": np.arange(9, dtype="int64"),
|
||||
},
|
||||
columns=["month", "feature", "a"],
|
||||
)
|
||||
result = df.reset_index()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_reset_index_delevel_infer_dtype(self):
|
||||
tuples = list(product(["foo", "bar"], [10, 20], [1.0, 1.1]))
|
||||
index = MultiIndex.from_tuples(tuples, names=["prm0", "prm1", "prm2"])
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((8, 3)),
|
||||
columns=["A", "B", "C"],
|
||||
index=index,
|
||||
)
|
||||
deleveled = df.reset_index()
|
||||
assert is_integer_dtype(deleveled["prm1"])
|
||||
assert is_float_dtype(deleveled["prm2"])
|
||||
|
||||
def test_reset_index_with_drop(
|
||||
self, multiindex_year_month_day_dataframe_random_data
|
||||
):
|
||||
ymd = multiindex_year_month_day_dataframe_random_data
|
||||
|
||||
deleveled = ymd.reset_index(drop=True)
|
||||
assert len(deleveled.columns) == len(ymd.columns)
|
||||
assert deleveled.index.name == ymd.index.name
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ix_data, exp_data",
|
||||
[
|
||||
(
|
||||
[(pd.NaT, 1), (pd.NaT, 2)],
|
||||
{"a": [pd.NaT, pd.NaT], "b": [1, 2], "x": [11, 12]},
|
||||
),
|
||||
(
|
||||
[(pd.NaT, 1), (Timestamp("2020-01-01"), 2)],
|
||||
{"a": [pd.NaT, Timestamp("2020-01-01")], "b": [1, 2], "x": [11, 12]},
|
||||
),
|
||||
(
|
||||
[(pd.NaT, 1), (pd.Timedelta(123, "d"), 2)],
|
||||
{"a": [pd.NaT, pd.Timedelta(123, "d")], "b": [1, 2], "x": [11, 12]},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_reset_index_nat_multiindex(self, ix_data, exp_data):
|
||||
# GH#36541: that reset_index() does not raise ValueError
|
||||
ix = MultiIndex.from_tuples(ix_data, names=["a", "b"])
|
||||
result = DataFrame({"x": [11, 12]}, index=ix)
|
||||
result = result.reset_index()
|
||||
|
||||
expected = DataFrame(exp_data)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"codes", ([[0, 0, 1, 1], [0, 1, 0, 1]], [[0, 0, -1, 1], [0, 1, 0, 1]])
|
||||
)
|
||||
def test_rest_index_multiindex_categorical_with_missing_values(self, codes):
|
||||
# GH#24206
|
||||
|
||||
index = MultiIndex(
|
||||
[CategoricalIndex(["A", "B"]), CategoricalIndex(["a", "b"])], codes
|
||||
)
|
||||
data = {"col": range(len(index))}
|
||||
df = DataFrame(data=data, index=index)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"level_0": Categorical.from_codes(codes[0], categories=["A", "B"]),
|
||||
"level_1": Categorical.from_codes(codes[1], categories=["a", "b"]),
|
||||
"col": range(4),
|
||||
}
|
||||
)
|
||||
|
||||
res = df.reset_index()
|
||||
tm.assert_frame_equal(res, expected)
|
||||
|
||||
# roundtrip
|
||||
res = expected.set_index(["level_0", "level_1"]).reset_index()
|
||||
tm.assert_frame_equal(res, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array, dtype",
|
||||
[
|
||||
(["a", "b"], object),
|
||||
(
|
||||
pd.period_range("12-1-2000", periods=2, freq="Q-DEC"),
|
||||
pd.PeriodDtype(freq="Q-DEC"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_reset_index_dtypes_on_empty_frame_with_multiindex(
|
||||
array, dtype, using_infer_string
|
||||
):
|
||||
# GH 19602 - Preserve dtype on empty DataFrame with MultiIndex
|
||||
idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array])
|
||||
result = DataFrame(index=idx)[:0].reset_index().dtypes
|
||||
if using_infer_string and dtype == object:
|
||||
dtype = "string"
|
||||
expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": dtype})
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_reset_index_empty_frame_with_datetime64_multiindex():
|
||||
# https://github.com/pandas-dev/pandas/issues/35606
|
||||
dti = pd.DatetimeIndex(["2020-07-20 00:00:00"], dtype="M8[ns]")
|
||||
idx = MultiIndex.from_product([dti, [3, 4]], names=["a", "b"])[:0]
|
||||
df = DataFrame(index=idx, columns=["c", "d"])
|
||||
result = df.reset_index()
|
||||
expected = DataFrame(
|
||||
columns=list("abcd"), index=RangeIndex(start=0, stop=0, step=1)
|
||||
)
|
||||
expected["a"] = expected["a"].astype("datetime64[ns]")
|
||||
expected["b"] = expected["b"].astype("int64")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby(
|
||||
using_infer_string,
|
||||
):
|
||||
# https://github.com/pandas-dev/pandas/issues/35657
|
||||
dti = pd.DatetimeIndex(["2020-01-01"], dtype="M8[ns]")
|
||||
df = DataFrame({"c1": [10.0], "c2": ["a"], "c3": dti})
|
||||
df = df.head(0).groupby(["c2", "c3"])[["c1"]].sum()
|
||||
result = df.reset_index()
|
||||
expected = DataFrame(
|
||||
columns=["c2", "c3", "c1"], index=RangeIndex(start=0, stop=0, step=1)
|
||||
)
|
||||
expected["c3"] = expected["c3"].astype("datetime64[ns]")
|
||||
expected["c1"] = expected["c1"].astype("float64")
|
||||
if using_infer_string:
|
||||
expected["c2"] = expected["c2"].astype("string[pyarrow_numpy]")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_reset_index_multiindex_nat():
|
||||
# GH 11479
|
||||
idx = range(3)
|
||||
tstamp = date_range("2015-07-01", freq="D", periods=3)
|
||||
df = DataFrame({"id": idx, "tstamp": tstamp, "a": list("abc")})
|
||||
df.loc[2, "tstamp"] = pd.NaT
|
||||
result = df.set_index(["id", "tstamp"]).reset_index("id")
|
||||
exp_dti = pd.DatetimeIndex(
|
||||
["2015-07-01", "2015-07-02", "NaT"], dtype="M8[ns]", name="tstamp"
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"id": range(3), "a": list("abc")},
|
||||
index=exp_dti,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_reset_index_interval_columns_object_cast():
|
||||
# GH 19136
|
||||
df = DataFrame(
|
||||
np.eye(2), index=Index([1, 2], name="Year"), columns=cut([1, 2], [0, 1, 2])
|
||||
)
|
||||
result = df.reset_index()
|
||||
expected = DataFrame(
|
||||
[[1, 1.0, 0.0], [2, 0.0, 1.0]],
|
||||
columns=Index(["Year", Interval(0, 1), Interval(1, 2)]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_reset_index_rename(float_frame):
|
||||
# GH 6878
|
||||
result = float_frame.reset_index(names="new_name")
|
||||
expected = Series(float_frame.index.values, name="new_name")
|
||||
tm.assert_series_equal(result["new_name"], expected)
|
||||
|
||||
result = float_frame.reset_index(names=123)
|
||||
expected = Series(float_frame.index.values, name=123)
|
||||
tm.assert_series_equal(result[123], expected)
|
||||
|
||||
|
||||
def test_reset_index_rename_multiindex(float_frame):
|
||||
# GH 6878
|
||||
stacked_df = float_frame.stack(future_stack=True)[::2]
|
||||
stacked_df = DataFrame({"foo": stacked_df, "bar": stacked_df})
|
||||
|
||||
names = ["first", "second"]
|
||||
stacked_df.index.names = names
|
||||
|
||||
result = stacked_df.reset_index()
|
||||
expected = stacked_df.reset_index(names=["new_first", "new_second"])
|
||||
tm.assert_series_equal(result["first"], expected["new_first"], check_names=False)
|
||||
tm.assert_series_equal(result["second"], expected["new_second"], check_names=False)
|
||||
|
||||
|
||||
def test_errorreset_index_rename(float_frame):
|
||||
# GH 6878
|
||||
stacked_df = float_frame.stack(future_stack=True)[::2]
|
||||
stacked_df = DataFrame({"first": stacked_df, "second": stacked_df})
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="Index names must be str or 1-dimensional list"
|
||||
):
|
||||
stacked_df.reset_index(names={"first": "new_first", "second": "new_second"})
|
||||
|
||||
with pytest.raises(IndexError, match="list index out of range"):
|
||||
stacked_df.reset_index(names=["new_first"])
|
||||
|
||||
|
||||
def test_reset_index_false_index_name():
|
||||
result_series = Series(data=range(5, 10), index=range(5))
|
||||
result_series.index.name = False
|
||||
result_series.reset_index()
|
||||
expected_series = Series(range(5, 10), RangeIndex(range(5), name=False))
|
||||
tm.assert_series_equal(result_series, expected_series)
|
||||
|
||||
# GH 38147
|
||||
result_frame = DataFrame(data=range(5, 10), index=range(5))
|
||||
result_frame.index.name = False
|
||||
result_frame.reset_index()
|
||||
expected_frame = DataFrame(range(5, 10), RangeIndex(range(5), name=False))
|
||||
tm.assert_frame_equal(result_frame, expected_frame)
|
@ -0,0 +1,225 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameRound:
|
||||
def test_round(self):
|
||||
# GH#2665
|
||||
|
||||
# Test that rounding an empty DataFrame does nothing
|
||||
df = DataFrame()
|
||||
tm.assert_frame_equal(df, df.round())
|
||||
|
||||
# Here's the test frame we'll be working with
|
||||
df = DataFrame({"col1": [1.123, 2.123, 3.123], "col2": [1.234, 2.234, 3.234]})
|
||||
|
||||
# Default round to integer (i.e. decimals=0)
|
||||
expected_rounded = DataFrame({"col1": [1.0, 2.0, 3.0], "col2": [1.0, 2.0, 3.0]})
|
||||
tm.assert_frame_equal(df.round(), expected_rounded)
|
||||
|
||||
# Round with an integer
|
||||
decimals = 2
|
||||
expected_rounded = DataFrame(
|
||||
{"col1": [1.12, 2.12, 3.12], "col2": [1.23, 2.23, 3.23]}
|
||||
)
|
||||
tm.assert_frame_equal(df.round(decimals), expected_rounded)
|
||||
|
||||
# This should also work with np.round (since np.round dispatches to
|
||||
# df.round)
|
||||
tm.assert_frame_equal(np.round(df, decimals), expected_rounded)
|
||||
|
||||
# Round with a list
|
||||
round_list = [1, 2]
|
||||
msg = "decimals must be an integer, a dict-like or a Series"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.round(round_list)
|
||||
|
||||
# Round with a dictionary
|
||||
expected_rounded = DataFrame(
|
||||
{"col1": [1.1, 2.1, 3.1], "col2": [1.23, 2.23, 3.23]}
|
||||
)
|
||||
round_dict = {"col1": 1, "col2": 2}
|
||||
tm.assert_frame_equal(df.round(round_dict), expected_rounded)
|
||||
|
||||
# Incomplete dict
|
||||
expected_partially_rounded = DataFrame(
|
||||
{"col1": [1.123, 2.123, 3.123], "col2": [1.2, 2.2, 3.2]}
|
||||
)
|
||||
partial_round_dict = {"col2": 1}
|
||||
tm.assert_frame_equal(df.round(partial_round_dict), expected_partially_rounded)
|
||||
|
||||
# Dict with unknown elements
|
||||
wrong_round_dict = {"col3": 2, "col2": 1}
|
||||
tm.assert_frame_equal(df.round(wrong_round_dict), expected_partially_rounded)
|
||||
|
||||
# float input to `decimals`
|
||||
non_int_round_dict = {"col1": 1, "col2": 0.5}
|
||||
msg = "Values in decimals must be integers"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.round(non_int_round_dict)
|
||||
|
||||
# String input
|
||||
non_int_round_dict = {"col1": 1, "col2": "foo"}
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.round(non_int_round_dict)
|
||||
|
||||
non_int_round_Series = Series(non_int_round_dict)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.round(non_int_round_Series)
|
||||
|
||||
# List input
|
||||
non_int_round_dict = {"col1": 1, "col2": [1, 2]}
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.round(non_int_round_dict)
|
||||
|
||||
non_int_round_Series = Series(non_int_round_dict)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.round(non_int_round_Series)
|
||||
|
||||
# Non integer Series inputs
|
||||
non_int_round_Series = Series(non_int_round_dict)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.round(non_int_round_Series)
|
||||
|
||||
non_int_round_Series = Series(non_int_round_dict)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.round(non_int_round_Series)
|
||||
|
||||
# Negative numbers
|
||||
negative_round_dict = {"col1": -1, "col2": -2}
|
||||
big_df = df * 100
|
||||
expected_neg_rounded = DataFrame(
|
||||
{"col1": [110.0, 210, 310], "col2": [100.0, 200, 300]}
|
||||
)
|
||||
tm.assert_frame_equal(big_df.round(negative_round_dict), expected_neg_rounded)
|
||||
|
||||
# nan in Series round
|
||||
nan_round_Series = Series({"col1": np.nan, "col2": 1})
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.round(nan_round_Series)
|
||||
|
||||
# Make sure this doesn't break existing Series.round
|
||||
tm.assert_series_equal(df["col1"].round(1), expected_rounded["col1"])
|
||||
|
||||
# named columns
|
||||
# GH#11986
|
||||
decimals = 2
|
||||
expected_rounded = DataFrame(
|
||||
{"col1": [1.12, 2.12, 3.12], "col2": [1.23, 2.23, 3.23]}
|
||||
)
|
||||
df.columns.name = "cols"
|
||||
expected_rounded.columns.name = "cols"
|
||||
tm.assert_frame_equal(df.round(decimals), expected_rounded)
|
||||
|
||||
# interaction of named columns & series
|
||||
tm.assert_series_equal(df["col1"].round(decimals), expected_rounded["col1"])
|
||||
tm.assert_series_equal(df.round(decimals)["col1"], expected_rounded["col1"])
|
||||
|
||||
def test_round_numpy(self):
|
||||
# GH#12600
|
||||
df = DataFrame([[1.53, 1.36], [0.06, 7.01]])
|
||||
out = np.round(df, decimals=0)
|
||||
expected = DataFrame([[2.0, 1.0], [0.0, 7.0]])
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
msg = "the 'out' parameter is not supported"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
np.round(df, decimals=0, out=df)
|
||||
|
||||
def test_round_numpy_with_nan(self):
|
||||
# See GH#14197
|
||||
df = Series([1.53, np.nan, 0.06]).to_frame()
|
||||
with tm.assert_produces_warning(None):
|
||||
result = df.round()
|
||||
expected = Series([2.0, np.nan, 0.0]).to_frame()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_round_mixed_type(self):
|
||||
# GH#11885
|
||||
df = DataFrame(
|
||||
{
|
||||
"col1": [1.1, 2.2, 3.3, 4.4],
|
||||
"col2": ["1", "a", "c", "f"],
|
||||
"col3": date_range("20111111", periods=4),
|
||||
}
|
||||
)
|
||||
round_0 = DataFrame(
|
||||
{
|
||||
"col1": [1.0, 2.0, 3.0, 4.0],
|
||||
"col2": ["1", "a", "c", "f"],
|
||||
"col3": date_range("20111111", periods=4),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(df.round(), round_0)
|
||||
tm.assert_frame_equal(df.round(1), df)
|
||||
tm.assert_frame_equal(df.round({"col1": 1}), df)
|
||||
tm.assert_frame_equal(df.round({"col1": 0}), round_0)
|
||||
tm.assert_frame_equal(df.round({"col1": 0, "col2": 1}), round_0)
|
||||
tm.assert_frame_equal(df.round({"col3": 1}), df)
|
||||
|
||||
def test_round_with_duplicate_columns(self):
|
||||
# GH#11611
|
||||
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).random([3, 3]),
|
||||
columns=["A", "B", "C"],
|
||||
index=["first", "second", "third"],
|
||||
)
|
||||
|
||||
dfs = pd.concat((df, df), axis=1)
|
||||
rounded = dfs.round()
|
||||
tm.assert_index_equal(rounded.index, dfs.index)
|
||||
|
||||
decimals = Series([1, 0, 2], index=["A", "B", "A"])
|
||||
msg = "Index of decimals must be unique"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.round(decimals)
|
||||
|
||||
def test_round_builtin(self):
|
||||
# GH#11763
|
||||
# Here's the test frame we'll be working with
|
||||
df = DataFrame({"col1": [1.123, 2.123, 3.123], "col2": [1.234, 2.234, 3.234]})
|
||||
|
||||
# Default round to integer (i.e. decimals=0)
|
||||
expected_rounded = DataFrame({"col1": [1.0, 2.0, 3.0], "col2": [1.0, 2.0, 3.0]})
|
||||
tm.assert_frame_equal(round(df), expected_rounded)
|
||||
|
||||
def test_round_nonunique_categorical(self):
|
||||
# See GH#21809
|
||||
idx = pd.CategoricalIndex(["low"] * 3 + ["hi"] * 3)
|
||||
df = DataFrame(np.random.default_rng(2).random((6, 3)), columns=list("abc"))
|
||||
|
||||
expected = df.round(3)
|
||||
expected.index = idx
|
||||
|
||||
df_categorical = df.copy().set_index(idx)
|
||||
assert df_categorical.shape == (6, 3)
|
||||
result = df_categorical.round(3)
|
||||
assert result.shape == (6, 3)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_round_interval_category_columns(self):
|
||||
# GH#30063
|
||||
columns = pd.CategoricalIndex(pd.interval_range(0, 2))
|
||||
df = DataFrame([[0.66, 1.1], [0.3, 0.25]], columns=columns)
|
||||
|
||||
result = df.round()
|
||||
expected = DataFrame([[1.0, 1.0], [0.0, 0.0]], columns=columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_round_empty_not_input(self):
|
||||
# GH#51032
|
||||
df = DataFrame()
|
||||
result = df.round()
|
||||
tm.assert_frame_equal(df, result)
|
||||
assert df is not result
|
@ -0,0 +1,372 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
import pandas.core.common as com
|
||||
|
||||
|
||||
class TestSample:
|
||||
@pytest.fixture
|
||||
def obj(self, frame_or_series):
|
||||
if frame_or_series is Series:
|
||||
arr = np.random.default_rng(2).standard_normal(10)
|
||||
else:
|
||||
arr = np.random.default_rng(2).standard_normal((10, 10))
|
||||
return frame_or_series(arr, dtype=None)
|
||||
|
||||
@pytest.mark.parametrize("test", list(range(10)))
|
||||
def test_sample(self, test, obj):
|
||||
# Fixes issue: 2419
|
||||
# Check behavior of random_state argument
|
||||
# Check for stability when receives seed or random state -- run 10
|
||||
# times.
|
||||
|
||||
seed = np.random.default_rng(2).integers(0, 100)
|
||||
tm.assert_equal(
|
||||
obj.sample(n=4, random_state=seed), obj.sample(n=4, random_state=seed)
|
||||
)
|
||||
|
||||
tm.assert_equal(
|
||||
obj.sample(frac=0.7, random_state=seed),
|
||||
obj.sample(frac=0.7, random_state=seed),
|
||||
)
|
||||
|
||||
tm.assert_equal(
|
||||
obj.sample(n=4, random_state=np.random.default_rng(test)),
|
||||
obj.sample(n=4, random_state=np.random.default_rng(test)),
|
||||
)
|
||||
|
||||
tm.assert_equal(
|
||||
obj.sample(frac=0.7, random_state=np.random.default_rng(test)),
|
||||
obj.sample(frac=0.7, random_state=np.random.default_rng(test)),
|
||||
)
|
||||
|
||||
tm.assert_equal(
|
||||
obj.sample(
|
||||
frac=2,
|
||||
replace=True,
|
||||
random_state=np.random.default_rng(test),
|
||||
),
|
||||
obj.sample(
|
||||
frac=2,
|
||||
replace=True,
|
||||
random_state=np.random.default_rng(test),
|
||||
),
|
||||
)
|
||||
|
||||
os1, os2 = [], []
|
||||
for _ in range(2):
|
||||
os1.append(obj.sample(n=4, random_state=test))
|
||||
os2.append(obj.sample(frac=0.7, random_state=test))
|
||||
tm.assert_equal(*os1)
|
||||
tm.assert_equal(*os2)
|
||||
|
||||
def test_sample_lengths(self, obj):
|
||||
# Check lengths are right
|
||||
assert len(obj.sample(n=4) == 4)
|
||||
assert len(obj.sample(frac=0.34) == 3)
|
||||
assert len(obj.sample(frac=0.36) == 4)
|
||||
|
||||
def test_sample_invalid_random_state(self, obj):
|
||||
# Check for error when random_state argument invalid.
|
||||
msg = (
|
||||
"random_state must be an integer, array-like, a BitGenerator, Generator, "
|
||||
"a numpy RandomState, or None"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.sample(random_state="a_string")
|
||||
|
||||
def test_sample_wont_accept_n_and_frac(self, obj):
|
||||
# Giving both frac and N throws error
|
||||
msg = "Please enter a value for `frac` OR `n`, not both"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.sample(n=3, frac=0.3)
|
||||
|
||||
def test_sample_requires_positive_n_frac(self, obj):
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="A negative number of rows requested. Please provide `n` >= 0",
|
||||
):
|
||||
obj.sample(n=-3)
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="A negative number of rows requested. Please provide `frac` >= 0",
|
||||
):
|
||||
obj.sample(frac=-0.3)
|
||||
|
||||
def test_sample_requires_integer_n(self, obj):
|
||||
# Make sure float values of `n` give error
|
||||
with pytest.raises(ValueError, match="Only integers accepted as `n` values"):
|
||||
obj.sample(n=3.2)
|
||||
|
||||
def test_sample_invalid_weight_lengths(self, obj):
|
||||
# Weight length must be right
|
||||
msg = "Weights and axis to be sampled must be of same length"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.sample(n=3, weights=[0, 1])
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
bad_weights = [0.5] * 11
|
||||
obj.sample(n=3, weights=bad_weights)
|
||||
|
||||
with pytest.raises(ValueError, match="Fewer non-zero entries in p than size"):
|
||||
bad_weight_series = Series([0, 0, 0.2])
|
||||
obj.sample(n=4, weights=bad_weight_series)
|
||||
|
||||
def test_sample_negative_weights(self, obj):
|
||||
# Check won't accept negative weights
|
||||
bad_weights = [-0.1] * 10
|
||||
msg = "weight vector many not include negative values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.sample(n=3, weights=bad_weights)
|
||||
|
||||
def test_sample_inf_weights(self, obj):
|
||||
# Check inf and -inf throw errors:
|
||||
|
||||
weights_with_inf = [0.1] * 10
|
||||
weights_with_inf[0] = np.inf
|
||||
msg = "weight vector may not include `inf` values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.sample(n=3, weights=weights_with_inf)
|
||||
|
||||
weights_with_ninf = [0.1] * 10
|
||||
weights_with_ninf[0] = -np.inf
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.sample(n=3, weights=weights_with_ninf)
|
||||
|
||||
def test_sample_zero_weights(self, obj):
|
||||
# All zeros raises errors
|
||||
|
||||
zero_weights = [0] * 10
|
||||
with pytest.raises(ValueError, match="Invalid weights: weights sum to zero"):
|
||||
obj.sample(n=3, weights=zero_weights)
|
||||
|
||||
def test_sample_missing_weights(self, obj):
|
||||
# All missing weights
|
||||
|
||||
nan_weights = [np.nan] * 10
|
||||
with pytest.raises(ValueError, match="Invalid weights: weights sum to zero"):
|
||||
obj.sample(n=3, weights=nan_weights)
|
||||
|
||||
def test_sample_none_weights(self, obj):
|
||||
# Check None are also replaced by zeros.
|
||||
weights_with_None = [None] * 10
|
||||
weights_with_None[5] = 0.5
|
||||
tm.assert_equal(
|
||||
obj.sample(n=1, axis=0, weights=weights_with_None), obj.iloc[5:6]
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"func_str,arg",
|
||||
[
|
||||
("np.array", [2, 3, 1, 0]),
|
||||
("np.random.MT19937", 3),
|
||||
("np.random.PCG64", 11),
|
||||
],
|
||||
)
|
||||
def test_sample_random_state(self, func_str, arg, frame_or_series):
|
||||
# GH#32503
|
||||
obj = DataFrame({"col1": range(10, 20), "col2": range(20, 30)})
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
result = obj.sample(n=3, random_state=eval(func_str)(arg))
|
||||
expected = obj.sample(n=3, random_state=com.random_state(eval(func_str)(arg)))
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_sample_generator(self, frame_or_series):
|
||||
# GH#38100
|
||||
obj = frame_or_series(np.arange(100))
|
||||
rng = np.random.default_rng(2)
|
||||
|
||||
# Consecutive calls should advance the seed
|
||||
result1 = obj.sample(n=50, random_state=rng)
|
||||
result2 = obj.sample(n=50, random_state=rng)
|
||||
assert not (result1.index.values == result2.index.values).all()
|
||||
|
||||
# Matching generator initialization must give same result
|
||||
# Consecutive calls should advance the seed
|
||||
result1 = obj.sample(n=50, random_state=np.random.default_rng(11))
|
||||
result2 = obj.sample(n=50, random_state=np.random.default_rng(11))
|
||||
tm.assert_equal(result1, result2)
|
||||
|
||||
def test_sample_upsampling_without_replacement(self, frame_or_series):
|
||||
# GH#27451
|
||||
|
||||
obj = DataFrame({"A": list("abc")})
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
msg = (
|
||||
"Replace has to be set to `True` when "
|
||||
"upsampling the population `frac` > 1."
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.sample(frac=2, replace=False)
|
||||
|
||||
|
||||
class TestSampleDataFrame:
|
||||
# Tests which are relevant only for DataFrame, so these are
|
||||
# as fully parametrized as they can get.
|
||||
|
||||
def test_sample(self):
|
||||
# GH#2419
|
||||
# additional specific object based tests
|
||||
|
||||
# A few dataframe test with degenerate weights.
|
||||
easy_weight_list = [0] * 10
|
||||
easy_weight_list[5] = 1
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"col1": range(10, 20),
|
||||
"col2": range(20, 30),
|
||||
"colString": ["a"] * 10,
|
||||
"easyweights": easy_weight_list,
|
||||
}
|
||||
)
|
||||
sample1 = df.sample(n=1, weights="easyweights")
|
||||
tm.assert_frame_equal(sample1, df.iloc[5:6])
|
||||
|
||||
# Ensure proper error if string given as weight for Series or
|
||||
# DataFrame with axis = 1.
|
||||
ser = Series(range(10))
|
||||
msg = "Strings cannot be passed as weights when sampling from a Series."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ser.sample(n=3, weights="weight_column")
|
||||
|
||||
msg = (
|
||||
"Strings can only be passed to weights when sampling from rows on a "
|
||||
"DataFrame"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.sample(n=1, weights="weight_column", axis=1)
|
||||
|
||||
# Check weighting key error
|
||||
with pytest.raises(
|
||||
KeyError, match="'String passed to weights not a valid column'"
|
||||
):
|
||||
df.sample(n=3, weights="not_a_real_column_name")
|
||||
|
||||
# Check that re-normalizes weights that don't sum to one.
|
||||
weights_less_than_1 = [0] * 10
|
||||
weights_less_than_1[0] = 0.5
|
||||
tm.assert_frame_equal(df.sample(n=1, weights=weights_less_than_1), df.iloc[:1])
|
||||
|
||||
###
|
||||
# Test axis argument
|
||||
###
|
||||
|
||||
# Test axis argument
|
||||
df = DataFrame({"col1": range(10), "col2": ["a"] * 10})
|
||||
second_column_weight = [0, 1]
|
||||
tm.assert_frame_equal(
|
||||
df.sample(n=1, axis=1, weights=second_column_weight), df[["col2"]]
|
||||
)
|
||||
|
||||
# Different axis arg types
|
||||
tm.assert_frame_equal(
|
||||
df.sample(n=1, axis="columns", weights=second_column_weight), df[["col2"]]
|
||||
)
|
||||
|
||||
weight = [0] * 10
|
||||
weight[5] = 0.5
|
||||
tm.assert_frame_equal(df.sample(n=1, axis="rows", weights=weight), df.iloc[5:6])
|
||||
tm.assert_frame_equal(
|
||||
df.sample(n=1, axis="index", weights=weight), df.iloc[5:6]
|
||||
)
|
||||
|
||||
# Check out of range axis values
|
||||
msg = "No axis named 2 for object type DataFrame"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.sample(n=1, axis=2)
|
||||
|
||||
msg = "No axis named not_a_name for object type DataFrame"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.sample(n=1, axis="not_a_name")
|
||||
|
||||
ser = Series(range(10))
|
||||
with pytest.raises(ValueError, match="No axis named 1 for object type Series"):
|
||||
ser.sample(n=1, axis=1)
|
||||
|
||||
# Test weight length compared to correct axis
|
||||
msg = "Weights and axis to be sampled must be of same length"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.sample(n=1, axis=1, weights=[0.5] * 10)
|
||||
|
||||
def test_sample_axis1(self):
|
||||
# Check weights with axis = 1
|
||||
easy_weight_list = [0] * 3
|
||||
easy_weight_list[2] = 1
|
||||
|
||||
df = DataFrame(
|
||||
{"col1": range(10, 20), "col2": range(20, 30), "colString": ["a"] * 10}
|
||||
)
|
||||
sample1 = df.sample(n=1, axis=1, weights=easy_weight_list)
|
||||
tm.assert_frame_equal(sample1, df[["colString"]])
|
||||
|
||||
# Test default axes
|
||||
tm.assert_frame_equal(
|
||||
df.sample(n=3, random_state=42), df.sample(n=3, axis=0, random_state=42)
|
||||
)
|
||||
|
||||
def test_sample_aligns_weights_with_frame(self):
|
||||
# Test that function aligns weights with frame
|
||||
df = DataFrame({"col1": [5, 6, 7], "col2": ["a", "b", "c"]}, index=[9, 5, 3])
|
||||
ser = Series([1, 0, 0], index=[3, 5, 9])
|
||||
tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser))
|
||||
|
||||
# Weights have index values to be dropped because not in
|
||||
# sampled DataFrame
|
||||
ser2 = Series([0.001, 0, 10000], index=[3, 5, 10])
|
||||
tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser2))
|
||||
|
||||
# Weights have empty values to be filed with zeros
|
||||
ser3 = Series([0.01, 0], index=[3, 5])
|
||||
tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser3))
|
||||
|
||||
# No overlap in weight and sampled DataFrame indices
|
||||
ser4 = Series([1, 0], index=[1, 2])
|
||||
|
||||
with pytest.raises(ValueError, match="Invalid weights: weights sum to zero"):
|
||||
df.sample(1, weights=ser4)
|
||||
|
||||
def test_sample_is_copy(self):
|
||||
# GH#27357, GH#30784: ensure the result of sample is an actual copy and
|
||||
# doesn't track the parent dataframe / doesn't give SettingWithCopy warnings
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 3)), columns=["a", "b", "c"]
|
||||
)
|
||||
df2 = df.sample(3)
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
df2["d"] = 1
|
||||
|
||||
def test_sample_does_not_modify_weights(self):
|
||||
# GH-42843
|
||||
result = np.array([np.nan, 1, np.nan])
|
||||
expected = result.copy()
|
||||
ser = Series([1, 2, 3])
|
||||
|
||||
# Test numpy array weights won't be modified in place
|
||||
ser.sample(weights=result)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
# Test DataFrame column won't be modified in place
|
||||
df = DataFrame({"values": [1, 1, 1], "weights": [1, np.nan, np.nan]})
|
||||
expected = df["weights"].copy()
|
||||
|
||||
df.sample(frac=1.0, replace=True, weights="weights")
|
||||
result = df["weights"]
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_sample_ignore_index(self):
|
||||
# GH 38581
|
||||
df = DataFrame(
|
||||
{"col1": range(10, 20), "col2": range(20, 30), "colString": ["a"] * 10}
|
||||
)
|
||||
result = df.sample(3, ignore_index=True)
|
||||
expected_index = Index(range(3))
|
||||
tm.assert_index_equal(result.index, expected_index, exact=True)
|
@ -0,0 +1,469 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.dtypes import ExtensionDtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Timestamp,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays import ExtensionArray
|
||||
|
||||
|
||||
class DummyDtype(ExtensionDtype):
|
||||
type = int
|
||||
|
||||
def __init__(self, numeric) -> None:
|
||||
self._numeric = numeric
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
return "Dummy"
|
||||
|
||||
@property
|
||||
def _is_numeric(self):
|
||||
return self._numeric
|
||||
|
||||
|
||||
class DummyArray(ExtensionArray):
|
||||
def __init__(self, data, dtype) -> None:
|
||||
self.data = data
|
||||
self._dtype = dtype
|
||||
|
||||
def __array__(self, dtype=None, copy=None):
|
||||
return self.data
|
||||
|
||||
@property
|
||||
def dtype(self):
|
||||
return self._dtype
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, item):
|
||||
pass
|
||||
|
||||
def copy(self):
|
||||
return self
|
||||
|
||||
|
||||
class TestSelectDtypes:
|
||||
def test_select_dtypes_include_using_list_like(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": list("abc"),
|
||||
"b": list(range(1, 4)),
|
||||
"c": np.arange(3, 6).astype("u1"),
|
||||
"d": np.arange(4.0, 7.0, dtype="float64"),
|
||||
"e": [True, False, True],
|
||||
"f": pd.Categorical(list("abc")),
|
||||
"g": pd.date_range("20130101", periods=3),
|
||||
"h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
|
||||
"i": pd.date_range("20130101", periods=3, tz="CET"),
|
||||
"j": pd.period_range("2013-01", periods=3, freq="M"),
|
||||
"k": pd.timedelta_range("1 day", periods=3),
|
||||
}
|
||||
)
|
||||
|
||||
ri = df.select_dtypes(include=[np.number])
|
||||
ei = df[["b", "c", "d", "k"]]
|
||||
tm.assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include=[np.number], exclude=["timedelta"])
|
||||
ei = df[["b", "c", "d"]]
|
||||
tm.assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include=[np.number, "category"], exclude=["timedelta"])
|
||||
ei = df[["b", "c", "d", "f"]]
|
||||
tm.assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include=["datetime"])
|
||||
ei = df[["g"]]
|
||||
tm.assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include=["datetime64"])
|
||||
ei = df[["g"]]
|
||||
tm.assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include=["datetimetz"])
|
||||
ei = df[["h", "i"]]
|
||||
tm.assert_frame_equal(ri, ei)
|
||||
|
||||
with pytest.raises(NotImplementedError, match=r"^$"):
|
||||
df.select_dtypes(include=["period"])
|
||||
|
||||
def test_select_dtypes_exclude_using_list_like(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": list("abc"),
|
||||
"b": list(range(1, 4)),
|
||||
"c": np.arange(3, 6).astype("u1"),
|
||||
"d": np.arange(4.0, 7.0, dtype="float64"),
|
||||
"e": [True, False, True],
|
||||
}
|
||||
)
|
||||
re = df.select_dtypes(exclude=[np.number])
|
||||
ee = df[["a", "e"]]
|
||||
tm.assert_frame_equal(re, ee)
|
||||
|
||||
def test_select_dtypes_exclude_include_using_list_like(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": list("abc"),
|
||||
"b": list(range(1, 4)),
|
||||
"c": np.arange(3, 6, dtype="u1"),
|
||||
"d": np.arange(4.0, 7.0, dtype="float64"),
|
||||
"e": [True, False, True],
|
||||
"f": pd.date_range("now", periods=3).values,
|
||||
}
|
||||
)
|
||||
exclude = (np.datetime64,)
|
||||
include = np.bool_, "integer"
|
||||
r = df.select_dtypes(include=include, exclude=exclude)
|
||||
e = df[["b", "c", "e"]]
|
||||
tm.assert_frame_equal(r, e)
|
||||
|
||||
exclude = ("datetime",)
|
||||
include = "bool", "int64", "int32"
|
||||
r = df.select_dtypes(include=include, exclude=exclude)
|
||||
e = df[["b", "e"]]
|
||||
tm.assert_frame_equal(r, e)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"include", [(np.bool_, "int"), (np.bool_, "integer"), ("bool", int)]
|
||||
)
|
||||
def test_select_dtypes_exclude_include_int(self, include):
|
||||
# Fix select_dtypes(include='int') for Windows, FYI #36596
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": list("abc"),
|
||||
"b": list(range(1, 4)),
|
||||
"c": np.arange(3, 6, dtype="int32"),
|
||||
"d": np.arange(4.0, 7.0, dtype="float64"),
|
||||
"e": [True, False, True],
|
||||
"f": pd.date_range("now", periods=3).values,
|
||||
}
|
||||
)
|
||||
exclude = (np.datetime64,)
|
||||
result = df.select_dtypes(include=include, exclude=exclude)
|
||||
expected = df[["b", "c", "e"]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_select_dtypes_include_using_scalars(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": list("abc"),
|
||||
"b": list(range(1, 4)),
|
||||
"c": np.arange(3, 6).astype("u1"),
|
||||
"d": np.arange(4.0, 7.0, dtype="float64"),
|
||||
"e": [True, False, True],
|
||||
"f": pd.Categorical(list("abc")),
|
||||
"g": pd.date_range("20130101", periods=3),
|
||||
"h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
|
||||
"i": pd.date_range("20130101", periods=3, tz="CET"),
|
||||
"j": pd.period_range("2013-01", periods=3, freq="M"),
|
||||
"k": pd.timedelta_range("1 day", periods=3),
|
||||
}
|
||||
)
|
||||
|
||||
ri = df.select_dtypes(include=np.number)
|
||||
ei = df[["b", "c", "d", "k"]]
|
||||
tm.assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include="datetime")
|
||||
ei = df[["g"]]
|
||||
tm.assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include="datetime64")
|
||||
ei = df[["g"]]
|
||||
tm.assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include="category")
|
||||
ei = df[["f"]]
|
||||
tm.assert_frame_equal(ri, ei)
|
||||
|
||||
with pytest.raises(NotImplementedError, match=r"^$"):
|
||||
df.select_dtypes(include="period")
|
||||
|
||||
def test_select_dtypes_exclude_using_scalars(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": list("abc"),
|
||||
"b": list(range(1, 4)),
|
||||
"c": np.arange(3, 6).astype("u1"),
|
||||
"d": np.arange(4.0, 7.0, dtype="float64"),
|
||||
"e": [True, False, True],
|
||||
"f": pd.Categorical(list("abc")),
|
||||
"g": pd.date_range("20130101", periods=3),
|
||||
"h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
|
||||
"i": pd.date_range("20130101", periods=3, tz="CET"),
|
||||
"j": pd.period_range("2013-01", periods=3, freq="M"),
|
||||
"k": pd.timedelta_range("1 day", periods=3),
|
||||
}
|
||||
)
|
||||
|
||||
ri = df.select_dtypes(exclude=np.number)
|
||||
ei = df[["a", "e", "f", "g", "h", "i", "j"]]
|
||||
tm.assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(exclude="category")
|
||||
ei = df[["a", "b", "c", "d", "e", "g", "h", "i", "j", "k"]]
|
||||
tm.assert_frame_equal(ri, ei)
|
||||
|
||||
with pytest.raises(NotImplementedError, match=r"^$"):
|
||||
df.select_dtypes(exclude="period")
|
||||
|
||||
def test_select_dtypes_include_exclude_using_scalars(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": list("abc"),
|
||||
"b": list(range(1, 4)),
|
||||
"c": np.arange(3, 6).astype("u1"),
|
||||
"d": np.arange(4.0, 7.0, dtype="float64"),
|
||||
"e": [True, False, True],
|
||||
"f": pd.Categorical(list("abc")),
|
||||
"g": pd.date_range("20130101", periods=3),
|
||||
"h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
|
||||
"i": pd.date_range("20130101", periods=3, tz="CET"),
|
||||
"j": pd.period_range("2013-01", periods=3, freq="M"),
|
||||
"k": pd.timedelta_range("1 day", periods=3),
|
||||
}
|
||||
)
|
||||
|
||||
ri = df.select_dtypes(include=np.number, exclude="floating")
|
||||
ei = df[["b", "c", "k"]]
|
||||
tm.assert_frame_equal(ri, ei)
|
||||
|
||||
def test_select_dtypes_include_exclude_mixed_scalars_lists(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": list("abc"),
|
||||
"b": list(range(1, 4)),
|
||||
"c": np.arange(3, 6).astype("u1"),
|
||||
"d": np.arange(4.0, 7.0, dtype="float64"),
|
||||
"e": [True, False, True],
|
||||
"f": pd.Categorical(list("abc")),
|
||||
"g": pd.date_range("20130101", periods=3),
|
||||
"h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
|
||||
"i": pd.date_range("20130101", periods=3, tz="CET"),
|
||||
"j": pd.period_range("2013-01", periods=3, freq="M"),
|
||||
"k": pd.timedelta_range("1 day", periods=3),
|
||||
}
|
||||
)
|
||||
|
||||
ri = df.select_dtypes(include=np.number, exclude=["floating", "timedelta"])
|
||||
ei = df[["b", "c"]]
|
||||
tm.assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include=[np.number, "category"], exclude="floating")
|
||||
ei = df[["b", "c", "f", "k"]]
|
||||
tm.assert_frame_equal(ri, ei)
|
||||
|
||||
def test_select_dtypes_duplicate_columns(self):
|
||||
# GH20839
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": ["a", "b", "c"],
|
||||
"b": [1, 2, 3],
|
||||
"c": np.arange(3, 6).astype("u1"),
|
||||
"d": np.arange(4.0, 7.0, dtype="float64"),
|
||||
"e": [True, False, True],
|
||||
"f": pd.date_range("now", periods=3).values,
|
||||
}
|
||||
)
|
||||
df.columns = ["a", "a", "b", "b", "b", "c"]
|
||||
|
||||
expected = DataFrame(
|
||||
{"a": list(range(1, 4)), "b": np.arange(3, 6).astype("u1")}
|
||||
)
|
||||
|
||||
result = df.select_dtypes(include=[np.number], exclude=["floating"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_select_dtypes_not_an_attr_but_still_valid_dtype(self, using_infer_string):
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": list("abc"),
|
||||
"b": list(range(1, 4)),
|
||||
"c": np.arange(3, 6).astype("u1"),
|
||||
"d": np.arange(4.0, 7.0, dtype="float64"),
|
||||
"e": [True, False, True],
|
||||
"f": pd.date_range("now", periods=3).values,
|
||||
}
|
||||
)
|
||||
df["g"] = df.f.diff()
|
||||
assert not hasattr(np, "u8")
|
||||
r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"])
|
||||
if using_infer_string:
|
||||
e = df[["b"]]
|
||||
else:
|
||||
e = df[["a", "b"]]
|
||||
tm.assert_frame_equal(r, e)
|
||||
|
||||
r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"])
|
||||
if using_infer_string:
|
||||
e = df[["b", "g"]]
|
||||
else:
|
||||
e = df[["a", "b", "g"]]
|
||||
tm.assert_frame_equal(r, e)
|
||||
|
||||
def test_select_dtypes_empty(self):
|
||||
df = DataFrame({"a": list("abc"), "b": list(range(1, 4))})
|
||||
msg = "at least one of include or exclude must be nonempty"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.select_dtypes()
|
||||
|
||||
def test_select_dtypes_bad_datetime64(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": list("abc"),
|
||||
"b": list(range(1, 4)),
|
||||
"c": np.arange(3, 6).astype("u1"),
|
||||
"d": np.arange(4.0, 7.0, dtype="float64"),
|
||||
"e": [True, False, True],
|
||||
"f": pd.date_range("now", periods=3).values,
|
||||
}
|
||||
)
|
||||
with pytest.raises(ValueError, match=".+ is too specific"):
|
||||
df.select_dtypes(include=["datetime64[D]"])
|
||||
|
||||
with pytest.raises(ValueError, match=".+ is too specific"):
|
||||
df.select_dtypes(exclude=["datetime64[as]"])
|
||||
|
||||
def test_select_dtypes_datetime_with_tz(self):
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"A": Timestamp("20130102", tz="US/Eastern"),
|
||||
"B": Timestamp("20130603", tz="CET"),
|
||||
},
|
||||
index=range(5),
|
||||
)
|
||||
df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1)
|
||||
result = df3.select_dtypes(include=["datetime64[ns]"])
|
||||
expected = df3.reindex(columns=[])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"])
|
||||
@pytest.mark.parametrize("arg", ["include", "exclude"])
|
||||
def test_select_dtypes_str_raises(self, dtype, arg):
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": list("abc"),
|
||||
"g": list("abc"),
|
||||
"b": list(range(1, 4)),
|
||||
"c": np.arange(3, 6).astype("u1"),
|
||||
"d": np.arange(4.0, 7.0, dtype="float64"),
|
||||
"e": [True, False, True],
|
||||
"f": pd.date_range("now", periods=3).values,
|
||||
}
|
||||
)
|
||||
msg = "string dtypes are not allowed"
|
||||
kwargs = {arg: [dtype]}
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.select_dtypes(**kwargs)
|
||||
|
||||
def test_select_dtypes_bad_arg_raises(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": list("abc"),
|
||||
"g": list("abc"),
|
||||
"b": list(range(1, 4)),
|
||||
"c": np.arange(3, 6).astype("u1"),
|
||||
"d": np.arange(4.0, 7.0, dtype="float64"),
|
||||
"e": [True, False, True],
|
||||
"f": pd.date_range("now", periods=3).values,
|
||||
}
|
||||
)
|
||||
|
||||
msg = "data type.*not understood"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.select_dtypes(["blargy, blarg, blarg"])
|
||||
|
||||
def test_select_dtypes_typecodes(self):
|
||||
# GH 11990
|
||||
df = DataFrame(np.random.default_rng(2).random((5, 3)))
|
||||
FLOAT_TYPES = list(np.typecodes["AllFloat"])
|
||||
tm.assert_frame_equal(df.select_dtypes(FLOAT_TYPES), df)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arr,expected",
|
||||
(
|
||||
(np.array([1, 2], dtype=np.int32), True),
|
||||
(pd.array([1, 2], dtype="Int32"), True),
|
||||
(DummyArray([1, 2], dtype=DummyDtype(numeric=True)), True),
|
||||
(DummyArray([1, 2], dtype=DummyDtype(numeric=False)), False),
|
||||
),
|
||||
)
|
||||
def test_select_dtypes_numeric(self, arr, expected):
|
||||
# GH 35340
|
||||
|
||||
df = DataFrame(arr)
|
||||
is_selected = df.select_dtypes(np.number).shape == df.shape
|
||||
assert is_selected == expected
|
||||
|
||||
def test_select_dtypes_numeric_nullable_string(self, nullable_string_dtype):
|
||||
arr = pd.array(["a", "b"], dtype=nullable_string_dtype)
|
||||
df = DataFrame(arr)
|
||||
is_selected = df.select_dtypes(np.number).shape == df.shape
|
||||
assert not is_selected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"expected, float_dtypes",
|
||||
[
|
||||
[
|
||||
DataFrame(
|
||||
{"A": range(3), "B": range(5, 8), "C": range(10, 7, -1)}
|
||||
).astype(dtype={"A": float, "B": np.float64, "C": np.float32}),
|
||||
float,
|
||||
],
|
||||
[
|
||||
DataFrame(
|
||||
{"A": range(3), "B": range(5, 8), "C": range(10, 7, -1)}
|
||||
).astype(dtype={"A": float, "B": np.float64, "C": np.float32}),
|
||||
"float",
|
||||
],
|
||||
[DataFrame({"C": range(10, 7, -1)}, dtype=np.float32), np.float32],
|
||||
[
|
||||
DataFrame({"A": range(3), "B": range(5, 8)}).astype(
|
||||
dtype={"A": float, "B": np.float64}
|
||||
),
|
||||
np.float64,
|
||||
],
|
||||
],
|
||||
)
|
||||
def test_select_dtypes_float_dtype(self, expected, float_dtypes):
|
||||
# GH#42452
|
||||
dtype_dict = {"A": float, "B": np.float64, "C": np.float32}
|
||||
df = DataFrame(
|
||||
{"A": range(3), "B": range(5, 8), "C": range(10, 7, -1)},
|
||||
)
|
||||
df = df.astype(dtype_dict)
|
||||
result = df.select_dtypes(include=float_dtypes)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_np_bool_ea_boolean_include_number(self):
|
||||
# GH 46870
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 2, 3],
|
||||
"b": pd.Series([True, False, True], dtype="boolean"),
|
||||
"c": np.array([True, False, True]),
|
||||
"d": pd.Categorical([True, False, True]),
|
||||
"e": pd.arrays.SparseArray([True, False, True]),
|
||||
}
|
||||
)
|
||||
result = df.select_dtypes(include="number")
|
||||
expected = DataFrame({"a": [1, 2, 3]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_select_dtypes_no_view(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/48090
|
||||
# result of this method is not a view on the original dataframe
|
||||
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
|
||||
df_orig = df.copy()
|
||||
result = df.select_dtypes(include=["number"])
|
||||
result.iloc[0, 0] = 0
|
||||
tm.assert_frame_equal(df, df_orig)
|
@ -0,0 +1,143 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class SharedSetAxisTests:
|
||||
@pytest.fixture
|
||||
def obj(self):
|
||||
raise NotImplementedError("Implemented by subclasses")
|
||||
|
||||
def test_set_axis(self, obj):
|
||||
# GH14636; this tests setting index for both Series and DataFrame
|
||||
new_index = list("abcd")[: len(obj)]
|
||||
expected = obj.copy()
|
||||
expected.index = new_index
|
||||
result = obj.set_axis(new_index, axis=0)
|
||||
tm.assert_equal(expected, result)
|
||||
|
||||
def test_set_axis_copy(self, obj, using_copy_on_write):
|
||||
# Test copy keyword GH#47932
|
||||
new_index = list("abcd")[: len(obj)]
|
||||
|
||||
orig = obj.iloc[:]
|
||||
expected = obj.copy()
|
||||
expected.index = new_index
|
||||
|
||||
result = obj.set_axis(new_index, axis=0, copy=True)
|
||||
tm.assert_equal(expected, result)
|
||||
assert result is not obj
|
||||
# check we DID make a copy
|
||||
if not using_copy_on_write:
|
||||
if obj.ndim == 1:
|
||||
assert not tm.shares_memory(result, obj)
|
||||
else:
|
||||
assert not any(
|
||||
tm.shares_memory(result.iloc[:, i], obj.iloc[:, i])
|
||||
for i in range(obj.shape[1])
|
||||
)
|
||||
|
||||
result = obj.set_axis(new_index, axis=0, copy=False)
|
||||
tm.assert_equal(expected, result)
|
||||
assert result is not obj
|
||||
# check we did NOT make a copy
|
||||
if obj.ndim == 1:
|
||||
assert tm.shares_memory(result, obj)
|
||||
else:
|
||||
assert all(
|
||||
tm.shares_memory(result.iloc[:, i], obj.iloc[:, i])
|
||||
for i in range(obj.shape[1])
|
||||
)
|
||||
|
||||
# copy defaults to True
|
||||
result = obj.set_axis(new_index, axis=0)
|
||||
tm.assert_equal(expected, result)
|
||||
assert result is not obj
|
||||
if using_copy_on_write:
|
||||
# check we DID NOT make a copy
|
||||
if obj.ndim == 1:
|
||||
assert tm.shares_memory(result, obj)
|
||||
else:
|
||||
assert any(
|
||||
tm.shares_memory(result.iloc[:, i], obj.iloc[:, i])
|
||||
for i in range(obj.shape[1])
|
||||
)
|
||||
# check we DID make a copy
|
||||
elif obj.ndim == 1:
|
||||
assert not tm.shares_memory(result, obj)
|
||||
else:
|
||||
assert not any(
|
||||
tm.shares_memory(result.iloc[:, i], obj.iloc[:, i])
|
||||
for i in range(obj.shape[1])
|
||||
)
|
||||
|
||||
res = obj.set_axis(new_index, copy=False)
|
||||
tm.assert_equal(expected, res)
|
||||
# check we did NOT make a copy
|
||||
if res.ndim == 1:
|
||||
assert tm.shares_memory(res, orig)
|
||||
else:
|
||||
assert all(
|
||||
tm.shares_memory(res.iloc[:, i], orig.iloc[:, i])
|
||||
for i in range(res.shape[1])
|
||||
)
|
||||
|
||||
def test_set_axis_unnamed_kwarg_warns(self, obj):
|
||||
# omitting the "axis" parameter
|
||||
new_index = list("abcd")[: len(obj)]
|
||||
|
||||
expected = obj.copy()
|
||||
expected.index = new_index
|
||||
|
||||
result = obj.set_axis(new_index)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("axis", [3, "foo"])
|
||||
def test_set_axis_invalid_axis_name(self, axis, obj):
|
||||
# wrong values for the "axis" parameter
|
||||
with pytest.raises(ValueError, match="No axis named"):
|
||||
obj.set_axis(list("abc"), axis=axis)
|
||||
|
||||
def test_set_axis_setattr_index_not_collection(self, obj):
|
||||
# wrong type
|
||||
msg = (
|
||||
r"Index\(\.\.\.\) must be called with a collection of some "
|
||||
r"kind, None was passed"
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
obj.index = None
|
||||
|
||||
def test_set_axis_setattr_index_wrong_length(self, obj):
|
||||
# wrong length
|
||||
msg = (
|
||||
f"Length mismatch: Expected axis has {len(obj)} elements, "
|
||||
f"new values have {len(obj)-1} elements"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.index = np.arange(len(obj) - 1)
|
||||
|
||||
if obj.ndim == 2:
|
||||
with pytest.raises(ValueError, match="Length mismatch"):
|
||||
obj.columns = obj.columns[::2]
|
||||
|
||||
|
||||
class TestDataFrameSetAxis(SharedSetAxisTests):
|
||||
@pytest.fixture
|
||||
def obj(self):
|
||||
df = DataFrame(
|
||||
{"A": [1.1, 2.2, 3.3], "B": [5.0, 6.1, 7.2], "C": [4.4, 5.5, 6.6]},
|
||||
index=[2010, 2011, 2012],
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
class TestSeriesSetAxis(SharedSetAxisTests):
|
||||
@pytest.fixture
|
||||
def obj(self):
|
||||
ser = Series(np.arange(4), index=[1, 3, 5, 7], dtype="int64")
|
||||
return ser
|
@ -0,0 +1,734 @@
|
||||
"""
|
||||
See also: test_reindex.py:TestReindexSetIndex
|
||||
"""
|
||||
|
||||
from datetime import (
|
||||
datetime,
|
||||
timedelta,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
Categorical,
|
||||
CategoricalIndex,
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
date_range,
|
||||
period_range,
|
||||
to_datetime,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def frame_of_index_cols():
|
||||
"""
|
||||
Fixture for DataFrame of columns that can be used for indexing
|
||||
|
||||
Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')];
|
||||
'A' & 'B' contain duplicates (but are jointly unique), the rest are unique.
|
||||
|
||||
A B C D E (tuple, as, label)
|
||||
0 foo one a 0.608477 -0.012500 -1.664297
|
||||
1 foo two b -0.633460 0.249614 -0.364411
|
||||
2 foo three c 0.615256 2.154968 -0.834666
|
||||
3 bar one d 0.234246 1.085675 0.718445
|
||||
4 bar two e 0.533841 -0.005702 -3.533912
|
||||
"""
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "foo", "foo", "bar", "bar"],
|
||||
"B": ["one", "two", "three", "one", "two"],
|
||||
"C": ["a", "b", "c", "d", "e"],
|
||||
"D": np.random.default_rng(2).standard_normal(5),
|
||||
"E": np.random.default_rng(2).standard_normal(5),
|
||||
("tuple", "as", "label"): np.random.default_rng(2).standard_normal(5),
|
||||
}
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
class TestSetIndex:
|
||||
def test_set_index_multiindex(self):
|
||||
# segfault in GH#3308
|
||||
d = {"t1": [2, 2.5, 3], "t2": [4, 5, 6]}
|
||||
df = DataFrame(d)
|
||||
tuples = [(0, 1), (0, 2), (1, 2)]
|
||||
df["tuples"] = tuples
|
||||
|
||||
index = MultiIndex.from_tuples(df["tuples"])
|
||||
# it works!
|
||||
df.set_index(index)
|
||||
|
||||
def test_set_index_empty_column(self):
|
||||
# GH#1971
|
||||
df = DataFrame(
|
||||
[
|
||||
{"a": 1, "p": 0},
|
||||
{"a": 2, "m": 10},
|
||||
{"a": 3, "m": 11, "p": 20},
|
||||
{"a": 4, "m": 12, "p": 21},
|
||||
],
|
||||
columns=["a", "m", "p", "x"],
|
||||
)
|
||||
|
||||
result = df.set_index(["a", "x"])
|
||||
|
||||
expected = df[["m", "p"]]
|
||||
expected.index = MultiIndex.from_arrays([df["a"], df["x"]], names=["a", "x"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_set_index_empty_dataframe(self):
|
||||
# GH#38419
|
||||
df1 = DataFrame(
|
||||
{"a": Series(dtype="datetime64[ns]"), "b": Series(dtype="int64"), "c": []}
|
||||
)
|
||||
|
||||
df2 = df1.set_index(["a", "b"])
|
||||
result = df2.index.to_frame().dtypes
|
||||
expected = df1[["a", "b"]].dtypes
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_set_index_multiindexcolumns(self):
|
||||
columns = MultiIndex.from_tuples([("foo", 1), ("foo", 2), ("bar", 1)])
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((3, 3)), columns=columns
|
||||
)
|
||||
|
||||
result = df.set_index(df.columns[0])
|
||||
|
||||
expected = df.iloc[:, 1:]
|
||||
expected.index = df.iloc[:, 0].values
|
||||
expected.index.names = [df.columns[0]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_set_index_timezone(self):
|
||||
# GH#12358
|
||||
# tz-aware Series should retain the tz
|
||||
idx = DatetimeIndex(["2014-01-01 10:10:10"], tz="UTC").tz_convert("Europe/Rome")
|
||||
df = DataFrame({"A": idx})
|
||||
assert df.set_index(idx).index[0].hour == 11
|
||||
assert DatetimeIndex(Series(df.A))[0].hour == 11
|
||||
assert df.set_index(df.A).index[0].hour == 11
|
||||
|
||||
def test_set_index_cast_datetimeindex(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)],
|
||||
"B": np.random.default_rng(2).standard_normal(1000),
|
||||
}
|
||||
)
|
||||
|
||||
idf = df.set_index("A")
|
||||
assert isinstance(idf.index, DatetimeIndex)
|
||||
|
||||
def test_set_index_dst(self):
|
||||
di = date_range("2006-10-29 00:00:00", periods=3, freq="h", tz="US/Pacific")
|
||||
|
||||
df = DataFrame(data={"a": [0, 1, 2], "b": [3, 4, 5]}, index=di).reset_index()
|
||||
# single level
|
||||
res = df.set_index("index")
|
||||
exp = DataFrame(
|
||||
data={"a": [0, 1, 2], "b": [3, 4, 5]},
|
||||
index=Index(di, name="index"),
|
||||
)
|
||||
exp.index = exp.index._with_freq(None)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# GH#12920
|
||||
res = df.set_index(["index", "a"])
|
||||
exp_index = MultiIndex.from_arrays([di, [0, 1, 2]], names=["index", "a"])
|
||||
exp = DataFrame({"b": [3, 4, 5]}, index=exp_index)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_set_index(self, float_string_frame):
|
||||
df = float_string_frame
|
||||
idx = Index(np.arange(len(df))[::-1])
|
||||
|
||||
df = df.set_index(idx)
|
||||
tm.assert_index_equal(df.index, idx)
|
||||
with pytest.raises(ValueError, match="Length mismatch"):
|
||||
df.set_index(idx[::2])
|
||||
|
||||
def test_set_index_names(self):
|
||||
df = DataFrame(
|
||||
np.ones((10, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=Index([f"i-{i}" for i in range(10)], dtype=object),
|
||||
)
|
||||
df.index.name = "name"
|
||||
|
||||
assert df.set_index(df.index).index.names == ["name"]
|
||||
|
||||
mi = MultiIndex.from_arrays(df[["A", "B"]].T.values, names=["A", "B"])
|
||||
mi2 = MultiIndex.from_arrays(
|
||||
df[["A", "B", "A", "B"]].T.values, names=["A", "B", "C", "D"]
|
||||
)
|
||||
|
||||
df = df.set_index(["A", "B"])
|
||||
|
||||
assert df.set_index(df.index).index.names == ["A", "B"]
|
||||
|
||||
# Check that set_index isn't converting a MultiIndex into an Index
|
||||
assert isinstance(df.set_index(df.index).index, MultiIndex)
|
||||
|
||||
# Check actual equality
|
||||
tm.assert_index_equal(df.set_index(df.index).index, mi)
|
||||
|
||||
idx2 = df.index.rename(["C", "D"])
|
||||
|
||||
# Check that [MultiIndex, MultiIndex] yields a MultiIndex rather
|
||||
# than a pair of tuples
|
||||
assert isinstance(df.set_index([df.index, idx2]).index, MultiIndex)
|
||||
|
||||
# Check equality
|
||||
tm.assert_index_equal(df.set_index([df.index, idx2]).index, mi2)
|
||||
|
||||
# A has duplicate values, C does not
|
||||
@pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")])
|
||||
@pytest.mark.parametrize("inplace", [True, False])
|
||||
@pytest.mark.parametrize("drop", [True, False])
|
||||
def test_set_index_drop_inplace(self, frame_of_index_cols, drop, inplace, keys):
|
||||
df = frame_of_index_cols
|
||||
|
||||
if isinstance(keys, list):
|
||||
idx = MultiIndex.from_arrays([df[x] for x in keys], names=keys)
|
||||
else:
|
||||
idx = Index(df[keys], name=keys)
|
||||
expected = df.drop(keys, axis=1) if drop else df
|
||||
expected.index = idx
|
||||
|
||||
if inplace:
|
||||
result = df.copy()
|
||||
return_value = result.set_index(keys, drop=drop, inplace=True)
|
||||
assert return_value is None
|
||||
else:
|
||||
result = df.set_index(keys, drop=drop)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# A has duplicate values, C does not
|
||||
@pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")])
|
||||
@pytest.mark.parametrize("drop", [True, False])
|
||||
def test_set_index_append(self, frame_of_index_cols, drop, keys):
|
||||
df = frame_of_index_cols
|
||||
|
||||
keys = keys if isinstance(keys, list) else [keys]
|
||||
idx = MultiIndex.from_arrays(
|
||||
[df.index] + [df[x] for x in keys], names=[None] + keys
|
||||
)
|
||||
expected = df.drop(keys, axis=1) if drop else df.copy()
|
||||
expected.index = idx
|
||||
|
||||
result = df.set_index(keys, drop=drop, append=True)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# A has duplicate values, C does not
|
||||
@pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")])
|
||||
@pytest.mark.parametrize("drop", [True, False])
|
||||
def test_set_index_append_to_multiindex(self, frame_of_index_cols, drop, keys):
|
||||
# append to existing multiindex
|
||||
df = frame_of_index_cols.set_index(["D"], drop=drop, append=True)
|
||||
|
||||
keys = keys if isinstance(keys, list) else [keys]
|
||||
expected = frame_of_index_cols.set_index(["D"] + keys, drop=drop, append=True)
|
||||
|
||||
result = df.set_index(keys, drop=drop, append=True)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_set_index_after_mutation(self):
|
||||
# GH#1590
|
||||
df = DataFrame({"val": [0, 1, 2], "key": ["a", "b", "c"]})
|
||||
expected = DataFrame({"val": [1, 2]}, Index(["b", "c"], name="key"))
|
||||
|
||||
df2 = df.loc[df.index.map(lambda indx: indx >= 1)]
|
||||
result = df2.set_index("key")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# MultiIndex constructor does not work directly on Series -> lambda
|
||||
# Add list-of-list constructor because list is ambiguous -> lambda
|
||||
# also test index name if append=True (name is duplicate here for B)
|
||||
@pytest.mark.parametrize(
|
||||
"box",
|
||||
[
|
||||
Series,
|
||||
Index,
|
||||
np.array,
|
||||
list,
|
||||
lambda x: [list(x)],
|
||||
lambda x: MultiIndex.from_arrays([x]),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"append, index_name", [(True, None), (True, "B"), (True, "test"), (False, None)]
|
||||
)
|
||||
@pytest.mark.parametrize("drop", [True, False])
|
||||
def test_set_index_pass_single_array(
|
||||
self, frame_of_index_cols, drop, append, index_name, box
|
||||
):
|
||||
df = frame_of_index_cols
|
||||
df.index.name = index_name
|
||||
|
||||
key = box(df["B"])
|
||||
if box == list:
|
||||
# list of strings gets interpreted as list of keys
|
||||
msg = "['one', 'two', 'three', 'one', 'two']"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
df.set_index(key, drop=drop, append=append)
|
||||
else:
|
||||
# np.array/list-of-list "forget" the name of B
|
||||
name_mi = getattr(key, "names", None)
|
||||
name = [getattr(key, "name", None)] if name_mi is None else name_mi
|
||||
|
||||
result = df.set_index(key, drop=drop, append=append)
|
||||
|
||||
# only valid column keys are dropped
|
||||
# since B is always passed as array above, nothing is dropped
|
||||
expected = df.set_index(["B"], drop=False, append=append)
|
||||
expected.index.names = [index_name] + name if append else name
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# MultiIndex constructor does not work directly on Series -> lambda
|
||||
# also test index name if append=True (name is duplicate here for A & B)
|
||||
@pytest.mark.parametrize(
|
||||
"box", [Series, Index, np.array, list, lambda x: MultiIndex.from_arrays([x])]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"append, index_name",
|
||||
[(True, None), (True, "A"), (True, "B"), (True, "test"), (False, None)],
|
||||
)
|
||||
@pytest.mark.parametrize("drop", [True, False])
|
||||
def test_set_index_pass_arrays(
|
||||
self, frame_of_index_cols, drop, append, index_name, box
|
||||
):
|
||||
df = frame_of_index_cols
|
||||
df.index.name = index_name
|
||||
|
||||
keys = ["A", box(df["B"])]
|
||||
# np.array/list "forget" the name of B
|
||||
names = ["A", None if box in [np.array, list, tuple, iter] else "B"]
|
||||
|
||||
result = df.set_index(keys, drop=drop, append=append)
|
||||
|
||||
# only valid column keys are dropped
|
||||
# since B is always passed as array above, only A is dropped, if at all
|
||||
expected = df.set_index(["A", "B"], drop=False, append=append)
|
||||
expected = expected.drop("A", axis=1) if drop else expected
|
||||
expected.index.names = [index_name] + names if append else names
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# MultiIndex constructor does not work directly on Series -> lambda
|
||||
# We also emulate a "constructor" for the label -> lambda
|
||||
# also test index name if append=True (name is duplicate here for A)
|
||||
@pytest.mark.parametrize(
|
||||
"box2",
|
||||
[
|
||||
Series,
|
||||
Index,
|
||||
np.array,
|
||||
list,
|
||||
iter,
|
||||
lambda x: MultiIndex.from_arrays([x]),
|
||||
lambda x: x.name,
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"box1",
|
||||
[
|
||||
Series,
|
||||
Index,
|
||||
np.array,
|
||||
list,
|
||||
iter,
|
||||
lambda x: MultiIndex.from_arrays([x]),
|
||||
lambda x: x.name,
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"append, index_name", [(True, None), (True, "A"), (True, "test"), (False, None)]
|
||||
)
|
||||
@pytest.mark.parametrize("drop", [True, False])
|
||||
def test_set_index_pass_arrays_duplicate(
|
||||
self, frame_of_index_cols, drop, append, index_name, box1, box2
|
||||
):
|
||||
df = frame_of_index_cols
|
||||
df.index.name = index_name
|
||||
|
||||
keys = [box1(df["A"]), box2(df["A"])]
|
||||
result = df.set_index(keys, drop=drop, append=append)
|
||||
|
||||
# if either box is iter, it has been consumed; re-read
|
||||
keys = [box1(df["A"]), box2(df["A"])]
|
||||
|
||||
# need to adapt first drop for case that both keys are 'A' --
|
||||
# cannot drop the same column twice;
|
||||
# plain == would give ambiguous Boolean error for containers
|
||||
first_drop = (
|
||||
False
|
||||
if (
|
||||
isinstance(keys[0], str)
|
||||
and keys[0] == "A"
|
||||
and isinstance(keys[1], str)
|
||||
and keys[1] == "A"
|
||||
)
|
||||
else drop
|
||||
)
|
||||
# to test against already-tested behaviour, we add sequentially,
|
||||
# hence second append always True; must wrap keys in list, otherwise
|
||||
# box = list would be interpreted as keys
|
||||
expected = df.set_index([keys[0]], drop=first_drop, append=append)
|
||||
expected = expected.set_index([keys[1]], drop=drop, append=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("append", [True, False])
|
||||
@pytest.mark.parametrize("drop", [True, False])
|
||||
def test_set_index_pass_multiindex(self, frame_of_index_cols, drop, append):
|
||||
df = frame_of_index_cols
|
||||
keys = MultiIndex.from_arrays([df["A"], df["B"]], names=["A", "B"])
|
||||
|
||||
result = df.set_index(keys, drop=drop, append=append)
|
||||
|
||||
# setting with a MultiIndex will never drop columns
|
||||
expected = df.set_index(["A", "B"], drop=False, append=append)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_construction_with_categorical_index(self):
|
||||
ci = CategoricalIndex(list("ab") * 5, name="B")
|
||||
|
||||
# with Categorical
|
||||
df = DataFrame(
|
||||
{"A": np.random.default_rng(2).standard_normal(10), "B": ci.values}
|
||||
)
|
||||
idf = df.set_index("B")
|
||||
tm.assert_index_equal(idf.index, ci)
|
||||
|
||||
# from a CategoricalIndex
|
||||
df = DataFrame({"A": np.random.default_rng(2).standard_normal(10), "B": ci})
|
||||
idf = df.set_index("B")
|
||||
tm.assert_index_equal(idf.index, ci)
|
||||
|
||||
# round-trip
|
||||
idf = idf.reset_index().set_index("B")
|
||||
tm.assert_index_equal(idf.index, ci)
|
||||
|
||||
def test_set_index_preserve_categorical_dtype(self):
|
||||
# GH#13743, GH#13854
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 2, 1, 1, 2],
|
||||
"B": [10, 16, 22, 28, 34],
|
||||
"C1": Categorical(list("abaab"), categories=list("bac"), ordered=False),
|
||||
"C2": Categorical(list("abaab"), categories=list("bac"), ordered=True),
|
||||
}
|
||||
)
|
||||
for cols in ["C1", "C2", ["A", "C1"], ["A", "C2"], ["C1", "C2"]]:
|
||||
result = df.set_index(cols).reset_index()
|
||||
result = result.reindex(columns=df.columns)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
def test_set_index_datetime(self):
|
||||
# GH#3950
|
||||
df = DataFrame(
|
||||
{
|
||||
"label": ["a", "a", "a", "b", "b", "b"],
|
||||
"datetime": [
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
],
|
||||
"value": range(6),
|
||||
}
|
||||
)
|
||||
df.index = to_datetime(df.pop("datetime"), utc=True)
|
||||
df.index = df.index.tz_convert("US/Pacific")
|
||||
|
||||
expected = DatetimeIndex(
|
||||
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
|
||||
name="datetime",
|
||||
)
|
||||
expected = expected.tz_localize("UTC").tz_convert("US/Pacific")
|
||||
|
||||
df = df.set_index("label", append=True)
|
||||
tm.assert_index_equal(df.index.levels[0], expected)
|
||||
tm.assert_index_equal(df.index.levels[1], Index(["a", "b"], name="label"))
|
||||
assert df.index.names == ["datetime", "label"]
|
||||
|
||||
df = df.swaplevel(0, 1)
|
||||
tm.assert_index_equal(df.index.levels[0], Index(["a", "b"], name="label"))
|
||||
tm.assert_index_equal(df.index.levels[1], expected)
|
||||
assert df.index.names == ["label", "datetime"]
|
||||
|
||||
df = DataFrame(np.random.default_rng(2).random(6))
|
||||
idx1 = DatetimeIndex(
|
||||
[
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
],
|
||||
tz="US/Eastern",
|
||||
)
|
||||
idx2 = DatetimeIndex(
|
||||
[
|
||||
"2012-04-01 09:00",
|
||||
"2012-04-01 09:00",
|
||||
"2012-04-01 09:00",
|
||||
"2012-04-02 09:00",
|
||||
"2012-04-02 09:00",
|
||||
"2012-04-02 09:00",
|
||||
],
|
||||
tz="US/Eastern",
|
||||
)
|
||||
idx3 = date_range("2011-01-01 09:00", periods=6, tz="Asia/Tokyo")
|
||||
idx3 = idx3._with_freq(None)
|
||||
|
||||
df = df.set_index(idx1)
|
||||
df = df.set_index(idx2, append=True)
|
||||
df = df.set_index(idx3, append=True)
|
||||
|
||||
expected1 = DatetimeIndex(
|
||||
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
|
||||
tz="US/Eastern",
|
||||
)
|
||||
expected2 = DatetimeIndex(
|
||||
["2012-04-01 09:00", "2012-04-02 09:00"], tz="US/Eastern"
|
||||
)
|
||||
|
||||
tm.assert_index_equal(df.index.levels[0], expected1)
|
||||
tm.assert_index_equal(df.index.levels[1], expected2)
|
||||
tm.assert_index_equal(df.index.levels[2], idx3)
|
||||
|
||||
# GH#7092
|
||||
tm.assert_index_equal(df.index.get_level_values(0), idx1)
|
||||
tm.assert_index_equal(df.index.get_level_values(1), idx2)
|
||||
tm.assert_index_equal(df.index.get_level_values(2), idx3)
|
||||
|
||||
def test_set_index_period(self):
|
||||
# GH#6631
|
||||
df = DataFrame(np.random.default_rng(2).random(6))
|
||||
idx1 = period_range("2011-01-01", periods=3, freq="M")
|
||||
idx1 = idx1.append(idx1)
|
||||
idx2 = period_range("2013-01-01 09:00", periods=2, freq="h")
|
||||
idx2 = idx2.append(idx2).append(idx2)
|
||||
idx3 = period_range("2005", periods=6, freq="Y")
|
||||
|
||||
df = df.set_index(idx1)
|
||||
df = df.set_index(idx2, append=True)
|
||||
df = df.set_index(idx3, append=True)
|
||||
|
||||
expected1 = period_range("2011-01-01", periods=3, freq="M")
|
||||
expected2 = period_range("2013-01-01 09:00", periods=2, freq="h")
|
||||
|
||||
tm.assert_index_equal(df.index.levels[0], expected1)
|
||||
tm.assert_index_equal(df.index.levels[1], expected2)
|
||||
tm.assert_index_equal(df.index.levels[2], idx3)
|
||||
|
||||
tm.assert_index_equal(df.index.get_level_values(0), idx1)
|
||||
tm.assert_index_equal(df.index.get_level_values(1), idx2)
|
||||
tm.assert_index_equal(df.index.get_level_values(2), idx3)
|
||||
|
||||
|
||||
class TestSetIndexInvalid:
|
||||
def test_set_index_verify_integrity(self, frame_of_index_cols):
|
||||
df = frame_of_index_cols
|
||||
|
||||
with pytest.raises(ValueError, match="Index has duplicate keys"):
|
||||
df.set_index("A", verify_integrity=True)
|
||||
# with MultiIndex
|
||||
with pytest.raises(ValueError, match="Index has duplicate keys"):
|
||||
df.set_index([df["A"], df["A"]], verify_integrity=True)
|
||||
|
||||
@pytest.mark.parametrize("append", [True, False])
|
||||
@pytest.mark.parametrize("drop", [True, False])
|
||||
def test_set_index_raise_keys(self, frame_of_index_cols, drop, append):
|
||||
df = frame_of_index_cols
|
||||
|
||||
with pytest.raises(KeyError, match="['foo', 'bar', 'baz']"):
|
||||
# column names are A-E, as well as one tuple
|
||||
df.set_index(["foo", "bar", "baz"], drop=drop, append=append)
|
||||
|
||||
# non-existent key in list with arrays
|
||||
with pytest.raises(KeyError, match="X"):
|
||||
df.set_index([df["A"], df["B"], "X"], drop=drop, append=append)
|
||||
|
||||
msg = "[('foo', 'foo', 'foo', 'bar', 'bar')]"
|
||||
# tuples always raise KeyError
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
df.set_index(tuple(df["A"]), drop=drop, append=append)
|
||||
|
||||
# also within a list
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
df.set_index(["A", df["A"], tuple(df["A"])], drop=drop, append=append)
|
||||
|
||||
@pytest.mark.parametrize("append", [True, False])
|
||||
@pytest.mark.parametrize("drop", [True, False])
|
||||
@pytest.mark.parametrize("box", [set], ids=["set"])
|
||||
def test_set_index_raise_on_type(self, frame_of_index_cols, box, drop, append):
|
||||
df = frame_of_index_cols
|
||||
|
||||
msg = 'The parameter "keys" may be a column key, .*'
|
||||
# forbidden type, e.g. set
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.set_index(box(df["A"]), drop=drop, append=append)
|
||||
|
||||
# forbidden type in list, e.g. set
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.set_index(["A", df["A"], box(df["A"])], drop=drop, append=append)
|
||||
|
||||
# MultiIndex constructor does not work directly on Series -> lambda
|
||||
@pytest.mark.parametrize(
|
||||
"box",
|
||||
[Series, Index, np.array, iter, lambda x: MultiIndex.from_arrays([x])],
|
||||
ids=["Series", "Index", "np.array", "iter", "MultiIndex"],
|
||||
)
|
||||
@pytest.mark.parametrize("length", [4, 6], ids=["too_short", "too_long"])
|
||||
@pytest.mark.parametrize("append", [True, False])
|
||||
@pytest.mark.parametrize("drop", [True, False])
|
||||
def test_set_index_raise_on_len(
|
||||
self, frame_of_index_cols, box, length, drop, append
|
||||
):
|
||||
# GH 24984
|
||||
df = frame_of_index_cols # has length 5
|
||||
|
||||
values = np.random.default_rng(2).integers(0, 10, (length,))
|
||||
|
||||
msg = "Length mismatch: Expected 5 rows, received array of length.*"
|
||||
|
||||
# wrong length directly
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.set_index(box(values), drop=drop, append=append)
|
||||
|
||||
# wrong length in list
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.set_index(["A", df.A, box(values)], drop=drop, append=append)
|
||||
|
||||
|
||||
class TestSetIndexCustomLabelType:
|
||||
def test_set_index_custom_label_type(self):
|
||||
# GH#24969
|
||||
|
||||
class Thing:
|
||||
def __init__(self, name, color) -> None:
|
||||
self.name = name
|
||||
self.color = color
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"<Thing {repr(self.name)}>"
|
||||
|
||||
# necessary for pretty KeyError
|
||||
__repr__ = __str__
|
||||
|
||||
thing1 = Thing("One", "red")
|
||||
thing2 = Thing("Two", "blue")
|
||||
df = DataFrame({thing1: [0, 1], thing2: [2, 3]})
|
||||
expected = DataFrame({thing1: [0, 1]}, index=Index([2, 3], name=thing2))
|
||||
|
||||
# use custom label directly
|
||||
result = df.set_index(thing2)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# custom label wrapped in list
|
||||
result = df.set_index([thing2])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# missing key
|
||||
thing3 = Thing("Three", "pink")
|
||||
msg = "<Thing 'Three'>"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
# missing label directly
|
||||
df.set_index(thing3)
|
||||
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
# missing label in list
|
||||
df.set_index([thing3])
|
||||
|
||||
def test_set_index_custom_label_hashable_iterable(self):
|
||||
# GH#24969
|
||||
|
||||
# actual example discussed in GH 24984 was e.g. for shapely.geometry
|
||||
# objects (e.g. a collection of Points) that can be both hashable and
|
||||
# iterable; using frozenset as a stand-in for testing here
|
||||
|
||||
class Thing(frozenset):
|
||||
# need to stabilize repr for KeyError (due to random order in sets)
|
||||
def __repr__(self) -> str:
|
||||
tmp = sorted(self)
|
||||
joined_reprs = ", ".join(map(repr, tmp))
|
||||
# double curly brace prints one brace in format string
|
||||
return f"frozenset({{{joined_reprs}}})"
|
||||
|
||||
thing1 = Thing(["One", "red"])
|
||||
thing2 = Thing(["Two", "blue"])
|
||||
df = DataFrame({thing1: [0, 1], thing2: [2, 3]})
|
||||
expected = DataFrame({thing1: [0, 1]}, index=Index([2, 3], name=thing2))
|
||||
|
||||
# use custom label directly
|
||||
result = df.set_index(thing2)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# custom label wrapped in list
|
||||
result = df.set_index([thing2])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# missing key
|
||||
thing3 = Thing(["Three", "pink"])
|
||||
msg = r"frozenset\(\{'Three', 'pink'\}\)"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
# missing label directly
|
||||
df.set_index(thing3)
|
||||
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
# missing label in list
|
||||
df.set_index([thing3])
|
||||
|
||||
def test_set_index_custom_label_type_raises(self):
|
||||
# GH#24969
|
||||
|
||||
# purposefully inherit from something unhashable
|
||||
class Thing(set):
|
||||
def __init__(self, name, color) -> None:
|
||||
self.name = name
|
||||
self.color = color
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"<Thing {repr(self.name)}>"
|
||||
|
||||
thing1 = Thing("One", "red")
|
||||
thing2 = Thing("Two", "blue")
|
||||
df = DataFrame([[0, 2], [1, 3]], columns=[thing1, thing2])
|
||||
|
||||
msg = 'The parameter "keys" may be a column key, .*'
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
# use custom label directly
|
||||
df.set_index(thing2)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
# custom label wrapped in list
|
||||
df.set_index([thing2])
|
||||
|
||||
def test_set_index_periodindex(self):
|
||||
# GH#6631
|
||||
df = DataFrame(np.random.default_rng(2).random(6))
|
||||
idx1 = period_range("2011/01/01", periods=6, freq="M")
|
||||
idx2 = period_range("2013", periods=6, freq="Y")
|
||||
|
||||
df = df.set_index(idx1)
|
||||
tm.assert_index_equal(df.index, idx1)
|
||||
df = df.set_index(idx2)
|
||||
tm.assert_index_equal(df.index, idx2)
|
@ -0,0 +1,764 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
CategoricalIndex,
|
||||
DataFrame,
|
||||
Index,
|
||||
NaT,
|
||||
Series,
|
||||
date_range,
|
||||
offsets,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameShift:
|
||||
def test_shift_axis1_with_valid_fill_value_one_array(self):
|
||||
# Case with axis=1 that does not go through the "len(arrays)>1" path
|
||||
# in DataFrame.shift
|
||||
data = np.random.default_rng(2).standard_normal((5, 3))
|
||||
df = DataFrame(data)
|
||||
res = df.shift(axis=1, periods=1, fill_value=12345)
|
||||
expected = df.T.shift(periods=1, fill_value=12345).T
|
||||
tm.assert_frame_equal(res, expected)
|
||||
|
||||
# same but with an 1D ExtensionArray backing it
|
||||
df2 = df[[0]].astype("Float64")
|
||||
res2 = df2.shift(axis=1, periods=1, fill_value=12345)
|
||||
expected2 = DataFrame([12345] * 5, dtype="Float64")
|
||||
tm.assert_frame_equal(res2, expected2)
|
||||
|
||||
def test_shift_deprecate_freq_and_fill_value(self, frame_or_series):
|
||||
# Can't pass both!
|
||||
obj = frame_or_series(
|
||||
np.random.default_rng(2).standard_normal(5),
|
||||
index=date_range("1/1/2000", periods=5, freq="h"),
|
||||
)
|
||||
|
||||
msg = (
|
||||
"Passing a 'freq' together with a 'fill_value' silently ignores the "
|
||||
"fill_value"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
obj.shift(1, fill_value=1, freq="h")
|
||||
|
||||
if frame_or_series is DataFrame:
|
||||
obj.columns = date_range("1/1/2000", periods=1, freq="h")
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
obj.shift(1, axis=1, fill_value=1, freq="h")
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_data, output_data",
|
||||
[(np.empty(shape=(0,)), []), (np.ones(shape=(2,)), [np.nan, 1.0])],
|
||||
)
|
||||
def test_shift_non_writable_array(self, input_data, output_data, frame_or_series):
|
||||
# GH21049 Verify whether non writable numpy array is shiftable
|
||||
input_data.setflags(write=False)
|
||||
|
||||
result = frame_or_series(input_data).shift(1)
|
||||
if frame_or_series is not Series:
|
||||
# need to explicitly specify columns in the empty case
|
||||
expected = frame_or_series(
|
||||
output_data,
|
||||
index=range(len(output_data)),
|
||||
columns=range(1),
|
||||
dtype="float64",
|
||||
)
|
||||
else:
|
||||
expected = frame_or_series(output_data, dtype="float64")
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_shift_mismatched_freq(self, frame_or_series):
|
||||
ts = frame_or_series(
|
||||
np.random.default_rng(2).standard_normal(5),
|
||||
index=date_range("1/1/2000", periods=5, freq="h"),
|
||||
)
|
||||
|
||||
result = ts.shift(1, freq="5min")
|
||||
exp_index = ts.index.shift(1, freq="5min")
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
# GH#1063, multiple of same base
|
||||
result = ts.shift(1, freq="4h")
|
||||
exp_index = ts.index + offsets.Hour(4)
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"obj",
|
||||
[
|
||||
Series([np.arange(5)]),
|
||||
date_range("1/1/2011", periods=24, freq="h"),
|
||||
Series(range(5), index=date_range("2017", periods=5)),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("shift_size", [0, 1, 2])
|
||||
def test_shift_always_copy(self, obj, shift_size, frame_or_series):
|
||||
# GH#22397
|
||||
if frame_or_series is not Series:
|
||||
obj = obj.to_frame()
|
||||
assert obj.shift(shift_size) is not obj
|
||||
|
||||
def test_shift_object_non_scalar_fill(self):
|
||||
# shift requires scalar fill_value except for object dtype
|
||||
ser = Series(range(3))
|
||||
with pytest.raises(ValueError, match="fill_value must be a scalar"):
|
||||
ser.shift(1, fill_value=[])
|
||||
|
||||
df = ser.to_frame()
|
||||
with pytest.raises(ValueError, match="fill_value must be a scalar"):
|
||||
df.shift(1, fill_value=np.arange(3))
|
||||
|
||||
obj_ser = ser.astype(object)
|
||||
result = obj_ser.shift(1, fill_value={})
|
||||
assert result[0] == {}
|
||||
|
||||
obj_df = obj_ser.to_frame()
|
||||
result = obj_df.shift(1, fill_value={})
|
||||
assert result.iloc[0, 0] == {}
|
||||
|
||||
def test_shift_int(self, datetime_frame, frame_or_series):
|
||||
ts = tm.get_obj(datetime_frame, frame_or_series).astype(int)
|
||||
shifted = ts.shift(1)
|
||||
expected = ts.astype(float).shift(1)
|
||||
tm.assert_equal(shifted, expected)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["int32", "int64"])
|
||||
def test_shift_32bit_take(self, frame_or_series, dtype):
|
||||
# 32-bit taking
|
||||
# GH#8129
|
||||
index = date_range("2000-01-01", periods=5)
|
||||
arr = np.arange(5, dtype=dtype)
|
||||
s1 = frame_or_series(arr, index=index)
|
||||
p = arr[1]
|
||||
result = s1.shift(periods=p)
|
||||
expected = frame_or_series([np.nan, 0, 1, 2, 3], index=index)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("periods", [1, 2, 3, 4])
|
||||
def test_shift_preserve_freqstr(self, periods, frame_or_series):
|
||||
# GH#21275
|
||||
obj = frame_or_series(
|
||||
range(periods),
|
||||
index=date_range("2016-1-1 00:00:00", periods=periods, freq="h"),
|
||||
)
|
||||
|
||||
result = obj.shift(1, "2h")
|
||||
|
||||
expected = frame_or_series(
|
||||
range(periods),
|
||||
index=date_range("2016-1-1 02:00:00", periods=periods, freq="h"),
|
||||
)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_shift_dst(self, frame_or_series):
|
||||
# GH#13926
|
||||
dates = date_range("2016-11-06", freq="h", periods=10, tz="US/Eastern")
|
||||
obj = frame_or_series(dates)
|
||||
|
||||
res = obj.shift(0)
|
||||
tm.assert_equal(res, obj)
|
||||
assert tm.get_dtype(res) == "datetime64[ns, US/Eastern]"
|
||||
|
||||
res = obj.shift(1)
|
||||
exp_vals = [NaT] + dates.astype(object).values.tolist()[:9]
|
||||
exp = frame_or_series(exp_vals)
|
||||
tm.assert_equal(res, exp)
|
||||
assert tm.get_dtype(res) == "datetime64[ns, US/Eastern]"
|
||||
|
||||
res = obj.shift(-2)
|
||||
exp_vals = dates.astype(object).values.tolist()[2:] + [NaT, NaT]
|
||||
exp = frame_or_series(exp_vals)
|
||||
tm.assert_equal(res, exp)
|
||||
assert tm.get_dtype(res) == "datetime64[ns, US/Eastern]"
|
||||
|
||||
@pytest.mark.parametrize("ex", [10, -10, 20, -20])
|
||||
def test_shift_dst_beyond(self, frame_or_series, ex):
|
||||
# GH#13926
|
||||
dates = date_range("2016-11-06", freq="h", periods=10, tz="US/Eastern")
|
||||
obj = frame_or_series(dates)
|
||||
res = obj.shift(ex)
|
||||
exp = frame_or_series([NaT] * 10, dtype="datetime64[ns, US/Eastern]")
|
||||
tm.assert_equal(res, exp)
|
||||
assert tm.get_dtype(res) == "datetime64[ns, US/Eastern]"
|
||||
|
||||
def test_shift_by_zero(self, datetime_frame, frame_or_series):
|
||||
# shift by 0
|
||||
obj = tm.get_obj(datetime_frame, frame_or_series)
|
||||
unshifted = obj.shift(0)
|
||||
tm.assert_equal(unshifted, obj)
|
||||
|
||||
def test_shift(self, datetime_frame):
|
||||
# naive shift
|
||||
ser = datetime_frame["A"]
|
||||
|
||||
shifted = datetime_frame.shift(5)
|
||||
tm.assert_index_equal(shifted.index, datetime_frame.index)
|
||||
|
||||
shifted_ser = ser.shift(5)
|
||||
tm.assert_series_equal(shifted["A"], shifted_ser)
|
||||
|
||||
shifted = datetime_frame.shift(-5)
|
||||
tm.assert_index_equal(shifted.index, datetime_frame.index)
|
||||
|
||||
shifted_ser = ser.shift(-5)
|
||||
tm.assert_series_equal(shifted["A"], shifted_ser)
|
||||
|
||||
unshifted = datetime_frame.shift(5).shift(-5)
|
||||
tm.assert_numpy_array_equal(
|
||||
unshifted.dropna().values, datetime_frame.values[:-5]
|
||||
)
|
||||
|
||||
unshifted_ser = ser.shift(5).shift(-5)
|
||||
tm.assert_numpy_array_equal(unshifted_ser.dropna().values, ser.values[:-5])
|
||||
|
||||
def test_shift_by_offset(self, datetime_frame, frame_or_series):
|
||||
# shift by DateOffset
|
||||
obj = tm.get_obj(datetime_frame, frame_or_series)
|
||||
offset = offsets.BDay()
|
||||
|
||||
shifted = obj.shift(5, freq=offset)
|
||||
assert len(shifted) == len(obj)
|
||||
unshifted = shifted.shift(-5, freq=offset)
|
||||
tm.assert_equal(unshifted, obj)
|
||||
|
||||
shifted2 = obj.shift(5, freq="B")
|
||||
tm.assert_equal(shifted, shifted2)
|
||||
|
||||
unshifted = obj.shift(0, freq=offset)
|
||||
tm.assert_equal(unshifted, obj)
|
||||
|
||||
d = obj.index[0]
|
||||
shifted_d = d + offset * 5
|
||||
if frame_or_series is DataFrame:
|
||||
tm.assert_series_equal(obj.xs(d), shifted.xs(shifted_d), check_names=False)
|
||||
else:
|
||||
tm.assert_almost_equal(obj.at[d], shifted.at[shifted_d])
|
||||
|
||||
def test_shift_with_periodindex(self, frame_or_series):
|
||||
# Shifting with PeriodIndex
|
||||
ps = DataFrame(
|
||||
np.arange(4, dtype=float), index=pd.period_range("2020-01-01", periods=4)
|
||||
)
|
||||
ps = tm.get_obj(ps, frame_or_series)
|
||||
|
||||
shifted = ps.shift(1)
|
||||
unshifted = shifted.shift(-1)
|
||||
tm.assert_index_equal(shifted.index, ps.index)
|
||||
tm.assert_index_equal(unshifted.index, ps.index)
|
||||
if frame_or_series is DataFrame:
|
||||
tm.assert_numpy_array_equal(
|
||||
unshifted.iloc[:, 0].dropna().values, ps.iloc[:-1, 0].values
|
||||
)
|
||||
else:
|
||||
tm.assert_numpy_array_equal(unshifted.dropna().values, ps.values[:-1])
|
||||
|
||||
shifted2 = ps.shift(1, "D")
|
||||
shifted3 = ps.shift(1, offsets.Day())
|
||||
tm.assert_equal(shifted2, shifted3)
|
||||
tm.assert_equal(ps, shifted2.shift(-1, "D"))
|
||||
|
||||
msg = "does not match PeriodIndex freq"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ps.shift(freq="W")
|
||||
|
||||
# legacy support
|
||||
shifted4 = ps.shift(1, freq="D")
|
||||
tm.assert_equal(shifted2, shifted4)
|
||||
|
||||
shifted5 = ps.shift(1, freq=offsets.Day())
|
||||
tm.assert_equal(shifted5, shifted4)
|
||||
|
||||
def test_shift_other_axis(self):
|
||||
# shift other axis
|
||||
# GH#6371
|
||||
df = DataFrame(np.random.default_rng(2).random((10, 5)))
|
||||
expected = pd.concat(
|
||||
[DataFrame(np.nan, index=df.index, columns=[0]), df.iloc[:, 0:-1]],
|
||||
ignore_index=True,
|
||||
axis=1,
|
||||
)
|
||||
result = df.shift(1, axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_shift_named_axis(self):
|
||||
# shift named axis
|
||||
df = DataFrame(np.random.default_rng(2).random((10, 5)))
|
||||
expected = pd.concat(
|
||||
[DataFrame(np.nan, index=df.index, columns=[0]), df.iloc[:, 0:-1]],
|
||||
ignore_index=True,
|
||||
axis=1,
|
||||
)
|
||||
result = df.shift(1, axis="columns")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_shift_other_axis_with_freq(self, datetime_frame):
|
||||
obj = datetime_frame.T
|
||||
offset = offsets.BDay()
|
||||
|
||||
# GH#47039
|
||||
shifted = obj.shift(5, freq=offset, axis=1)
|
||||
assert len(shifted) == len(obj)
|
||||
unshifted = shifted.shift(-5, freq=offset, axis=1)
|
||||
tm.assert_equal(unshifted, obj)
|
||||
|
||||
def test_shift_bool(self):
|
||||
df = DataFrame({"high": [True, False], "low": [False, False]})
|
||||
rs = df.shift(1)
|
||||
xp = DataFrame(
|
||||
np.array([[np.nan, np.nan], [True, False]], dtype=object),
|
||||
columns=["high", "low"],
|
||||
)
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
def test_shift_categorical1(self, frame_or_series):
|
||||
# GH#9416
|
||||
obj = frame_or_series(["a", "b", "c", "d"], dtype="category")
|
||||
|
||||
rt = obj.shift(1).shift(-1)
|
||||
tm.assert_equal(obj.iloc[:-1], rt.dropna())
|
||||
|
||||
def get_cat_values(ndframe):
|
||||
# For Series we could just do ._values; for DataFrame
|
||||
# we may be able to do this if we ever have 2D Categoricals
|
||||
return ndframe._mgr.arrays[0]
|
||||
|
||||
cat = get_cat_values(obj)
|
||||
|
||||
sp1 = obj.shift(1)
|
||||
tm.assert_index_equal(obj.index, sp1.index)
|
||||
assert np.all(get_cat_values(sp1).codes[:1] == -1)
|
||||
assert np.all(cat.codes[:-1] == get_cat_values(sp1).codes[1:])
|
||||
|
||||
sn2 = obj.shift(-2)
|
||||
tm.assert_index_equal(obj.index, sn2.index)
|
||||
assert np.all(get_cat_values(sn2).codes[-2:] == -1)
|
||||
assert np.all(cat.codes[2:] == get_cat_values(sn2).codes[:-2])
|
||||
|
||||
tm.assert_index_equal(cat.categories, get_cat_values(sp1).categories)
|
||||
tm.assert_index_equal(cat.categories, get_cat_values(sn2).categories)
|
||||
|
||||
def test_shift_categorical(self):
|
||||
# GH#9416
|
||||
s1 = Series(["a", "b", "c"], dtype="category")
|
||||
s2 = Series(["A", "B", "C"], dtype="category")
|
||||
df = DataFrame({"one": s1, "two": s2})
|
||||
rs = df.shift(1)
|
||||
xp = DataFrame({"one": s1.shift(1), "two": s2.shift(1)})
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
def test_shift_categorical_fill_value(self, frame_or_series):
|
||||
ts = frame_or_series(["a", "b", "c", "d"], dtype="category")
|
||||
res = ts.shift(1, fill_value="a")
|
||||
expected = frame_or_series(
|
||||
pd.Categorical(
|
||||
["a", "a", "b", "c"], categories=["a", "b", "c", "d"], ordered=False
|
||||
)
|
||||
)
|
||||
tm.assert_equal(res, expected)
|
||||
|
||||
# check for incorrect fill_value
|
||||
msg = r"Cannot setitem on a Categorical with a new category \(f\)"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
ts.shift(1, fill_value="f")
|
||||
|
||||
def test_shift_fill_value(self, frame_or_series):
|
||||
# GH#24128
|
||||
dti = date_range("1/1/2000", periods=5, freq="h")
|
||||
|
||||
ts = frame_or_series([1.0, 2.0, 3.0, 4.0, 5.0], index=dti)
|
||||
exp = frame_or_series([0.0, 1.0, 2.0, 3.0, 4.0], index=dti)
|
||||
# check that fill value works
|
||||
result = ts.shift(1, fill_value=0.0)
|
||||
tm.assert_equal(result, exp)
|
||||
|
||||
exp = frame_or_series([0.0, 0.0, 1.0, 2.0, 3.0], index=dti)
|
||||
result = ts.shift(2, fill_value=0.0)
|
||||
tm.assert_equal(result, exp)
|
||||
|
||||
ts = frame_or_series([1, 2, 3])
|
||||
res = ts.shift(2, fill_value=0)
|
||||
assert tm.get_dtype(res) == tm.get_dtype(ts)
|
||||
|
||||
# retain integer dtype
|
||||
obj = frame_or_series([1, 2, 3, 4, 5], index=dti)
|
||||
exp = frame_or_series([0, 1, 2, 3, 4], index=dti)
|
||||
result = obj.shift(1, fill_value=0)
|
||||
tm.assert_equal(result, exp)
|
||||
|
||||
exp = frame_or_series([0, 0, 1, 2, 3], index=dti)
|
||||
result = obj.shift(2, fill_value=0)
|
||||
tm.assert_equal(result, exp)
|
||||
|
||||
def test_shift_empty(self):
|
||||
# Regression test for GH#8019
|
||||
df = DataFrame({"foo": []})
|
||||
rs = df.shift(-1)
|
||||
|
||||
tm.assert_frame_equal(df, rs)
|
||||
|
||||
def test_shift_duplicate_columns(self):
|
||||
# GH#9092; verify that position-based shifting works
|
||||
# in the presence of duplicate columns
|
||||
column_lists = [list(range(5)), [1] * 5, [1, 1, 2, 2, 1]]
|
||||
data = np.random.default_rng(2).standard_normal((20, 5))
|
||||
|
||||
shifted = []
|
||||
for columns in column_lists:
|
||||
df = DataFrame(data.copy(), columns=columns)
|
||||
for s in range(5):
|
||||
df.iloc[:, s] = df.iloc[:, s].shift(s + 1)
|
||||
df.columns = range(5)
|
||||
shifted.append(df)
|
||||
|
||||
# sanity check the base case
|
||||
nulls = shifted[0].isna().sum()
|
||||
tm.assert_series_equal(nulls, Series(range(1, 6), dtype="int64"))
|
||||
|
||||
# check all answers are the same
|
||||
tm.assert_frame_equal(shifted[0], shifted[1])
|
||||
tm.assert_frame_equal(shifted[0], shifted[2])
|
||||
|
||||
def test_shift_axis1_multiple_blocks(self, using_array_manager):
|
||||
# GH#35488
|
||||
df1 = DataFrame(np.random.default_rng(2).integers(1000, size=(5, 3)))
|
||||
df2 = DataFrame(np.random.default_rng(2).integers(1000, size=(5, 2)))
|
||||
df3 = pd.concat([df1, df2], axis=1)
|
||||
if not using_array_manager:
|
||||
assert len(df3._mgr.blocks) == 2
|
||||
|
||||
result = df3.shift(2, axis=1)
|
||||
|
||||
expected = df3.take([-1, -1, 0, 1, 2], axis=1)
|
||||
# Explicit cast to float to avoid implicit cast when setting nan.
|
||||
# Column names aren't unique, so directly calling `expected.astype` won't work.
|
||||
expected = expected.pipe(
|
||||
lambda df: df.set_axis(range(df.shape[1]), axis=1)
|
||||
.astype({0: "float", 1: "float"})
|
||||
.set_axis(df.columns, axis=1)
|
||||
)
|
||||
expected.iloc[:, :2] = np.nan
|
||||
expected.columns = df3.columns
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Case with periods < 0
|
||||
# rebuild df3 because `take` call above consolidated
|
||||
df3 = pd.concat([df1, df2], axis=1)
|
||||
if not using_array_manager:
|
||||
assert len(df3._mgr.blocks) == 2
|
||||
result = df3.shift(-2, axis=1)
|
||||
|
||||
expected = df3.take([2, 3, 4, -1, -1], axis=1)
|
||||
# Explicit cast to float to avoid implicit cast when setting nan.
|
||||
# Column names aren't unique, so directly calling `expected.astype` won't work.
|
||||
expected = expected.pipe(
|
||||
lambda df: df.set_axis(range(df.shape[1]), axis=1)
|
||||
.astype({3: "float", 4: "float"})
|
||||
.set_axis(df.columns, axis=1)
|
||||
)
|
||||
expected.iloc[:, -2:] = np.nan
|
||||
expected.columns = df3.columns
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) axis=1 support
|
||||
def test_shift_axis1_multiple_blocks_with_int_fill(self):
|
||||
# GH#42719
|
||||
rng = np.random.default_rng(2)
|
||||
df1 = DataFrame(rng.integers(1000, size=(5, 3), dtype=int))
|
||||
df2 = DataFrame(rng.integers(1000, size=(5, 2), dtype=int))
|
||||
df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1)
|
||||
result = df3.shift(2, axis=1, fill_value=np.int_(0))
|
||||
assert len(df3._mgr.blocks) == 2
|
||||
|
||||
expected = df3.take([-1, -1, 0, 1], axis=1)
|
||||
expected.iloc[:, :2] = np.int_(0)
|
||||
expected.columns = df3.columns
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Case with periods < 0
|
||||
df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1)
|
||||
result = df3.shift(-2, axis=1, fill_value=np.int_(0))
|
||||
assert len(df3._mgr.blocks) == 2
|
||||
|
||||
expected = df3.take([2, 3, -1, -1], axis=1)
|
||||
expected.iloc[:, -2:] = np.int_(0)
|
||||
expected.columns = df3.columns
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_period_index_frame_shift_with_freq(self, frame_or_series):
|
||||
ps = DataFrame(range(4), index=pd.period_range("2020-01-01", periods=4))
|
||||
ps = tm.get_obj(ps, frame_or_series)
|
||||
|
||||
shifted = ps.shift(1, freq="infer")
|
||||
unshifted = shifted.shift(-1, freq="infer")
|
||||
tm.assert_equal(unshifted, ps)
|
||||
|
||||
shifted2 = ps.shift(freq="D")
|
||||
tm.assert_equal(shifted, shifted2)
|
||||
|
||||
shifted3 = ps.shift(freq=offsets.Day())
|
||||
tm.assert_equal(shifted, shifted3)
|
||||
|
||||
def test_datetime_frame_shift_with_freq(self, datetime_frame, frame_or_series):
|
||||
dtobj = tm.get_obj(datetime_frame, frame_or_series)
|
||||
shifted = dtobj.shift(1, freq="infer")
|
||||
unshifted = shifted.shift(-1, freq="infer")
|
||||
tm.assert_equal(dtobj, unshifted)
|
||||
|
||||
shifted2 = dtobj.shift(freq=dtobj.index.freq)
|
||||
tm.assert_equal(shifted, shifted2)
|
||||
|
||||
inferred_ts = DataFrame(
|
||||
datetime_frame.values,
|
||||
Index(np.asarray(datetime_frame.index)),
|
||||
columns=datetime_frame.columns,
|
||||
)
|
||||
inferred_ts = tm.get_obj(inferred_ts, frame_or_series)
|
||||
shifted = inferred_ts.shift(1, freq="infer")
|
||||
expected = dtobj.shift(1, freq="infer")
|
||||
expected.index = expected.index._with_freq(None)
|
||||
tm.assert_equal(shifted, expected)
|
||||
|
||||
unshifted = shifted.shift(-1, freq="infer")
|
||||
tm.assert_equal(unshifted, inferred_ts)
|
||||
|
||||
def test_period_index_frame_shift_with_freq_error(self, frame_or_series):
|
||||
ps = DataFrame(range(4), index=pd.period_range("2020-01-01", periods=4))
|
||||
ps = tm.get_obj(ps, frame_or_series)
|
||||
msg = "Given freq M does not match PeriodIndex freq D"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ps.shift(freq="M")
|
||||
|
||||
def test_datetime_frame_shift_with_freq_error(
|
||||
self, datetime_frame, frame_or_series
|
||||
):
|
||||
dtobj = tm.get_obj(datetime_frame, frame_or_series)
|
||||
no_freq = dtobj.iloc[[0, 5, 7]]
|
||||
msg = "Freq was not set in the index hence cannot be inferred"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
no_freq.shift(freq="infer")
|
||||
|
||||
def test_shift_dt64values_int_fill_deprecated(self):
|
||||
# GH#31971
|
||||
ser = Series([pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02")])
|
||||
|
||||
with pytest.raises(TypeError, match="value should be a"):
|
||||
ser.shift(1, fill_value=0)
|
||||
|
||||
df = ser.to_frame()
|
||||
with pytest.raises(TypeError, match="value should be a"):
|
||||
df.shift(1, fill_value=0)
|
||||
|
||||
# axis = 1
|
||||
df2 = DataFrame({"A": ser, "B": ser})
|
||||
df2._consolidate_inplace()
|
||||
|
||||
result = df2.shift(1, axis=1, fill_value=0)
|
||||
expected = DataFrame({"A": [0, 0], "B": df2["A"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# same thing but not consolidated; pre-2.0 we got different behavior
|
||||
df3 = DataFrame({"A": ser})
|
||||
df3["B"] = ser
|
||||
assert len(df3._mgr.arrays) == 2
|
||||
result = df3.shift(1, axis=1, fill_value=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"as_cat",
|
||||
[
|
||||
pytest.param(
|
||||
True,
|
||||
marks=pytest.mark.xfail(
|
||||
reason="_can_hold_element incorrectly always returns True"
|
||||
),
|
||||
),
|
||||
False,
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
date_range("2020-01-01", periods=2),
|
||||
date_range("2020-01-01", periods=2, tz="US/Pacific"),
|
||||
pd.period_range("2020-01-01", periods=2, freq="D"),
|
||||
pd.timedelta_range("2020 Days", periods=2, freq="D"),
|
||||
pd.interval_range(0, 3, periods=2),
|
||||
pytest.param(
|
||||
pd.array([1, 2], dtype="Int64"),
|
||||
marks=pytest.mark.xfail(
|
||||
reason="_can_hold_element incorrectly always returns True"
|
||||
),
|
||||
),
|
||||
pytest.param(
|
||||
pd.array([1, 2], dtype="Float32"),
|
||||
marks=pytest.mark.xfail(
|
||||
reason="_can_hold_element incorrectly always returns True"
|
||||
),
|
||||
),
|
||||
],
|
||||
ids=lambda x: str(x.dtype),
|
||||
)
|
||||
def test_shift_dt64values_axis1_invalid_fill(self, vals, as_cat):
|
||||
# GH#44564
|
||||
ser = Series(vals)
|
||||
if as_cat:
|
||||
ser = ser.astype("category")
|
||||
|
||||
df = DataFrame({"A": ser})
|
||||
result = df.shift(-1, axis=1, fill_value="foo")
|
||||
expected = DataFrame({"A": ["foo", "foo"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# same thing but multiple blocks
|
||||
df2 = DataFrame({"A": ser, "B": ser})
|
||||
df2._consolidate_inplace()
|
||||
|
||||
result = df2.shift(-1, axis=1, fill_value="foo")
|
||||
expected = DataFrame({"A": df2["B"], "B": ["foo", "foo"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# same thing but not consolidated
|
||||
df3 = DataFrame({"A": ser})
|
||||
df3["B"] = ser
|
||||
assert len(df3._mgr.arrays) == 2
|
||||
result = df3.shift(-1, axis=1, fill_value="foo")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_shift_axis1_categorical_columns(self):
|
||||
# GH#38434
|
||||
ci = CategoricalIndex(["a", "b", "c"])
|
||||
df = DataFrame(
|
||||
{"a": [1, 3], "b": [2, 4], "c": [5, 6]}, index=ci[:-1], columns=ci
|
||||
)
|
||||
result = df.shift(axis=1)
|
||||
|
||||
expected = DataFrame(
|
||||
{"a": [np.nan, np.nan], "b": [1, 3], "c": [2, 4]}, index=ci[:-1], columns=ci
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# periods != 1
|
||||
result = df.shift(2, axis=1)
|
||||
expected = DataFrame(
|
||||
{"a": [np.nan, np.nan], "b": [np.nan, np.nan], "c": [1, 3]},
|
||||
index=ci[:-1],
|
||||
columns=ci,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_shift_axis1_many_periods(self):
|
||||
# GH#44978 periods > len(columns)
|
||||
df = DataFrame(np.random.default_rng(2).random((5, 3)))
|
||||
shifted = df.shift(6, axis=1, fill_value=None)
|
||||
|
||||
expected = df * np.nan
|
||||
tm.assert_frame_equal(shifted, expected)
|
||||
|
||||
shifted2 = df.shift(-6, axis=1, fill_value=None)
|
||||
tm.assert_frame_equal(shifted2, expected)
|
||||
|
||||
def test_shift_with_offsets_freq(self):
|
||||
df = DataFrame({"x": [1, 2, 3]}, index=date_range("2000", periods=3))
|
||||
shifted = df.shift(freq="1MS")
|
||||
expected = DataFrame(
|
||||
{"x": [1, 2, 3]},
|
||||
index=date_range(start="02/01/2000", end="02/01/2000", periods=3),
|
||||
)
|
||||
tm.assert_frame_equal(shifted, expected)
|
||||
|
||||
def test_shift_with_iterable_basic_functionality(self):
|
||||
# GH#44424
|
||||
data = {"a": [1, 2, 3], "b": [4, 5, 6]}
|
||||
shifts = [0, 1, 2]
|
||||
|
||||
df = DataFrame(data)
|
||||
shifted = df.shift(shifts)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a_0": [1, 2, 3],
|
||||
"b_0": [4, 5, 6],
|
||||
"a_1": [np.nan, 1.0, 2.0],
|
||||
"b_1": [np.nan, 4.0, 5.0],
|
||||
"a_2": [np.nan, np.nan, 1.0],
|
||||
"b_2": [np.nan, np.nan, 4.0],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(expected, shifted)
|
||||
|
||||
def test_shift_with_iterable_series(self):
|
||||
# GH#44424
|
||||
data = {"a": [1, 2, 3]}
|
||||
shifts = [0, 1, 2]
|
||||
|
||||
df = DataFrame(data)
|
||||
s = df["a"]
|
||||
tm.assert_frame_equal(s.shift(shifts), df.shift(shifts))
|
||||
|
||||
def test_shift_with_iterable_freq_and_fill_value(self):
|
||||
# GH#44424
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal(5),
|
||||
index=date_range("1/1/2000", periods=5, freq="h"),
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(
|
||||
# rename because shift with an iterable leads to str column names
|
||||
df.shift([1], fill_value=1).rename(columns=lambda x: int(x[0])),
|
||||
df.shift(1, fill_value=1),
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(
|
||||
df.shift([1], freq="h").rename(columns=lambda x: int(x[0])),
|
||||
df.shift(1, freq="h"),
|
||||
)
|
||||
|
||||
msg = (
|
||||
"Passing a 'freq' together with a 'fill_value' silently ignores the "
|
||||
"fill_value"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.shift([1, 2], fill_value=1, freq="h")
|
||||
|
||||
def test_shift_with_iterable_check_other_arguments(self):
|
||||
# GH#44424
|
||||
data = {"a": [1, 2], "b": [4, 5]}
|
||||
shifts = [0, 1]
|
||||
df = DataFrame(data)
|
||||
|
||||
# test suffix
|
||||
shifted = df[["a"]].shift(shifts, suffix="_suffix")
|
||||
expected = DataFrame({"a_suffix_0": [1, 2], "a_suffix_1": [np.nan, 1.0]})
|
||||
tm.assert_frame_equal(shifted, expected)
|
||||
|
||||
# check bad inputs when doing multiple shifts
|
||||
msg = "If `periods` contains multiple shifts, `axis` cannot be 1."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.shift(shifts, axis=1)
|
||||
|
||||
msg = "Periods must be integer, but s is <class 'str'>."
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.shift(["s"])
|
||||
|
||||
msg = "If `periods` is an iterable, it cannot be empty."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.shift([])
|
||||
|
||||
msg = "Cannot specify `suffix` if `periods` is an int."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.shift(1, suffix="fails")
|
||||
|
||||
def test_shift_axis_one_empty(self):
|
||||
# GH#57301
|
||||
df = DataFrame()
|
||||
result = df.shift(1, axis=1)
|
||||
tm.assert_frame_equal(result, df)
|
@ -0,0 +1,21 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, index, expected",
|
||||
[
|
||||
({"col1": [1], "col2": [3]}, None, 2),
|
||||
({}, None, 0),
|
||||
({"col1": [1, np.nan], "col2": [3, 4]}, None, 4),
|
||||
({"col1": [1, 2], "col2": [3, 4]}, [["a", "b"], [1, 2]], 4),
|
||||
({"col1": [1, 2, 3, 4], "col2": [3, 4, 5, 6]}, ["x", "y", "a", "b"], 8),
|
||||
],
|
||||
)
|
||||
def test_size(data, index, expected):
|
||||
# GH#52897
|
||||
df = DataFrame(data, index=index)
|
||||
assert df.size == expected
|
||||
assert isinstance(df.size, int)
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,940 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
NaT,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.util.version import Version
|
||||
|
||||
|
||||
class TestDataFrameSortValues:
|
||||
@pytest.mark.parametrize("dtype", [np.uint8, bool])
|
||||
def test_sort_values_sparse_no_warning(self, dtype):
|
||||
# GH#45618
|
||||
ser = pd.Series(Categorical(["a", "b", "a"], categories=["a", "b", "c"]))
|
||||
df = pd.get_dummies(ser, dtype=dtype, sparse=True)
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
# No warnings about constructing Index from SparseArray
|
||||
df.sort_values(by=df.columns.tolist())
|
||||
|
||||
def test_sort_values(self):
|
||||
frame = DataFrame(
|
||||
[[1, 1, 2], [3, 1, 0], [4, 5, 6]], index=[1, 2, 3], columns=list("ABC")
|
||||
)
|
||||
|
||||
# by column (axis=0)
|
||||
sorted_df = frame.sort_values(by="A")
|
||||
indexer = frame["A"].argsort().values
|
||||
expected = frame.loc[frame.index[indexer]]
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by="A", ascending=False)
|
||||
indexer = indexer[::-1]
|
||||
expected = frame.loc[frame.index[indexer]]
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by="A", ascending=False)
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# GH4839
|
||||
sorted_df = frame.sort_values(by=["A"], ascending=[False])
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# multiple bys
|
||||
sorted_df = frame.sort_values(by=["B", "C"])
|
||||
expected = frame.loc[[2, 1, 3]]
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by=["B", "C"], ascending=False)
|
||||
tm.assert_frame_equal(sorted_df, expected[::-1])
|
||||
|
||||
sorted_df = frame.sort_values(by=["B", "A"], ascending=[True, False])
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
msg = "No axis named 2 for object type DataFrame"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
frame.sort_values(by=["A", "B"], axis=2, inplace=True)
|
||||
|
||||
# by row (axis=1): GH#10806
|
||||
sorted_df = frame.sort_values(by=3, axis=1)
|
||||
expected = frame
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by=3, axis=1, ascending=False)
|
||||
expected = frame.reindex(columns=["C", "B", "A"])
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by=[1, 2], axis="columns")
|
||||
expected = frame.reindex(columns=["B", "A", "C"])
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=[True, False])
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False)
|
||||
expected = frame.reindex(columns=["C", "B", "A"])
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
msg = r"Length of ascending \(5\) != length of by \(2\)"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
frame.sort_values(by=["A", "B"], axis=0, ascending=[True] * 5)
|
||||
|
||||
def test_sort_values_by_empty_list(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/40258
|
||||
expected = DataFrame({"a": [1, 4, 2, 5, 3, 6]})
|
||||
result = expected.sort_values(by=[])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert result is not expected
|
||||
|
||||
def test_sort_values_inplace(self):
|
||||
frame = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((4, 4)),
|
||||
index=[1, 2, 3, 4],
|
||||
columns=["A", "B", "C", "D"],
|
||||
)
|
||||
|
||||
sorted_df = frame.copy()
|
||||
return_value = sorted_df.sort_values(by="A", inplace=True)
|
||||
assert return_value is None
|
||||
expected = frame.sort_values(by="A")
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.copy()
|
||||
return_value = sorted_df.sort_values(by=1, axis=1, inplace=True)
|
||||
assert return_value is None
|
||||
expected = frame.sort_values(by=1, axis=1)
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.copy()
|
||||
return_value = sorted_df.sort_values(by="A", ascending=False, inplace=True)
|
||||
assert return_value is None
|
||||
expected = frame.sort_values(by="A", ascending=False)
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.copy()
|
||||
return_value = sorted_df.sort_values(
|
||||
by=["A", "B"], ascending=False, inplace=True
|
||||
)
|
||||
assert return_value is None
|
||||
expected = frame.sort_values(by=["A", "B"], ascending=False)
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
def test_sort_values_multicolumn(self):
|
||||
A = np.arange(5).repeat(20)
|
||||
B = np.tile(np.arange(5), 20)
|
||||
np.random.default_rng(2).shuffle(A)
|
||||
np.random.default_rng(2).shuffle(B)
|
||||
frame = DataFrame(
|
||||
{"A": A, "B": B, "C": np.random.default_rng(2).standard_normal(100)}
|
||||
)
|
||||
|
||||
result = frame.sort_values(by=["A", "B"])
|
||||
indexer = np.lexsort((frame["B"], frame["A"]))
|
||||
expected = frame.take(indexer)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = frame.sort_values(by=["A", "B"], ascending=False)
|
||||
indexer = np.lexsort(
|
||||
(frame["B"].rank(ascending=False), frame["A"].rank(ascending=False))
|
||||
)
|
||||
expected = frame.take(indexer)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = frame.sort_values(by=["B", "A"])
|
||||
indexer = np.lexsort((frame["A"], frame["B"]))
|
||||
expected = frame.take(indexer)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_values_multicolumn_uint64(self):
|
||||
# GH#9918
|
||||
# uint64 multicolumn sort
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": pd.Series([18446637057563306014, 1162265347240853609]),
|
||||
"b": pd.Series([1, 2]),
|
||||
}
|
||||
)
|
||||
df["a"] = df["a"].astype(np.uint64)
|
||||
result = df.sort_values(["a", "b"])
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": pd.Series([18446637057563306014, 1162265347240853609]),
|
||||
"b": pd.Series([1, 2]),
|
||||
},
|
||||
index=pd.Index([1, 0]),
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_values_nan(self):
|
||||
# GH#3917
|
||||
df = DataFrame(
|
||||
{"A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5]}
|
||||
)
|
||||
|
||||
# sort one column only
|
||||
expected = DataFrame(
|
||||
{"A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, np.nan, 5, 5, 4]},
|
||||
index=[2, 0, 3, 1, 6, 4, 5],
|
||||
)
|
||||
sorted_df = df.sort_values(["A"], na_position="first")
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
{"A": [np.nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, np.nan, 9, 2]},
|
||||
index=[2, 5, 4, 6, 1, 0, 3],
|
||||
)
|
||||
sorted_df = df.sort_values(["A"], na_position="first", ascending=False)
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
expected = df.reindex(columns=["B", "A"])
|
||||
sorted_df = df.sort_values(by=1, axis=1, na_position="first")
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# na_position='last', order
|
||||
expected = DataFrame(
|
||||
{"A": [1, 1, 2, 4, 6, 8, np.nan], "B": [2, 9, np.nan, 5, 5, 4, 5]},
|
||||
index=[3, 0, 1, 6, 4, 5, 2],
|
||||
)
|
||||
sorted_df = df.sort_values(["A", "B"])
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# na_position='first', order
|
||||
expected = DataFrame(
|
||||
{"A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 2, 9, np.nan, 5, 5, 4]},
|
||||
index=[2, 3, 0, 1, 6, 4, 5],
|
||||
)
|
||||
sorted_df = df.sort_values(["A", "B"], na_position="first")
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# na_position='first', not order
|
||||
expected = DataFrame(
|
||||
{"A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, np.nan, 5, 5, 4]},
|
||||
index=[2, 0, 3, 1, 6, 4, 5],
|
||||
)
|
||||
sorted_df = df.sort_values(["A", "B"], ascending=[1, 0], na_position="first")
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# na_position='last', not order
|
||||
expected = DataFrame(
|
||||
{"A": [8, 6, 4, 2, 1, 1, np.nan], "B": [4, 5, 5, np.nan, 2, 9, 5]},
|
||||
index=[5, 4, 6, 1, 3, 0, 2],
|
||||
)
|
||||
sorted_df = df.sort_values(["A", "B"], ascending=[0, 1], na_position="last")
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
def test_sort_values_stable_descending_sort(self):
|
||||
# GH#6399
|
||||
df = DataFrame(
|
||||
[[2, "first"], [2, "second"], [1, "a"], [1, "b"]],
|
||||
columns=["sort_col", "order"],
|
||||
)
|
||||
sorted_df = df.sort_values(by="sort_col", kind="mergesort", ascending=False)
|
||||
tm.assert_frame_equal(df, sorted_df)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"expected_idx_non_na, ascending",
|
||||
[
|
||||
[
|
||||
[3, 4, 5, 0, 1, 8, 6, 9, 7, 10, 13, 14],
|
||||
[True, True],
|
||||
],
|
||||
[
|
||||
[0, 3, 4, 5, 1, 8, 6, 7, 10, 13, 14, 9],
|
||||
[True, False],
|
||||
],
|
||||
[
|
||||
[9, 7, 10, 13, 14, 6, 8, 1, 3, 4, 5, 0],
|
||||
[False, True],
|
||||
],
|
||||
[
|
||||
[7, 10, 13, 14, 9, 6, 8, 1, 0, 3, 4, 5],
|
||||
[False, False],
|
||||
],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("na_position", ["first", "last"])
|
||||
def test_sort_values_stable_multicolumn_sort(
|
||||
self, expected_idx_non_na, ascending, na_position
|
||||
):
|
||||
# GH#38426 Clarify sort_values with mult. columns / labels is stable
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 2, np.nan, 1, 1, 1, 6, 8, 4, 8, 8, np.nan, np.nan, 8, 8],
|
||||
"B": [9, np.nan, 5, 2, 2, 2, 5, 4, 5, 3, 4, np.nan, np.nan, 4, 4],
|
||||
}
|
||||
)
|
||||
# All rows with NaN in col "B" only have unique values in "A", therefore,
|
||||
# only the rows with NaNs in "A" have to be treated individually:
|
||||
expected_idx = (
|
||||
[11, 12, 2] + expected_idx_non_na
|
||||
if na_position == "first"
|
||||
else expected_idx_non_na + [2, 11, 12]
|
||||
)
|
||||
expected = df.take(expected_idx)
|
||||
sorted_df = df.sort_values(
|
||||
["A", "B"], ascending=ascending, na_position=na_position
|
||||
)
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
def test_sort_values_stable_categorial(self):
|
||||
# GH#16793
|
||||
df = DataFrame({"x": Categorical(np.repeat([1, 2, 3, 4], 5), ordered=True)})
|
||||
expected = df.copy()
|
||||
sorted_df = df.sort_values("x", kind="mergesort")
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
def test_sort_values_datetimes(self):
|
||||
# GH#3461, argsort / lexsort differences for a datetime column
|
||||
df = DataFrame(
|
||||
["a", "a", "a", "b", "c", "d", "e", "f", "g"],
|
||||
columns=["A"],
|
||||
index=date_range("20130101", periods=9),
|
||||
)
|
||||
dts = [
|
||||
Timestamp(x)
|
||||
for x in [
|
||||
"2004-02-11",
|
||||
"2004-01-21",
|
||||
"2004-01-26",
|
||||
"2005-09-20",
|
||||
"2010-10-04",
|
||||
"2009-05-12",
|
||||
"2008-11-12",
|
||||
"2010-09-28",
|
||||
"2010-09-28",
|
||||
]
|
||||
]
|
||||
df["B"] = dts[::2] + dts[1::2]
|
||||
df["C"] = 2.0
|
||||
df["A1"] = 3.0
|
||||
|
||||
df1 = df.sort_values(by="A")
|
||||
df2 = df.sort_values(by=["A"])
|
||||
tm.assert_frame_equal(df1, df2)
|
||||
|
||||
df1 = df.sort_values(by="B")
|
||||
df2 = df.sort_values(by=["B"])
|
||||
tm.assert_frame_equal(df1, df2)
|
||||
|
||||
df1 = df.sort_values(by="B")
|
||||
|
||||
df2 = df.sort_values(by=["C", "B"])
|
||||
tm.assert_frame_equal(df1, df2)
|
||||
|
||||
def test_sort_values_frame_column_inplace_sort_exception(
|
||||
self, float_frame, using_copy_on_write
|
||||
):
|
||||
s = float_frame["A"]
|
||||
float_frame_orig = float_frame.copy()
|
||||
if using_copy_on_write:
|
||||
# INFO(CoW) Series is a new object, so can be changed inplace
|
||||
# without modifying original datafame
|
||||
s.sort_values(inplace=True)
|
||||
tm.assert_series_equal(s, float_frame_orig["A"].sort_values())
|
||||
# column in dataframe is not changed
|
||||
tm.assert_frame_equal(float_frame, float_frame_orig)
|
||||
else:
|
||||
with pytest.raises(ValueError, match="This Series is a view"):
|
||||
s.sort_values(inplace=True)
|
||||
|
||||
cp = s.copy()
|
||||
cp.sort_values() # it works!
|
||||
|
||||
def test_sort_values_nat_values_in_int_column(self):
|
||||
# GH#14922: "sorting with large float and multiple columns incorrect"
|
||||
|
||||
# cause was that the int64 value NaT was considered as "na". Which is
|
||||
# only correct for datetime64 columns.
|
||||
|
||||
int_values = (2, int(NaT._value))
|
||||
float_values = (2.0, -1.797693e308)
|
||||
|
||||
df = DataFrame(
|
||||
{"int": int_values, "float": float_values}, columns=["int", "float"]
|
||||
)
|
||||
|
||||
df_reversed = DataFrame(
|
||||
{"int": int_values[::-1], "float": float_values[::-1]},
|
||||
columns=["int", "float"],
|
||||
index=[1, 0],
|
||||
)
|
||||
|
||||
# NaT is not a "na" for int64 columns, so na_position must not
|
||||
# influence the result:
|
||||
df_sorted = df.sort_values(["int", "float"], na_position="last")
|
||||
tm.assert_frame_equal(df_sorted, df_reversed)
|
||||
|
||||
df_sorted = df.sort_values(["int", "float"], na_position="first")
|
||||
tm.assert_frame_equal(df_sorted, df_reversed)
|
||||
|
||||
# reverse sorting order
|
||||
df_sorted = df.sort_values(["int", "float"], ascending=False)
|
||||
tm.assert_frame_equal(df_sorted, df)
|
||||
|
||||
# and now check if NaT is still considered as "na" for datetime64
|
||||
# columns:
|
||||
df = DataFrame(
|
||||
{"datetime": [Timestamp("2016-01-01"), NaT], "float": float_values},
|
||||
columns=["datetime", "float"],
|
||||
)
|
||||
|
||||
df_reversed = DataFrame(
|
||||
{"datetime": [NaT, Timestamp("2016-01-01")], "float": float_values[::-1]},
|
||||
columns=["datetime", "float"],
|
||||
index=[1, 0],
|
||||
)
|
||||
|
||||
df_sorted = df.sort_values(["datetime", "float"], na_position="first")
|
||||
tm.assert_frame_equal(df_sorted, df_reversed)
|
||||
|
||||
df_sorted = df.sort_values(["datetime", "float"], na_position="last")
|
||||
tm.assert_frame_equal(df_sorted, df)
|
||||
|
||||
# Ascending should not affect the results.
|
||||
df_sorted = df.sort_values(["datetime", "float"], ascending=False)
|
||||
tm.assert_frame_equal(df_sorted, df)
|
||||
|
||||
def test_sort_nat(self):
|
||||
# GH 16836
|
||||
|
||||
d1 = [Timestamp(x) for x in ["2016-01-01", "2015-01-01", np.nan, "2016-01-01"]]
|
||||
d2 = [
|
||||
Timestamp(x)
|
||||
for x in ["2017-01-01", "2014-01-01", "2016-01-01", "2015-01-01"]
|
||||
]
|
||||
df = DataFrame({"a": d1, "b": d2}, index=[0, 1, 2, 3])
|
||||
|
||||
d3 = [Timestamp(x) for x in ["2015-01-01", "2016-01-01", "2016-01-01", np.nan]]
|
||||
d4 = [
|
||||
Timestamp(x)
|
||||
for x in ["2014-01-01", "2015-01-01", "2017-01-01", "2016-01-01"]
|
||||
]
|
||||
expected = DataFrame({"a": d3, "b": d4}, index=[1, 3, 0, 2])
|
||||
sorted_df = df.sort_values(by=["a", "b"])
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
def test_sort_values_na_position_with_categories(self):
|
||||
# GH#22556
|
||||
# Positioning missing value properly when column is Categorical.
|
||||
categories = ["A", "B", "C"]
|
||||
category_indices = [0, 2, 4]
|
||||
list_of_nans = [np.nan, np.nan]
|
||||
na_indices = [1, 3]
|
||||
na_position_first = "first"
|
||||
na_position_last = "last"
|
||||
column_name = "c"
|
||||
|
||||
reversed_categories = sorted(categories, reverse=True)
|
||||
reversed_category_indices = sorted(category_indices, reverse=True)
|
||||
reversed_na_indices = sorted(na_indices)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
column_name: Categorical(
|
||||
["A", np.nan, "B", np.nan, "C"], categories=categories, ordered=True
|
||||
)
|
||||
}
|
||||
)
|
||||
# sort ascending with na first
|
||||
result = df.sort_values(
|
||||
by=column_name, ascending=True, na_position=na_position_first
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
column_name: Categorical(
|
||||
list_of_nans + categories, categories=categories, ordered=True
|
||||
)
|
||||
},
|
||||
index=na_indices + category_indices,
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# sort ascending with na last
|
||||
result = df.sort_values(
|
||||
by=column_name, ascending=True, na_position=na_position_last
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
column_name: Categorical(
|
||||
categories + list_of_nans, categories=categories, ordered=True
|
||||
)
|
||||
},
|
||||
index=category_indices + na_indices,
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# sort descending with na first
|
||||
result = df.sort_values(
|
||||
by=column_name, ascending=False, na_position=na_position_first
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
column_name: Categorical(
|
||||
list_of_nans + reversed_categories,
|
||||
categories=categories,
|
||||
ordered=True,
|
||||
)
|
||||
},
|
||||
index=reversed_na_indices + reversed_category_indices,
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# sort descending with na last
|
||||
result = df.sort_values(
|
||||
by=column_name, ascending=False, na_position=na_position_last
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
column_name: Categorical(
|
||||
reversed_categories + list_of_nans,
|
||||
categories=categories,
|
||||
ordered=True,
|
||||
)
|
||||
},
|
||||
index=reversed_category_indices + reversed_na_indices,
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_values_nat(self):
|
||||
# GH#16836
|
||||
|
||||
d1 = [Timestamp(x) for x in ["2016-01-01", "2015-01-01", np.nan, "2016-01-01"]]
|
||||
d2 = [
|
||||
Timestamp(x)
|
||||
for x in ["2017-01-01", "2014-01-01", "2016-01-01", "2015-01-01"]
|
||||
]
|
||||
df = DataFrame({"a": d1, "b": d2}, index=[0, 1, 2, 3])
|
||||
|
||||
d3 = [Timestamp(x) for x in ["2015-01-01", "2016-01-01", "2016-01-01", np.nan]]
|
||||
d4 = [
|
||||
Timestamp(x)
|
||||
for x in ["2014-01-01", "2015-01-01", "2017-01-01", "2016-01-01"]
|
||||
]
|
||||
expected = DataFrame({"a": d3, "b": d4}, index=[1, 3, 0, 2])
|
||||
sorted_df = df.sort_values(by=["a", "b"])
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
def test_sort_values_na_position_with_categories_raises(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"c": Categorical(
|
||||
["A", np.nan, "B", np.nan, "C"],
|
||||
categories=["A", "B", "C"],
|
||||
ordered=True,
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="invalid na_position: bad_position"):
|
||||
df.sort_values(by="c", ascending=False, na_position="bad_position")
|
||||
|
||||
@pytest.mark.parametrize("inplace", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"original_dict, sorted_dict, ignore_index, output_index",
|
||||
[
|
||||
({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, True, [0, 1, 2]),
|
||||
({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, False, [2, 1, 0]),
|
||||
(
|
||||
{"A": [1, 2, 3], "B": [2, 3, 4]},
|
||||
{"A": [3, 2, 1], "B": [4, 3, 2]},
|
||||
True,
|
||||
[0, 1, 2],
|
||||
),
|
||||
(
|
||||
{"A": [1, 2, 3], "B": [2, 3, 4]},
|
||||
{"A": [3, 2, 1], "B": [4, 3, 2]},
|
||||
False,
|
||||
[2, 1, 0],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_sort_values_ignore_index(
|
||||
self, inplace, original_dict, sorted_dict, ignore_index, output_index
|
||||
):
|
||||
# GH 30114
|
||||
df = DataFrame(original_dict)
|
||||
expected = DataFrame(sorted_dict, index=output_index)
|
||||
kwargs = {"ignore_index": ignore_index, "inplace": inplace}
|
||||
|
||||
if inplace:
|
||||
result_df = df.copy()
|
||||
result_df.sort_values("A", ascending=False, **kwargs)
|
||||
else:
|
||||
result_df = df.sort_values("A", ascending=False, **kwargs)
|
||||
|
||||
tm.assert_frame_equal(result_df, expected)
|
||||
tm.assert_frame_equal(df, DataFrame(original_dict))
|
||||
|
||||
def test_sort_values_nat_na_position_default(self):
|
||||
# GH 13230
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3, 4, 4],
|
||||
"date": pd.DatetimeIndex(
|
||||
[
|
||||
"2010-01-01 09:00:00",
|
||||
"2010-01-01 09:00:01",
|
||||
"2010-01-01 09:00:02",
|
||||
"2010-01-01 09:00:03",
|
||||
"NaT",
|
||||
]
|
||||
),
|
||||
}
|
||||
)
|
||||
result = expected.sort_values(["A", "date"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_values_item_cache(self, using_array_manager, using_copy_on_write):
|
||||
# previous behavior incorrect retained an invalid _item_cache entry
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((4, 3)), columns=["A", "B", "C"]
|
||||
)
|
||||
df["D"] = df["A"] * 2
|
||||
ser = df["A"]
|
||||
if not using_array_manager:
|
||||
assert len(df._mgr.blocks) == 2
|
||||
|
||||
df.sort_values(by="A")
|
||||
|
||||
if using_copy_on_write:
|
||||
ser.iloc[0] = 99
|
||||
assert df.iloc[0, 0] == df["A"][0]
|
||||
assert df.iloc[0, 0] != 99
|
||||
else:
|
||||
ser.values[0] = 99
|
||||
assert df.iloc[0, 0] == df["A"][0]
|
||||
assert df.iloc[0, 0] == 99
|
||||
|
||||
def test_sort_values_reshaping(self):
|
||||
# GH 39426
|
||||
values = list(range(21))
|
||||
expected = DataFrame([values], columns=values)
|
||||
df = expected.sort_values(expected.index[0], axis=1, ignore_index=True)
|
||||
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_sort_values_no_by_inplace(self):
|
||||
# GH#50643
|
||||
df = DataFrame({"a": [1, 2, 3]})
|
||||
expected = df.copy()
|
||||
result = df.sort_values(by=[], inplace=True)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
assert result is None
|
||||
|
||||
def test_sort_values_no_op_reset_index(self):
|
||||
# GH#52553
|
||||
df = DataFrame({"A": [10, 20], "B": [1, 5]}, index=[2, 3])
|
||||
result = df.sort_values(by="A", ignore_index=True)
|
||||
expected = DataFrame({"A": [10, 20], "B": [1, 5]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestDataFrameSortKey: # test key sorting (issue 27237)
|
||||
def test_sort_values_inplace_key(self, sort_by_key):
|
||||
frame = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((4, 4)),
|
||||
index=[1, 2, 3, 4],
|
||||
columns=["A", "B", "C", "D"],
|
||||
)
|
||||
|
||||
sorted_df = frame.copy()
|
||||
return_value = sorted_df.sort_values(by="A", inplace=True, key=sort_by_key)
|
||||
assert return_value is None
|
||||
expected = frame.sort_values(by="A", key=sort_by_key)
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.copy()
|
||||
return_value = sorted_df.sort_values(
|
||||
by=1, axis=1, inplace=True, key=sort_by_key
|
||||
)
|
||||
assert return_value is None
|
||||
expected = frame.sort_values(by=1, axis=1, key=sort_by_key)
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.copy()
|
||||
return_value = sorted_df.sort_values(
|
||||
by="A", ascending=False, inplace=True, key=sort_by_key
|
||||
)
|
||||
assert return_value is None
|
||||
expected = frame.sort_values(by="A", ascending=False, key=sort_by_key)
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.copy()
|
||||
sorted_df.sort_values(
|
||||
by=["A", "B"], ascending=False, inplace=True, key=sort_by_key
|
||||
)
|
||||
expected = frame.sort_values(by=["A", "B"], ascending=False, key=sort_by_key)
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
def test_sort_values_key(self):
|
||||
df = DataFrame(np.array([0, 5, np.nan, 3, 2, np.nan]))
|
||||
|
||||
result = df.sort_values(0)
|
||||
expected = df.iloc[[0, 4, 3, 1, 2, 5]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.sort_values(0, key=lambda x: x + 5)
|
||||
expected = df.iloc[[0, 4, 3, 1, 2, 5]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.sort_values(0, key=lambda x: -x, ascending=False)
|
||||
expected = df.iloc[[0, 4, 3, 1, 2, 5]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_values_by_key(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": np.array([0, 3, np.nan, 3, 2, np.nan]),
|
||||
"b": np.array([0, 2, np.nan, 5, 2, np.nan]),
|
||||
}
|
||||
)
|
||||
|
||||
result = df.sort_values("a", key=lambda x: -x)
|
||||
expected = df.iloc[[1, 3, 4, 0, 2, 5]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.sort_values(by=["a", "b"], key=lambda x: -x)
|
||||
expected = df.iloc[[3, 1, 4, 0, 2, 5]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.sort_values(by=["a", "b"], key=lambda x: -x, ascending=False)
|
||||
expected = df.iloc[[0, 4, 1, 3, 2, 5]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_values_by_key_by_name(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": np.array([0, 3, np.nan, 3, 2, np.nan]),
|
||||
"b": np.array([0, 2, np.nan, 5, 2, np.nan]),
|
||||
}
|
||||
)
|
||||
|
||||
def key(col):
|
||||
if col.name == "a":
|
||||
return -col
|
||||
else:
|
||||
return col
|
||||
|
||||
result = df.sort_values(by="a", key=key)
|
||||
expected = df.iloc[[1, 3, 4, 0, 2, 5]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.sort_values(by=["a"], key=key)
|
||||
expected = df.iloc[[1, 3, 4, 0, 2, 5]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.sort_values(by="b", key=key)
|
||||
expected = df.iloc[[0, 1, 4, 3, 2, 5]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.sort_values(by=["a", "b"], key=key)
|
||||
expected = df.iloc[[1, 3, 4, 0, 2, 5]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_values_key_string(self):
|
||||
df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]]))
|
||||
|
||||
result = df.sort_values(1)
|
||||
expected = df[::-1]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.sort_values([0, 1], key=lambda col: col.str.lower())
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
result = df.sort_values(
|
||||
[0, 1], key=lambda col: col.str.lower(), ascending=False
|
||||
)
|
||||
expected = df.sort_values(1, key=lambda col: col.str.lower(), ascending=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_values_key_empty(self, sort_by_key):
|
||||
df = DataFrame(np.array([]))
|
||||
|
||||
df.sort_values(0, key=sort_by_key)
|
||||
df.sort_index(key=sort_by_key)
|
||||
|
||||
def test_changes_length_raises(self):
|
||||
df = DataFrame({"A": [1, 2, 3]})
|
||||
with pytest.raises(ValueError, match="change the shape"):
|
||||
df.sort_values("A", key=lambda x: x[:1])
|
||||
|
||||
def test_sort_values_key_axes(self):
|
||||
df = DataFrame({0: ["Hello", "goodbye"], 1: [0, 1]})
|
||||
|
||||
result = df.sort_values(0, key=lambda col: col.str.lower())
|
||||
expected = df[::-1]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.sort_values(1, key=lambda col: -col)
|
||||
expected = df[::-1]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_values_key_dict_axis(self):
|
||||
df = DataFrame({0: ["Hello", 0], 1: ["goodbye", 1]})
|
||||
|
||||
result = df.sort_values(0, key=lambda col: col.str.lower(), axis=1)
|
||||
expected = df.loc[:, ::-1]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.sort_values(1, key=lambda col: -col, axis=1)
|
||||
expected = df.loc[:, ::-1]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("ordered", [True, False])
|
||||
def test_sort_values_key_casts_to_categorical(self, ordered):
|
||||
# https://github.com/pandas-dev/pandas/issues/36383
|
||||
categories = ["c", "b", "a"]
|
||||
df = DataFrame({"x": [1, 1, 1], "y": ["a", "b", "c"]})
|
||||
|
||||
def sorter(key):
|
||||
if key.name == "y":
|
||||
return pd.Series(
|
||||
Categorical(key, categories=categories, ordered=ordered)
|
||||
)
|
||||
return key
|
||||
|
||||
result = df.sort_values(by=["x", "y"], key=sorter)
|
||||
expected = DataFrame(
|
||||
{"x": [1, 1, 1], "y": ["c", "b", "a"]}, index=pd.Index([2, 1, 0])
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_none():
|
||||
return DataFrame(
|
||||
{
|
||||
"outer": ["a", "a", "a", "b", "b", "b"],
|
||||
"inner": [1, 2, 2, 2, 1, 1],
|
||||
"A": np.arange(6, 0, -1),
|
||||
("B", 5): ["one", "one", "two", "two", "one", "one"],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(params=[["outer"], ["outer", "inner"]])
|
||||
def df_idx(request, df_none):
|
||||
levels = request.param
|
||||
return df_none.set_index(levels)
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
"inner", # index level
|
||||
["outer"], # list of index level
|
||||
"A", # column
|
||||
[("B", 5)], # list of column
|
||||
["inner", "outer"], # two index levels
|
||||
[("B", 5), "outer"], # index level and column
|
||||
["A", ("B", 5)], # Two columns
|
||||
["inner", "outer"], # two index levels and column
|
||||
]
|
||||
)
|
||||
def sort_names(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def ascending(request):
|
||||
return request.param
|
||||
|
||||
|
||||
class TestSortValuesLevelAsStr:
|
||||
def test_sort_index_level_and_column_label(
|
||||
self, df_none, df_idx, sort_names, ascending, request
|
||||
):
|
||||
# GH#14353
|
||||
if (
|
||||
Version(np.__version__) >= Version("1.25")
|
||||
and request.node.callspec.id == "df_idx0-inner-True"
|
||||
):
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(
|
||||
reason=(
|
||||
"pandas default unstable sorting of duplicates"
|
||||
"issue with numpy>=1.25 with AVX instructions"
|
||||
),
|
||||
strict=False,
|
||||
)
|
||||
)
|
||||
|
||||
# Get index levels from df_idx
|
||||
levels = df_idx.index.names
|
||||
|
||||
# Compute expected by sorting on columns and the setting index
|
||||
expected = df_none.sort_values(
|
||||
by=sort_names, ascending=ascending, axis=0
|
||||
).set_index(levels)
|
||||
|
||||
# Compute result sorting on mix on columns and index levels
|
||||
result = df_idx.sort_values(by=sort_names, ascending=ascending, axis=0)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_column_level_and_index_label(
|
||||
self, df_none, df_idx, sort_names, ascending, request
|
||||
):
|
||||
# GH#14353
|
||||
|
||||
# Get levels from df_idx
|
||||
levels = df_idx.index.names
|
||||
|
||||
# Compute expected by sorting on axis=0, setting index levels, and then
|
||||
# transposing. For some cases this will result in a frame with
|
||||
# multiple column levels
|
||||
expected = (
|
||||
df_none.sort_values(by=sort_names, ascending=ascending, axis=0)
|
||||
.set_index(levels)
|
||||
.T
|
||||
)
|
||||
|
||||
# Compute result by transposing and sorting on axis=1.
|
||||
result = df_idx.T.sort_values(by=sort_names, ascending=ascending, axis=1)
|
||||
|
||||
if Version(np.__version__) >= Version("1.25"):
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(
|
||||
reason=(
|
||||
"pandas default unstable sorting of duplicates"
|
||||
"issue with numpy>=1.25 with AVX instructions"
|
||||
),
|
||||
strict=False,
|
||||
)
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_values_validate_ascending_for_value_error(self):
|
||||
# GH41634
|
||||
df = DataFrame({"D": [23, 7, 21]})
|
||||
|
||||
msg = 'For argument "ascending" expected type bool, received type str.'
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.sort_values(by="D", ascending="False")
|
||||
|
||||
@pytest.mark.parametrize("ascending", [False, 0, 1, True])
|
||||
def test_sort_values_validate_ascending_functional(self, ascending):
|
||||
df = DataFrame({"D": [23, 7, 21]})
|
||||
indexer = df["D"].argsort().values
|
||||
|
||||
if not ascending:
|
||||
indexer = indexer[::-1]
|
||||
|
||||
expected = df.loc[df.index[indexer]]
|
||||
result = df.sort_values(by="D", ascending=ascending)
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,37 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestSwapAxes:
|
||||
def test_swapaxes(self):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((10, 5)))
|
||||
msg = "'DataFrame.swapaxes' is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
tm.assert_frame_equal(df.T, df.swapaxes(0, 1))
|
||||
tm.assert_frame_equal(df.T, df.swapaxes(1, 0))
|
||||
|
||||
def test_swapaxes_noop(self):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((10, 5)))
|
||||
msg = "'DataFrame.swapaxes' is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
tm.assert_frame_equal(df, df.swapaxes(0, 0))
|
||||
|
||||
def test_swapaxes_invalid_axis(self):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((10, 5)))
|
||||
msg = "'DataFrame.swapaxes' is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
msg = "No axis named 2 for object type DataFrame"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.swapaxes(2, 5)
|
||||
|
||||
def test_round_empty_not_input(self):
|
||||
# GH#51032
|
||||
df = DataFrame({"a": [1, 2]})
|
||||
msg = "'DataFrame.swapaxes' is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.swapaxes("index", "index")
|
||||
tm.assert_frame_equal(df, result)
|
||||
assert df is not result
|
@ -0,0 +1,36 @@
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestSwaplevel:
|
||||
def test_swaplevel(self, multiindex_dataframe_random_data):
|
||||
frame = multiindex_dataframe_random_data
|
||||
|
||||
swapped = frame["A"].swaplevel()
|
||||
swapped2 = frame["A"].swaplevel(0)
|
||||
swapped3 = frame["A"].swaplevel(0, 1)
|
||||
swapped4 = frame["A"].swaplevel("first", "second")
|
||||
assert not swapped.index.equals(frame.index)
|
||||
tm.assert_series_equal(swapped, swapped2)
|
||||
tm.assert_series_equal(swapped, swapped3)
|
||||
tm.assert_series_equal(swapped, swapped4)
|
||||
|
||||
back = swapped.swaplevel()
|
||||
back2 = swapped.swaplevel(0)
|
||||
back3 = swapped.swaplevel(0, 1)
|
||||
back4 = swapped.swaplevel("second", "first")
|
||||
assert back.index.equals(frame.index)
|
||||
tm.assert_series_equal(back, back2)
|
||||
tm.assert_series_equal(back, back3)
|
||||
tm.assert_series_equal(back, back4)
|
||||
|
||||
ft = frame.T
|
||||
swapped = ft.swaplevel("first", "second", axis=1)
|
||||
exp = frame.swaplevel("first", "second").T
|
||||
tm.assert_frame_equal(swapped, exp)
|
||||
|
||||
msg = "Can only swap levels on a hierarchical axis."
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
DataFrame(range(3)).swaplevel()
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,535 @@
|
||||
from collections import (
|
||||
OrderedDict,
|
||||
defaultdict,
|
||||
)
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import pytz
|
||||
|
||||
from pandas import (
|
||||
NA,
|
||||
DataFrame,
|
||||
Index,
|
||||
Interval,
|
||||
MultiIndex,
|
||||
Period,
|
||||
Series,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameToDict:
|
||||
def test_to_dict_timestamp(self):
|
||||
# GH#11247
|
||||
# split/records producing np.datetime64 rather than Timestamps
|
||||
# on datetime64[ns] dtypes only
|
||||
|
||||
tsmp = Timestamp("20130101")
|
||||
test_data = DataFrame({"A": [tsmp, tsmp], "B": [tsmp, tsmp]})
|
||||
test_data_mixed = DataFrame({"A": [tsmp, tsmp], "B": [1, 2]})
|
||||
|
||||
expected_records = [{"A": tsmp, "B": tsmp}, {"A": tsmp, "B": tsmp}]
|
||||
expected_records_mixed = [{"A": tsmp, "B": 1}, {"A": tsmp, "B": 2}]
|
||||
|
||||
assert test_data.to_dict(orient="records") == expected_records
|
||||
assert test_data_mixed.to_dict(orient="records") == expected_records_mixed
|
||||
|
||||
expected_series = {
|
||||
"A": Series([tsmp, tsmp], name="A"),
|
||||
"B": Series([tsmp, tsmp], name="B"),
|
||||
}
|
||||
expected_series_mixed = {
|
||||
"A": Series([tsmp, tsmp], name="A"),
|
||||
"B": Series([1, 2], name="B"),
|
||||
}
|
||||
|
||||
tm.assert_dict_equal(test_data.to_dict(orient="series"), expected_series)
|
||||
tm.assert_dict_equal(
|
||||
test_data_mixed.to_dict(orient="series"), expected_series_mixed
|
||||
)
|
||||
|
||||
expected_split = {
|
||||
"index": [0, 1],
|
||||
"data": [[tsmp, tsmp], [tsmp, tsmp]],
|
||||
"columns": ["A", "B"],
|
||||
}
|
||||
expected_split_mixed = {
|
||||
"index": [0, 1],
|
||||
"data": [[tsmp, 1], [tsmp, 2]],
|
||||
"columns": ["A", "B"],
|
||||
}
|
||||
|
||||
tm.assert_dict_equal(test_data.to_dict(orient="split"), expected_split)
|
||||
tm.assert_dict_equal(
|
||||
test_data_mixed.to_dict(orient="split"), expected_split_mixed
|
||||
)
|
||||
|
||||
def test_to_dict_index_not_unique_with_index_orient(self):
|
||||
# GH#22801
|
||||
# Data loss when indexes are not unique. Raise ValueError.
|
||||
df = DataFrame({"a": [1, 2], "b": [0.5, 0.75]}, index=["A", "A"])
|
||||
msg = "DataFrame index must be unique for orient='index'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_dict(orient="index")
|
||||
|
||||
def test_to_dict_invalid_orient(self):
|
||||
df = DataFrame({"A": [0, 1]})
|
||||
msg = "orient 'xinvalid' not understood"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_dict(orient="xinvalid")
|
||||
|
||||
@pytest.mark.parametrize("orient", ["d", "l", "r", "sp", "s", "i"])
|
||||
def test_to_dict_short_orient_raises(self, orient):
|
||||
# GH#32515
|
||||
df = DataFrame({"A": [0, 1]})
|
||||
with pytest.raises(ValueError, match="not understood"):
|
||||
df.to_dict(orient=orient)
|
||||
|
||||
@pytest.mark.parametrize("mapping", [dict, defaultdict(list), OrderedDict])
|
||||
def test_to_dict(self, mapping):
|
||||
# orient= should only take the listed options
|
||||
# see GH#32515
|
||||
test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}}
|
||||
|
||||
# GH#16122
|
||||
recons_data = DataFrame(test_data).to_dict(into=mapping)
|
||||
|
||||
for k, v in test_data.items():
|
||||
for k2, v2 in v.items():
|
||||
assert v2 == recons_data[k][k2]
|
||||
|
||||
recons_data = DataFrame(test_data).to_dict("list", into=mapping)
|
||||
|
||||
for k, v in test_data.items():
|
||||
for k2, v2 in v.items():
|
||||
assert v2 == recons_data[k][int(k2) - 1]
|
||||
|
||||
recons_data = DataFrame(test_data).to_dict("series", into=mapping)
|
||||
|
||||
for k, v in test_data.items():
|
||||
for k2, v2 in v.items():
|
||||
assert v2 == recons_data[k][k2]
|
||||
|
||||
recons_data = DataFrame(test_data).to_dict("split", into=mapping)
|
||||
expected_split = {
|
||||
"columns": ["A", "B"],
|
||||
"index": ["1", "2", "3"],
|
||||
"data": [[1.0, "1"], [2.0, "2"], [np.nan, "3"]],
|
||||
}
|
||||
tm.assert_dict_equal(recons_data, expected_split)
|
||||
|
||||
recons_data = DataFrame(test_data).to_dict("records", into=mapping)
|
||||
expected_records = [
|
||||
{"A": 1.0, "B": "1"},
|
||||
{"A": 2.0, "B": "2"},
|
||||
{"A": np.nan, "B": "3"},
|
||||
]
|
||||
assert isinstance(recons_data, list)
|
||||
assert len(recons_data) == 3
|
||||
for left, right in zip(recons_data, expected_records):
|
||||
tm.assert_dict_equal(left, right)
|
||||
|
||||
# GH#10844
|
||||
recons_data = DataFrame(test_data).to_dict("index")
|
||||
|
||||
for k, v in test_data.items():
|
||||
for k2, v2 in v.items():
|
||||
assert v2 == recons_data[k2][k]
|
||||
|
||||
df = DataFrame(test_data)
|
||||
df["duped"] = df[df.columns[0]]
|
||||
recons_data = df.to_dict("index")
|
||||
comp_data = test_data.copy()
|
||||
comp_data["duped"] = comp_data[df.columns[0]]
|
||||
for k, v in comp_data.items():
|
||||
for k2, v2 in v.items():
|
||||
assert v2 == recons_data[k2][k]
|
||||
|
||||
@pytest.mark.parametrize("mapping", [list, defaultdict, []])
|
||||
def test_to_dict_errors(self, mapping):
|
||||
# GH#16122
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((3, 3)))
|
||||
msg = "|".join(
|
||||
[
|
||||
"unsupported type: <class 'list'>",
|
||||
r"to_dict\(\) only accepts initialized defaultdicts",
|
||||
]
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.to_dict(into=mapping)
|
||||
|
||||
def test_to_dict_not_unique_warning(self):
|
||||
# GH#16927: When converting to a dict, if a column has a non-unique name
|
||||
# it will be dropped, throwing a warning.
|
||||
df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"])
|
||||
with tm.assert_produces_warning(UserWarning):
|
||||
df.to_dict()
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
@pytest.mark.parametrize(
|
||||
"orient,expected",
|
||||
[
|
||||
("list", {"A": [2, 5], "B": [3, 6]}),
|
||||
("dict", {"A": {0: 2, 1: 5}, "B": {0: 3, 1: 6}}),
|
||||
],
|
||||
)
|
||||
def test_to_dict_not_unique(self, orient, expected):
|
||||
# GH#54824: This is to make sure that dataframes with non-unique column
|
||||
# would have uniform behavior throughout different orients
|
||||
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "A", "B"])
|
||||
result = df.to_dict(orient)
|
||||
assert result == expected
|
||||
|
||||
# orient - orient argument to to_dict function
|
||||
# item_getter - function for extracting value from
|
||||
# the resulting dict using column name and index
|
||||
@pytest.mark.parametrize(
|
||||
"orient,item_getter",
|
||||
[
|
||||
("dict", lambda d, col, idx: d[col][idx]),
|
||||
("records", lambda d, col, idx: d[idx][col]),
|
||||
("list", lambda d, col, idx: d[col][idx]),
|
||||
("split", lambda d, col, idx: d["data"][idx][d["columns"].index(col)]),
|
||||
("index", lambda d, col, idx: d[idx][col]),
|
||||
],
|
||||
)
|
||||
def test_to_dict_box_scalars(self, orient, item_getter):
|
||||
# GH#14216, GH#23753
|
||||
# make sure that we are boxing properly
|
||||
df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]})
|
||||
result = df.to_dict(orient=orient)
|
||||
assert isinstance(item_getter(result, "a", 0), int)
|
||||
assert isinstance(item_getter(result, "b", 0), float)
|
||||
|
||||
def test_to_dict_tz(self):
|
||||
# GH#18372 When converting to dict with orient='records' columns of
|
||||
# datetime that are tz-aware were not converted to required arrays
|
||||
data = [
|
||||
(datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),),
|
||||
(datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc),),
|
||||
]
|
||||
df = DataFrame(list(data), columns=["d"])
|
||||
|
||||
result = df.to_dict(orient="records")
|
||||
expected = [
|
||||
{"d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc)},
|
||||
{"d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc)},
|
||||
]
|
||||
tm.assert_dict_equal(result[0], expected[0])
|
||||
tm.assert_dict_equal(result[1], expected[1])
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"into, expected",
|
||||
[
|
||||
(
|
||||
dict,
|
||||
{
|
||||
0: {"int_col": 1, "float_col": 1.0},
|
||||
1: {"int_col": 2, "float_col": 2.0},
|
||||
2: {"int_col": 3, "float_col": 3.0},
|
||||
},
|
||||
),
|
||||
(
|
||||
OrderedDict,
|
||||
OrderedDict(
|
||||
[
|
||||
(0, {"int_col": 1, "float_col": 1.0}),
|
||||
(1, {"int_col": 2, "float_col": 2.0}),
|
||||
(2, {"int_col": 3, "float_col": 3.0}),
|
||||
]
|
||||
),
|
||||
),
|
||||
(
|
||||
defaultdict(dict),
|
||||
defaultdict(
|
||||
dict,
|
||||
{
|
||||
0: {"int_col": 1, "float_col": 1.0},
|
||||
1: {"int_col": 2, "float_col": 2.0},
|
||||
2: {"int_col": 3, "float_col": 3.0},
|
||||
},
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_to_dict_index_dtypes(self, into, expected):
|
||||
# GH#18580
|
||||
# When using to_dict(orient='index') on a dataframe with int
|
||||
# and float columns only the int columns were cast to float
|
||||
|
||||
df = DataFrame({"int_col": [1, 2, 3], "float_col": [1.0, 2.0, 3.0]})
|
||||
|
||||
result = df.to_dict(orient="index", into=into)
|
||||
cols = ["int_col", "float_col"]
|
||||
result = DataFrame.from_dict(result, orient="index")[cols]
|
||||
expected = DataFrame.from_dict(expected, orient="index")[cols]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_to_dict_numeric_names(self):
|
||||
# GH#24940
|
||||
df = DataFrame({str(i): [i] for i in range(5)})
|
||||
result = set(df.to_dict("records")[0].keys())
|
||||
expected = set(df.columns)
|
||||
assert result == expected
|
||||
|
||||
def test_to_dict_wide(self):
|
||||
# GH#24939
|
||||
df = DataFrame({(f"A_{i:d}"): [i] for i in range(256)})
|
||||
result = df.to_dict("records")[0]
|
||||
expected = {f"A_{i:d}": i for i in range(256)}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,dtype",
|
||||
(
|
||||
([True, True, False], bool),
|
||||
[
|
||||
[
|
||||
datetime(2018, 1, 1),
|
||||
datetime(2019, 2, 2),
|
||||
datetime(2020, 3, 3),
|
||||
],
|
||||
Timestamp,
|
||||
],
|
||||
[[1.0, 2.0, 3.0], float],
|
||||
[[1, 2, 3], int],
|
||||
[["X", "Y", "Z"], str],
|
||||
),
|
||||
)
|
||||
def test_to_dict_orient_dtype(self, data, dtype):
|
||||
# GH22620 & GH21256
|
||||
|
||||
df = DataFrame({"a": data})
|
||||
d = df.to_dict(orient="records")
|
||||
assert all(type(record["a"]) is dtype for record in d)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,expected_dtype",
|
||||
(
|
||||
[np.uint64(2), int],
|
||||
[np.int64(-9), int],
|
||||
[np.float64(1.1), float],
|
||||
[np.bool_(True), bool],
|
||||
[np.datetime64("2005-02-25"), Timestamp],
|
||||
),
|
||||
)
|
||||
def test_to_dict_scalar_constructor_orient_dtype(self, data, expected_dtype):
|
||||
# GH22620 & GH21256
|
||||
|
||||
df = DataFrame({"a": data}, index=[0])
|
||||
d = df.to_dict(orient="records")
|
||||
result = type(d[0]["a"])
|
||||
assert result is expected_dtype
|
||||
|
||||
def test_to_dict_mixed_numeric_frame(self):
|
||||
# GH 12859
|
||||
df = DataFrame({"a": [1.0], "b": [9.0]})
|
||||
result = df.reset_index().to_dict("records")
|
||||
expected = [{"index": 0, "a": 1.0, "b": 9.0}]
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index",
|
||||
[
|
||||
None,
|
||||
Index(["aa", "bb"]),
|
||||
Index(["aa", "bb"], name="cc"),
|
||||
MultiIndex.from_tuples([("a", "b"), ("a", "c")]),
|
||||
MultiIndex.from_tuples([("a", "b"), ("a", "c")], names=["n1", "n2"]),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"columns",
|
||||
[
|
||||
["x", "y"],
|
||||
Index(["x", "y"]),
|
||||
Index(["x", "y"], name="z"),
|
||||
MultiIndex.from_tuples([("x", 1), ("y", 2)]),
|
||||
MultiIndex.from_tuples([("x", 1), ("y", 2)], names=["z1", "z2"]),
|
||||
],
|
||||
)
|
||||
def test_to_dict_orient_tight(self, index, columns):
|
||||
df = DataFrame.from_records(
|
||||
[[1, 3], [2, 4]],
|
||||
columns=columns,
|
||||
index=index,
|
||||
)
|
||||
roundtrip = DataFrame.from_dict(df.to_dict(orient="tight"), orient="tight")
|
||||
|
||||
tm.assert_frame_equal(df, roundtrip)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"orient",
|
||||
["dict", "list", "split", "records", "index", "tight"],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"data,expected_types",
|
||||
(
|
||||
(
|
||||
{
|
||||
"a": [np.int64(1), 1, np.int64(3)],
|
||||
"b": [np.float64(1.0), 2.0, np.float64(3.0)],
|
||||
"c": [np.float64(1.0), 2, np.int64(3)],
|
||||
"d": [np.float64(1.0), "a", np.int64(3)],
|
||||
"e": [np.float64(1.0), ["a"], np.int64(3)],
|
||||
"f": [np.float64(1.0), ("a",), np.int64(3)],
|
||||
},
|
||||
{
|
||||
"a": [int, int, int],
|
||||
"b": [float, float, float],
|
||||
"c": [float, float, float],
|
||||
"d": [float, str, int],
|
||||
"e": [float, list, int],
|
||||
"f": [float, tuple, int],
|
||||
},
|
||||
),
|
||||
(
|
||||
{
|
||||
"a": [1, 2, 3],
|
||||
"b": [1.1, 2.2, 3.3],
|
||||
},
|
||||
{
|
||||
"a": [int, int, int],
|
||||
"b": [float, float, float],
|
||||
},
|
||||
),
|
||||
( # Make sure we have one df which is all object type cols
|
||||
{
|
||||
"a": [1, "hello", 3],
|
||||
"b": [1.1, "world", 3.3],
|
||||
},
|
||||
{
|
||||
"a": [int, str, int],
|
||||
"b": [float, str, float],
|
||||
},
|
||||
),
|
||||
),
|
||||
)
|
||||
def test_to_dict_returns_native_types(self, orient, data, expected_types):
|
||||
# GH 46751
|
||||
# Tests we get back native types for all orient types
|
||||
df = DataFrame(data)
|
||||
result = df.to_dict(orient)
|
||||
if orient == "dict":
|
||||
assertion_iterator = (
|
||||
(i, key, value)
|
||||
for key, index_value_map in result.items()
|
||||
for i, value in index_value_map.items()
|
||||
)
|
||||
elif orient == "list":
|
||||
assertion_iterator = (
|
||||
(i, key, value)
|
||||
for key, values in result.items()
|
||||
for i, value in enumerate(values)
|
||||
)
|
||||
elif orient in {"split", "tight"}:
|
||||
assertion_iterator = (
|
||||
(i, key, result["data"][i][j])
|
||||
for i in result["index"]
|
||||
for j, key in enumerate(result["columns"])
|
||||
)
|
||||
elif orient == "records":
|
||||
assertion_iterator = (
|
||||
(i, key, value)
|
||||
for i, record in enumerate(result)
|
||||
for key, value in record.items()
|
||||
)
|
||||
elif orient == "index":
|
||||
assertion_iterator = (
|
||||
(i, key, value)
|
||||
for i, record in result.items()
|
||||
for key, value in record.items()
|
||||
)
|
||||
|
||||
for i, key, value in assertion_iterator:
|
||||
assert value == data[key][i]
|
||||
assert type(value) is expected_types[key][i]
|
||||
|
||||
@pytest.mark.parametrize("orient", ["dict", "list", "series", "records", "index"])
|
||||
def test_to_dict_index_false_error(self, orient):
|
||||
# GH#46398
|
||||
df = DataFrame({"col1": [1, 2], "col2": [3, 4]}, index=["row1", "row2"])
|
||||
msg = "'index=False' is only valid when 'orient' is 'split' or 'tight'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_dict(orient=orient, index=False)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"orient, expected",
|
||||
[
|
||||
("split", {"columns": ["col1", "col2"], "data": [[1, 3], [2, 4]]}),
|
||||
(
|
||||
"tight",
|
||||
{
|
||||
"columns": ["col1", "col2"],
|
||||
"data": [[1, 3], [2, 4]],
|
||||
"column_names": [None],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_to_dict_index_false(self, orient, expected):
|
||||
# GH#46398
|
||||
df = DataFrame({"col1": [1, 2], "col2": [3, 4]}, index=["row1", "row2"])
|
||||
result = df.to_dict(orient=orient, index=False)
|
||||
tm.assert_dict_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"orient, expected",
|
||||
[
|
||||
("dict", {"a": {0: 1, 1: None}}),
|
||||
("list", {"a": [1, None]}),
|
||||
("split", {"index": [0, 1], "columns": ["a"], "data": [[1], [None]]}),
|
||||
(
|
||||
"tight",
|
||||
{
|
||||
"index": [0, 1],
|
||||
"columns": ["a"],
|
||||
"data": [[1], [None]],
|
||||
"index_names": [None],
|
||||
"column_names": [None],
|
||||
},
|
||||
),
|
||||
("records", [{"a": 1}, {"a": None}]),
|
||||
("index", {0: {"a": 1}, 1: {"a": None}}),
|
||||
],
|
||||
)
|
||||
def test_to_dict_na_to_none(self, orient, expected):
|
||||
# GH#50795
|
||||
df = DataFrame({"a": [1, NA]}, dtype="Int64")
|
||||
result = df.to_dict(orient=orient)
|
||||
assert result == expected
|
||||
|
||||
def test_to_dict_masked_native_python(self):
|
||||
# GH#34665
|
||||
df = DataFrame({"a": Series([1, 2], dtype="Int64"), "B": 1})
|
||||
result = df.to_dict(orient="records")
|
||||
assert isinstance(result[0]["a"], int)
|
||||
|
||||
df = DataFrame({"a": Series([1, NA], dtype="Int64"), "B": 1})
|
||||
result = df.to_dict(orient="records")
|
||||
assert isinstance(result[0]["a"], int)
|
||||
|
||||
def test_to_dict_pos_args_deprecation(self):
|
||||
# GH-54229
|
||||
df = DataFrame({"a": [1, 2, 3]})
|
||||
msg = (
|
||||
r"Starting with pandas version 3.0 all arguments of to_dict except for the "
|
||||
r"argument 'orient' will be keyword-only."
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.to_dict("records", {})
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"val", [Timestamp(2020, 1, 1), Timedelta(1), Period("2020"), Interval(1, 2)]
|
||||
)
|
||||
def test_to_dict_list_pd_scalars(val):
|
||||
# GH 54824
|
||||
df = DataFrame({"a": [val]})
|
||||
result = df.to_dict(orient="list")
|
||||
expected = {"a": [val]}
|
||||
assert result == expected
|
@ -0,0 +1,76 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
MultiIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays import NumpyExtensionArray
|
||||
|
||||
pytestmark = td.skip_array_manager_invalid_test
|
||||
|
||||
|
||||
class TestToDictOfBlocks:
|
||||
@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning")
|
||||
def test_no_copy_blocks(self, float_frame, using_copy_on_write):
|
||||
# GH#9607
|
||||
df = DataFrame(float_frame, copy=True)
|
||||
column = df.columns[0]
|
||||
|
||||
_last_df = None
|
||||
# use the copy=False, change a column
|
||||
blocks = df._to_dict_of_blocks()
|
||||
for _df in blocks.values():
|
||||
_last_df = _df
|
||||
if column in _df:
|
||||
_df.loc[:, column] = _df[column] + 1
|
||||
|
||||
if not using_copy_on_write:
|
||||
# make sure we did change the original DataFrame
|
||||
assert _last_df is not None and _last_df[column].equals(df[column])
|
||||
else:
|
||||
assert _last_df is not None and not _last_df[column].equals(df[column])
|
||||
|
||||
|
||||
def test_to_dict_of_blocks_item_cache(using_copy_on_write, warn_copy_on_write):
|
||||
# Calling to_dict_of_blocks should not poison item_cache
|
||||
df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
|
||||
df["c"] = NumpyExtensionArray(np.array([1, 2, None, 3], dtype=object))
|
||||
mgr = df._mgr
|
||||
assert len(mgr.blocks) == 3 # i.e. not consolidated
|
||||
|
||||
ser = df["b"] # populations item_cache["b"]
|
||||
|
||||
df._to_dict_of_blocks()
|
||||
|
||||
if using_copy_on_write:
|
||||
with pytest.raises(ValueError, match="read-only"):
|
||||
ser.values[0] = "foo"
|
||||
elif warn_copy_on_write:
|
||||
ser.values[0] = "foo"
|
||||
assert df.loc[0, "b"] == "foo"
|
||||
# with warning mode, the item cache is disabled
|
||||
assert df["b"] is not ser
|
||||
else:
|
||||
# Check that the to_dict_of_blocks didn't break link between ser and df
|
||||
ser.values[0] = "foo"
|
||||
assert df.loc[0, "b"] == "foo"
|
||||
|
||||
assert df["b"] is ser
|
||||
|
||||
|
||||
def test_set_change_dtype_slice():
|
||||
# GH#8850
|
||||
cols = MultiIndex.from_tuples([("1st", "a"), ("2nd", "b"), ("3rd", "c")])
|
||||
df = DataFrame([[1.0, 2, 3], [4.0, 5, 6]], columns=cols)
|
||||
df["2nd"] = df["2nd"] * 2.0
|
||||
|
||||
blocks = df._to_dict_of_blocks()
|
||||
assert sorted(blocks.keys()) == ["float64", "int64"]
|
||||
tm.assert_frame_equal(
|
||||
blocks["float64"], DataFrame([[1.0, 4.0], [4.0, 10.0]], columns=cols[:2])
|
||||
)
|
||||
tm.assert_frame_equal(blocks["int64"], DataFrame([[3], [6]], columns=cols[2:]))
|
@ -0,0 +1,49 @@
|
||||
import numpy as np
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Timestamp,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestToNumpy:
|
||||
def test_to_numpy(self):
|
||||
df = DataFrame({"A": [1, 2], "B": [3, 4.5]})
|
||||
expected = np.array([[1, 3], [2, 4.5]])
|
||||
result = df.to_numpy()
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_to_numpy_dtype(self):
|
||||
df = DataFrame({"A": [1, 2], "B": [3, 4.5]})
|
||||
expected = np.array([[1, 3], [2, 4]], dtype="int64")
|
||||
result = df.to_numpy(dtype="int64")
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
@td.skip_array_manager_invalid_test
|
||||
def test_to_numpy_copy(self, using_copy_on_write):
|
||||
arr = np.random.default_rng(2).standard_normal((4, 3))
|
||||
df = DataFrame(arr)
|
||||
if using_copy_on_write:
|
||||
assert df.values.base is not arr
|
||||
assert df.to_numpy(copy=False).base is df.values.base
|
||||
else:
|
||||
assert df.values.base is arr
|
||||
assert df.to_numpy(copy=False).base is arr
|
||||
assert df.to_numpy(copy=True).base is not arr
|
||||
|
||||
# we still don't want a copy when na_value=np.nan is passed,
|
||||
# and that can be respected because we are already numpy-float
|
||||
if using_copy_on_write:
|
||||
assert df.to_numpy(copy=False).base is df.values.base
|
||||
else:
|
||||
assert df.to_numpy(copy=False, na_value=np.nan).base is arr
|
||||
|
||||
def test_to_numpy_mixed_dtype_to_str(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/35455
|
||||
df = DataFrame([[Timestamp("2020-01-01 00:00:00"), 100.0]])
|
||||
result = df.to_numpy(dtype=str)
|
||||
expected = np.array([["2020-01-01 00:00:00", "100.0"]], dtype=str)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
@ -0,0 +1,89 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
PeriodIndex,
|
||||
Series,
|
||||
date_range,
|
||||
period_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestToPeriod:
|
||||
def test_to_period(self, frame_or_series):
|
||||
K = 5
|
||||
|
||||
dr = date_range("1/1/2000", "1/1/2001", freq="D")
|
||||
obj = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(dr), K)),
|
||||
index=dr,
|
||||
columns=["A", "B", "C", "D", "E"],
|
||||
)
|
||||
obj["mix"] = "a"
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
pts = obj.to_period()
|
||||
exp = obj.copy()
|
||||
exp.index = period_range("1/1/2000", "1/1/2001")
|
||||
tm.assert_equal(pts, exp)
|
||||
|
||||
pts = obj.to_period("M")
|
||||
exp.index = exp.index.asfreq("M")
|
||||
tm.assert_equal(pts, exp)
|
||||
|
||||
def test_to_period_without_freq(self, frame_or_series):
|
||||
# GH#7606 without freq
|
||||
idx = DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"])
|
||||
exp_idx = PeriodIndex(
|
||||
["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], freq="D"
|
||||
)
|
||||
|
||||
obj = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((4, 4)), index=idx, columns=idx
|
||||
)
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
expected = obj.copy()
|
||||
expected.index = exp_idx
|
||||
tm.assert_equal(obj.to_period(), expected)
|
||||
|
||||
if frame_or_series is DataFrame:
|
||||
expected = obj.copy()
|
||||
expected.columns = exp_idx
|
||||
tm.assert_frame_equal(obj.to_period(axis=1), expected)
|
||||
|
||||
def test_to_period_columns(self):
|
||||
dr = date_range("1/1/2000", "1/1/2001")
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((len(dr), 5)), index=dr)
|
||||
df["mix"] = "a"
|
||||
|
||||
df = df.T
|
||||
pts = df.to_period(axis=1)
|
||||
exp = df.copy()
|
||||
exp.columns = period_range("1/1/2000", "1/1/2001")
|
||||
tm.assert_frame_equal(pts, exp)
|
||||
|
||||
pts = df.to_period("M", axis=1)
|
||||
tm.assert_index_equal(pts.columns, exp.columns.asfreq("M"))
|
||||
|
||||
def test_to_period_invalid_axis(self):
|
||||
dr = date_range("1/1/2000", "1/1/2001")
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((len(dr), 5)), index=dr)
|
||||
df["mix"] = "a"
|
||||
|
||||
msg = "No axis named 2 for object type DataFrame"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_period(axis=2)
|
||||
|
||||
def test_to_period_raises(self, index, frame_or_series):
|
||||
# https://github.com/pandas-dev/pandas/issues/33327
|
||||
obj = Series(index=index, dtype=object)
|
||||
if frame_or_series is DataFrame:
|
||||
obj = obj.to_frame()
|
||||
|
||||
if not isinstance(index, DatetimeIndex):
|
||||
msg = f"unsupported Type {type(index).__name__}"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
obj.to_period()
|
@ -0,0 +1,523 @@
|
||||
from collections import abc
|
||||
import email
|
||||
from email.parser import Parser
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
CategoricalDtype,
|
||||
DataFrame,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameToRecords:
|
||||
def test_to_records_timeseries(self):
|
||||
index = date_range("1/1/2000", periods=10)
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 3)),
|
||||
index=index,
|
||||
columns=["a", "b", "c"],
|
||||
)
|
||||
|
||||
result = df.to_records()
|
||||
assert result["index"].dtype == "M8[ns]"
|
||||
|
||||
result = df.to_records(index=False)
|
||||
|
||||
def test_to_records_dt64(self):
|
||||
df = DataFrame(
|
||||
[["one", "two", "three"], ["four", "five", "six"]],
|
||||
index=date_range("2012-01-01", "2012-01-02"),
|
||||
)
|
||||
|
||||
expected = df.index.values[0]
|
||||
result = df.to_records()["index"][0]
|
||||
assert expected == result
|
||||
|
||||
def test_to_records_dt64tz_column(self):
|
||||
# GH#32535 dont less tz in to_records
|
||||
df = DataFrame({"A": date_range("2012-01-01", "2012-01-02", tz="US/Eastern")})
|
||||
|
||||
result = df.to_records()
|
||||
|
||||
assert result.dtype["A"] == object
|
||||
val = result[0][1]
|
||||
assert isinstance(val, Timestamp)
|
||||
assert val == df.loc[0, "A"]
|
||||
|
||||
def test_to_records_with_multindex(self):
|
||||
# GH#3189
|
||||
index = [
|
||||
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
|
||||
["one", "two", "one", "two", "one", "two", "one", "two"],
|
||||
]
|
||||
data = np.zeros((8, 4))
|
||||
df = DataFrame(data, index=index)
|
||||
r = df.to_records(index=True)["level_0"]
|
||||
assert "bar" in r
|
||||
assert "one" not in r
|
||||
|
||||
def test_to_records_with_Mapping_type(self):
|
||||
abc.Mapping.register(email.message.Message)
|
||||
|
||||
headers = Parser().parsestr(
|
||||
"From: <user@example.com>\n"
|
||||
"To: <someone_else@example.com>\n"
|
||||
"Subject: Test message\n"
|
||||
"\n"
|
||||
"Body would go here\n"
|
||||
)
|
||||
|
||||
frame = DataFrame.from_records([headers])
|
||||
all(x in frame for x in ["Type", "Subject", "From"])
|
||||
|
||||
def test_to_records_floats(self):
|
||||
df = DataFrame(np.random.default_rng(2).random((10, 10)))
|
||||
df.to_records()
|
||||
|
||||
def test_to_records_index_name(self):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((3, 3)))
|
||||
df.index.name = "X"
|
||||
rs = df.to_records()
|
||||
assert "X" in rs.dtype.fields
|
||||
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((3, 3)))
|
||||
rs = df.to_records()
|
||||
assert "index" in rs.dtype.fields
|
||||
|
||||
df.index = MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
|
||||
df.index.names = ["A", None]
|
||||
result = df.to_records()
|
||||
expected = np.rec.fromarrays(
|
||||
[np.array(["a", "a", "b"]), np.array(["x", "y", "z"])]
|
||||
+ [np.asarray(df.iloc[:, i]) for i in range(3)],
|
||||
dtype={
|
||||
"names": ["A", "level_1", "0", "1", "2"],
|
||||
"formats": [
|
||||
"O",
|
||||
"O",
|
||||
f"{tm.ENDIAN}f8",
|
||||
f"{tm.ENDIAN}f8",
|
||||
f"{tm.ENDIAN}f8",
|
||||
],
|
||||
},
|
||||
)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_to_records_with_unicode_index(self):
|
||||
# GH#13172
|
||||
# unicode_literals conflict with to_records
|
||||
result = DataFrame([{"a": "x", "b": "y"}]).set_index("a").to_records()
|
||||
expected = np.rec.array([("x", "y")], dtype=[("a", "O"), ("b", "O")])
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
def test_to_records_index_dtype(self):
|
||||
# GH 47263: consistent data types for Index and MultiIndex
|
||||
df = DataFrame(
|
||||
{
|
||||
1: date_range("2022-01-01", periods=2),
|
||||
2: date_range("2022-01-01", periods=2),
|
||||
3: date_range("2022-01-01", periods=2),
|
||||
}
|
||||
)
|
||||
|
||||
expected = np.rec.array(
|
||||
[
|
||||
("2022-01-01", "2022-01-01", "2022-01-01"),
|
||||
("2022-01-02", "2022-01-02", "2022-01-02"),
|
||||
],
|
||||
dtype=[
|
||||
("1", f"{tm.ENDIAN}M8[ns]"),
|
||||
("2", f"{tm.ENDIAN}M8[ns]"),
|
||||
("3", f"{tm.ENDIAN}M8[ns]"),
|
||||
],
|
||||
)
|
||||
|
||||
result = df.to_records(index=False)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
result = df.set_index(1).to_records(index=True)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
result = df.set_index([1, 2]).to_records(index=True)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
def test_to_records_with_unicode_column_names(self):
|
||||
# xref issue: https://github.com/numpy/numpy/issues/2407
|
||||
# Issue GH#11879. to_records used to raise an exception when used
|
||||
# with column names containing non-ascii characters in Python 2
|
||||
result = DataFrame(data={"accented_name_é": [1.0]}).to_records()
|
||||
|
||||
# Note that numpy allows for unicode field names but dtypes need
|
||||
# to be specified using dictionary instead of list of tuples.
|
||||
expected = np.rec.array(
|
||||
[(0, 1.0)],
|
||||
dtype={"names": ["index", "accented_name_é"], "formats": ["=i8", "=f8"]},
|
||||
)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
def test_to_records_with_categorical(self):
|
||||
# GH#8626
|
||||
|
||||
# dict creation
|
||||
df = DataFrame({"A": list("abc")}, dtype="category")
|
||||
expected = Series(list("abc"), dtype="category", name="A")
|
||||
tm.assert_series_equal(df["A"], expected)
|
||||
|
||||
# list-like creation
|
||||
df = DataFrame(list("abc"), dtype="category")
|
||||
expected = Series(list("abc"), dtype="category", name=0)
|
||||
tm.assert_series_equal(df[0], expected)
|
||||
|
||||
# to record array
|
||||
# this coerces
|
||||
result = df.to_records()
|
||||
expected = np.rec.array(
|
||||
[(0, "a"), (1, "b"), (2, "c")], dtype=[("index", "=i8"), ("0", "O")]
|
||||
)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,expected",
|
||||
[
|
||||
# No dtypes --> default to array dtypes.
|
||||
(
|
||||
{},
|
||||
np.rec.array(
|
||||
[(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
|
||||
dtype=[
|
||||
("index", f"{tm.ENDIAN}i8"),
|
||||
("A", f"{tm.ENDIAN}i8"),
|
||||
("B", f"{tm.ENDIAN}f8"),
|
||||
("C", "O"),
|
||||
],
|
||||
),
|
||||
),
|
||||
# Should have no effect in this case.
|
||||
(
|
||||
{"index": True},
|
||||
np.rec.array(
|
||||
[(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
|
||||
dtype=[
|
||||
("index", f"{tm.ENDIAN}i8"),
|
||||
("A", f"{tm.ENDIAN}i8"),
|
||||
("B", f"{tm.ENDIAN}f8"),
|
||||
("C", "O"),
|
||||
],
|
||||
),
|
||||
),
|
||||
# Column dtype applied across the board. Index unaffected.
|
||||
(
|
||||
{"column_dtypes": f"{tm.ENDIAN}U4"},
|
||||
np.rec.array(
|
||||
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
|
||||
dtype=[
|
||||
("index", f"{tm.ENDIAN}i8"),
|
||||
("A", f"{tm.ENDIAN}U4"),
|
||||
("B", f"{tm.ENDIAN}U4"),
|
||||
("C", f"{tm.ENDIAN}U4"),
|
||||
],
|
||||
),
|
||||
),
|
||||
# Index dtype applied across the board. Columns unaffected.
|
||||
(
|
||||
{"index_dtypes": f"{tm.ENDIAN}U1"},
|
||||
np.rec.array(
|
||||
[("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")],
|
||||
dtype=[
|
||||
("index", f"{tm.ENDIAN}U1"),
|
||||
("A", f"{tm.ENDIAN}i8"),
|
||||
("B", f"{tm.ENDIAN}f8"),
|
||||
("C", "O"),
|
||||
],
|
||||
),
|
||||
),
|
||||
# Pass in a type instance.
|
||||
(
|
||||
{"column_dtypes": str},
|
||||
np.rec.array(
|
||||
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
|
||||
dtype=[
|
||||
("index", f"{tm.ENDIAN}i8"),
|
||||
("A", f"{tm.ENDIAN}U"),
|
||||
("B", f"{tm.ENDIAN}U"),
|
||||
("C", f"{tm.ENDIAN}U"),
|
||||
],
|
||||
),
|
||||
),
|
||||
# Pass in a dtype instance.
|
||||
(
|
||||
{"column_dtypes": np.dtype(np.str_)},
|
||||
np.rec.array(
|
||||
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
|
||||
dtype=[
|
||||
("index", f"{tm.ENDIAN}i8"),
|
||||
("A", f"{tm.ENDIAN}U"),
|
||||
("B", f"{tm.ENDIAN}U"),
|
||||
("C", f"{tm.ENDIAN}U"),
|
||||
],
|
||||
),
|
||||
),
|
||||
# Pass in a dictionary (name-only).
|
||||
(
|
||||
{
|
||||
"column_dtypes": {
|
||||
"A": np.int8,
|
||||
"B": np.float32,
|
||||
"C": f"{tm.ENDIAN}U2",
|
||||
}
|
||||
},
|
||||
np.rec.array(
|
||||
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
|
||||
dtype=[
|
||||
("index", f"{tm.ENDIAN}i8"),
|
||||
("A", "i1"),
|
||||
("B", f"{tm.ENDIAN}f4"),
|
||||
("C", f"{tm.ENDIAN}U2"),
|
||||
],
|
||||
),
|
||||
),
|
||||
# Pass in a dictionary (indices-only).
|
||||
(
|
||||
{"index_dtypes": {0: "int16"}},
|
||||
np.rec.array(
|
||||
[(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
|
||||
dtype=[
|
||||
("index", "i2"),
|
||||
("A", f"{tm.ENDIAN}i8"),
|
||||
("B", f"{tm.ENDIAN}f8"),
|
||||
("C", "O"),
|
||||
],
|
||||
),
|
||||
),
|
||||
# Ignore index mappings if index is not True.
|
||||
(
|
||||
{"index": False, "index_dtypes": f"{tm.ENDIAN}U2"},
|
||||
np.rec.array(
|
||||
[(1, 0.2, "a"), (2, 1.5, "bc")],
|
||||
dtype=[
|
||||
("A", f"{tm.ENDIAN}i8"),
|
||||
("B", f"{tm.ENDIAN}f8"),
|
||||
("C", "O"),
|
||||
],
|
||||
),
|
||||
),
|
||||
# Non-existent names / indices in mapping should not error.
|
||||
(
|
||||
{"index_dtypes": {0: "int16", "not-there": "float32"}},
|
||||
np.rec.array(
|
||||
[(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
|
||||
dtype=[
|
||||
("index", "i2"),
|
||||
("A", f"{tm.ENDIAN}i8"),
|
||||
("B", f"{tm.ENDIAN}f8"),
|
||||
("C", "O"),
|
||||
],
|
||||
),
|
||||
),
|
||||
# Names / indices not in mapping default to array dtype.
|
||||
(
|
||||
{"column_dtypes": {"A": np.int8, "B": np.float32}},
|
||||
np.rec.array(
|
||||
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
|
||||
dtype=[
|
||||
("index", f"{tm.ENDIAN}i8"),
|
||||
("A", "i1"),
|
||||
("B", f"{tm.ENDIAN}f4"),
|
||||
("C", "O"),
|
||||
],
|
||||
),
|
||||
),
|
||||
# Names / indices not in dtype mapping default to array dtype.
|
||||
(
|
||||
{"column_dtypes": {"A": np.dtype("int8"), "B": np.dtype("float32")}},
|
||||
np.rec.array(
|
||||
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
|
||||
dtype=[
|
||||
("index", f"{tm.ENDIAN}i8"),
|
||||
("A", "i1"),
|
||||
("B", f"{tm.ENDIAN}f4"),
|
||||
("C", "O"),
|
||||
],
|
||||
),
|
||||
),
|
||||
# Mixture of everything.
|
||||
(
|
||||
{
|
||||
"column_dtypes": {"A": np.int8, "B": np.float32},
|
||||
"index_dtypes": f"{tm.ENDIAN}U2",
|
||||
},
|
||||
np.rec.array(
|
||||
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
|
||||
dtype=[
|
||||
("index", f"{tm.ENDIAN}U2"),
|
||||
("A", "i1"),
|
||||
("B", f"{tm.ENDIAN}f4"),
|
||||
("C", "O"),
|
||||
],
|
||||
),
|
||||
),
|
||||
# Invalid dype values.
|
||||
(
|
||||
{"index": False, "column_dtypes": []},
|
||||
(ValueError, "Invalid dtype \\[\\] specified for column A"),
|
||||
),
|
||||
(
|
||||
{"index": False, "column_dtypes": {"A": "int32", "B": 5}},
|
||||
(ValueError, "Invalid dtype 5 specified for column B"),
|
||||
),
|
||||
# Numpy can't handle EA types, so check error is raised
|
||||
(
|
||||
{
|
||||
"index": False,
|
||||
"column_dtypes": {"A": "int32", "B": CategoricalDtype(["a", "b"])},
|
||||
},
|
||||
(ValueError, "Invalid dtype category specified for column B"),
|
||||
),
|
||||
# Check that bad types raise
|
||||
(
|
||||
{"index": False, "column_dtypes": {"A": "int32", "B": "foo"}},
|
||||
(TypeError, "data type [\"']foo[\"'] not understood"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_to_records_dtype(self, kwargs, expected):
|
||||
# see GH#18146
|
||||
df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})
|
||||
|
||||
if not isinstance(expected, np.rec.recarray):
|
||||
with pytest.raises(expected[0], match=expected[1]):
|
||||
df.to_records(**kwargs)
|
||||
else:
|
||||
result = df.to_records(**kwargs)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"df,kwargs,expected",
|
||||
[
|
||||
# MultiIndex in the index.
|
||||
(
|
||||
DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=list("abc")
|
||||
).set_index(["a", "b"]),
|
||||
{"column_dtypes": "float64", "index_dtypes": {0: "int32", 1: "int8"}},
|
||||
np.rec.array(
|
||||
[(1, 2, 3.0), (4, 5, 6.0), (7, 8, 9.0)],
|
||||
dtype=[
|
||||
("a", f"{tm.ENDIAN}i4"),
|
||||
("b", "i1"),
|
||||
("c", f"{tm.ENDIAN}f8"),
|
||||
],
|
||||
),
|
||||
),
|
||||
# MultiIndex in the columns.
|
||||
(
|
||||
DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("a", "d"), ("b", "e"), ("c", "f")]
|
||||
),
|
||||
),
|
||||
{
|
||||
"column_dtypes": {0: f"{tm.ENDIAN}U1", 2: "float32"},
|
||||
"index_dtypes": "float32",
|
||||
},
|
||||
np.rec.array(
|
||||
[(0.0, "1", 2, 3.0), (1.0, "4", 5, 6.0), (2.0, "7", 8, 9.0)],
|
||||
dtype=[
|
||||
("index", f"{tm.ENDIAN}f4"),
|
||||
("('a', 'd')", f"{tm.ENDIAN}U1"),
|
||||
("('b', 'e')", f"{tm.ENDIAN}i8"),
|
||||
("('c', 'f')", f"{tm.ENDIAN}f4"),
|
||||
],
|
||||
),
|
||||
),
|
||||
# MultiIndex in both the columns and index.
|
||||
(
|
||||
DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("a", "d"), ("b", "e"), ("c", "f")], names=list("ab")
|
||||
),
|
||||
index=MultiIndex.from_tuples(
|
||||
[("d", -4), ("d", -5), ("f", -6)], names=list("cd")
|
||||
),
|
||||
),
|
||||
{
|
||||
"column_dtypes": "float64",
|
||||
"index_dtypes": {0: f"{tm.ENDIAN}U2", 1: "int8"},
|
||||
},
|
||||
np.rec.array(
|
||||
[
|
||||
("d", -4, 1.0, 2.0, 3.0),
|
||||
("d", -5, 4.0, 5.0, 6.0),
|
||||
("f", -6, 7, 8, 9.0),
|
||||
],
|
||||
dtype=[
|
||||
("c", f"{tm.ENDIAN}U2"),
|
||||
("d", "i1"),
|
||||
("('a', 'd')", f"{tm.ENDIAN}f8"),
|
||||
("('b', 'e')", f"{tm.ENDIAN}f8"),
|
||||
("('c', 'f')", f"{tm.ENDIAN}f8"),
|
||||
],
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_to_records_dtype_mi(self, df, kwargs, expected):
|
||||
# see GH#18146
|
||||
result = df.to_records(**kwargs)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
def test_to_records_dict_like(self):
|
||||
# see GH#18146
|
||||
class DictLike:
|
||||
def __init__(self, **kwargs) -> None:
|
||||
self.d = kwargs.copy()
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.d.__getitem__(key)
|
||||
|
||||
def __contains__(self, key) -> bool:
|
||||
return key in self.d
|
||||
|
||||
def keys(self):
|
||||
return self.d.keys()
|
||||
|
||||
df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})
|
||||
|
||||
dtype_mappings = {
|
||||
"column_dtypes": DictLike(A=np.int8, B=np.float32),
|
||||
"index_dtypes": f"{tm.ENDIAN}U2",
|
||||
}
|
||||
|
||||
result = df.to_records(**dtype_mappings)
|
||||
expected = np.rec.array(
|
||||
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
|
||||
dtype=[
|
||||
("index", f"{tm.ENDIAN}U2"),
|
||||
("A", "i1"),
|
||||
("B", f"{tm.ENDIAN}f4"),
|
||||
("C", "O"),
|
||||
],
|
||||
)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz", ["UTC", "GMT", "US/Eastern"])
|
||||
def test_to_records_datetimeindex_with_tz(self, tz):
|
||||
# GH#13937
|
||||
dr = date_range("2016-01-01", periods=10, freq="s", tz=tz)
|
||||
|
||||
df = DataFrame({"datetime": dr}, index=dr)
|
||||
|
||||
expected = df.to_records()
|
||||
result = df.tz_convert("UTC").to_records()
|
||||
|
||||
# both converted to UTC, so they are equal
|
||||
tm.assert_numpy_array_equal(result, expected)
|
@ -0,0 +1,154 @@
|
||||
from datetime import timedelta
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
PeriodIndex,
|
||||
Series,
|
||||
Timedelta,
|
||||
date_range,
|
||||
period_range,
|
||||
to_datetime,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def _get_with_delta(delta, freq="YE-DEC"):
|
||||
return date_range(
|
||||
to_datetime("1/1/2001") + delta,
|
||||
to_datetime("12/31/2009") + delta,
|
||||
freq=freq,
|
||||
)
|
||||
|
||||
|
||||
class TestToTimestamp:
|
||||
def test_to_timestamp(self, frame_or_series):
|
||||
K = 5
|
||||
index = period_range(freq="Y", start="1/1/2001", end="12/1/2009")
|
||||
obj = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(index), K)),
|
||||
index=index,
|
||||
columns=["A", "B", "C", "D", "E"],
|
||||
)
|
||||
obj["mix"] = "a"
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
exp_index = date_range("1/1/2001", end="12/31/2009", freq="YE-DEC")
|
||||
exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns")
|
||||
result = obj.to_timestamp("D", "end")
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
tm.assert_numpy_array_equal(result.values, obj.values)
|
||||
if frame_or_series is Series:
|
||||
assert result.name == "A"
|
||||
|
||||
exp_index = date_range("1/1/2001", end="1/1/2009", freq="YS-JAN")
|
||||
result = obj.to_timestamp("D", "start")
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
result = obj.to_timestamp(how="start")
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
delta = timedelta(hours=23)
|
||||
result = obj.to_timestamp("H", "end")
|
||||
exp_index = _get_with_delta(delta)
|
||||
exp_index = exp_index + Timedelta(1, "h") - Timedelta(1, "ns")
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
delta = timedelta(hours=23, minutes=59)
|
||||
result = obj.to_timestamp("T", "end")
|
||||
exp_index = _get_with_delta(delta)
|
||||
exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns")
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
result = obj.to_timestamp("S", "end")
|
||||
delta = timedelta(hours=23, minutes=59, seconds=59)
|
||||
exp_index = _get_with_delta(delta)
|
||||
exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns")
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
def test_to_timestamp_columns(self):
|
||||
K = 5
|
||||
index = period_range(freq="Y", start="1/1/2001", end="12/1/2009")
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(index), K)),
|
||||
index=index,
|
||||
columns=["A", "B", "C", "D", "E"],
|
||||
)
|
||||
df["mix"] = "a"
|
||||
|
||||
# columns
|
||||
df = df.T
|
||||
|
||||
exp_index = date_range("1/1/2001", end="12/31/2009", freq="YE-DEC")
|
||||
exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns")
|
||||
result = df.to_timestamp("D", "end", axis=1)
|
||||
tm.assert_index_equal(result.columns, exp_index)
|
||||
tm.assert_numpy_array_equal(result.values, df.values)
|
||||
|
||||
exp_index = date_range("1/1/2001", end="1/1/2009", freq="YS-JAN")
|
||||
result = df.to_timestamp("D", "start", axis=1)
|
||||
tm.assert_index_equal(result.columns, exp_index)
|
||||
|
||||
delta = timedelta(hours=23)
|
||||
result = df.to_timestamp("H", "end", axis=1)
|
||||
exp_index = _get_with_delta(delta)
|
||||
exp_index = exp_index + Timedelta(1, "h") - Timedelta(1, "ns")
|
||||
tm.assert_index_equal(result.columns, exp_index)
|
||||
|
||||
delta = timedelta(hours=23, minutes=59)
|
||||
result = df.to_timestamp("min", "end", axis=1)
|
||||
exp_index = _get_with_delta(delta)
|
||||
exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns")
|
||||
tm.assert_index_equal(result.columns, exp_index)
|
||||
|
||||
result = df.to_timestamp("S", "end", axis=1)
|
||||
delta = timedelta(hours=23, minutes=59, seconds=59)
|
||||
exp_index = _get_with_delta(delta)
|
||||
exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns")
|
||||
tm.assert_index_equal(result.columns, exp_index)
|
||||
|
||||
result1 = df.to_timestamp("5min", axis=1)
|
||||
result2 = df.to_timestamp("min", axis=1)
|
||||
expected = date_range("2001-01-01", "2009-01-01", freq="YS")
|
||||
assert isinstance(result1.columns, DatetimeIndex)
|
||||
assert isinstance(result2.columns, DatetimeIndex)
|
||||
tm.assert_numpy_array_equal(result1.columns.asi8, expected.asi8)
|
||||
tm.assert_numpy_array_equal(result2.columns.asi8, expected.asi8)
|
||||
# PeriodIndex.to_timestamp always use 'infer'
|
||||
assert result1.columns.freqstr == "YS-JAN"
|
||||
assert result2.columns.freqstr == "YS-JAN"
|
||||
|
||||
def test_to_timestamp_invalid_axis(self):
|
||||
index = period_range(freq="Y", start="1/1/2001", end="12/1/2009")
|
||||
obj = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(index), 5)), index=index
|
||||
)
|
||||
|
||||
# invalid axis
|
||||
with pytest.raises(ValueError, match="axis"):
|
||||
obj.to_timestamp(axis=2)
|
||||
|
||||
def test_to_timestamp_hourly(self, frame_or_series):
|
||||
index = period_range(freq="h", start="1/1/2001", end="1/2/2001")
|
||||
obj = Series(1, index=index, name="foo")
|
||||
if frame_or_series is not Series:
|
||||
obj = obj.to_frame()
|
||||
|
||||
exp_index = date_range("1/1/2001 00:59:59", end="1/2/2001 00:59:59", freq="h")
|
||||
result = obj.to_timestamp(how="end")
|
||||
exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns")
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
if frame_or_series is Series:
|
||||
assert result.name == "foo"
|
||||
|
||||
def test_to_timestamp_raises(self, index, frame_or_series):
|
||||
# GH#33327
|
||||
obj = frame_or_series(index=index, dtype=object)
|
||||
|
||||
if not isinstance(index, PeriodIndex):
|
||||
msg = f"unsupported Type {type(index).__name__}"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
obj.to_timestamp()
|
@ -0,0 +1,209 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Index,
|
||||
IntervalIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
bdate_range,
|
||||
date_range,
|
||||
timedelta_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestTranspose:
|
||||
def test_transpose_td64_intervals(self):
|
||||
# GH#44917
|
||||
tdi = timedelta_range("0 Days", "3 Days")
|
||||
ii = IntervalIndex.from_breaks(tdi)
|
||||
ii = ii.insert(-1, np.nan)
|
||||
df = DataFrame(ii)
|
||||
|
||||
result = df.T
|
||||
expected = DataFrame({i: ii[i : i + 1] for i in range(len(ii))})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_transpose_empty_preserves_datetimeindex(self):
|
||||
# GH#41382
|
||||
dti = DatetimeIndex([], dtype="M8[ns]")
|
||||
df = DataFrame(index=dti)
|
||||
|
||||
expected = DatetimeIndex([], dtype="datetime64[ns]", freq=None)
|
||||
|
||||
result1 = df.T.sum().index
|
||||
result2 = df.sum(axis=1).index
|
||||
|
||||
tm.assert_index_equal(result1, expected)
|
||||
tm.assert_index_equal(result2, expected)
|
||||
|
||||
def test_transpose_tzaware_1col_single_tz(self):
|
||||
# GH#26825
|
||||
dti = date_range("2016-04-05 04:30", periods=3, tz="UTC")
|
||||
|
||||
df = DataFrame(dti)
|
||||
assert (df.dtypes == dti.dtype).all()
|
||||
res = df.T
|
||||
assert (res.dtypes == dti.dtype).all()
|
||||
|
||||
def test_transpose_tzaware_2col_single_tz(self):
|
||||
# GH#26825
|
||||
dti = date_range("2016-04-05 04:30", periods=3, tz="UTC")
|
||||
|
||||
df3 = DataFrame({"A": dti, "B": dti})
|
||||
assert (df3.dtypes == dti.dtype).all()
|
||||
res3 = df3.T
|
||||
assert (res3.dtypes == dti.dtype).all()
|
||||
|
||||
def test_transpose_tzaware_2col_mixed_tz(self):
|
||||
# GH#26825
|
||||
dti = date_range("2016-04-05 04:30", periods=3, tz="UTC")
|
||||
dti2 = dti.tz_convert("US/Pacific")
|
||||
|
||||
df4 = DataFrame({"A": dti, "B": dti2})
|
||||
assert (df4.dtypes == [dti.dtype, dti2.dtype]).all()
|
||||
assert (df4.T.dtypes == object).all()
|
||||
tm.assert_frame_equal(df4.T.T, df4.astype(object))
|
||||
|
||||
@pytest.mark.parametrize("tz", [None, "America/New_York"])
|
||||
def test_transpose_preserves_dtindex_equality_with_dst(self, tz):
|
||||
# GH#19970
|
||||
idx = date_range("20161101", "20161130", freq="4h", tz=tz)
|
||||
df = DataFrame({"a": range(len(idx)), "b": range(len(idx))}, index=idx)
|
||||
result = df.T == df.T
|
||||
expected = DataFrame(True, index=list("ab"), columns=idx)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_transpose_object_to_tzaware_mixed_tz(self):
|
||||
# GH#26825
|
||||
dti = date_range("2016-04-05 04:30", periods=3, tz="UTC")
|
||||
dti2 = dti.tz_convert("US/Pacific")
|
||||
|
||||
# mixed all-tzaware dtypes
|
||||
df2 = DataFrame([dti, dti2])
|
||||
assert (df2.dtypes == object).all()
|
||||
res2 = df2.T
|
||||
assert (res2.dtypes == object).all()
|
||||
|
||||
def test_transpose_uint64(self):
|
||||
df = DataFrame(
|
||||
{"A": np.arange(3), "B": [2**63, 2**63 + 5, 2**63 + 10]},
|
||||
dtype=np.uint64,
|
||||
)
|
||||
result = df.T
|
||||
expected = DataFrame(df.values.T)
|
||||
expected.index = ["A", "B"]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_transpose_float(self, float_frame):
|
||||
frame = float_frame
|
||||
dft = frame.T
|
||||
for idx, series in dft.items():
|
||||
for col, value in series.items():
|
||||
if np.isnan(value):
|
||||
assert np.isnan(frame[col][idx])
|
||||
else:
|
||||
assert value == frame[col][idx]
|
||||
|
||||
def test_transpose_mixed(self):
|
||||
# mixed type
|
||||
mixed = DataFrame(
|
||||
{
|
||||
"A": [0.0, 1.0, 2.0, 3.0, 4.0],
|
||||
"B": [0.0, 1.0, 0.0, 1.0, 0.0],
|
||||
"C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
|
||||
"D": bdate_range("1/1/2009", periods=5),
|
||||
},
|
||||
index=Index(["a", "b", "c", "d", "e"], dtype=object),
|
||||
)
|
||||
|
||||
mixed_T = mixed.T
|
||||
for col, s in mixed_T.items():
|
||||
assert s.dtype == np.object_
|
||||
|
||||
@td.skip_array_manager_invalid_test
|
||||
def test_transpose_get_view(self, float_frame, using_copy_on_write):
|
||||
dft = float_frame.T
|
||||
dft.iloc[:, 5:10] = 5
|
||||
|
||||
if using_copy_on_write:
|
||||
assert (float_frame.values[5:10] != 5).all()
|
||||
else:
|
||||
assert (float_frame.values[5:10] == 5).all()
|
||||
|
||||
@td.skip_array_manager_invalid_test
|
||||
def test_transpose_get_view_dt64tzget_view(self, using_copy_on_write):
|
||||
dti = date_range("2016-01-01", periods=6, tz="US/Pacific")
|
||||
arr = dti._data.reshape(3, 2)
|
||||
df = DataFrame(arr)
|
||||
assert df._mgr.nblocks == 1
|
||||
|
||||
result = df.T
|
||||
assert result._mgr.nblocks == 1
|
||||
|
||||
rtrip = result._mgr.blocks[0].values
|
||||
if using_copy_on_write:
|
||||
assert np.shares_memory(df._mgr.blocks[0].values._ndarray, rtrip._ndarray)
|
||||
else:
|
||||
assert np.shares_memory(arr._ndarray, rtrip._ndarray)
|
||||
|
||||
def test_transpose_not_inferring_dt(self):
|
||||
# GH#51546
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [Timestamp("2019-12-31"), Timestamp("2019-12-31")],
|
||||
},
|
||||
dtype=object,
|
||||
)
|
||||
result = df.T
|
||||
expected = DataFrame(
|
||||
[[Timestamp("2019-12-31"), Timestamp("2019-12-31")]],
|
||||
columns=[0, 1],
|
||||
index=["a"],
|
||||
dtype=object,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_transpose_not_inferring_dt_mixed_blocks(self):
|
||||
# GH#51546
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": Series(
|
||||
[Timestamp("2019-12-31"), Timestamp("2019-12-31")], dtype=object
|
||||
),
|
||||
"b": [Timestamp("2019-12-31"), Timestamp("2019-12-31")],
|
||||
}
|
||||
)
|
||||
result = df.T
|
||||
expected = DataFrame(
|
||||
[
|
||||
[Timestamp("2019-12-31"), Timestamp("2019-12-31")],
|
||||
[Timestamp("2019-12-31"), Timestamp("2019-12-31")],
|
||||
],
|
||||
columns=[0, 1],
|
||||
index=["a", "b"],
|
||||
dtype=object,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("dtype1", ["Int64", "Float64"])
|
||||
@pytest.mark.parametrize("dtype2", ["Int64", "Float64"])
|
||||
def test_transpose(self, dtype1, dtype2):
|
||||
# GH#57315 - transpose should have F contiguous blocks
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": pd.array([1, 1, 2], dtype=dtype1),
|
||||
"b": pd.array([3, 4, 5], dtype=dtype2),
|
||||
}
|
||||
)
|
||||
result = df.T
|
||||
for blk in result._mgr.blocks:
|
||||
# When dtypes are unequal, we get NumPy object array
|
||||
data = blk.values._data if dtype1 == dtype2 else blk.values
|
||||
assert data.flags["F_CONTIGUOUS"]
|
@ -0,0 +1,154 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Index,
|
||||
Series,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameTruncate:
|
||||
def test_truncate(self, datetime_frame, frame_or_series):
|
||||
ts = datetime_frame[::3]
|
||||
ts = tm.get_obj(ts, frame_or_series)
|
||||
|
||||
start, end = datetime_frame.index[3], datetime_frame.index[6]
|
||||
|
||||
start_missing = datetime_frame.index[2]
|
||||
end_missing = datetime_frame.index[7]
|
||||
|
||||
# neither specified
|
||||
truncated = ts.truncate()
|
||||
tm.assert_equal(truncated, ts)
|
||||
|
||||
# both specified
|
||||
expected = ts[1:3]
|
||||
|
||||
truncated = ts.truncate(start, end)
|
||||
tm.assert_equal(truncated, expected)
|
||||
|
||||
truncated = ts.truncate(start_missing, end_missing)
|
||||
tm.assert_equal(truncated, expected)
|
||||
|
||||
# start specified
|
||||
expected = ts[1:]
|
||||
|
||||
truncated = ts.truncate(before=start)
|
||||
tm.assert_equal(truncated, expected)
|
||||
|
||||
truncated = ts.truncate(before=start_missing)
|
||||
tm.assert_equal(truncated, expected)
|
||||
|
||||
# end specified
|
||||
expected = ts[:3]
|
||||
|
||||
truncated = ts.truncate(after=end)
|
||||
tm.assert_equal(truncated, expected)
|
||||
|
||||
truncated = ts.truncate(after=end_missing)
|
||||
tm.assert_equal(truncated, expected)
|
||||
|
||||
# corner case, empty series/frame returned
|
||||
truncated = ts.truncate(after=ts.index[0] - ts.index.freq)
|
||||
assert len(truncated) == 0
|
||||
|
||||
truncated = ts.truncate(before=ts.index[-1] + ts.index.freq)
|
||||
assert len(truncated) == 0
|
||||
|
||||
msg = "Truncate: 2000-01-06 00:00:00 must be after 2000-05-16 00:00:00"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ts.truncate(
|
||||
before=ts.index[-1] - ts.index.freq, after=ts.index[0] + ts.index.freq
|
||||
)
|
||||
|
||||
def test_truncate_nonsortedindex(self, frame_or_series):
|
||||
# GH#17935
|
||||
|
||||
obj = DataFrame({"A": ["a", "b", "c", "d", "e"]}, index=[5, 3, 2, 9, 0])
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
msg = "truncate requires a sorted index"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.truncate(before=3, after=9)
|
||||
|
||||
def test_sort_values_nonsortedindex(self):
|
||||
rng = date_range("2011-01-01", "2012-01-01", freq="W")
|
||||
ts = DataFrame(
|
||||
{
|
||||
"A": np.random.default_rng(2).standard_normal(len(rng)),
|
||||
"B": np.random.default_rng(2).standard_normal(len(rng)),
|
||||
},
|
||||
index=rng,
|
||||
)
|
||||
|
||||
decreasing = ts.sort_values("A", ascending=False)
|
||||
|
||||
msg = "truncate requires a sorted index"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
decreasing.truncate(before="2011-11", after="2011-12")
|
||||
|
||||
def test_truncate_nonsortedindex_axis1(self):
|
||||
# GH#17935
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
3: np.random.default_rng(2).standard_normal(5),
|
||||
20: np.random.default_rng(2).standard_normal(5),
|
||||
2: np.random.default_rng(2).standard_normal(5),
|
||||
0: np.random.default_rng(2).standard_normal(5),
|
||||
},
|
||||
columns=[3, 20, 2, 0],
|
||||
)
|
||||
msg = "truncate requires a sorted index"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.truncate(before=2, after=20, axis=1)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"before, after, indices",
|
||||
[(1, 2, [2, 1]), (None, 2, [2, 1, 0]), (1, None, [3, 2, 1])],
|
||||
)
|
||||
@pytest.mark.parametrize("dtyp", [*tm.ALL_REAL_NUMPY_DTYPES, "datetime64[ns]"])
|
||||
def test_truncate_decreasing_index(
|
||||
self, before, after, indices, dtyp, frame_or_series
|
||||
):
|
||||
# https://github.com/pandas-dev/pandas/issues/33756
|
||||
idx = Index([3, 2, 1, 0], dtype=dtyp)
|
||||
if isinstance(idx, DatetimeIndex):
|
||||
before = pd.Timestamp(before) if before is not None else None
|
||||
after = pd.Timestamp(after) if after is not None else None
|
||||
indices = [pd.Timestamp(i) for i in indices]
|
||||
values = frame_or_series(range(len(idx)), index=idx)
|
||||
result = values.truncate(before=before, after=after)
|
||||
expected = values.loc[indices]
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_truncate_multiindex(self, frame_or_series):
|
||||
# GH 34564
|
||||
mi = pd.MultiIndex.from_product([[1, 2, 3, 4], ["A", "B"]], names=["L1", "L2"])
|
||||
s1 = DataFrame(range(mi.shape[0]), index=mi, columns=["col"])
|
||||
s1 = tm.get_obj(s1, frame_or_series)
|
||||
|
||||
result = s1.truncate(before=2, after=3)
|
||||
|
||||
df = DataFrame.from_dict(
|
||||
{"L1": [2, 2, 3, 3], "L2": ["A", "B", "A", "B"], "col": [2, 3, 4, 5]}
|
||||
)
|
||||
expected = df.set_index(["L1", "L2"])
|
||||
expected = tm.get_obj(expected, frame_or_series)
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_truncate_index_only_one_unique_value(self, frame_or_series):
|
||||
# GH 42365
|
||||
obj = Series(0, index=date_range("2021-06-30", "2021-06-30")).repeat(5)
|
||||
if frame_or_series is DataFrame:
|
||||
obj = obj.to_frame(name="a")
|
||||
|
||||
truncated = obj.truncate("2021-06-28", "2021-07-01")
|
||||
|
||||
tm.assert_equal(truncated, obj)
|
@ -0,0 +1,131 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestTZConvert:
|
||||
def test_tz_convert(self, frame_or_series):
|
||||
rng = date_range("1/1/2011", periods=200, freq="D", tz="US/Eastern")
|
||||
|
||||
obj = DataFrame({"a": 1}, index=rng)
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
result = obj.tz_convert("Europe/Berlin")
|
||||
expected = DataFrame({"a": 1}, rng.tz_convert("Europe/Berlin"))
|
||||
expected = tm.get_obj(expected, frame_or_series)
|
||||
|
||||
assert result.index.tz.zone == "Europe/Berlin"
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_tz_convert_axis1(self):
|
||||
rng = date_range("1/1/2011", periods=200, freq="D", tz="US/Eastern")
|
||||
|
||||
obj = DataFrame({"a": 1}, index=rng)
|
||||
|
||||
obj = obj.T
|
||||
result = obj.tz_convert("Europe/Berlin", axis=1)
|
||||
assert result.columns.tz.zone == "Europe/Berlin"
|
||||
|
||||
expected = DataFrame({"a": 1}, rng.tz_convert("Europe/Berlin"))
|
||||
|
||||
tm.assert_equal(result, expected.T)
|
||||
|
||||
def test_tz_convert_naive(self, frame_or_series):
|
||||
# can't convert tz-naive
|
||||
rng = date_range("1/1/2011", periods=200, freq="D")
|
||||
ts = Series(1, index=rng)
|
||||
ts = frame_or_series(ts)
|
||||
|
||||
with pytest.raises(TypeError, match="Cannot convert tz-naive"):
|
||||
ts.tz_convert("US/Eastern")
|
||||
|
||||
@pytest.mark.parametrize("fn", ["tz_localize", "tz_convert"])
|
||||
def test_tz_convert_and_localize(self, fn):
|
||||
l0 = date_range("20140701", periods=5, freq="D")
|
||||
l1 = date_range("20140701", periods=5, freq="D")
|
||||
|
||||
int_idx = Index(range(5))
|
||||
|
||||
if fn == "tz_convert":
|
||||
l0 = l0.tz_localize("UTC")
|
||||
l1 = l1.tz_localize("UTC")
|
||||
|
||||
for idx in [l0, l1]:
|
||||
l0_expected = getattr(idx, fn)("US/Pacific")
|
||||
l1_expected = getattr(idx, fn)("US/Pacific")
|
||||
|
||||
df1 = DataFrame(np.ones(5), index=l0)
|
||||
df1 = getattr(df1, fn)("US/Pacific")
|
||||
tm.assert_index_equal(df1.index, l0_expected)
|
||||
|
||||
# MultiIndex
|
||||
# GH7846
|
||||
df2 = DataFrame(np.ones(5), MultiIndex.from_arrays([l0, l1]))
|
||||
|
||||
# freq is not preserved in MultiIndex construction
|
||||
l1_expected = l1_expected._with_freq(None)
|
||||
l0_expected = l0_expected._with_freq(None)
|
||||
l1 = l1._with_freq(None)
|
||||
l0 = l0._with_freq(None)
|
||||
|
||||
df3 = getattr(df2, fn)("US/Pacific", level=0)
|
||||
assert not df3.index.levels[0].equals(l0)
|
||||
tm.assert_index_equal(df3.index.levels[0], l0_expected)
|
||||
tm.assert_index_equal(df3.index.levels[1], l1)
|
||||
assert not df3.index.levels[1].equals(l1_expected)
|
||||
|
||||
df3 = getattr(df2, fn)("US/Pacific", level=1)
|
||||
tm.assert_index_equal(df3.index.levels[0], l0)
|
||||
assert not df3.index.levels[0].equals(l0_expected)
|
||||
tm.assert_index_equal(df3.index.levels[1], l1_expected)
|
||||
assert not df3.index.levels[1].equals(l1)
|
||||
|
||||
df4 = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0]))
|
||||
|
||||
# TODO: untested
|
||||
getattr(df4, fn)("US/Pacific", level=1)
|
||||
|
||||
tm.assert_index_equal(df3.index.levels[0], l0)
|
||||
assert not df3.index.levels[0].equals(l0_expected)
|
||||
tm.assert_index_equal(df3.index.levels[1], l1_expected)
|
||||
assert not df3.index.levels[1].equals(l1)
|
||||
|
||||
# Bad Inputs
|
||||
|
||||
# Not DatetimeIndex / PeriodIndex
|
||||
with pytest.raises(TypeError, match="DatetimeIndex"):
|
||||
df = DataFrame(index=int_idx)
|
||||
getattr(df, fn)("US/Pacific")
|
||||
|
||||
# Not DatetimeIndex / PeriodIndex
|
||||
with pytest.raises(TypeError, match="DatetimeIndex"):
|
||||
df = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0]))
|
||||
getattr(df, fn)("US/Pacific", level=0)
|
||||
|
||||
# Invalid level
|
||||
with pytest.raises(ValueError, match="not valid"):
|
||||
df = DataFrame(index=l0)
|
||||
getattr(df, fn)("US/Pacific", level=1)
|
||||
|
||||
@pytest.mark.parametrize("copy", [True, False])
|
||||
def test_tz_convert_copy_inplace_mutate(self, copy, frame_or_series):
|
||||
# GH#6326
|
||||
obj = frame_or_series(
|
||||
np.arange(0, 5),
|
||||
index=date_range("20131027", periods=5, freq="h", tz="Europe/Berlin"),
|
||||
)
|
||||
orig = obj.copy()
|
||||
result = obj.tz_convert("UTC", copy=copy)
|
||||
expected = frame_or_series(np.arange(0, 5), index=obj.index.tz_convert("UTC"))
|
||||
tm.assert_equal(result, expected)
|
||||
tm.assert_equal(obj, orig)
|
||||
assert result.index is not obj.index
|
||||
assert result is not obj
|
@ -0,0 +1,68 @@
|
||||
from datetime import timezone
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestTZLocalize:
|
||||
# See also:
|
||||
# test_tz_convert_and_localize in test_tz_convert
|
||||
|
||||
def test_tz_localize(self, frame_or_series):
|
||||
rng = date_range("1/1/2011", periods=100, freq="h")
|
||||
|
||||
obj = DataFrame({"a": 1}, index=rng)
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
result = obj.tz_localize("utc")
|
||||
expected = DataFrame({"a": 1}, rng.tz_localize("UTC"))
|
||||
expected = tm.get_obj(expected, frame_or_series)
|
||||
|
||||
assert result.index.tz is timezone.utc
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_tz_localize_axis1(self):
|
||||
rng = date_range("1/1/2011", periods=100, freq="h")
|
||||
|
||||
df = DataFrame({"a": 1}, index=rng)
|
||||
|
||||
df = df.T
|
||||
result = df.tz_localize("utc", axis=1)
|
||||
assert result.columns.tz is timezone.utc
|
||||
|
||||
expected = DataFrame({"a": 1}, rng.tz_localize("UTC"))
|
||||
|
||||
tm.assert_frame_equal(result, expected.T)
|
||||
|
||||
def test_tz_localize_naive(self, frame_or_series):
|
||||
# Can't localize if already tz-aware
|
||||
rng = date_range("1/1/2011", periods=100, freq="h", tz="utc")
|
||||
ts = Series(1, index=rng)
|
||||
ts = frame_or_series(ts)
|
||||
|
||||
with pytest.raises(TypeError, match="Already tz-aware"):
|
||||
ts.tz_localize("US/Eastern")
|
||||
|
||||
@pytest.mark.parametrize("copy", [True, False])
|
||||
def test_tz_localize_copy_inplace_mutate(self, copy, frame_or_series):
|
||||
# GH#6326
|
||||
obj = frame_or_series(
|
||||
np.arange(0, 5), index=date_range("20131027", periods=5, freq="1h", tz=None)
|
||||
)
|
||||
orig = obj.copy()
|
||||
result = obj.tz_localize("UTC", copy=copy)
|
||||
expected = frame_or_series(
|
||||
np.arange(0, 5),
|
||||
index=date_range("20131027", periods=5, freq="1h", tz="UTC"),
|
||||
)
|
||||
tm.assert_equal(result, expected)
|
||||
tm.assert_equal(obj, orig)
|
||||
assert result.index is not obj.index
|
||||
assert result is not obj
|
@ -0,0 +1,204 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameUpdate:
|
||||
def test_update_nan(self):
|
||||
# #15593 #15617
|
||||
# test 1
|
||||
df1 = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)})
|
||||
df2 = DataFrame({"A": [None, 2, 3]})
|
||||
expected = df1.copy()
|
||||
df1.update(df2, overwrite=False)
|
||||
|
||||
tm.assert_frame_equal(df1, expected)
|
||||
|
||||
# test 2
|
||||
df1 = DataFrame({"A": [1.0, None, 3], "B": date_range("2000", periods=3)})
|
||||
df2 = DataFrame({"A": [None, 2, 3]})
|
||||
expected = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)})
|
||||
df1.update(df2, overwrite=False)
|
||||
|
||||
tm.assert_frame_equal(df1, expected)
|
||||
|
||||
def test_update(self):
|
||||
df = DataFrame(
|
||||
[[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
|
||||
)
|
||||
|
||||
other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3])
|
||||
|
||||
df.update(other)
|
||||
|
||||
expected = DataFrame(
|
||||
[[1.5, np.nan, 3], [3.6, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]]
|
||||
)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_update_dtypes(self):
|
||||
# gh 3016
|
||||
df = DataFrame(
|
||||
[[1.0, 2.0, 1, False, True], [4.0, 5.0, 2, True, False]],
|
||||
columns=["A", "B", "int", "bool1", "bool2"],
|
||||
)
|
||||
|
||||
other = DataFrame(
|
||||
[[45, 45, 3, True]], index=[0], columns=["A", "B", "int", "bool1"]
|
||||
)
|
||||
df.update(other)
|
||||
|
||||
expected = DataFrame(
|
||||
[[45.0, 45.0, 3, True, True], [4.0, 5.0, 2, True, False]],
|
||||
columns=["A", "B", "int", "bool1", "bool2"],
|
||||
)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_update_nooverwrite(self):
|
||||
df = DataFrame(
|
||||
[[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
|
||||
)
|
||||
|
||||
other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3])
|
||||
|
||||
df.update(other, overwrite=False)
|
||||
|
||||
expected = DataFrame(
|
||||
[[1.5, np.nan, 3], [1.5, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 3.0]]
|
||||
)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_update_filtered(self):
|
||||
df = DataFrame(
|
||||
[[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
|
||||
)
|
||||
|
||||
other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3])
|
||||
|
||||
df.update(other, filter_func=lambda x: x > 2)
|
||||
|
||||
expected = DataFrame(
|
||||
[[1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]]
|
||||
)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"bad_kwarg, exception, msg",
|
||||
[
|
||||
# errors must be 'ignore' or 'raise'
|
||||
({"errors": "something"}, ValueError, "The parameter errors must.*"),
|
||||
({"join": "inner"}, NotImplementedError, "Only left join is supported"),
|
||||
],
|
||||
)
|
||||
def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg):
|
||||
df = DataFrame([[1.5, 1, 3.0]])
|
||||
with pytest.raises(exception, match=msg):
|
||||
df.update(df, **bad_kwarg)
|
||||
|
||||
def test_update_raise_on_overlap(self):
|
||||
df = DataFrame(
|
||||
[[1.5, 1, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
|
||||
)
|
||||
|
||||
other = DataFrame([[2.0, np.nan], [np.nan, 7]], index=[1, 3], columns=[1, 2])
|
||||
with pytest.raises(ValueError, match="Data overlaps"):
|
||||
df.update(other, errors="raise")
|
||||
|
||||
def test_update_from_non_df(self):
|
||||
d = {"a": Series([1, 2, 3, 4]), "b": Series([5, 6, 7, 8])}
|
||||
df = DataFrame(d)
|
||||
|
||||
d["a"] = Series([5, 6, 7, 8])
|
||||
df.update(d)
|
||||
|
||||
expected = DataFrame(d)
|
||||
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
d = {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}
|
||||
df = DataFrame(d)
|
||||
|
||||
d["a"] = [5, 6, 7, 8]
|
||||
df.update(d)
|
||||
|
||||
expected = DataFrame(d)
|
||||
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_update_datetime_tz(self):
|
||||
# GH 25807
|
||||
result = DataFrame([pd.Timestamp("2019", tz="UTC")])
|
||||
with tm.assert_produces_warning(None):
|
||||
result.update(result)
|
||||
expected = DataFrame([pd.Timestamp("2019", tz="UTC")])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_update_datetime_tz_in_place(self, using_copy_on_write, warn_copy_on_write):
|
||||
# https://github.com/pandas-dev/pandas/issues/56227
|
||||
result = DataFrame([pd.Timestamp("2019", tz="UTC")])
|
||||
orig = result.copy()
|
||||
view = result[:]
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning if warn_copy_on_write else None, match="Setting a value"
|
||||
):
|
||||
result.update(result + pd.Timedelta(days=1))
|
||||
expected = DataFrame([pd.Timestamp("2019-01-02", tz="UTC")])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
if not using_copy_on_write:
|
||||
tm.assert_frame_equal(view, expected)
|
||||
else:
|
||||
tm.assert_frame_equal(view, orig)
|
||||
|
||||
def test_update_with_different_dtype(self, using_copy_on_write):
|
||||
# GH#3217
|
||||
df = DataFrame({"a": [1, 3], "b": [np.nan, 2]})
|
||||
df["c"] = np.nan
|
||||
with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"):
|
||||
df.update({"c": Series(["foo"], index=[0])})
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": [1, 3],
|
||||
"b": [np.nan, 2],
|
||||
"c": Series(["foo", np.nan], dtype="object"),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
@td.skip_array_manager_invalid_test
|
||||
def test_update_modify_view(
|
||||
self, using_copy_on_write, warn_copy_on_write, using_infer_string
|
||||
):
|
||||
# GH#47188
|
||||
df = DataFrame({"A": ["1", np.nan], "B": ["100", np.nan]})
|
||||
df2 = DataFrame({"A": ["a", "x"], "B": ["100", "200"]})
|
||||
df2_orig = df2.copy()
|
||||
result_view = df2[:]
|
||||
# TODO(CoW-warn) better warning message
|
||||
with tm.assert_cow_warning(warn_copy_on_write):
|
||||
df2.update(df)
|
||||
expected = DataFrame({"A": ["1", "x"], "B": ["100", "200"]})
|
||||
tm.assert_frame_equal(df2, expected)
|
||||
if using_copy_on_write or using_infer_string:
|
||||
tm.assert_frame_equal(result_view, df2_orig)
|
||||
else:
|
||||
tm.assert_frame_equal(result_view, expected)
|
||||
|
||||
def test_update_dt_column_with_NaT_create_column(self):
|
||||
# GH#16713
|
||||
df = DataFrame({"A": [1, None], "B": [pd.NaT, pd.to_datetime("2016-01-01")]})
|
||||
df2 = DataFrame({"A": [2, 3]})
|
||||
df.update(df2, overwrite=False)
|
||||
expected = DataFrame(
|
||||
{"A": [1.0, 3.0], "B": [pd.NaT, pd.to_datetime("2016-01-01")]}
|
||||
)
|
||||
tm.assert_frame_equal(df, expected)
|
@ -0,0 +1,205 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_data_frame_value_counts_unsorted():
|
||||
df = pd.DataFrame(
|
||||
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
|
||||
index=["falcon", "dog", "cat", "ant"],
|
||||
)
|
||||
|
||||
result = df.value_counts(sort=False)
|
||||
expected = pd.Series(
|
||||
data=[1, 2, 1],
|
||||
index=pd.MultiIndex.from_arrays(
|
||||
[(2, 4, 6), (2, 0, 0)], names=["num_legs", "num_wings"]
|
||||
),
|
||||
name="count",
|
||||
)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_data_frame_value_counts_ascending():
|
||||
df = pd.DataFrame(
|
||||
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
|
||||
index=["falcon", "dog", "cat", "ant"],
|
||||
)
|
||||
|
||||
result = df.value_counts(ascending=True)
|
||||
expected = pd.Series(
|
||||
data=[1, 1, 2],
|
||||
index=pd.MultiIndex.from_arrays(
|
||||
[(2, 6, 4), (2, 0, 0)], names=["num_legs", "num_wings"]
|
||||
),
|
||||
name="count",
|
||||
)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_data_frame_value_counts_default():
|
||||
df = pd.DataFrame(
|
||||
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
|
||||
index=["falcon", "dog", "cat", "ant"],
|
||||
)
|
||||
|
||||
result = df.value_counts()
|
||||
expected = pd.Series(
|
||||
data=[2, 1, 1],
|
||||
index=pd.MultiIndex.from_arrays(
|
||||
[(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"]
|
||||
),
|
||||
name="count",
|
||||
)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_data_frame_value_counts_normalize():
|
||||
df = pd.DataFrame(
|
||||
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
|
||||
index=["falcon", "dog", "cat", "ant"],
|
||||
)
|
||||
|
||||
result = df.value_counts(normalize=True)
|
||||
expected = pd.Series(
|
||||
data=[0.5, 0.25, 0.25],
|
||||
index=pd.MultiIndex.from_arrays(
|
||||
[(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"]
|
||||
),
|
||||
name="proportion",
|
||||
)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_data_frame_value_counts_single_col_default():
|
||||
df = pd.DataFrame({"num_legs": [2, 4, 4, 6]})
|
||||
|
||||
result = df.value_counts()
|
||||
expected = pd.Series(
|
||||
data=[2, 1, 1],
|
||||
index=pd.MultiIndex.from_arrays([[4, 2, 6]], names=["num_legs"]),
|
||||
name="count",
|
||||
)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_data_frame_value_counts_empty():
|
||||
df_no_cols = pd.DataFrame()
|
||||
|
||||
result = df_no_cols.value_counts()
|
||||
expected = pd.Series(
|
||||
[], dtype=np.int64, name="count", index=np.array([], dtype=np.intp)
|
||||
)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_data_frame_value_counts_empty_normalize():
|
||||
df_no_cols = pd.DataFrame()
|
||||
|
||||
result = df_no_cols.value_counts(normalize=True)
|
||||
expected = pd.Series(
|
||||
[], dtype=np.float64, name="proportion", index=np.array([], dtype=np.intp)
|
||||
)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_data_frame_value_counts_dropna_true(nulls_fixture):
|
||||
# GH 41334
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"first_name": ["John", "Anne", "John", "Beth"],
|
||||
"middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"],
|
||||
},
|
||||
)
|
||||
result = df.value_counts()
|
||||
expected = pd.Series(
|
||||
data=[1, 1],
|
||||
index=pd.MultiIndex.from_arrays(
|
||||
[("Beth", "John"), ("Louise", "Smith")], names=["first_name", "middle_name"]
|
||||
),
|
||||
name="count",
|
||||
)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_data_frame_value_counts_dropna_false(nulls_fixture):
|
||||
# GH 41334
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"first_name": ["John", "Anne", "John", "Beth"],
|
||||
"middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"],
|
||||
},
|
||||
)
|
||||
|
||||
result = df.value_counts(dropna=False)
|
||||
expected = pd.Series(
|
||||
data=[1, 1, 1, 1],
|
||||
index=pd.MultiIndex(
|
||||
levels=[
|
||||
pd.Index(["Anne", "Beth", "John"]),
|
||||
pd.Index(["Louise", "Smith", np.nan]),
|
||||
],
|
||||
codes=[[0, 1, 2, 2], [2, 0, 1, 2]],
|
||||
names=["first_name", "middle_name"],
|
||||
),
|
||||
name="count",
|
||||
)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("columns", (["first_name", "middle_name"], [0, 1]))
|
||||
def test_data_frame_value_counts_subset(nulls_fixture, columns):
|
||||
# GH 50829
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
columns[0]: ["John", "Anne", "John", "Beth"],
|
||||
columns[1]: ["Smith", nulls_fixture, nulls_fixture, "Louise"],
|
||||
},
|
||||
)
|
||||
result = df.value_counts(columns[0])
|
||||
expected = pd.Series(
|
||||
data=[2, 1, 1],
|
||||
index=pd.Index(["John", "Anne", "Beth"], name=columns[0]),
|
||||
name="count",
|
||||
)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_value_counts_categorical_future_warning():
|
||||
# GH#54775
|
||||
df = pd.DataFrame({"a": [1, 2, 3]}, dtype="category")
|
||||
result = df.value_counts()
|
||||
expected = pd.Series(
|
||||
1,
|
||||
index=pd.MultiIndex.from_arrays(
|
||||
[pd.Index([1, 2, 3], name="a", dtype="category")]
|
||||
),
|
||||
name="count",
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_value_counts_with_missing_category():
|
||||
# GH-54836
|
||||
df = pd.DataFrame({"a": pd.Categorical([1, 2, 4], categories=[1, 2, 3, 4])})
|
||||
result = df.value_counts()
|
||||
expected = pd.Series(
|
||||
[1, 1, 1, 0],
|
||||
index=pd.MultiIndex.from_arrays(
|
||||
[pd.CategoricalIndex([1, 2, 4, 3], categories=[1, 2, 3, 4], name="a")]
|
||||
),
|
||||
name="count",
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
@ -0,0 +1,280 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
NaT,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
period_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameValues:
|
||||
@td.skip_array_manager_invalid_test
|
||||
def test_values(self, float_frame, using_copy_on_write):
|
||||
if using_copy_on_write:
|
||||
with pytest.raises(ValueError, match="read-only"):
|
||||
float_frame.values[:, 0] = 5.0
|
||||
assert (float_frame.values[:, 0] != 5).all()
|
||||
else:
|
||||
float_frame.values[:, 0] = 5.0
|
||||
assert (float_frame.values[:, 0] == 5).all()
|
||||
|
||||
def test_more_values(self, float_string_frame):
|
||||
values = float_string_frame.values
|
||||
assert values.shape[1] == len(float_string_frame.columns)
|
||||
|
||||
def test_values_mixed_dtypes(self, float_frame, float_string_frame):
|
||||
frame = float_frame
|
||||
arr = frame.values
|
||||
|
||||
frame_cols = frame.columns
|
||||
for i, row in enumerate(arr):
|
||||
for j, value in enumerate(row):
|
||||
col = frame_cols[j]
|
||||
if np.isnan(value):
|
||||
assert np.isnan(frame[col].iloc[i])
|
||||
else:
|
||||
assert value == frame[col].iloc[i]
|
||||
|
||||
# mixed type
|
||||
arr = float_string_frame[["foo", "A"]].values
|
||||
assert arr[0, 0] == "bar"
|
||||
|
||||
df = DataFrame({"complex": [1j, 2j, 3j], "real": [1, 2, 3]})
|
||||
arr = df.values
|
||||
assert arr[0, 0] == 1j
|
||||
|
||||
def test_values_duplicates(self):
|
||||
df = DataFrame(
|
||||
[[1, 2, "a", "b"], [1, 2, "a", "b"]], columns=["one", "one", "two", "two"]
|
||||
)
|
||||
|
||||
result = df.values
|
||||
expected = np.array([[1, 2, "a", "b"], [1, 2, "a", "b"]], dtype=object)
|
||||
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_values_with_duplicate_columns(self):
|
||||
df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=["x", "x"])
|
||||
result = df.values
|
||||
expected = np.array([[1, 2.5], [3, 4.5]])
|
||||
assert (result == expected).all().all()
|
||||
|
||||
@pytest.mark.parametrize("constructor", [date_range, period_range])
|
||||
def test_values_casts_datetimelike_to_object(self, constructor):
|
||||
series = Series(constructor("2000-01-01", periods=10, freq="D"))
|
||||
|
||||
expected = series.astype("object")
|
||||
|
||||
df = DataFrame(
|
||||
{"a": series, "b": np.random.default_rng(2).standard_normal(len(series))}
|
||||
)
|
||||
|
||||
result = df.values.squeeze()
|
||||
assert (result[:, 0] == expected.values).all()
|
||||
|
||||
df = DataFrame({"a": series, "b": ["foo"] * len(series)})
|
||||
|
||||
result = df.values.squeeze()
|
||||
assert (result[:, 0] == expected.values).all()
|
||||
|
||||
def test_frame_values_with_tz(self):
|
||||
tz = "US/Central"
|
||||
df = DataFrame({"A": date_range("2000", periods=4, tz=tz)})
|
||||
result = df.values
|
||||
expected = np.array(
|
||||
[
|
||||
[Timestamp("2000-01-01", tz=tz)],
|
||||
[Timestamp("2000-01-02", tz=tz)],
|
||||
[Timestamp("2000-01-03", tz=tz)],
|
||||
[Timestamp("2000-01-04", tz=tz)],
|
||||
]
|
||||
)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
# two columns, homogeneous
|
||||
|
||||
df["B"] = df["A"]
|
||||
result = df.values
|
||||
expected = np.concatenate([expected, expected], axis=1)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
# three columns, heterogeneous
|
||||
est = "US/Eastern"
|
||||
df["C"] = df["A"].dt.tz_convert(est)
|
||||
|
||||
new = np.array(
|
||||
[
|
||||
[Timestamp("2000-01-01T01:00:00", tz=est)],
|
||||
[Timestamp("2000-01-02T01:00:00", tz=est)],
|
||||
[Timestamp("2000-01-03T01:00:00", tz=est)],
|
||||
[Timestamp("2000-01-04T01:00:00", tz=est)],
|
||||
]
|
||||
)
|
||||
expected = np.concatenate([expected, new], axis=1)
|
||||
result = df.values
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_interleave_with_tzaware(self, timezone_frame):
|
||||
# interleave with object
|
||||
result = timezone_frame.assign(D="foo").values
|
||||
expected = np.array(
|
||||
[
|
||||
[
|
||||
Timestamp("2013-01-01 00:00:00"),
|
||||
Timestamp("2013-01-02 00:00:00"),
|
||||
Timestamp("2013-01-03 00:00:00"),
|
||||
],
|
||||
[
|
||||
Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"),
|
||||
NaT,
|
||||
Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"),
|
||||
],
|
||||
[
|
||||
Timestamp("2013-01-01 00:00:00+0100", tz="CET"),
|
||||
NaT,
|
||||
Timestamp("2013-01-03 00:00:00+0100", tz="CET"),
|
||||
],
|
||||
["foo", "foo", "foo"],
|
||||
],
|
||||
dtype=object,
|
||||
).T
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
# interleave with only datetime64[ns]
|
||||
result = timezone_frame.values
|
||||
expected = np.array(
|
||||
[
|
||||
[
|
||||
Timestamp("2013-01-01 00:00:00"),
|
||||
Timestamp("2013-01-02 00:00:00"),
|
||||
Timestamp("2013-01-03 00:00:00"),
|
||||
],
|
||||
[
|
||||
Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"),
|
||||
NaT,
|
||||
Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"),
|
||||
],
|
||||
[
|
||||
Timestamp("2013-01-01 00:00:00+0100", tz="CET"),
|
||||
NaT,
|
||||
Timestamp("2013-01-03 00:00:00+0100", tz="CET"),
|
||||
],
|
||||
],
|
||||
dtype=object,
|
||||
).T
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_values_interleave_non_unique_cols(self):
|
||||
df = DataFrame(
|
||||
[[Timestamp("20130101"), 3.5], [Timestamp("20130102"), 4.5]],
|
||||
columns=["x", "x"],
|
||||
index=[1, 2],
|
||||
)
|
||||
|
||||
df_unique = df.copy()
|
||||
df_unique.columns = ["x", "y"]
|
||||
assert df_unique.values.shape == df.values.shape
|
||||
tm.assert_numpy_array_equal(df_unique.values[0], df.values[0])
|
||||
tm.assert_numpy_array_equal(df_unique.values[1], df.values[1])
|
||||
|
||||
def test_values_numeric_cols(self, float_frame):
|
||||
float_frame["foo"] = "bar"
|
||||
|
||||
values = float_frame[["A", "B", "C", "D"]].values
|
||||
assert values.dtype == np.float64
|
||||
|
||||
def test_values_lcd(self, mixed_float_frame, mixed_int_frame):
|
||||
# mixed lcd
|
||||
values = mixed_float_frame[["A", "B", "C", "D"]].values
|
||||
assert values.dtype == np.float64
|
||||
|
||||
values = mixed_float_frame[["A", "B", "C"]].values
|
||||
assert values.dtype == np.float32
|
||||
|
||||
values = mixed_float_frame[["C"]].values
|
||||
assert values.dtype == np.float16
|
||||
|
||||
# GH#10364
|
||||
# B uint64 forces float because there are other signed int types
|
||||
values = mixed_int_frame[["A", "B", "C", "D"]].values
|
||||
assert values.dtype == np.float64
|
||||
|
||||
values = mixed_int_frame[["A", "D"]].values
|
||||
assert values.dtype == np.int64
|
||||
|
||||
# B uint64 forces float because there are other signed int types
|
||||
values = mixed_int_frame[["A", "B", "C"]].values
|
||||
assert values.dtype == np.float64
|
||||
|
||||
# as B and C are both unsigned, no forcing to float is needed
|
||||
values = mixed_int_frame[["B", "C"]].values
|
||||
assert values.dtype == np.uint64
|
||||
|
||||
values = mixed_int_frame[["A", "C"]].values
|
||||
assert values.dtype == np.int32
|
||||
|
||||
values = mixed_int_frame[["C", "D"]].values
|
||||
assert values.dtype == np.int64
|
||||
|
||||
values = mixed_int_frame[["A"]].values
|
||||
assert values.dtype == np.int32
|
||||
|
||||
values = mixed_int_frame[["C"]].values
|
||||
assert values.dtype == np.uint8
|
||||
|
||||
|
||||
class TestPrivateValues:
|
||||
@td.skip_array_manager_invalid_test
|
||||
def test_private_values_dt64tz(self, using_copy_on_write):
|
||||
dta = date_range("2000", periods=4, tz="US/Central")._data.reshape(-1, 1)
|
||||
|
||||
df = DataFrame(dta, columns=["A"])
|
||||
tm.assert_equal(df._values, dta)
|
||||
|
||||
if using_copy_on_write:
|
||||
assert not np.shares_memory(df._values._ndarray, dta._ndarray)
|
||||
else:
|
||||
# we have a view
|
||||
assert np.shares_memory(df._values._ndarray, dta._ndarray)
|
||||
|
||||
# TimedeltaArray
|
||||
tda = dta - dta
|
||||
df2 = df - df
|
||||
tm.assert_equal(df2._values, tda)
|
||||
|
||||
@td.skip_array_manager_invalid_test
|
||||
def test_private_values_dt64tz_multicol(self, using_copy_on_write):
|
||||
dta = date_range("2000", periods=8, tz="US/Central")._data.reshape(-1, 2)
|
||||
|
||||
df = DataFrame(dta, columns=["A", "B"])
|
||||
tm.assert_equal(df._values, dta)
|
||||
|
||||
if using_copy_on_write:
|
||||
assert not np.shares_memory(df._values._ndarray, dta._ndarray)
|
||||
else:
|
||||
# we have a view
|
||||
assert np.shares_memory(df._values._ndarray, dta._ndarray)
|
||||
|
||||
# TimedeltaArray
|
||||
tda = dta - dta
|
||||
df2 = df - df
|
||||
tm.assert_equal(df2._values, tda)
|
||||
|
||||
def test_private_values_dt64_multiblock(self):
|
||||
dta = date_range("2000", periods=8)._data
|
||||
|
||||
df = DataFrame({"A": dta[:4]}, copy=False)
|
||||
df["B"] = dta[4:]
|
||||
|
||||
assert len(df._mgr.arrays) == 2
|
||||
|
||||
result = df._values
|
||||
expected = dta.reshape(2, 4).T
|
||||
tm.assert_equal(result, expected)
|
Reference in New Issue
Block a user