Updated script that can be controled by Nodejs web app

This commit is contained in:
mac OS
2024-11-25 12:24:18 +07:00
parent c440eda1f4
commit 8b0ab2bd3a
8662 changed files with 1803808 additions and 34 deletions

View File

@ -0,0 +1,69 @@
import pytest
from pandas import (
DatetimeIndex,
Series,
Timestamp,
date_range,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Setting a value on a view:FutureWarning"
)
@pytest.mark.parametrize(
"cons",
[
lambda x: DatetimeIndex(x),
lambda x: DatetimeIndex(DatetimeIndex(x)),
],
)
def test_datetimeindex(using_copy_on_write, cons):
dt = date_range("2019-12-31", periods=3, freq="D")
ser = Series(dt)
idx = cons(ser)
expected = idx.copy(deep=True)
ser.iloc[0] = Timestamp("2020-12-31")
if using_copy_on_write:
tm.assert_index_equal(idx, expected)
def test_datetimeindex_tz_convert(using_copy_on_write):
dt = date_range("2019-12-31", periods=3, freq="D", tz="Europe/Berlin")
ser = Series(dt)
idx = DatetimeIndex(ser).tz_convert("US/Eastern")
expected = idx.copy(deep=True)
ser.iloc[0] = Timestamp("2020-12-31", tz="Europe/Berlin")
if using_copy_on_write:
tm.assert_index_equal(idx, expected)
def test_datetimeindex_tz_localize(using_copy_on_write):
dt = date_range("2019-12-31", periods=3, freq="D")
ser = Series(dt)
idx = DatetimeIndex(ser).tz_localize("Europe/Berlin")
expected = idx.copy(deep=True)
ser.iloc[0] = Timestamp("2020-12-31")
if using_copy_on_write:
tm.assert_index_equal(idx, expected)
def test_datetimeindex_isocalendar(using_copy_on_write):
dt = date_range("2019-12-31", periods=3, freq="D")
ser = Series(dt)
df = DatetimeIndex(ser).isocalendar()
expected = df.index.copy(deep=True)
ser.iloc[0] = Timestamp("2020-12-31")
if using_copy_on_write:
tm.assert_index_equal(df.index, expected)
def test_index_values(using_copy_on_write):
idx = date_range("2019-12-31", periods=3, freq="D")
result = idx.values
if using_copy_on_write:
assert result.flags.writeable is False
else:
assert result.flags.writeable is True

View File

@ -0,0 +1,184 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
Series,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
def index_view(index_data=[1, 2]):
df = DataFrame({"a": index_data, "b": 1.5})
view = df[:]
df = df.set_index("a", drop=True)
idx = df.index
# df = None
return idx, view
def test_set_index_update_column(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1, 2], "b": 1})
df = df.set_index("a", drop=False)
expected = df.index.copy(deep=True)
with tm.assert_cow_warning(warn_copy_on_write):
df.iloc[0, 0] = 100
if using_copy_on_write:
tm.assert_index_equal(df.index, expected)
else:
tm.assert_index_equal(df.index, Index([100, 2], name="a"))
def test_set_index_drop_update_column(using_copy_on_write):
df = DataFrame({"a": [1, 2], "b": 1.5})
view = df[:]
df = df.set_index("a", drop=True)
expected = df.index.copy(deep=True)
view.iloc[0, 0] = 100
tm.assert_index_equal(df.index, expected)
def test_set_index_series(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1, 2], "b": 1.5})
ser = Series([10, 11])
df = df.set_index(ser)
expected = df.index.copy(deep=True)
with tm.assert_cow_warning(warn_copy_on_write):
ser.iloc[0] = 100
if using_copy_on_write:
tm.assert_index_equal(df.index, expected)
else:
tm.assert_index_equal(df.index, Index([100, 11]))
def test_assign_index_as_series(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1, 2], "b": 1.5})
ser = Series([10, 11])
df.index = ser
expected = df.index.copy(deep=True)
with tm.assert_cow_warning(warn_copy_on_write):
ser.iloc[0] = 100
if using_copy_on_write:
tm.assert_index_equal(df.index, expected)
else:
tm.assert_index_equal(df.index, Index([100, 11]))
def test_assign_index_as_index(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1, 2], "b": 1.5})
ser = Series([10, 11])
rhs_index = Index(ser)
df.index = rhs_index
rhs_index = None # overwrite to clear reference
expected = df.index.copy(deep=True)
with tm.assert_cow_warning(warn_copy_on_write):
ser.iloc[0] = 100
if using_copy_on_write:
tm.assert_index_equal(df.index, expected)
else:
tm.assert_index_equal(df.index, Index([100, 11]))
def test_index_from_series(using_copy_on_write, warn_copy_on_write):
ser = Series([1, 2])
idx = Index(ser)
expected = idx.copy(deep=True)
with tm.assert_cow_warning(warn_copy_on_write):
ser.iloc[0] = 100
if using_copy_on_write:
tm.assert_index_equal(idx, expected)
else:
tm.assert_index_equal(idx, Index([100, 2]))
def test_index_from_series_copy(using_copy_on_write):
ser = Series([1, 2])
idx = Index(ser, copy=True) # noqa: F841
arr = get_array(ser)
ser.iloc[0] = 100
assert np.shares_memory(get_array(ser), arr)
def test_index_from_index(using_copy_on_write, warn_copy_on_write):
ser = Series([1, 2])
idx = Index(ser)
idx = Index(idx)
expected = idx.copy(deep=True)
with tm.assert_cow_warning(warn_copy_on_write):
ser.iloc[0] = 100
if using_copy_on_write:
tm.assert_index_equal(idx, expected)
else:
tm.assert_index_equal(idx, Index([100, 2]))
@pytest.mark.parametrize(
"func",
[
lambda x: x._shallow_copy(x._values),
lambda x: x.view(),
lambda x: x.take([0, 1]),
lambda x: x.repeat([1, 1]),
lambda x: x[slice(0, 2)],
lambda x: x[[0, 1]],
lambda x: x._getitem_slice(slice(0, 2)),
lambda x: x.delete([]),
lambda x: x.rename("b"),
lambda x: x.astype("Int64", copy=False),
],
ids=[
"_shallow_copy",
"view",
"take",
"repeat",
"getitem_slice",
"getitem_list",
"_getitem_slice",
"delete",
"rename",
"astype",
],
)
def test_index_ops(using_copy_on_write, func, request):
idx, view_ = index_view()
expected = idx.copy(deep=True)
if "astype" in request.node.callspec.id:
expected = expected.astype("Int64")
idx = func(idx)
view_.iloc[0, 0] = 100
if using_copy_on_write:
tm.assert_index_equal(idx, expected, check_names=False)
def test_infer_objects(using_copy_on_write):
idx, view_ = index_view(["a", "b"])
expected = idx.copy(deep=True)
idx = idx.infer_objects(copy=False)
view_.iloc[0, 0] = "aaaa"
if using_copy_on_write:
tm.assert_index_equal(idx, expected, check_names=False)
def test_index_to_frame(using_copy_on_write):
idx = Index([1, 2, 3], name="a")
expected = idx.copy(deep=True)
df = idx.to_frame()
if using_copy_on_write:
assert np.shares_memory(get_array(df, "a"), idx._values)
assert not df._mgr._has_no_reference(0)
else:
assert not np.shares_memory(get_array(df, "a"), idx._values)
df.iloc[0, 0] = 100
tm.assert_index_equal(idx, expected)
def test_index_values(using_copy_on_write):
idx = Index([1, 2, 3])
result = idx.values
if using_copy_on_write:
assert result.flags.writeable is False
else:
assert result.flags.writeable is True

View File

@ -0,0 +1,30 @@
import pytest
from pandas import (
Period,
PeriodIndex,
Series,
period_range,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Setting a value on a view:FutureWarning"
)
@pytest.mark.parametrize(
"cons",
[
lambda x: PeriodIndex(x),
lambda x: PeriodIndex(PeriodIndex(x)),
],
)
def test_periodindex(using_copy_on_write, cons):
dt = period_range("2019-12-31", periods=3, freq="D")
ser = Series(dt)
idx = cons(ser)
expected = idx.copy(deep=True)
ser.iloc[0] = Period("2020-12-31")
if using_copy_on_write:
tm.assert_index_equal(idx, expected)

View File

@ -0,0 +1,30 @@
import pytest
from pandas import (
Series,
Timedelta,
TimedeltaIndex,
timedelta_range,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Setting a value on a view:FutureWarning"
)
@pytest.mark.parametrize(
"cons",
[
lambda x: TimedeltaIndex(x),
lambda x: TimedeltaIndex(TimedeltaIndex(x)),
],
)
def test_timedeltaindex(using_copy_on_write, cons):
dt = timedelta_range("1 day", periods=3)
ser = Series(dt)
idx = cons(ser)
expected = idx.copy(deep=True)
ser.iloc[0] = Timedelta("5 days")
if using_copy_on_write:
tm.assert_index_equal(idx, expected)

View File

@ -0,0 +1,190 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Series,
date_range,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
# -----------------------------------------------------------------------------
# Copy/view behaviour for accessing underlying array of Series/DataFrame
@pytest.mark.parametrize(
"method",
[lambda ser: ser.values, lambda ser: np.asarray(ser)],
ids=["values", "asarray"],
)
def test_series_values(using_copy_on_write, method):
ser = Series([1, 2, 3], name="name")
ser_orig = ser.copy()
arr = method(ser)
if using_copy_on_write:
# .values still gives a view but is read-only
assert np.shares_memory(arr, get_array(ser, "name"))
assert arr.flags.writeable is False
# mutating series through arr therefore doesn't work
with pytest.raises(ValueError, match="read-only"):
arr[0] = 0
tm.assert_series_equal(ser, ser_orig)
# mutating the series itself still works
ser.iloc[0] = 0
assert ser.values[0] == 0
else:
assert arr.flags.writeable is True
arr[0] = 0
assert ser.iloc[0] == 0
@pytest.mark.parametrize(
"method",
[lambda df: df.values, lambda df: np.asarray(df)],
ids=["values", "asarray"],
)
def test_dataframe_values(using_copy_on_write, using_array_manager, method):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
df_orig = df.copy()
arr = method(df)
if using_copy_on_write:
# .values still gives a view but is read-only
assert np.shares_memory(arr, get_array(df, "a"))
assert arr.flags.writeable is False
# mutating series through arr therefore doesn't work
with pytest.raises(ValueError, match="read-only"):
arr[0, 0] = 0
tm.assert_frame_equal(df, df_orig)
# mutating the series itself still works
df.iloc[0, 0] = 0
assert df.values[0, 0] == 0
else:
assert arr.flags.writeable is True
arr[0, 0] = 0
if not using_array_manager:
assert df.iloc[0, 0] == 0
else:
tm.assert_frame_equal(df, df_orig)
def test_series_to_numpy(using_copy_on_write):
ser = Series([1, 2, 3], name="name")
ser_orig = ser.copy()
# default: copy=False, no dtype or NAs
arr = ser.to_numpy()
if using_copy_on_write:
# to_numpy still gives a view but is read-only
assert np.shares_memory(arr, get_array(ser, "name"))
assert arr.flags.writeable is False
# mutating series through arr therefore doesn't work
with pytest.raises(ValueError, match="read-only"):
arr[0] = 0
tm.assert_series_equal(ser, ser_orig)
# mutating the series itself still works
ser.iloc[0] = 0
assert ser.values[0] == 0
else:
assert arr.flags.writeable is True
arr[0] = 0
assert ser.iloc[0] == 0
# specify copy=False gives a writeable array
ser = Series([1, 2, 3], name="name")
arr = ser.to_numpy(copy=True)
assert not np.shares_memory(arr, get_array(ser, "name"))
assert arr.flags.writeable is True
# specifying a dtype that already causes a copy also gives a writeable array
ser = Series([1, 2, 3], name="name")
arr = ser.to_numpy(dtype="float64")
assert not np.shares_memory(arr, get_array(ser, "name"))
assert arr.flags.writeable is True
@pytest.mark.parametrize("order", ["F", "C"])
def test_ravel_read_only(using_copy_on_write, order):
ser = Series([1, 2, 3])
with tm.assert_produces_warning(FutureWarning, match="is deprecated"):
arr = ser.ravel(order=order)
if using_copy_on_write:
assert arr.flags.writeable is False
assert np.shares_memory(get_array(ser), arr)
def test_series_array_ea_dtypes(using_copy_on_write):
ser = Series([1, 2, 3], dtype="Int64")
arr = np.asarray(ser, dtype="int64")
assert np.shares_memory(arr, get_array(ser))
if using_copy_on_write:
assert arr.flags.writeable is False
else:
assert arr.flags.writeable is True
arr = np.asarray(ser)
assert np.shares_memory(arr, get_array(ser))
if using_copy_on_write:
assert arr.flags.writeable is False
else:
assert arr.flags.writeable is True
def test_dataframe_array_ea_dtypes(using_copy_on_write):
df = DataFrame({"a": [1, 2, 3]}, dtype="Int64")
arr = np.asarray(df, dtype="int64")
assert np.shares_memory(arr, get_array(df, "a"))
if using_copy_on_write:
assert arr.flags.writeable is False
else:
assert arr.flags.writeable is True
arr = np.asarray(df)
assert np.shares_memory(arr, get_array(df, "a"))
if using_copy_on_write:
assert arr.flags.writeable is False
else:
assert arr.flags.writeable is True
def test_dataframe_array_string_dtype(using_copy_on_write, using_array_manager):
df = DataFrame({"a": ["a", "b"]}, dtype="string")
arr = np.asarray(df)
if not using_array_manager:
assert np.shares_memory(arr, get_array(df, "a"))
if using_copy_on_write:
assert arr.flags.writeable is False
else:
assert arr.flags.writeable is True
def test_dataframe_multiple_numpy_dtypes():
df = DataFrame({"a": [1, 2, 3], "b": 1.5})
arr = np.asarray(df)
assert not np.shares_memory(arr, get_array(df, "a"))
assert arr.flags.writeable is True
def test_values_is_ea(using_copy_on_write):
df = DataFrame({"a": date_range("2012-01-01", periods=3)})
arr = np.asarray(df)
if using_copy_on_write:
assert arr.flags.writeable is False
else:
assert arr.flags.writeable is True
def test_empty_dataframe():
df = DataFrame()
arr = np.asarray(df)
assert arr.flags.writeable is True

View File

@ -0,0 +1,260 @@
import pickle
import numpy as np
import pytest
from pandas.compat.pyarrow import pa_version_under12p0
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
DataFrame,
Series,
Timestamp,
date_range,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
def test_astype_single_dtype(using_copy_on_write):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": 1.5})
df_orig = df.copy()
df2 = df.astype("float64")
if using_copy_on_write:
assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
# mutating df2 triggers a copy-on-write for that column/block
df2.iloc[0, 2] = 5.5
if using_copy_on_write:
assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
tm.assert_frame_equal(df, df_orig)
# mutating parent also doesn't update result
df2 = df.astype("float64")
df.iloc[0, 2] = 5.5
tm.assert_frame_equal(df2, df_orig.astype("float64"))
@pytest.mark.parametrize("dtype", ["int64", "Int64"])
@pytest.mark.parametrize("new_dtype", ["int64", "Int64", "int64[pyarrow]"])
def test_astype_avoids_copy(using_copy_on_write, dtype, new_dtype):
if new_dtype == "int64[pyarrow]":
pytest.importorskip("pyarrow")
df = DataFrame({"a": [1, 2, 3]}, dtype=dtype)
df_orig = df.copy()
df2 = df.astype(new_dtype)
if using_copy_on_write:
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
# mutating df2 triggers a copy-on-write for that column/block
df2.iloc[0, 0] = 10
if using_copy_on_write:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
tm.assert_frame_equal(df, df_orig)
# mutating parent also doesn't update result
df2 = df.astype(new_dtype)
df.iloc[0, 0] = 100
tm.assert_frame_equal(df2, df_orig.astype(new_dtype))
@pytest.mark.parametrize("dtype", ["float64", "int32", "Int32", "int32[pyarrow]"])
def test_astype_different_target_dtype(using_copy_on_write, dtype):
if dtype == "int32[pyarrow]":
pytest.importorskip("pyarrow")
df = DataFrame({"a": [1, 2, 3]})
df_orig = df.copy()
df2 = df.astype(dtype)
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
if using_copy_on_write:
assert df2._mgr._has_no_reference(0)
df2.iloc[0, 0] = 5
tm.assert_frame_equal(df, df_orig)
# mutating parent also doesn't update result
df2 = df.astype(dtype)
df.iloc[0, 0] = 100
tm.assert_frame_equal(df2, df_orig.astype(dtype))
@td.skip_array_manager_invalid_test
def test_astype_numpy_to_ea():
ser = Series([1, 2, 3])
with pd.option_context("mode.copy_on_write", True):
result = ser.astype("Int64")
assert np.shares_memory(get_array(ser), get_array(result))
@pytest.mark.parametrize(
"dtype, new_dtype", [("object", "string"), ("string", "object")]
)
def test_astype_string_and_object(using_copy_on_write, dtype, new_dtype):
df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
df_orig = df.copy()
df2 = df.astype(new_dtype)
if using_copy_on_write:
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
df2.iloc[0, 0] = "x"
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize(
"dtype, new_dtype", [("object", "string"), ("string", "object")]
)
def test_astype_string_and_object_update_original(
using_copy_on_write, dtype, new_dtype
):
df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
df2 = df.astype(new_dtype)
df_orig = df2.copy()
if using_copy_on_write:
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
df.iloc[0, 0] = "x"
tm.assert_frame_equal(df2, df_orig)
def test_astype_string_copy_on_pickle_roundrip():
# https://github.com/pandas-dev/pandas/issues/54654
# ensure_string_array may alter array inplace
base = Series(np.array([(1, 2), None, 1], dtype="object"))
base_copy = pickle.loads(pickle.dumps(base))
base_copy.astype(str)
tm.assert_series_equal(base, base_copy)
def test_astype_dict_dtypes(using_copy_on_write):
df = DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")}
)
df_orig = df.copy()
df2 = df.astype({"a": "float64", "c": "float64"})
if using_copy_on_write:
assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
# mutating df2 triggers a copy-on-write for that column/block
df2.iloc[0, 2] = 5.5
if using_copy_on_write:
assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
df2.iloc[0, 1] = 10
if using_copy_on_write:
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
tm.assert_frame_equal(df, df_orig)
def test_astype_different_datetime_resos(using_copy_on_write):
df = DataFrame({"a": date_range("2019-12-31", periods=2, freq="D")})
result = df.astype("datetime64[ms]")
assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
if using_copy_on_write:
assert result._mgr._has_no_reference(0)
def test_astype_different_timezones(using_copy_on_write):
df = DataFrame(
{"a": date_range("2019-12-31", periods=5, freq="D", tz="US/Pacific")}
)
result = df.astype("datetime64[ns, Europe/Berlin]")
if using_copy_on_write:
assert not result._mgr._has_no_reference(0)
assert np.shares_memory(get_array(df, "a"), get_array(result, "a"))
def test_astype_different_timezones_different_reso(using_copy_on_write):
df = DataFrame(
{"a": date_range("2019-12-31", periods=5, freq="D", tz="US/Pacific")}
)
result = df.astype("datetime64[ms, Europe/Berlin]")
if using_copy_on_write:
assert result._mgr._has_no_reference(0)
assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
def test_astype_arrow_timestamp(using_copy_on_write):
pytest.importorskip("pyarrow")
df = DataFrame(
{
"a": [
Timestamp("2020-01-01 01:01:01.000001"),
Timestamp("2020-01-01 01:01:01.000001"),
]
},
dtype="M8[ns]",
)
result = df.astype("timestamp[ns][pyarrow]")
if using_copy_on_write:
assert not result._mgr._has_no_reference(0)
if pa_version_under12p0:
assert not np.shares_memory(
get_array(df, "a"), get_array(result, "a")._pa_array
)
else:
assert np.shares_memory(
get_array(df, "a"), get_array(result, "a")._pa_array
)
def test_convert_dtypes_infer_objects(using_copy_on_write):
ser = Series(["a", "b", "c"])
ser_orig = ser.copy()
result = ser.convert_dtypes(
convert_integer=False,
convert_boolean=False,
convert_floating=False,
convert_string=False,
)
if using_copy_on_write:
assert np.shares_memory(get_array(ser), get_array(result))
else:
assert not np.shares_memory(get_array(ser), get_array(result))
result.iloc[0] = "x"
tm.assert_series_equal(ser, ser_orig)
def test_convert_dtypes(using_copy_on_write):
df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]})
df_orig = df.copy()
df2 = df.convert_dtypes()
if using_copy_on_write:
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert np.shares_memory(get_array(df2, "d"), get_array(df, "d"))
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
else:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
assert not np.shares_memory(get_array(df2, "d"), get_array(df, "d"))
df2.iloc[0, 0] = "x"
tm.assert_frame_equal(df, df_orig)

View File

@ -0,0 +1,174 @@
import numpy as np
import pytest
from pandas.compat import PY311
from pandas.errors import (
ChainedAssignmentError,
SettingWithCopyWarning,
)
from pandas import (
DataFrame,
option_context,
)
import pandas._testing as tm
def test_methods_iloc_warn(using_copy_on_write):
if not using_copy_on_write:
df = DataFrame({"a": [1, 2, 3], "b": 1})
with tm.assert_cow_warning(match="A value"):
df.iloc[:, 0].replace(1, 5, inplace=True)
with tm.assert_cow_warning(match="A value"):
df.iloc[:, 0].fillna(1, inplace=True)
with tm.assert_cow_warning(match="A value"):
df.iloc[:, 0].interpolate(inplace=True)
with tm.assert_cow_warning(match="A value"):
df.iloc[:, 0].ffill(inplace=True)
with tm.assert_cow_warning(match="A value"):
df.iloc[:, 0].bfill(inplace=True)
@pytest.mark.parametrize(
"func, args",
[
("replace", (4, 5)),
("fillna", (1,)),
("interpolate", ()),
("bfill", ()),
("ffill", ()),
],
)
def test_methods_iloc_getitem_item_cache(
func, args, using_copy_on_write, warn_copy_on_write
):
# ensure we don't incorrectly raise chained assignment warning because
# of the item cache / iloc not setting the item cache
df_orig = DataFrame({"a": [1, 2, 3], "b": 1})
df = df_orig.copy()
ser = df.iloc[:, 0]
getattr(ser, func)(*args, inplace=True)
# parent that holds item_cache is dead, so don't increase ref count
df = df_orig.copy()
ser = df.copy()["a"]
getattr(ser, func)(*args, inplace=True)
df = df_orig.copy()
df["a"] # populate the item_cache
ser = df.iloc[:, 0] # iloc creates a new object
getattr(ser, func)(*args, inplace=True)
df = df_orig.copy()
df["a"] # populate the item_cache
ser = df["a"]
getattr(ser, func)(*args, inplace=True)
df = df_orig.copy()
df["a"] # populate the item_cache
# TODO(CoW-warn) because of the usage of *args, this doesn't warn on Py3.11+
if using_copy_on_write:
with tm.raises_chained_assignment_error(not PY311):
getattr(df["a"], func)(*args, inplace=True)
else:
with tm.assert_cow_warning(not PY311, match="A value"):
getattr(df["a"], func)(*args, inplace=True)
df = df_orig.copy()
ser = df["a"] # populate the item_cache and keep ref
if using_copy_on_write:
with tm.raises_chained_assignment_error(not PY311):
getattr(df["a"], func)(*args, inplace=True)
else:
# ideally also warns on the default mode, but the ser' _cacher
# messes up the refcount + even in warning mode this doesn't trigger
# the warning of Py3.1+ (see above)
with tm.assert_cow_warning(warn_copy_on_write and not PY311, match="A value"):
getattr(df["a"], func)(*args, inplace=True)
def test_methods_iloc_getitem_item_cache_fillna(
using_copy_on_write, warn_copy_on_write
):
# ensure we don't incorrectly raise chained assignment warning because
# of the item cache / iloc not setting the item cache
df_orig = DataFrame({"a": [1, 2, 3], "b": 1})
df = df_orig.copy()
ser = df.iloc[:, 0]
ser.fillna(1, inplace=True)
# parent that holds item_cache is dead, so don't increase ref count
df = df_orig.copy()
ser = df.copy()["a"]
ser.fillna(1, inplace=True)
df = df_orig.copy()
df["a"] # populate the item_cache
ser = df.iloc[:, 0] # iloc creates a new object
ser.fillna(1, inplace=True)
df = df_orig.copy()
df["a"] # populate the item_cache
ser = df["a"]
ser.fillna(1, inplace=True)
df = df_orig.copy()
df["a"] # populate the item_cache
if using_copy_on_write:
with tm.raises_chained_assignment_error():
df["a"].fillna(1, inplace=True)
else:
with tm.assert_cow_warning(match="A value"):
df["a"].fillna(1, inplace=True)
df = df_orig.copy()
ser = df["a"] # populate the item_cache and keep ref
if using_copy_on_write:
with tm.raises_chained_assignment_error():
df["a"].fillna(1, inplace=True)
else:
# TODO(CoW-warn) ideally also warns on the default mode, but the ser' _cacher
# messes up the refcount
with tm.assert_cow_warning(warn_copy_on_write, match="A value"):
df["a"].fillna(1, inplace=True)
# TODO(CoW-warn) expand the cases
@pytest.mark.parametrize(
"indexer", [0, [0, 1], slice(0, 2), np.array([True, False, True])]
)
def test_series_setitem(indexer, using_copy_on_write, warn_copy_on_write):
# ensure we only get a single warning for those typical cases of chained
# assignment
df = DataFrame({"a": [1, 2, 3], "b": 1})
# using custom check instead of tm.assert_produces_warning because that doesn't
# fail if multiple warnings are raised
with pytest.warns() as record:
df["a"][indexer] = 0
assert len(record) == 1
if using_copy_on_write:
assert record[0].category == ChainedAssignmentError
else:
assert record[0].category == FutureWarning
assert "ChainedAssignmentError" in record[0].message.args[0]
@pytest.mark.filterwarnings("ignore::pandas.errors.SettingWithCopyWarning")
@pytest.mark.parametrize(
"indexer", ["a", ["a", "b"], slice(0, 2), np.array([True, False, True])]
)
def test_frame_setitem(indexer, using_copy_on_write):
df = DataFrame({"a": [1, 2, 3, 4, 5], "b": 1})
extra_warnings = () if using_copy_on_write else (SettingWithCopyWarning,)
with option_context("chained_assignment", "warn"):
with tm.raises_chained_assignment_error(extra_warnings=extra_warnings):
df[0:3][indexer] = 10

View File

@ -0,0 +1,101 @@
import numpy as np
from pandas import (
DataFrame,
option_context,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
def test_clip_inplace_reference(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1.5, 2, 3]})
df_copy = df.copy()
arr_a = get_array(df, "a")
view = df[:]
if warn_copy_on_write:
with tm.assert_cow_warning():
df.clip(lower=2, inplace=True)
else:
df.clip(lower=2, inplace=True)
if using_copy_on_write:
assert not np.shares_memory(get_array(df, "a"), arr_a)
assert df._mgr._has_no_reference(0)
assert view._mgr._has_no_reference(0)
tm.assert_frame_equal(df_copy, view)
else:
assert np.shares_memory(get_array(df, "a"), arr_a)
def test_clip_inplace_reference_no_op(using_copy_on_write):
df = DataFrame({"a": [1.5, 2, 3]})
df_copy = df.copy()
arr_a = get_array(df, "a")
view = df[:]
df.clip(lower=0, inplace=True)
assert np.shares_memory(get_array(df, "a"), arr_a)
if using_copy_on_write:
assert not df._mgr._has_no_reference(0)
assert not view._mgr._has_no_reference(0)
tm.assert_frame_equal(df_copy, view)
def test_clip_inplace(using_copy_on_write):
df = DataFrame({"a": [1.5, 2, 3]})
arr_a = get_array(df, "a")
df.clip(lower=2, inplace=True)
assert np.shares_memory(get_array(df, "a"), arr_a)
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
def test_clip(using_copy_on_write):
df = DataFrame({"a": [1.5, 2, 3]})
df_orig = df.copy()
df2 = df.clip(lower=2)
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
tm.assert_frame_equal(df_orig, df)
def test_clip_no_op(using_copy_on_write):
df = DataFrame({"a": [1.5, 2, 3]})
df2 = df.clip(lower=0)
if using_copy_on_write:
assert not df._mgr._has_no_reference(0)
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
def test_clip_chained_inplace(using_copy_on_write):
df = DataFrame({"a": [1, 4, 2], "b": 1})
df_orig = df.copy()
if using_copy_on_write:
with tm.raises_chained_assignment_error():
df["a"].clip(1, 2, inplace=True)
tm.assert_frame_equal(df, df_orig)
with tm.raises_chained_assignment_error():
df[["a"]].clip(1, 2, inplace=True)
tm.assert_frame_equal(df, df_orig)
else:
with tm.assert_produces_warning(FutureWarning, match="inplace method"):
df["a"].clip(1, 2, inplace=True)
with tm.assert_produces_warning(None):
with option_context("mode.chained_assignment", None):
df[["a"]].clip(1, 2, inplace=True)
with tm.assert_produces_warning(None):
with option_context("mode.chained_assignment", None):
df[df["a"] > 1].clip(1, 2, inplace=True)

View File

@ -0,0 +1,382 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
DatetimeIndex,
Index,
Period,
PeriodIndex,
Series,
Timedelta,
TimedeltaIndex,
Timestamp,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
# -----------------------------------------------------------------------------
# Copy/view behaviour for Series / DataFrame constructors
@pytest.mark.parametrize("dtype", [None, "int64"])
def test_series_from_series(dtype, using_copy_on_write, warn_copy_on_write):
# Case: constructing a Series from another Series object follows CoW rules:
# a new object is returned and thus mutations are not propagated
ser = Series([1, 2, 3], name="name")
# default is copy=False -> new Series is a shallow copy / view of original
result = Series(ser, dtype=dtype)
# the shallow copy still shares memory
assert np.shares_memory(get_array(ser), get_array(result))
if using_copy_on_write:
assert result._mgr.blocks[0].refs.has_reference()
if using_copy_on_write:
# mutating new series copy doesn't mutate original
result.iloc[0] = 0
assert ser.iloc[0] == 1
# mutating triggered a copy-on-write -> no longer shares memory
assert not np.shares_memory(get_array(ser), get_array(result))
else:
# mutating shallow copy does mutate original
with tm.assert_cow_warning(warn_copy_on_write):
result.iloc[0] = 0
assert ser.iloc[0] == 0
# and still shares memory
assert np.shares_memory(get_array(ser), get_array(result))
# the same when modifying the parent
result = Series(ser, dtype=dtype)
if using_copy_on_write:
# mutating original doesn't mutate new series
ser.iloc[0] = 0
assert result.iloc[0] == 1
else:
# mutating original does mutate shallow copy
with tm.assert_cow_warning(warn_copy_on_write):
ser.iloc[0] = 0
assert result.iloc[0] == 0
def test_series_from_series_with_reindex(using_copy_on_write, warn_copy_on_write):
# Case: constructing a Series from another Series with specifying an index
# that potentially requires a reindex of the values
ser = Series([1, 2, 3], name="name")
# passing an index that doesn't actually require a reindex of the values
# -> without CoW we get an actual mutating view
for index in [
ser.index,
ser.index.copy(),
list(ser.index),
ser.index.rename("idx"),
]:
result = Series(ser, index=index)
assert np.shares_memory(ser.values, result.values)
with tm.assert_cow_warning(warn_copy_on_write):
result.iloc[0] = 0
if using_copy_on_write:
assert ser.iloc[0] == 1
else:
assert ser.iloc[0] == 0
# ensure that if an actual reindex is needed, we don't have any refs
# (mutating the result wouldn't trigger CoW)
result = Series(ser, index=[0, 1, 2, 3])
assert not np.shares_memory(ser.values, result.values)
if using_copy_on_write:
assert not result._mgr.blocks[0].refs.has_reference()
@pytest.mark.parametrize("fastpath", [False, True])
@pytest.mark.parametrize("dtype", [None, "int64"])
@pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)])
@pytest.mark.parametrize(
"arr", [np.array([1, 2, 3], dtype="int64"), pd.array([1, 2, 3], dtype="Int64")]
)
def test_series_from_array(using_copy_on_write, idx, dtype, fastpath, arr):
if idx is None or dtype is not None:
fastpath = False
msg = "The 'fastpath' keyword in pd.Series is deprecated"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
ser = Series(arr, dtype=dtype, index=idx, fastpath=fastpath)
ser_orig = ser.copy()
data = getattr(arr, "_data", arr)
if using_copy_on_write:
assert not np.shares_memory(get_array(ser), data)
else:
assert np.shares_memory(get_array(ser), data)
arr[0] = 100
if using_copy_on_write:
tm.assert_series_equal(ser, ser_orig)
else:
expected = Series([100, 2, 3], dtype=dtype if dtype is not None else arr.dtype)
tm.assert_series_equal(ser, expected)
@pytest.mark.parametrize("copy", [True, False, None])
def test_series_from_array_different_dtype(using_copy_on_write, copy):
arr = np.array([1, 2, 3], dtype="int64")
ser = Series(arr, dtype="int32", copy=copy)
assert not np.shares_memory(get_array(ser), arr)
@pytest.mark.parametrize(
"idx",
[
Index([1, 2]),
DatetimeIndex([Timestamp("2019-12-31"), Timestamp("2020-12-31")]),
PeriodIndex([Period("2019-12-31"), Period("2020-12-31")]),
TimedeltaIndex([Timedelta("1 days"), Timedelta("2 days")]),
],
)
def test_series_from_index(using_copy_on_write, idx):
ser = Series(idx)
expected = idx.copy(deep=True)
if using_copy_on_write:
assert np.shares_memory(get_array(ser), get_array(idx))
assert not ser._mgr._has_no_reference(0)
else:
assert not np.shares_memory(get_array(ser), get_array(idx))
ser.iloc[0] = ser.iloc[1]
tm.assert_index_equal(idx, expected)
def test_series_from_index_different_dtypes(using_copy_on_write):
idx = Index([1, 2, 3], dtype="int64")
ser = Series(idx, dtype="int32")
assert not np.shares_memory(get_array(ser), get_array(idx))
if using_copy_on_write:
assert ser._mgr._has_no_reference(0)
@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning")
@pytest.mark.parametrize("fastpath", [False, True])
@pytest.mark.parametrize("dtype", [None, "int64"])
@pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)])
def test_series_from_block_manager(using_copy_on_write, idx, dtype, fastpath):
ser = Series([1, 2, 3], dtype="int64")
ser_orig = ser.copy()
msg = "The 'fastpath' keyword in pd.Series is deprecated"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
ser2 = Series(ser._mgr, dtype=dtype, fastpath=fastpath, index=idx)
assert np.shares_memory(get_array(ser), get_array(ser2))
if using_copy_on_write:
assert not ser2._mgr._has_no_reference(0)
ser2.iloc[0] = 100
if using_copy_on_write:
tm.assert_series_equal(ser, ser_orig)
else:
expected = Series([100, 2, 3])
tm.assert_series_equal(ser, expected)
def test_series_from_block_manager_different_dtype(using_copy_on_write):
ser = Series([1, 2, 3], dtype="int64")
msg = "Passing a SingleBlockManager to Series"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
ser2 = Series(ser._mgr, dtype="int32")
assert not np.shares_memory(get_array(ser), get_array(ser2))
if using_copy_on_write:
assert ser2._mgr._has_no_reference(0)
@pytest.mark.parametrize("use_mgr", [True, False])
@pytest.mark.parametrize("columns", [None, ["a"]])
def test_dataframe_constructor_mgr_or_df(
using_copy_on_write, warn_copy_on_write, columns, use_mgr
):
df = DataFrame({"a": [1, 2, 3]})
df_orig = df.copy()
if use_mgr:
data = df._mgr
warn = DeprecationWarning
else:
data = df
warn = None
msg = "Passing a BlockManager to DataFrame"
with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
new_df = DataFrame(data)
assert np.shares_memory(get_array(df, "a"), get_array(new_df, "a"))
with tm.assert_cow_warning(warn_copy_on_write and not use_mgr):
new_df.iloc[0] = 100
if using_copy_on_write:
assert not np.shares_memory(get_array(df, "a"), get_array(new_df, "a"))
tm.assert_frame_equal(df, df_orig)
else:
assert np.shares_memory(get_array(df, "a"), get_array(new_df, "a"))
tm.assert_frame_equal(df, new_df)
@pytest.mark.parametrize("dtype", [None, "int64", "Int64"])
@pytest.mark.parametrize("index", [None, [0, 1, 2]])
@pytest.mark.parametrize("columns", [None, ["a", "b"], ["a", "b", "c"]])
def test_dataframe_from_dict_of_series(
request, using_copy_on_write, warn_copy_on_write, columns, index, dtype
):
# Case: constructing a DataFrame from Series objects with copy=False
# has to do a lazy following CoW rules
# (the default for DataFrame(dict) is still to copy to ensure consolidation)
s1 = Series([1, 2, 3])
s2 = Series([4, 5, 6])
s1_orig = s1.copy()
expected = DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6]}, index=index, columns=columns, dtype=dtype
)
result = DataFrame(
{"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False
)
# the shallow copy still shares memory
assert np.shares_memory(get_array(result, "a"), get_array(s1))
# mutating the new dataframe doesn't mutate original
with tm.assert_cow_warning(warn_copy_on_write):
result.iloc[0, 0] = 10
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(s1))
tm.assert_series_equal(s1, s1_orig)
else:
assert s1.iloc[0] == 10
# the same when modifying the parent series
s1 = Series([1, 2, 3])
s2 = Series([4, 5, 6])
result = DataFrame(
{"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False
)
with tm.assert_cow_warning(warn_copy_on_write):
s1.iloc[0] = 10
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(s1))
tm.assert_frame_equal(result, expected)
else:
assert result.iloc[0, 0] == 10
@pytest.mark.parametrize("dtype", [None, "int64"])
def test_dataframe_from_dict_of_series_with_reindex(dtype):
# Case: constructing a DataFrame from Series objects with copy=False
# and passing an index that requires an actual (no-view) reindex -> need
# to ensure the result doesn't have refs set up to unnecessarily trigger
# a copy on write
s1 = Series([1, 2, 3])
s2 = Series([4, 5, 6])
df = DataFrame({"a": s1, "b": s2}, index=[1, 2, 3], dtype=dtype, copy=False)
# df should own its memory, so mutating shouldn't trigger a copy
arr_before = get_array(df, "a")
assert not np.shares_memory(arr_before, get_array(s1))
df.iloc[0, 0] = 100
arr_after = get_array(df, "a")
assert np.shares_memory(arr_before, arr_after)
@pytest.mark.parametrize("cons", [Series, Index])
@pytest.mark.parametrize(
"data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], None)]
)
def test_dataframe_from_series_or_index(
using_copy_on_write, warn_copy_on_write, data, dtype, cons
):
obj = cons(data, dtype=dtype)
obj_orig = obj.copy()
df = DataFrame(obj, dtype=dtype)
assert np.shares_memory(get_array(obj), get_array(df, 0))
if using_copy_on_write:
assert not df._mgr._has_no_reference(0)
with tm.assert_cow_warning(warn_copy_on_write):
df.iloc[0, 0] = data[-1]
if using_copy_on_write:
tm.assert_equal(obj, obj_orig)
@pytest.mark.parametrize("cons", [Series, Index])
def test_dataframe_from_series_or_index_different_dtype(using_copy_on_write, cons):
obj = cons([1, 2], dtype="int64")
df = DataFrame(obj, dtype="int32")
assert not np.shares_memory(get_array(obj), get_array(df, 0))
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
def test_dataframe_from_series_infer_datetime(using_copy_on_write):
ser = Series([Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype=object)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
df = DataFrame(ser)
assert not np.shares_memory(get_array(ser), get_array(df, 0))
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
@pytest.mark.parametrize("index", [None, [0, 1, 2]])
def test_dataframe_from_dict_of_series_with_dtype(index):
# Variant of above, but now passing a dtype that causes a copy
# -> need to ensure the result doesn't have refs set up to unnecessarily
# trigger a copy on write
s1 = Series([1.0, 2.0, 3.0])
s2 = Series([4, 5, 6])
df = DataFrame({"a": s1, "b": s2}, index=index, dtype="int64", copy=False)
# df should own its memory, so mutating shouldn't trigger a copy
arr_before = get_array(df, "a")
assert not np.shares_memory(arr_before, get_array(s1))
df.iloc[0, 0] = 100
arr_after = get_array(df, "a")
assert np.shares_memory(arr_before, arr_after)
@pytest.mark.parametrize("copy", [False, None, True])
def test_frame_from_numpy_array(using_copy_on_write, copy, using_array_manager):
arr = np.array([[1, 2], [3, 4]])
df = DataFrame(arr, copy=copy)
if (
using_copy_on_write
and copy is not False
or copy is True
or (using_array_manager and copy is None)
):
assert not np.shares_memory(get_array(df, 0), arr)
else:
assert np.shares_memory(get_array(df, 0), arr)
def test_dataframe_from_records_with_dataframe(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1, 2, 3]})
df_orig = df.copy()
with tm.assert_produces_warning(FutureWarning):
df2 = DataFrame.from_records(df)
if using_copy_on_write:
assert not df._mgr._has_no_reference(0)
assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
with tm.assert_cow_warning(warn_copy_on_write):
df2.iloc[0, 0] = 100
if using_copy_on_write:
tm.assert_frame_equal(df, df_orig)
else:
tm.assert_frame_equal(df, df2)
def test_frame_from_dict_of_index(using_copy_on_write):
idx = Index([1, 2, 3])
expected = idx.copy(deep=True)
df = DataFrame({"a": idx}, copy=False)
assert np.shares_memory(get_array(df, "a"), idx._values)
if using_copy_on_write:
assert not df._mgr._has_no_reference(0)
df.iloc[0, 0] = 100
tm.assert_index_equal(idx, expected)

View File

@ -0,0 +1,106 @@
import numpy as np
import pytest
from pandas import DataFrame
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
def test_assigning_to_same_variable_removes_references(using_copy_on_write):
df = DataFrame({"a": [1, 2, 3]})
df = df.reset_index()
if using_copy_on_write:
assert df._mgr._has_no_reference(1)
arr = get_array(df, "a")
df.iloc[0, 1] = 100 # Write into a
assert np.shares_memory(arr, get_array(df, "a"))
def test_setitem_dont_track_unnecessary_references(using_copy_on_write):
df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1})
df["b"] = 100
arr = get_array(df, "a")
# We split the block in setitem, if we are not careful the new blocks will
# reference each other triggering a copy
df.iloc[0, 0] = 100
assert np.shares_memory(arr, get_array(df, "a"))
def test_setitem_with_view_copies(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1})
view = df[:]
expected = df.copy()
df["b"] = 100
arr = get_array(df, "a")
with tm.assert_cow_warning(warn_copy_on_write):
df.iloc[0, 0] = 100 # Check that we correctly track reference
if using_copy_on_write:
assert not np.shares_memory(arr, get_array(df, "a"))
tm.assert_frame_equal(view, expected)
def test_setitem_with_view_invalidated_does_not_copy(
using_copy_on_write, warn_copy_on_write, request
):
df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1})
view = df[:]
df["b"] = 100
arr = get_array(df, "a")
view = None # noqa: F841
# TODO(CoW-warn) false positive? -> block gets split because of `df["b"] = 100`
# which introduces additional refs, even when those of `view` go out of scopes
with tm.assert_cow_warning(warn_copy_on_write):
df.iloc[0, 0] = 100
if using_copy_on_write:
# Setitem split the block. Since the old block shared data with view
# all the new blocks are referencing view and each other. When view
# goes out of scope, they don't share data with any other block,
# so we should not trigger a copy
mark = pytest.mark.xfail(
reason="blk.delete does not track references correctly"
)
request.applymarker(mark)
assert np.shares_memory(arr, get_array(df, "a"))
def test_out_of_scope(using_copy_on_write):
def func():
df = DataFrame({"a": [1, 2], "b": 1.5, "c": 1})
# create some subset
result = df[["a", "b"]]
return result
result = func()
if using_copy_on_write:
assert not result._mgr.blocks[0].refs.has_reference()
assert not result._mgr.blocks[1].refs.has_reference()
def test_delete(using_copy_on_write):
df = DataFrame(
np.random.default_rng(2).standard_normal((4, 3)), columns=["a", "b", "c"]
)
del df["b"]
if using_copy_on_write:
assert not df._mgr.blocks[0].refs.has_reference()
assert not df._mgr.blocks[1].refs.has_reference()
df = df[["a"]]
if using_copy_on_write:
assert not df._mgr.blocks[0].refs.has_reference()
def test_delete_reference(using_copy_on_write):
df = DataFrame(
np.random.default_rng(2).standard_normal((4, 3)), columns=["a", "b", "c"]
)
x = df[:]
del df["b"]
if using_copy_on_write:
assert df._mgr.blocks[0].refs.has_reference()
assert df._mgr.blocks[1].refs.has_reference()
assert x._mgr.blocks[0].refs.has_reference()

View File

@ -0,0 +1,396 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
Series,
concat,
merge,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
def test_concat_frames(using_copy_on_write):
df = DataFrame({"b": ["a"] * 3})
df2 = DataFrame({"a": ["a"] * 3})
df_orig = df.copy()
result = concat([df, df2], axis=1)
if using_copy_on_write:
assert np.shares_memory(get_array(result, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
else:
assert not np.shares_memory(get_array(result, "b"), get_array(df, "b"))
assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
result.iloc[0, 0] = "d"
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
result.iloc[0, 1] = "d"
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
tm.assert_frame_equal(df, df_orig)
def test_concat_frames_updating_input(using_copy_on_write):
df = DataFrame({"b": ["a"] * 3})
df2 = DataFrame({"a": ["a"] * 3})
result = concat([df, df2], axis=1)
if using_copy_on_write:
assert np.shares_memory(get_array(result, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
else:
assert not np.shares_memory(get_array(result, "b"), get_array(df, "b"))
assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
expected = result.copy()
df.iloc[0, 0] = "d"
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
df2.iloc[0, 0] = "d"
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
tm.assert_frame_equal(result, expected)
def test_concat_series(using_copy_on_write):
ser = Series([1, 2], name="a")
ser2 = Series([3, 4], name="b")
ser_orig = ser.copy()
ser2_orig = ser2.copy()
result = concat([ser, ser2], axis=1)
if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), ser.values)
assert np.shares_memory(get_array(result, "b"), ser2.values)
else:
assert not np.shares_memory(get_array(result, "a"), ser.values)
assert not np.shares_memory(get_array(result, "b"), ser2.values)
result.iloc[0, 0] = 100
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), ser.values)
assert np.shares_memory(get_array(result, "b"), ser2.values)
result.iloc[0, 1] = 1000
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "b"), ser2.values)
tm.assert_series_equal(ser, ser_orig)
tm.assert_series_equal(ser2, ser2_orig)
def test_concat_frames_chained(using_copy_on_write):
df1 = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
df2 = DataFrame({"c": [4, 5, 6]})
df3 = DataFrame({"d": [4, 5, 6]})
result = concat([concat([df1, df2], axis=1), df3], axis=1)
expected = result.copy()
if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert np.shares_memory(get_array(result, "c"), get_array(df2, "c"))
assert np.shares_memory(get_array(result, "d"), get_array(df3, "d"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert not np.shares_memory(get_array(result, "c"), get_array(df2, "c"))
assert not np.shares_memory(get_array(result, "d"), get_array(df3, "d"))
df1.iloc[0, 0] = 100
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
tm.assert_frame_equal(result, expected)
def test_concat_series_chained(using_copy_on_write):
ser1 = Series([1, 2, 3], name="a")
ser2 = Series([4, 5, 6], name="c")
ser3 = Series([4, 5, 6], name="d")
result = concat([concat([ser1, ser2], axis=1), ser3], axis=1)
expected = result.copy()
if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(ser1, "a"))
assert np.shares_memory(get_array(result, "c"), get_array(ser2, "c"))
assert np.shares_memory(get_array(result, "d"), get_array(ser3, "d"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(ser1, "a"))
assert not np.shares_memory(get_array(result, "c"), get_array(ser2, "c"))
assert not np.shares_memory(get_array(result, "d"), get_array(ser3, "d"))
ser1.iloc[0] = 100
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(ser1, "a"))
tm.assert_frame_equal(result, expected)
def test_concat_series_updating_input(using_copy_on_write):
ser = Series([1, 2], name="a")
ser2 = Series([3, 4], name="b")
expected = DataFrame({"a": [1, 2], "b": [3, 4]})
result = concat([ser, ser2], axis=1)
if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(ser, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(ser, "a"))
assert not np.shares_memory(get_array(result, "b"), get_array(ser2, "b"))
ser.iloc[0] = 100
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(ser, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b"))
tm.assert_frame_equal(result, expected)
ser2.iloc[0] = 1000
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "b"), get_array(ser2, "b"))
tm.assert_frame_equal(result, expected)
def test_concat_mixed_series_frame(using_copy_on_write):
df = DataFrame({"a": [1, 2, 3], "c": 1})
ser = Series([4, 5, 6], name="d")
result = concat([df, ser], axis=1)
expected = result.copy()
if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
assert np.shares_memory(get_array(result, "c"), get_array(df, "c"))
assert np.shares_memory(get_array(result, "d"), get_array(ser, "d"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
assert not np.shares_memory(get_array(result, "c"), get_array(df, "c"))
assert not np.shares_memory(get_array(result, "d"), get_array(ser, "d"))
ser.iloc[0] = 100
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "d"), get_array(ser, "d"))
df.iloc[0, 0] = 100
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("copy", [True, None, False])
def test_concat_copy_keyword(using_copy_on_write, copy):
df = DataFrame({"a": [1, 2]})
df2 = DataFrame({"b": [1.5, 2.5]})
result = concat([df, df2], axis=1, copy=copy)
if using_copy_on_write or copy is False:
assert np.shares_memory(get_array(df, "a"), get_array(result, "a"))
assert np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
else:
assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
@pytest.mark.parametrize(
"func",
[
lambda df1, df2, **kwargs: df1.merge(df2, **kwargs),
lambda df1, df2, **kwargs: merge(df1, df2, **kwargs),
],
)
def test_merge_on_key(using_copy_on_write, func):
df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]})
df2 = DataFrame({"key": ["a", "b", "c"], "b": [4, 5, 6]})
df1_orig = df1.copy()
df2_orig = df2.copy()
result = func(df1, df2, on="key")
if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
assert np.shares_memory(get_array(result, "key"), get_array(df1, "key"))
assert not np.shares_memory(get_array(result, "key"), get_array(df2, "key"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
result.iloc[0, 1] = 0
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
result.iloc[0, 2] = 0
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
tm.assert_frame_equal(df1, df1_orig)
tm.assert_frame_equal(df2, df2_orig)
def test_merge_on_index(using_copy_on_write):
df1 = DataFrame({"a": [1, 2, 3]})
df2 = DataFrame({"b": [4, 5, 6]})
df1_orig = df1.copy()
df2_orig = df2.copy()
result = merge(df1, df2, left_index=True, right_index=True)
if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
result.iloc[0, 0] = 0
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
result.iloc[0, 1] = 0
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
tm.assert_frame_equal(df1, df1_orig)
tm.assert_frame_equal(df2, df2_orig)
@pytest.mark.parametrize(
"func, how",
[
(lambda df1, df2, **kwargs: merge(df2, df1, on="key", **kwargs), "right"),
(lambda df1, df2, **kwargs: merge(df1, df2, on="key", **kwargs), "left"),
],
)
def test_merge_on_key_enlarging_one(using_copy_on_write, func, how):
df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]})
df2 = DataFrame({"key": ["a", "b"], "b": [4, 5]})
df1_orig = df1.copy()
df2_orig = df2.copy()
result = func(df1, df2, how=how)
if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
assert df2._mgr._has_no_reference(1)
assert df2._mgr._has_no_reference(0)
assert np.shares_memory(get_array(result, "key"), get_array(df1, "key")) is (
how == "left"
)
assert not np.shares_memory(get_array(result, "key"), get_array(df2, "key"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
if how == "left":
result.iloc[0, 1] = 0
else:
result.iloc[0, 2] = 0
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
tm.assert_frame_equal(df1, df1_orig)
tm.assert_frame_equal(df2, df2_orig)
@pytest.mark.parametrize("copy", [True, None, False])
def test_merge_copy_keyword(using_copy_on_write, copy):
df = DataFrame({"a": [1, 2]})
df2 = DataFrame({"b": [3, 4.5]})
result = df.merge(df2, copy=copy, left_index=True, right_index=True)
if using_copy_on_write or copy is False:
assert np.shares_memory(get_array(df, "a"), get_array(result, "a"))
assert np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
else:
assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
def test_join_on_key(using_copy_on_write):
df_index = Index(["a", "b", "c"], name="key")
df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True))
df2 = DataFrame({"b": [4, 5, 6]}, index=df_index.copy(deep=True))
df1_orig = df1.copy()
df2_orig = df2.copy()
result = df1.join(df2, on="key")
if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
assert np.shares_memory(get_array(result.index), get_array(df1.index))
assert not np.shares_memory(get_array(result.index), get_array(df2.index))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
result.iloc[0, 0] = 0
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
result.iloc[0, 1] = 0
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
tm.assert_frame_equal(df1, df1_orig)
tm.assert_frame_equal(df2, df2_orig)
def test_join_multiple_dataframes_on_key(using_copy_on_write):
df_index = Index(["a", "b", "c"], name="key")
df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True))
dfs_list = [
DataFrame({"b": [4, 5, 6]}, index=df_index.copy(deep=True)),
DataFrame({"c": [7, 8, 9]}, index=df_index.copy(deep=True)),
]
df1_orig = df1.copy()
dfs_list_orig = [df.copy() for df in dfs_list]
result = df1.join(dfs_list)
if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b"))
assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c"))
assert np.shares_memory(get_array(result.index), get_array(df1.index))
assert not np.shares_memory(
get_array(result.index), get_array(dfs_list[0].index)
)
assert not np.shares_memory(
get_array(result.index), get_array(dfs_list[1].index)
)
else:
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert not np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b"))
assert not np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c"))
result.iloc[0, 0] = 0
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b"))
assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c"))
result.iloc[0, 1] = 0
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b"))
assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c"))
result.iloc[0, 2] = 0
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c"))
tm.assert_frame_equal(df1, df1_orig)
for df, df_orig in zip(dfs_list, dfs_list_orig):
tm.assert_frame_equal(df, df_orig)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,151 @@
import numpy as np
import pytest
import pandas.util._test_decorators as td
import pandas as pd
from pandas import DataFrame
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
@td.skip_array_manager_invalid_test
def test_consolidate(using_copy_on_write):
# create unconsolidated DataFrame
df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
df["c"] = [4, 5, 6]
# take a viewing subset
subset = df[:]
# each block of subset references a block of df
assert all(blk.refs.has_reference() for blk in subset._mgr.blocks)
# consolidate the two int64 blocks
subset._consolidate_inplace()
# the float64 block still references the parent one because it still a view
assert subset._mgr.blocks[0].refs.has_reference()
# equivalent of assert np.shares_memory(df["b"].values, subset["b"].values)
# but avoids caching df["b"]
assert np.shares_memory(get_array(df, "b"), get_array(subset, "b"))
# the new consolidated int64 block does not reference another
assert not subset._mgr.blocks[1].refs.has_reference()
# the parent dataframe now also only is linked for the float column
assert not df._mgr.blocks[0].refs.has_reference()
assert df._mgr.blocks[1].refs.has_reference()
assert not df._mgr.blocks[2].refs.has_reference()
# and modifying subset still doesn't modify parent
if using_copy_on_write:
subset.iloc[0, 1] = 0.0
assert not df._mgr.blocks[1].refs.has_reference()
assert df.loc[0, "b"] == 0.1
@pytest.mark.single_cpu
@td.skip_array_manager_invalid_test
def test_switch_options():
# ensure we can switch the value of the option within one session
# (assuming data is constructed after switching)
# using the option_context to ensure we set back to global option value
# after running the test
with pd.option_context("mode.copy_on_write", False):
df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
subset = df[:]
subset.iloc[0, 0] = 0
# df updated with CoW disabled
assert df.iloc[0, 0] == 0
pd.options.mode.copy_on_write = True
df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
subset = df[:]
subset.iloc[0, 0] = 0
# df not updated with CoW enabled
assert df.iloc[0, 0] == 1
pd.options.mode.copy_on_write = False
df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
subset = df[:]
subset.iloc[0, 0] = 0
# df updated with CoW disabled
assert df.iloc[0, 0] == 0
@td.skip_array_manager_invalid_test
@pytest.mark.parametrize("dtype", [np.intp, np.int8])
@pytest.mark.parametrize(
"locs, arr",
[
([0], np.array([-1, -2, -3])),
([1], np.array([-1, -2, -3])),
([5], np.array([-1, -2, -3])),
([0, 1], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
([0, 2], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
([0, 1, 2], np.array([[-1, -2, -3], [-4, -5, -6], [-4, -5, -6]]).T),
([1, 2], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
([1, 3], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
([1, 3], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
],
)
def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr, dtype):
# Nothing currently calls iset with
# more than 1 loc with inplace=True (only happens with inplace=False)
# but ensure that it works
df = DataFrame(
{
"a": [1, 2, 3],
"b": [4, 5, 6],
"c": [7, 8, 9],
"d": [10, 11, 12],
"e": [13, 14, 15],
"f": ["a", "b", "c"],
},
)
arr = arr.astype(dtype)
df_orig = df.copy()
df2 = df.copy(deep=None) # Trigger a CoW (if enabled, otherwise makes copy)
df2._mgr.iset(locs, arr, inplace=True)
tm.assert_frame_equal(df, df_orig)
if using_copy_on_write:
for i, col in enumerate(df.columns):
if i not in locs:
assert np.shares_memory(get_array(df, col), get_array(df2, col))
else:
for col in df.columns:
assert not np.shares_memory(get_array(df, col), get_array(df2, col))
def test_exponential_backoff():
# GH#55518
df = DataFrame({"a": [1, 2, 3]})
for i in range(490):
df.copy(deep=False)
assert len(df._mgr.blocks[0].refs.referenced_blocks) == 491
df = DataFrame({"a": [1, 2, 3]})
dfs = [df.copy(deep=False) for i in range(510)]
for i in range(20):
df.copy(deep=False)
assert len(df._mgr.blocks[0].refs.referenced_blocks) == 531
assert df._mgr.blocks[0].refs.clear_counter == 1000
for i in range(500):
df.copy(deep=False)
# Don't reduce since we still have over 500 objects alive
assert df._mgr.blocks[0].refs.clear_counter == 1000
dfs = dfs[:300]
for i in range(500):
df.copy(deep=False)
# Reduce since there are less than 500 objects alive
assert df._mgr.blocks[0].refs.clear_counter == 500

View File

@ -0,0 +1,432 @@
import numpy as np
import pytest
from pandas import (
NA,
ArrowDtype,
DataFrame,
Interval,
NaT,
Series,
Timestamp,
interval_range,
option_context,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
@pytest.mark.parametrize("method", ["pad", "nearest", "linear"])
def test_interpolate_no_op(using_copy_on_write, method):
df = DataFrame({"a": [1, 2]})
df_orig = df.copy()
warn = None
if method == "pad":
warn = FutureWarning
msg = "DataFrame.interpolate with method=pad is deprecated"
with tm.assert_produces_warning(warn, match=msg):
result = df.interpolate(method=method)
if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
result.iloc[0, 0] = 100
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize("func", ["ffill", "bfill"])
def test_interp_fill_functions(using_copy_on_write, func):
# Check that these takes the same code paths as interpolate
df = DataFrame({"a": [1, 2]})
df_orig = df.copy()
result = getattr(df, func)()
if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
result.iloc[0, 0] = 100
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize("func", ["ffill", "bfill"])
@pytest.mark.parametrize(
"vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]]
)
def test_interpolate_triggers_copy(using_copy_on_write, vals, func):
df = DataFrame({"a": vals})
result = getattr(df, func)()
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
if using_copy_on_write:
# Check that we don't have references when triggering a copy
assert result._mgr._has_no_reference(0)
@pytest.mark.parametrize(
"vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]]
)
def test_interpolate_inplace_no_reference_no_copy(using_copy_on_write, vals):
df = DataFrame({"a": vals})
arr = get_array(df, "a")
df.interpolate(method="linear", inplace=True)
assert np.shares_memory(arr, get_array(df, "a"))
if using_copy_on_write:
# Check that we don't have references when triggering a copy
assert df._mgr._has_no_reference(0)
@pytest.mark.parametrize(
"vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]]
)
def test_interpolate_inplace_with_refs(using_copy_on_write, vals, warn_copy_on_write):
df = DataFrame({"a": [1, np.nan, 2]})
df_orig = df.copy()
arr = get_array(df, "a")
view = df[:]
with tm.assert_cow_warning(warn_copy_on_write):
df.interpolate(method="linear", inplace=True)
if using_copy_on_write:
# Check that copy was triggered in interpolate and that we don't
# have any references left
assert not np.shares_memory(arr, get_array(df, "a"))
tm.assert_frame_equal(df_orig, view)
assert df._mgr._has_no_reference(0)
assert view._mgr._has_no_reference(0)
else:
assert np.shares_memory(arr, get_array(df, "a"))
@pytest.mark.parametrize("func", ["ffill", "bfill"])
@pytest.mark.parametrize("dtype", ["float64", "Float64"])
def test_interp_fill_functions_inplace(
using_copy_on_write, func, warn_copy_on_write, dtype
):
# Check that these takes the same code paths as interpolate
df = DataFrame({"a": [1, np.nan, 2]}, dtype=dtype)
df_orig = df.copy()
arr = get_array(df, "a")
view = df[:]
with tm.assert_cow_warning(warn_copy_on_write and dtype == "float64"):
getattr(df, func)(inplace=True)
if using_copy_on_write:
# Check that copy was triggered in interpolate and that we don't
# have any references left
assert not np.shares_memory(arr, get_array(df, "a"))
tm.assert_frame_equal(df_orig, view)
assert df._mgr._has_no_reference(0)
assert view._mgr._has_no_reference(0)
else:
assert np.shares_memory(arr, get_array(df, "a")) is (dtype == "float64")
def test_interpolate_cleaned_fill_method(using_copy_on_write):
# Check that "method is set to None" case works correctly
df = DataFrame({"a": ["a", np.nan, "c"], "b": 1})
df_orig = df.copy()
msg = "DataFrame.interpolate with object dtype"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.interpolate(method="linear")
if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
result.iloc[0, 0] = Timestamp("2021-12-31")
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
tm.assert_frame_equal(df, df_orig)
def test_interpolate_object_convert_no_op(using_copy_on_write):
df = DataFrame({"a": ["a", "b", "c"], "b": 1})
arr_a = get_array(df, "a")
msg = "DataFrame.interpolate with method=pad is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
df.interpolate(method="pad", inplace=True)
# Now CoW makes a copy, it should not!
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert np.shares_memory(arr_a, get_array(df, "a"))
def test_interpolate_object_convert_copies(using_copy_on_write):
df = DataFrame({"a": Series([1, 2], dtype=object), "b": 1})
arr_a = get_array(df, "a")
msg = "DataFrame.interpolate with method=pad is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
df.interpolate(method="pad", inplace=True)
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert not np.shares_memory(arr_a, get_array(df, "a"))
def test_interpolate_downcast(using_copy_on_write):
df = DataFrame({"a": [1, np.nan, 2.5], "b": 1})
arr_a = get_array(df, "a")
msg = "DataFrame.interpolate with method=pad is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
df.interpolate(method="pad", inplace=True, downcast="infer")
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert np.shares_memory(arr_a, get_array(df, "a"))
def test_interpolate_downcast_reference_triggers_copy(using_copy_on_write):
df = DataFrame({"a": [1, np.nan, 2.5], "b": 1})
df_orig = df.copy()
arr_a = get_array(df, "a")
view = df[:]
msg = "DataFrame.interpolate with method=pad is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
df.interpolate(method="pad", inplace=True, downcast="infer")
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert not np.shares_memory(arr_a, get_array(df, "a"))
tm.assert_frame_equal(df_orig, view)
else:
tm.assert_frame_equal(df, view)
def test_fillna(using_copy_on_write):
df = DataFrame({"a": [1.5, np.nan], "b": 1})
df_orig = df.copy()
df2 = df.fillna(5.5)
if using_copy_on_write:
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
else:
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
df2.iloc[0, 1] = 100
tm.assert_frame_equal(df_orig, df)
def test_fillna_dict(using_copy_on_write):
df = DataFrame({"a": [1.5, np.nan], "b": 1})
df_orig = df.copy()
df2 = df.fillna({"a": 100.5})
if using_copy_on_write:
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
else:
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
df2.iloc[0, 1] = 100
tm.assert_frame_equal(df_orig, df)
@pytest.mark.parametrize("downcast", [None, False])
def test_fillna_inplace(using_copy_on_write, downcast):
df = DataFrame({"a": [1.5, np.nan], "b": 1})
arr_a = get_array(df, "a")
arr_b = get_array(df, "b")
msg = "The 'downcast' keyword in fillna is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
df.fillna(5.5, inplace=True, downcast=downcast)
assert np.shares_memory(get_array(df, "a"), arr_a)
assert np.shares_memory(get_array(df, "b"), arr_b)
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert df._mgr._has_no_reference(1)
def test_fillna_inplace_reference(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1.5, np.nan], "b": 1})
df_orig = df.copy()
arr_a = get_array(df, "a")
arr_b = get_array(df, "b")
view = df[:]
with tm.assert_cow_warning(warn_copy_on_write):
df.fillna(5.5, inplace=True)
if using_copy_on_write:
assert not np.shares_memory(get_array(df, "a"), arr_a)
assert np.shares_memory(get_array(df, "b"), arr_b)
assert view._mgr._has_no_reference(0)
assert df._mgr._has_no_reference(0)
tm.assert_frame_equal(view, df_orig)
else:
assert np.shares_memory(get_array(df, "a"), arr_a)
assert np.shares_memory(get_array(df, "b"), arr_b)
expected = DataFrame({"a": [1.5, 5.5], "b": 1})
tm.assert_frame_equal(df, expected)
def test_fillna_interval_inplace_reference(using_copy_on_write, warn_copy_on_write):
# Set dtype explicitly to avoid implicit cast when setting nan
ser = Series(
interval_range(start=0, end=5), name="a", dtype="interval[float64, right]"
)
ser.iloc[1] = np.nan
ser_orig = ser.copy()
view = ser[:]
with tm.assert_cow_warning(warn_copy_on_write):
ser.fillna(value=Interval(left=0, right=5), inplace=True)
if using_copy_on_write:
assert not np.shares_memory(
get_array(ser, "a").left.values, get_array(view, "a").left.values
)
tm.assert_series_equal(view, ser_orig)
else:
assert np.shares_memory(
get_array(ser, "a").left.values, get_array(view, "a").left.values
)
def test_fillna_series_empty_arg(using_copy_on_write):
ser = Series([1, np.nan, 2])
ser_orig = ser.copy()
result = ser.fillna({})
if using_copy_on_write:
assert np.shares_memory(get_array(ser), get_array(result))
else:
assert not np.shares_memory(get_array(ser), get_array(result))
ser.iloc[0] = 100.5
tm.assert_series_equal(ser_orig, result)
def test_fillna_series_empty_arg_inplace(using_copy_on_write):
ser = Series([1, np.nan, 2])
arr = get_array(ser)
ser.fillna({}, inplace=True)
assert np.shares_memory(get_array(ser), arr)
if using_copy_on_write:
assert ser._mgr._has_no_reference(0)
def test_fillna_ea_noop_shares_memory(
using_copy_on_write, any_numeric_ea_and_arrow_dtype
):
df = DataFrame({"a": [1, NA, 3], "b": 1}, dtype=any_numeric_ea_and_arrow_dtype)
df_orig = df.copy()
df2 = df.fillna(100)
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
if using_copy_on_write:
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert not df2._mgr._has_no_reference(1)
elif isinstance(df.dtypes.iloc[0], ArrowDtype):
# arrow is immutable, so no-ops do not need to copy underlying array
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
else:
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
tm.assert_frame_equal(df_orig, df)
df2.iloc[0, 1] = 100
if using_copy_on_write:
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert df2._mgr._has_no_reference(1)
assert df._mgr._has_no_reference(1)
tm.assert_frame_equal(df_orig, df)
def test_fillna_inplace_ea_noop_shares_memory(
using_copy_on_write, warn_copy_on_write, any_numeric_ea_and_arrow_dtype
):
df = DataFrame({"a": [1, NA, 3], "b": 1}, dtype=any_numeric_ea_and_arrow_dtype)
df_orig = df.copy()
view = df[:]
with tm.assert_cow_warning(warn_copy_on_write):
df.fillna(100, inplace=True)
if isinstance(df["a"].dtype, ArrowDtype) or using_copy_on_write:
assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
else:
# MaskedArray can actually respect inplace=True
assert np.shares_memory(get_array(df, "a"), get_array(view, "a"))
assert np.shares_memory(get_array(df, "b"), get_array(view, "b"))
if using_copy_on_write:
assert not df._mgr._has_no_reference(1)
assert not view._mgr._has_no_reference(1)
with tm.assert_cow_warning(
warn_copy_on_write and "pyarrow" not in any_numeric_ea_and_arrow_dtype
):
df.iloc[0, 1] = 100
if isinstance(df["a"].dtype, ArrowDtype) or using_copy_on_write:
tm.assert_frame_equal(df_orig, view)
else:
# we actually have a view
tm.assert_frame_equal(df, view)
def test_fillna_chained_assignment(using_copy_on_write):
df = DataFrame({"a": [1, np.nan, 2], "b": 1})
df_orig = df.copy()
if using_copy_on_write:
with tm.raises_chained_assignment_error():
df["a"].fillna(100, inplace=True)
tm.assert_frame_equal(df, df_orig)
with tm.raises_chained_assignment_error():
df[["a"]].fillna(100, inplace=True)
tm.assert_frame_equal(df, df_orig)
else:
with tm.assert_produces_warning(None):
with option_context("mode.chained_assignment", None):
df[["a"]].fillna(100, inplace=True)
with tm.assert_produces_warning(None):
with option_context("mode.chained_assignment", None):
df[df.a > 5].fillna(100, inplace=True)
with tm.assert_produces_warning(FutureWarning, match="inplace method"):
df["a"].fillna(100, inplace=True)
@pytest.mark.parametrize("func", ["interpolate", "ffill", "bfill"])
def test_interpolate_chained_assignment(using_copy_on_write, func):
df = DataFrame({"a": [1, np.nan, 2], "b": 1})
df_orig = df.copy()
if using_copy_on_write:
with tm.raises_chained_assignment_error():
getattr(df["a"], func)(inplace=True)
tm.assert_frame_equal(df, df_orig)
with tm.raises_chained_assignment_error():
getattr(df[["a"]], func)(inplace=True)
tm.assert_frame_equal(df, df_orig)
else:
with tm.assert_produces_warning(FutureWarning, match="inplace method"):
getattr(df["a"], func)(inplace=True)
with tm.assert_produces_warning(None):
with option_context("mode.chained_assignment", None):
getattr(df[["a"]], func)(inplace=True)
with tm.assert_produces_warning(None):
with option_context("mode.chained_assignment", None):
getattr(df[df["a"] > 1], func)(inplace=True)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,481 @@
import numpy as np
import pytest
from pandas import (
Categorical,
DataFrame,
option_context,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
@pytest.mark.parametrize(
"replace_kwargs",
[
{"to_replace": {"a": 1, "b": 4}, "value": -1},
# Test CoW splits blocks to avoid copying unchanged columns
{"to_replace": {"a": 1}, "value": -1},
{"to_replace": {"b": 4}, "value": -1},
{"to_replace": {"b": {4: 1}}},
# TODO: Add these in a further optimization
# We would need to see which columns got replaced in the mask
# which could be expensive
# {"to_replace": {"b": 1}},
# 1
],
)
def test_replace(using_copy_on_write, replace_kwargs):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
df_orig = df.copy()
df_replaced = df.replace(**replace_kwargs)
if using_copy_on_write:
if (df_replaced["b"] == df["b"]).all():
assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
# mutating squeezed df triggers a copy-on-write for that column/block
df_replaced.loc[0, "c"] = -1
if using_copy_on_write:
assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
if "a" in replace_kwargs["to_replace"]:
arr = get_array(df_replaced, "a")
df_replaced.loc[0, "a"] = 100
assert np.shares_memory(get_array(df_replaced, "a"), arr)
tm.assert_frame_equal(df, df_orig)
def test_replace_regex_inplace_refs(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": ["aaa", "bbb"]})
df_orig = df.copy()
view = df[:]
arr = get_array(df, "a")
with tm.assert_cow_warning(warn_copy_on_write):
df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
if using_copy_on_write:
assert not np.shares_memory(arr, get_array(df, "a"))
assert df._mgr._has_no_reference(0)
tm.assert_frame_equal(view, df_orig)
else:
assert np.shares_memory(arr, get_array(df, "a"))
def test_replace_regex_inplace(using_copy_on_write):
df = DataFrame({"a": ["aaa", "bbb"]})
arr = get_array(df, "a")
df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert np.shares_memory(arr, get_array(df, "a"))
df_orig = df.copy()
df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True)
tm.assert_frame_equal(df_orig, df)
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
def test_replace_regex_inplace_no_op(using_copy_on_write):
df = DataFrame({"a": [1, 2]})
arr = get_array(df, "a")
df.replace(to_replace=r"^a.$", value="new", inplace=True, regex=True)
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert np.shares_memory(arr, get_array(df, "a"))
df_orig = df.copy()
df2 = df.replace(to_replace=r"^x.$", value="new", regex=True)
tm.assert_frame_equal(df_orig, df)
if using_copy_on_write:
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
def test_replace_mask_all_false_second_block(using_copy_on_write):
df = DataFrame({"a": [1.5, 2, 3], "b": 100.5, "c": 1, "d": 2})
df_orig = df.copy()
df2 = df.replace(to_replace=1.5, value=55.5)
if using_copy_on_write:
# TODO: Block splitting would allow us to avoid copying b
assert np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
else:
assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
df2.loc[0, "c"] = 1
tm.assert_frame_equal(df, df_orig) # Original is unchanged
if using_copy_on_write:
assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
# TODO: This should split and not copy the whole block
# assert np.shares_memory(get_array(df, "d"), get_array(df2, "d"))
def test_replace_coerce_single_column(using_copy_on_write, using_array_manager):
df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
df_orig = df.copy()
df2 = df.replace(to_replace=1.5, value="a")
if using_copy_on_write:
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
elif not using_array_manager:
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
if using_copy_on_write:
df2.loc[0, "b"] = 0.5
tm.assert_frame_equal(df, df_orig) # Original is unchanged
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
def test_replace_to_replace_wrong_dtype(using_copy_on_write):
df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
df_orig = df.copy()
df2 = df.replace(to_replace="xxx", value=1.5)
if using_copy_on_write:
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
else:
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
df2.loc[0, "b"] = 0.5
tm.assert_frame_equal(df, df_orig) # Original is unchanged
if using_copy_on_write:
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
def test_replace_list_categorical(using_copy_on_write):
df = DataFrame({"a": ["a", "b", "c"]}, dtype="category")
arr = get_array(df, "a")
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
df.replace(["c"], value="a", inplace=True)
assert np.shares_memory(arr.codes, get_array(df, "a").codes)
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
df_orig = df.copy()
with tm.assert_produces_warning(FutureWarning, match=msg):
df2 = df.replace(["b"], value="a")
assert not np.shares_memory(arr.codes, get_array(df2, "a").codes)
tm.assert_frame_equal(df, df_orig)
def test_replace_list_inplace_refs_categorical(using_copy_on_write):
df = DataFrame({"a": ["a", "b", "c"]}, dtype="category")
view = df[:]
df_orig = df.copy()
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
df.replace(["c"], value="a", inplace=True)
if using_copy_on_write:
assert not np.shares_memory(
get_array(view, "a").codes, get_array(df, "a").codes
)
tm.assert_frame_equal(df_orig, view)
else:
# This could be inplace
assert not np.shares_memory(
get_array(view, "a").codes, get_array(df, "a").codes
)
@pytest.mark.parametrize("to_replace", [1.5, [1.5], []])
def test_replace_inplace(using_copy_on_write, to_replace):
df = DataFrame({"a": [1.5, 2, 3]})
arr_a = get_array(df, "a")
df.replace(to_replace=1.5, value=15.5, inplace=True)
assert np.shares_memory(get_array(df, "a"), arr_a)
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
@pytest.mark.parametrize("to_replace", [1.5, [1.5]])
def test_replace_inplace_reference(using_copy_on_write, to_replace, warn_copy_on_write):
df = DataFrame({"a": [1.5, 2, 3]})
arr_a = get_array(df, "a")
view = df[:]
with tm.assert_cow_warning(warn_copy_on_write):
df.replace(to_replace=to_replace, value=15.5, inplace=True)
if using_copy_on_write:
assert not np.shares_memory(get_array(df, "a"), arr_a)
assert df._mgr._has_no_reference(0)
assert view._mgr._has_no_reference(0)
else:
assert np.shares_memory(get_array(df, "a"), arr_a)
@pytest.mark.parametrize("to_replace", ["a", 100.5])
def test_replace_inplace_reference_no_op(using_copy_on_write, to_replace):
df = DataFrame({"a": [1.5, 2, 3]})
arr_a = get_array(df, "a")
view = df[:]
df.replace(to_replace=to_replace, value=15.5, inplace=True)
assert np.shares_memory(get_array(df, "a"), arr_a)
if using_copy_on_write:
assert not df._mgr._has_no_reference(0)
assert not view._mgr._has_no_reference(0)
@pytest.mark.parametrize("to_replace", [1, [1]])
@pytest.mark.parametrize("val", [1, 1.5])
def test_replace_categorical_inplace_reference(using_copy_on_write, val, to_replace):
df = DataFrame({"a": Categorical([1, 2, 3])})
df_orig = df.copy()
arr_a = get_array(df, "a")
view = df[:]
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
warn = FutureWarning if val == 1.5 else None
with tm.assert_produces_warning(warn, match=msg):
df.replace(to_replace=to_replace, value=val, inplace=True)
if using_copy_on_write:
assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes)
assert df._mgr._has_no_reference(0)
assert view._mgr._has_no_reference(0)
tm.assert_frame_equal(view, df_orig)
else:
assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
@pytest.mark.parametrize("val", [1, 1.5])
def test_replace_categorical_inplace(using_copy_on_write, val):
df = DataFrame({"a": Categorical([1, 2, 3])})
arr_a = get_array(df, "a")
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
warn = FutureWarning if val == 1.5 else None
with tm.assert_produces_warning(warn, match=msg):
df.replace(to_replace=1, value=val, inplace=True)
assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
expected = DataFrame({"a": Categorical([val, 2, 3])})
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize("val", [1, 1.5])
def test_replace_categorical(using_copy_on_write, val):
df = DataFrame({"a": Categorical([1, 2, 3])})
df_orig = df.copy()
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
warn = FutureWarning if val == 1.5 else None
with tm.assert_produces_warning(warn, match=msg):
df2 = df.replace(to_replace=1, value=val)
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert df2._mgr._has_no_reference(0)
assert not np.shares_memory(get_array(df, "a").codes, get_array(df2, "a").codes)
tm.assert_frame_equal(df, df_orig)
arr_a = get_array(df2, "a").codes
df2.iloc[0, 0] = 2.0
assert np.shares_memory(get_array(df2, "a").codes, arr_a)
@pytest.mark.parametrize("method", ["where", "mask"])
def test_masking_inplace(using_copy_on_write, method, warn_copy_on_write):
df = DataFrame({"a": [1.5, 2, 3]})
df_orig = df.copy()
arr_a = get_array(df, "a")
view = df[:]
method = getattr(df, method)
if warn_copy_on_write:
with tm.assert_cow_warning():
method(df["a"] > 1.6, -1, inplace=True)
else:
method(df["a"] > 1.6, -1, inplace=True)
if using_copy_on_write:
assert not np.shares_memory(get_array(df, "a"), arr_a)
assert df._mgr._has_no_reference(0)
assert view._mgr._has_no_reference(0)
tm.assert_frame_equal(view, df_orig)
else:
assert np.shares_memory(get_array(df, "a"), arr_a)
def test_replace_empty_list(using_copy_on_write):
df = DataFrame({"a": [1, 2]})
df2 = df.replace([], [])
if using_copy_on_write:
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert not df._mgr._has_no_reference(0)
else:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
arr_a = get_array(df, "a")
df.replace([], [])
if using_copy_on_write:
assert np.shares_memory(get_array(df, "a"), arr_a)
assert not df._mgr._has_no_reference(0)
assert not df2._mgr._has_no_reference(0)
@pytest.mark.parametrize("value", ["d", None])
def test_replace_object_list_inplace(using_copy_on_write, value):
df = DataFrame({"a": ["a", "b", "c"]})
arr = get_array(df, "a")
df.replace(["c"], value, inplace=True)
if using_copy_on_write or value is None:
assert np.shares_memory(arr, get_array(df, "a"))
else:
# This could be inplace
assert not np.shares_memory(arr, get_array(df, "a"))
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
def test_replace_list_multiple_elements_inplace(using_copy_on_write):
df = DataFrame({"a": [1, 2, 3]})
arr = get_array(df, "a")
df.replace([1, 2], 4, inplace=True)
if using_copy_on_write:
assert np.shares_memory(arr, get_array(df, "a"))
assert df._mgr._has_no_reference(0)
else:
assert np.shares_memory(arr, get_array(df, "a"))
def test_replace_list_none(using_copy_on_write):
df = DataFrame({"a": ["a", "b", "c"]})
df_orig = df.copy()
df2 = df.replace(["b"], value=None)
tm.assert_frame_equal(df, df_orig)
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
def test_replace_list_none_inplace_refs(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": ["a", "b", "c"]})
arr = get_array(df, "a")
df_orig = df.copy()
view = df[:]
with tm.assert_cow_warning(warn_copy_on_write):
df.replace(["a"], value=None, inplace=True)
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert not np.shares_memory(arr, get_array(df, "a"))
tm.assert_frame_equal(df_orig, view)
else:
assert np.shares_memory(arr, get_array(df, "a"))
def test_replace_columnwise_no_op_inplace(using_copy_on_write):
df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
view = df[:]
df_orig = df.copy()
df.replace({"a": 10}, 100, inplace=True)
if using_copy_on_write:
assert np.shares_memory(get_array(view, "a"), get_array(df, "a"))
df.iloc[0, 0] = 100
tm.assert_frame_equal(view, df_orig)
def test_replace_columnwise_no_op(using_copy_on_write):
df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
df_orig = df.copy()
df2 = df.replace({"a": 10}, 100)
if using_copy_on_write:
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
df2.iloc[0, 0] = 100
tm.assert_frame_equal(df, df_orig)
def test_replace_chained_assignment(using_copy_on_write):
df = DataFrame({"a": [1, np.nan, 2], "b": 1})
df_orig = df.copy()
if using_copy_on_write:
with tm.raises_chained_assignment_error():
df["a"].replace(1, 100, inplace=True)
tm.assert_frame_equal(df, df_orig)
with tm.raises_chained_assignment_error():
df[["a"]].replace(1, 100, inplace=True)
tm.assert_frame_equal(df, df_orig)
else:
with tm.assert_produces_warning(None):
with option_context("mode.chained_assignment", None):
df[["a"]].replace(1, 100, inplace=True)
with tm.assert_produces_warning(None):
with option_context("mode.chained_assignment", None):
df[df.a > 5].replace(1, 100, inplace=True)
with tm.assert_produces_warning(FutureWarning, match="inplace method"):
df["a"].replace(1, 100, inplace=True)
def test_replace_listlike(using_copy_on_write):
df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
df_orig = df.copy()
result = df.replace([200, 201], [11, 11])
if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
result.iloc[0, 0] = 100
tm.assert_frame_equal(df, df)
result = df.replace([200, 2], [10, 10])
assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
tm.assert_frame_equal(df, df_orig)
def test_replace_listlike_inplace(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
arr = get_array(df, "a")
df.replace([200, 2], [10, 11], inplace=True)
assert np.shares_memory(get_array(df, "a"), arr)
view = df[:]
df_orig = df.copy()
with tm.assert_cow_warning(warn_copy_on_write):
df.replace([200, 3], [10, 11], inplace=True)
if using_copy_on_write:
assert not np.shares_memory(get_array(df, "a"), arr)
tm.assert_frame_equal(view, df_orig)
else:
assert np.shares_memory(get_array(df, "a"), arr)
tm.assert_frame_equal(df, view)

View File

@ -0,0 +1,156 @@
import numpy as np
from pandas import (
DataFrame,
Index,
MultiIndex,
RangeIndex,
Series,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
# -----------------------------------------------------------------------------
# Copy/view behaviour for the values that are set in a DataFrame
def test_set_column_with_array():
# Case: setting an array as a new column (df[col] = arr) copies that data
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
arr = np.array([1, 2, 3], dtype="int64")
df["c"] = arr
# the array data is copied
assert not np.shares_memory(get_array(df, "c"), arr)
# and thus modifying the array does not modify the DataFrame
arr[0] = 0
tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c"))
def test_set_column_with_series(using_copy_on_write):
# Case: setting a series as a new column (df[col] = s) copies that data
# (with delayed copy with CoW)
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
ser = Series([1, 2, 3])
df["c"] = ser
if using_copy_on_write:
assert np.shares_memory(get_array(df, "c"), get_array(ser))
else:
# the series data is copied
assert not np.shares_memory(get_array(df, "c"), get_array(ser))
# and modifying the series does not modify the DataFrame
ser.iloc[0] = 0
assert ser.iloc[0] == 0
tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c"))
def test_set_column_with_index(using_copy_on_write):
# Case: setting an index as a new column (df[col] = idx) copies that data
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
idx = Index([1, 2, 3])
df["c"] = idx
# the index data is copied
assert not np.shares_memory(get_array(df, "c"), idx.values)
idx = RangeIndex(1, 4)
arr = idx.values
df["d"] = idx
assert not np.shares_memory(get_array(df, "d"), arr)
def test_set_columns_with_dataframe(using_copy_on_write):
# Case: setting a DataFrame as new columns copies that data
# (with delayed copy with CoW)
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
df2 = DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]})
df[["c", "d"]] = df2
if using_copy_on_write:
assert np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
else:
# the data is copied
assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
# and modifying the set DataFrame does not modify the original DataFrame
df2.iloc[0, 0] = 0
tm.assert_series_equal(df["c"], Series([7, 8, 9], name="c"))
def test_setitem_series_no_copy(using_copy_on_write):
# Case: setting a Series as column into a DataFrame can delay copying that data
df = DataFrame({"a": [1, 2, 3]})
rhs = Series([4, 5, 6])
rhs_orig = rhs.copy()
# adding a new column
df["b"] = rhs
if using_copy_on_write:
assert np.shares_memory(get_array(rhs), get_array(df, "b"))
df.iloc[0, 1] = 100
tm.assert_series_equal(rhs, rhs_orig)
def test_setitem_series_no_copy_single_block(using_copy_on_write):
# Overwriting an existing column that is a single block
df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
rhs = Series([4, 5, 6])
rhs_orig = rhs.copy()
df["a"] = rhs
if using_copy_on_write:
assert np.shares_memory(get_array(rhs), get_array(df, "a"))
df.iloc[0, 0] = 100
tm.assert_series_equal(rhs, rhs_orig)
def test_setitem_series_no_copy_split_block(using_copy_on_write):
# Overwriting an existing column that is part of a larger block
df = DataFrame({"a": [1, 2, 3], "b": 1})
rhs = Series([4, 5, 6])
rhs_orig = rhs.copy()
df["b"] = rhs
if using_copy_on_write:
assert np.shares_memory(get_array(rhs), get_array(df, "b"))
df.iloc[0, 1] = 100
tm.assert_series_equal(rhs, rhs_orig)
def test_setitem_series_column_midx_broadcasting(using_copy_on_write):
# Setting a Series to multiple columns will repeat the data
# (currently copying the data eagerly)
df = DataFrame(
[[1, 2, 3], [3, 4, 5]],
columns=MultiIndex.from_arrays([["a", "a", "b"], [1, 2, 3]]),
)
rhs = Series([10, 11])
df["a"] = rhs
assert not np.shares_memory(get_array(rhs), df._get_column_array(0))
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
def test_set_column_with_inplace_operator(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
# this should not raise any warning
with tm.assert_produces_warning(None):
df["a"] += 1
# when it is not in a chain, then it should produce a warning
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
ser = df["a"]
with tm.assert_cow_warning(warn_copy_on_write):
ser += 1

View File

@ -0,0 +1,14 @@
import numpy as np
from pandas import DataFrame
from pandas.tests.copy_view.util import get_array
def test_get_array_numpy():
df = DataFrame({"a": [1, 2, 3]})
assert np.shares_memory(get_array(df, "a"), get_array(df, "a"))
def test_get_array_masked():
df = DataFrame({"a": [1, 2, 3]}, dtype="Int64")
assert np.shares_memory(get_array(df, "a"), get_array(df, "a"))

View File

@ -0,0 +1,30 @@
from pandas import (
Categorical,
Index,
Series,
)
from pandas.core.arrays import BaseMaskedArray
def get_array(obj, col=None):
"""
Helper method to get array for a DataFrame column or a Series.
Equivalent of df[col].values, but without going through normal getitem,
which triggers tracking references / CoW (and we might be testing that
this is done by some other operation).
"""
if isinstance(obj, Index):
arr = obj._values
elif isinstance(obj, Series) and (col is None or obj.name == col):
arr = obj._values
else:
assert col is not None
icol = obj.columns.get_loc(col)
assert isinstance(icol, int)
arr = obj._get_column_array(icol)
if isinstance(arr, BaseMaskedArray):
return arr._data
elif isinstance(arr, Categorical):
return arr
return getattr(arr, "_ndarray", arr)