Updated script that can be controled by Nodejs web app
This commit is contained in:
@ -0,0 +1,604 @@
|
||||
from datetime import (
|
||||
datetime,
|
||||
timezone,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.tslibs import iNaT
|
||||
from pandas.compat import (
|
||||
is_ci_environment,
|
||||
is_platform_windows,
|
||||
)
|
||||
from pandas.compat.numpy import np_version_lt1p23
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.core.interchange.column import PandasColumn
|
||||
from pandas.core.interchange.dataframe_protocol import (
|
||||
ColumnNullType,
|
||||
DtypeKind,
|
||||
)
|
||||
from pandas.core.interchange.from_dataframe import from_dataframe
|
||||
from pandas.core.interchange.utils import ArrowCTypes
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_categorical():
|
||||
return {
|
||||
"ordered": pd.Categorical(list("testdata") * 30, ordered=True),
|
||||
"unordered": pd.Categorical(list("testdata") * 30, ordered=False),
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def string_data():
|
||||
return {
|
||||
"separator data": [
|
||||
"abC|DeF,Hik",
|
||||
"234,3245.67",
|
||||
"gSaf,qWer|Gre",
|
||||
"asd3,4sad|",
|
||||
np.nan,
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data", [("ordered", True), ("unordered", False)])
|
||||
def test_categorical_dtype(data, data_categorical):
|
||||
df = pd.DataFrame({"A": (data_categorical[data[0]])})
|
||||
|
||||
col = df.__dataframe__().get_column_by_name("A")
|
||||
assert col.dtype[0] == DtypeKind.CATEGORICAL
|
||||
assert col.null_count == 0
|
||||
assert col.describe_null == (ColumnNullType.USE_SENTINEL, -1)
|
||||
assert col.num_chunks() == 1
|
||||
desc_cat = col.describe_categorical
|
||||
assert desc_cat["is_ordered"] == data[1]
|
||||
assert desc_cat["is_dictionary"] is True
|
||||
assert isinstance(desc_cat["categories"], PandasColumn)
|
||||
tm.assert_series_equal(
|
||||
desc_cat["categories"]._col, pd.Series(["a", "d", "e", "s", "t"])
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(df, from_dataframe(df.__dataframe__()))
|
||||
|
||||
|
||||
def test_categorical_pyarrow():
|
||||
# GH 49889
|
||||
pa = pytest.importorskip("pyarrow", "11.0.0")
|
||||
|
||||
arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"]
|
||||
table = pa.table({"weekday": pa.array(arr).dictionary_encode()})
|
||||
exchange_df = table.__dataframe__()
|
||||
result = from_dataframe(exchange_df)
|
||||
weekday = pd.Categorical(
|
||||
arr, categories=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
|
||||
)
|
||||
expected = pd.DataFrame({"weekday": weekday})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_categorical_pyarrow():
|
||||
# https://github.com/pandas-dev/pandas/issues/53077
|
||||
pa = pytest.importorskip("pyarrow", "11.0.0")
|
||||
|
||||
arr = [None]
|
||||
table = pa.table({"arr": pa.array(arr, "float64").dictionary_encode()})
|
||||
exchange_df = table.__dataframe__()
|
||||
result = pd.api.interchange.from_dataframe(exchange_df)
|
||||
expected = pd.DataFrame({"arr": pd.Categorical([np.nan])})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_large_string_pyarrow():
|
||||
# GH 52795
|
||||
pa = pytest.importorskip("pyarrow", "11.0.0")
|
||||
|
||||
arr = ["Mon", "Tue"]
|
||||
table = pa.table({"weekday": pa.array(arr, "large_string")})
|
||||
exchange_df = table.__dataframe__()
|
||||
result = from_dataframe(exchange_df)
|
||||
expected = pd.DataFrame({"weekday": ["Mon", "Tue"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# check round-trip
|
||||
assert pa.Table.equals(pa.interchange.from_dataframe(result), table)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("offset", "length", "expected_values"),
|
||||
[
|
||||
(0, None, [3.3, float("nan"), 2.1]),
|
||||
(1, None, [float("nan"), 2.1]),
|
||||
(2, None, [2.1]),
|
||||
(0, 2, [3.3, float("nan")]),
|
||||
(0, 1, [3.3]),
|
||||
(1, 1, [float("nan")]),
|
||||
],
|
||||
)
|
||||
def test_bitmasks_pyarrow(offset, length, expected_values):
|
||||
# GH 52795
|
||||
pa = pytest.importorskip("pyarrow", "11.0.0")
|
||||
|
||||
arr = [3.3, None, 2.1]
|
||||
table = pa.table({"arr": arr}).slice(offset, length)
|
||||
exchange_df = table.__dataframe__()
|
||||
result = from_dataframe(exchange_df)
|
||||
expected = pd.DataFrame({"arr": expected_values})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# check round-trip
|
||||
assert pa.Table.equals(pa.interchange.from_dataframe(result), table)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
lambda: np.random.default_rng(2).integers(-100, 100),
|
||||
lambda: np.random.default_rng(2).integers(1, 100),
|
||||
lambda: np.random.default_rng(2).random(),
|
||||
lambda: np.random.default_rng(2).choice([True, False]),
|
||||
lambda: datetime(
|
||||
year=np.random.default_rng(2).integers(1900, 2100),
|
||||
month=np.random.default_rng(2).integers(1, 12),
|
||||
day=np.random.default_rng(2).integers(1, 20),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_dataframe(data):
|
||||
NCOLS, NROWS = 10, 20
|
||||
data = {
|
||||
f"col{int((i - NCOLS / 2) % NCOLS + 1)}": [data() for _ in range(NROWS)]
|
||||
for i in range(NCOLS)
|
||||
}
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
df2 = df.__dataframe__()
|
||||
|
||||
assert df2.num_columns() == NCOLS
|
||||
assert df2.num_rows() == NROWS
|
||||
|
||||
assert list(df2.column_names()) == list(data.keys())
|
||||
|
||||
indices = (0, 2)
|
||||
names = tuple(list(data.keys())[idx] for idx in indices)
|
||||
|
||||
result = from_dataframe(df2.select_columns(indices))
|
||||
expected = from_dataframe(df2.select_columns_by_name(names))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
assert isinstance(result.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"], list)
|
||||
assert isinstance(expected.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"], list)
|
||||
|
||||
|
||||
def test_missing_from_masked():
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"x": np.array([1.0, 2.0, 3.0, 4.0, 0.0]),
|
||||
"y": np.array([1.5, 2.5, 3.5, 4.5, 0]),
|
||||
"z": np.array([1.0, 0.0, 1.0, 1.0, 1.0]),
|
||||
}
|
||||
)
|
||||
|
||||
rng = np.random.default_rng(2)
|
||||
dict_null = {col: rng.integers(low=0, high=len(df)) for col in df.columns}
|
||||
for col, num_nulls in dict_null.items():
|
||||
null_idx = df.index[
|
||||
rng.choice(np.arange(len(df)), size=num_nulls, replace=False)
|
||||
]
|
||||
df.loc[null_idx, col] = None
|
||||
|
||||
df2 = df.__dataframe__()
|
||||
|
||||
assert df2.get_column_by_name("x").null_count == dict_null["x"]
|
||||
assert df2.get_column_by_name("y").null_count == dict_null["y"]
|
||||
assert df2.get_column_by_name("z").null_count == dict_null["z"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
{"x": [1.5, 2.5, 3.5], "y": [9.2, 10.5, 11.8]},
|
||||
{"x": [1, 2, 0], "y": [9.2, 10.5, 11.8]},
|
||||
{
|
||||
"x": np.array([True, True, False]),
|
||||
"y": np.array([1, 2, 0]),
|
||||
"z": np.array([9.2, 10.5, 11.8]),
|
||||
},
|
||||
],
|
||||
)
|
||||
def test_mixed_data(data):
|
||||
df = pd.DataFrame(data)
|
||||
df2 = df.__dataframe__()
|
||||
|
||||
for col_name in df.columns:
|
||||
assert df2.get_column_by_name(col_name).null_count == 0
|
||||
|
||||
|
||||
def test_mixed_missing():
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"x": np.array([True, None, False, None, True]),
|
||||
"y": np.array([None, 2, None, 1, 2]),
|
||||
"z": np.array([9.2, 10.5, None, 11.8, None]),
|
||||
}
|
||||
)
|
||||
|
||||
df2 = df.__dataframe__()
|
||||
|
||||
for col_name in df.columns:
|
||||
assert df2.get_column_by_name(col_name).null_count == 2
|
||||
|
||||
|
||||
def test_string(string_data):
|
||||
test_str_data = string_data["separator data"] + [""]
|
||||
df = pd.DataFrame({"A": test_str_data})
|
||||
col = df.__dataframe__().get_column_by_name("A")
|
||||
|
||||
assert col.size() == 6
|
||||
assert col.null_count == 1
|
||||
assert col.dtype[0] == DtypeKind.STRING
|
||||
assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0)
|
||||
|
||||
df_sliced = df[1:]
|
||||
col = df_sliced.__dataframe__().get_column_by_name("A")
|
||||
assert col.size() == 5
|
||||
assert col.null_count == 1
|
||||
assert col.dtype[0] == DtypeKind.STRING
|
||||
assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0)
|
||||
|
||||
|
||||
def test_nonstring_object():
|
||||
df = pd.DataFrame({"A": ["a", 10, 1.0, ()]})
|
||||
col = df.__dataframe__().get_column_by_name("A")
|
||||
with pytest.raises(NotImplementedError, match="not supported yet"):
|
||||
col.dtype
|
||||
|
||||
|
||||
def test_datetime():
|
||||
df = pd.DataFrame({"A": [pd.Timestamp("2022-01-01"), pd.NaT]})
|
||||
col = df.__dataframe__().get_column_by_name("A")
|
||||
|
||||
assert col.size() == 2
|
||||
assert col.null_count == 1
|
||||
assert col.dtype[0] == DtypeKind.DATETIME
|
||||
assert col.describe_null == (ColumnNullType.USE_SENTINEL, iNaT)
|
||||
|
||||
tm.assert_frame_equal(df, from_dataframe(df.__dataframe__()))
|
||||
|
||||
|
||||
@pytest.mark.skipif(np_version_lt1p23, reason="Numpy > 1.23 required")
|
||||
def test_categorical_to_numpy_dlpack():
|
||||
# https://github.com/pandas-dev/pandas/issues/48393
|
||||
df = pd.DataFrame({"A": pd.Categorical(["a", "b", "a"])})
|
||||
col = df.__dataframe__().get_column_by_name("A")
|
||||
result = np.from_dlpack(col.get_buffers()["data"][0])
|
||||
expected = np.array([0, 1, 0], dtype="int8")
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data", [{}, {"a": []}])
|
||||
def test_empty_pyarrow(data):
|
||||
# GH 53155
|
||||
pytest.importorskip("pyarrow", "11.0.0")
|
||||
from pyarrow.interchange import from_dataframe as pa_from_dataframe
|
||||
|
||||
expected = pd.DataFrame(data)
|
||||
arrow_df = pa_from_dataframe(expected)
|
||||
result = from_dataframe(arrow_df)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multi_chunk_pyarrow() -> None:
|
||||
pa = pytest.importorskip("pyarrow", "11.0.0")
|
||||
n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
|
||||
names = ["n_legs"]
|
||||
table = pa.table([n_legs], names=names)
|
||||
with pytest.raises(
|
||||
RuntimeError,
|
||||
match="To join chunks a copy is required which is "
|
||||
"forbidden by allow_copy=False",
|
||||
):
|
||||
pd.api.interchange.from_dataframe(table, allow_copy=False)
|
||||
|
||||
|
||||
def test_multi_chunk_column() -> None:
|
||||
pytest.importorskip("pyarrow", "11.0.0")
|
||||
ser = pd.Series([1, 2, None], dtype="Int64[pyarrow]")
|
||||
df = pd.concat([ser, ser], ignore_index=True).to_frame("a")
|
||||
df_orig = df.copy()
|
||||
with pytest.raises(
|
||||
RuntimeError, match="Found multi-chunk pyarrow array, but `allow_copy` is False"
|
||||
):
|
||||
pd.api.interchange.from_dataframe(df.__dataframe__(allow_copy=False))
|
||||
result = pd.api.interchange.from_dataframe(df.__dataframe__(allow_copy=True))
|
||||
# Interchange protocol defaults to creating numpy-backed columns, so currently this
|
||||
# is 'float64'.
|
||||
expected = pd.DataFrame({"a": [1.0, 2.0, None, 1.0, 2.0, None]}, dtype="float64")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Check that the rechunking we did didn't modify the original DataFrame.
|
||||
tm.assert_frame_equal(df, df_orig)
|
||||
assert len(df["a"].array._pa_array.chunks) == 2
|
||||
assert len(df_orig["a"].array._pa_array.chunks) == 2
|
||||
|
||||
|
||||
def test_timestamp_ns_pyarrow():
|
||||
# GH 56712
|
||||
pytest.importorskip("pyarrow", "11.0.0")
|
||||
timestamp_args = {
|
||||
"year": 2000,
|
||||
"month": 1,
|
||||
"day": 1,
|
||||
"hour": 1,
|
||||
"minute": 1,
|
||||
"second": 1,
|
||||
}
|
||||
df = pd.Series(
|
||||
[datetime(**timestamp_args)],
|
||||
dtype="timestamp[ns][pyarrow]",
|
||||
name="col0",
|
||||
).to_frame()
|
||||
|
||||
dfi = df.__dataframe__()
|
||||
result = pd.api.interchange.from_dataframe(dfi)["col0"].item()
|
||||
|
||||
expected = pd.Timestamp(**timestamp_args)
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tz", ["UTC", "US/Pacific"])
|
||||
@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"])
|
||||
def test_datetimetzdtype(tz, unit):
|
||||
# GH 54239
|
||||
tz_data = (
|
||||
pd.date_range("2018-01-01", periods=5, freq="D").tz_localize(tz).as_unit(unit)
|
||||
)
|
||||
df = pd.DataFrame({"ts_tz": tz_data})
|
||||
tm.assert_frame_equal(df, from_dataframe(df.__dataframe__()))
|
||||
|
||||
|
||||
def test_interchange_from_non_pandas_tz_aware(request):
|
||||
# GH 54239, 54287
|
||||
pa = pytest.importorskip("pyarrow", "11.0.0")
|
||||
import pyarrow.compute as pc
|
||||
|
||||
if is_platform_windows() and is_ci_environment():
|
||||
mark = pytest.mark.xfail(
|
||||
raises=pa.ArrowInvalid,
|
||||
reason=(
|
||||
"TODO: Set ARROW_TIMEZONE_DATABASE environment variable "
|
||||
"on CI to path to the tzdata for pyarrow."
|
||||
),
|
||||
)
|
||||
request.applymarker(mark)
|
||||
|
||||
arr = pa.array([datetime(2020, 1, 1), None, datetime(2020, 1, 2)])
|
||||
arr = pc.assume_timezone(arr, "Asia/Kathmandu")
|
||||
table = pa.table({"arr": arr})
|
||||
exchange_df = table.__dataframe__()
|
||||
result = from_dataframe(exchange_df)
|
||||
|
||||
expected = pd.DataFrame(
|
||||
["2020-01-01 00:00:00+05:45", "NaT", "2020-01-02 00:00:00+05:45"],
|
||||
columns=["arr"],
|
||||
dtype="datetime64[us, Asia/Kathmandu]",
|
||||
)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None:
|
||||
# https://github.com/pandas-dev/pandas/issues/54781
|
||||
df = pd.DataFrame({"a": ["foo", "bar"]}).__dataframe__()
|
||||
interchange = df.__dataframe__()
|
||||
column = interchange.get_column_by_name("a")
|
||||
buffers = column.get_buffers()
|
||||
buffers_data = buffers["data"]
|
||||
buffer_dtype = buffers_data[1]
|
||||
buffer_dtype = (
|
||||
DtypeKind.UINT,
|
||||
8,
|
||||
ArrowCTypes.UINT8,
|
||||
buffer_dtype[3],
|
||||
)
|
||||
buffers["data"] = (buffers_data[0], buffer_dtype)
|
||||
column.get_buffers = lambda: buffers
|
||||
interchange.get_column_by_name = lambda _: column
|
||||
monkeypatch.setattr(df, "__dataframe__", lambda allow_copy: interchange)
|
||||
pd.api.interchange.from_dataframe(df)
|
||||
|
||||
|
||||
def test_empty_string_column():
|
||||
# https://github.com/pandas-dev/pandas/issues/56703
|
||||
df = pd.DataFrame({"a": []}, dtype=str)
|
||||
df2 = df.__dataframe__()
|
||||
result = pd.api.interchange.from_dataframe(df2)
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
|
||||
def test_large_string():
|
||||
# GH#56702
|
||||
pytest.importorskip("pyarrow")
|
||||
df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]")
|
||||
result = pd.api.interchange.from_dataframe(df.__dataframe__())
|
||||
expected = pd.DataFrame({"a": ["x"]}, dtype="object")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_non_str_names():
|
||||
# https://github.com/pandas-dev/pandas/issues/56701
|
||||
df = pd.Series([1, 2, 3], name=0).to_frame()
|
||||
names = df.__dataframe__().column_names()
|
||||
assert names == ["0"]
|
||||
|
||||
|
||||
def test_non_str_names_w_duplicates():
|
||||
# https://github.com/pandas-dev/pandas/issues/56701
|
||||
df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]})
|
||||
dfi = df.__dataframe__()
|
||||
with pytest.raises(
|
||||
TypeError,
|
||||
match=(
|
||||
"Expected a Series, got a DataFrame. This likely happened because you "
|
||||
"called __dataframe__ on a DataFrame which, after converting column "
|
||||
r"names to string, resulted in duplicated names: Index\(\['0', '0'\], "
|
||||
r"dtype='object'\). Please rename these columns before using the "
|
||||
"interchange protocol."
|
||||
),
|
||||
):
|
||||
pd.api.interchange.from_dataframe(dfi, allow_copy=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("data", "dtype", "expected_dtype"),
|
||||
[
|
||||
([1, 2, None], "Int64", "int64"),
|
||||
([1, 2, None], "Int64[pyarrow]", "int64"),
|
||||
([1, 2, None], "Int8", "int8"),
|
||||
([1, 2, None], "Int8[pyarrow]", "int8"),
|
||||
(
|
||||
[1, 2, None],
|
||||
"UInt64",
|
||||
"uint64",
|
||||
),
|
||||
(
|
||||
[1, 2, None],
|
||||
"UInt64[pyarrow]",
|
||||
"uint64",
|
||||
),
|
||||
([1.0, 2.25, None], "Float32", "float32"),
|
||||
([1.0, 2.25, None], "Float32[pyarrow]", "float32"),
|
||||
([True, False, None], "boolean", "bool"),
|
||||
([True, False, None], "boolean[pyarrow]", "bool"),
|
||||
(["much ado", "about", None], "string[pyarrow_numpy]", "large_string"),
|
||||
(["much ado", "about", None], "string[pyarrow]", "large_string"),
|
||||
(
|
||||
[datetime(2020, 1, 1), datetime(2020, 1, 2), None],
|
||||
"timestamp[ns][pyarrow]",
|
||||
"timestamp[ns]",
|
||||
),
|
||||
(
|
||||
[datetime(2020, 1, 1), datetime(2020, 1, 2), None],
|
||||
"timestamp[us][pyarrow]",
|
||||
"timestamp[us]",
|
||||
),
|
||||
(
|
||||
[
|
||||
datetime(2020, 1, 1, tzinfo=timezone.utc),
|
||||
datetime(2020, 1, 2, tzinfo=timezone.utc),
|
||||
None,
|
||||
],
|
||||
"timestamp[us, Asia/Kathmandu][pyarrow]",
|
||||
"timestamp[us, tz=Asia/Kathmandu]",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_pandas_nullable_with_missing_values(
|
||||
data: list, dtype: str, expected_dtype: str
|
||||
) -> None:
|
||||
# https://github.com/pandas-dev/pandas/issues/57643
|
||||
# https://github.com/pandas-dev/pandas/issues/57664
|
||||
pa = pytest.importorskip("pyarrow", "11.0.0")
|
||||
import pyarrow.interchange as pai
|
||||
|
||||
if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]":
|
||||
expected_dtype = pa.timestamp("us", "Asia/Kathmandu")
|
||||
|
||||
df = pd.DataFrame({"a": data}, dtype=dtype)
|
||||
result = pai.from_dataframe(df.__dataframe__())["a"]
|
||||
assert result.type == expected_dtype
|
||||
assert result[0].as_py() == data[0]
|
||||
assert result[1].as_py() == data[1]
|
||||
assert result[2].as_py() is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("data", "dtype", "expected_dtype"),
|
||||
[
|
||||
([1, 2, 3], "Int64", "int64"),
|
||||
([1, 2, 3], "Int64[pyarrow]", "int64"),
|
||||
([1, 2, 3], "Int8", "int8"),
|
||||
([1, 2, 3], "Int8[pyarrow]", "int8"),
|
||||
(
|
||||
[1, 2, 3],
|
||||
"UInt64",
|
||||
"uint64",
|
||||
),
|
||||
(
|
||||
[1, 2, 3],
|
||||
"UInt64[pyarrow]",
|
||||
"uint64",
|
||||
),
|
||||
([1.0, 2.25, 5.0], "Float32", "float32"),
|
||||
([1.0, 2.25, 5.0], "Float32[pyarrow]", "float32"),
|
||||
([True, False, False], "boolean", "bool"),
|
||||
([True, False, False], "boolean[pyarrow]", "bool"),
|
||||
(["much ado", "about", "nothing"], "string[pyarrow_numpy]", "large_string"),
|
||||
(["much ado", "about", "nothing"], "string[pyarrow]", "large_string"),
|
||||
(
|
||||
[datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)],
|
||||
"timestamp[ns][pyarrow]",
|
||||
"timestamp[ns]",
|
||||
),
|
||||
(
|
||||
[datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)],
|
||||
"timestamp[us][pyarrow]",
|
||||
"timestamp[us]",
|
||||
),
|
||||
(
|
||||
[
|
||||
datetime(2020, 1, 1, tzinfo=timezone.utc),
|
||||
datetime(2020, 1, 2, tzinfo=timezone.utc),
|
||||
datetime(2020, 1, 3, tzinfo=timezone.utc),
|
||||
],
|
||||
"timestamp[us, Asia/Kathmandu][pyarrow]",
|
||||
"timestamp[us, tz=Asia/Kathmandu]",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_pandas_nullable_without_missing_values(
|
||||
data: list, dtype: str, expected_dtype: str
|
||||
) -> None:
|
||||
# https://github.com/pandas-dev/pandas/issues/57643
|
||||
pa = pytest.importorskip("pyarrow", "11.0.0")
|
||||
import pyarrow.interchange as pai
|
||||
|
||||
if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]":
|
||||
expected_dtype = pa.timestamp("us", "Asia/Kathmandu")
|
||||
|
||||
df = pd.DataFrame({"a": data}, dtype=dtype)
|
||||
result = pai.from_dataframe(df.__dataframe__())["a"]
|
||||
assert result.type == expected_dtype
|
||||
assert result[0].as_py() == data[0]
|
||||
assert result[1].as_py() == data[1]
|
||||
assert result[2].as_py() == data[2]
|
||||
|
||||
|
||||
def test_string_validity_buffer() -> None:
|
||||
# https://github.com/pandas-dev/pandas/issues/57761
|
||||
pytest.importorskip("pyarrow", "11.0.0")
|
||||
df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]")
|
||||
result = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"]
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_string_validity_buffer_no_missing() -> None:
|
||||
# https://github.com/pandas-dev/pandas/issues/57762
|
||||
pytest.importorskip("pyarrow", "11.0.0")
|
||||
df = pd.DataFrame({"a": ["x", None]}, dtype="large_string[pyarrow]")
|
||||
validity = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"]
|
||||
assert validity is not None
|
||||
result = validity[1]
|
||||
expected = (DtypeKind.BOOL, 1, ArrowCTypes.BOOL, "=")
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_empty_dataframe():
|
||||
# https://github.com/pandas-dev/pandas/issues/56700
|
||||
df = pd.DataFrame({"a": []}, dtype="int8")
|
||||
dfi = df.__dataframe__()
|
||||
result = pd.api.interchange.from_dataframe(dfi, allow_copy=False)
|
||||
expected = pd.DataFrame({"a": []}, dtype="int8")
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,175 @@
|
||||
"""
|
||||
A verbatim copy (vendored) of the spec tests.
|
||||
Taken from https://github.com/data-apis/dataframe-api
|
||||
"""
|
||||
import ctypes
|
||||
import math
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_from_dict():
|
||||
def maker(dct, is_categorical=False):
|
||||
df = pd.DataFrame(dct)
|
||||
return df.astype("category") if is_categorical else df
|
||||
|
||||
return maker
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"test_data",
|
||||
[
|
||||
{"a": ["foo", "bar"], "b": ["baz", "qux"]},
|
||||
{"a": [1.5, 2.5, 3.5], "b": [9.2, 10.5, 11.8]},
|
||||
{"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]},
|
||||
],
|
||||
ids=["str_data", "float_data", "int_data"],
|
||||
)
|
||||
def test_only_one_dtype(test_data, df_from_dict):
|
||||
columns = list(test_data.keys())
|
||||
df = df_from_dict(test_data)
|
||||
dfX = df.__dataframe__()
|
||||
|
||||
column_size = len(test_data[columns[0]])
|
||||
for column in columns:
|
||||
null_count = dfX.get_column_by_name(column).null_count
|
||||
assert null_count == 0
|
||||
assert isinstance(null_count, int)
|
||||
assert dfX.get_column_by_name(column).size() == column_size
|
||||
assert dfX.get_column_by_name(column).offset == 0
|
||||
|
||||
|
||||
def test_mixed_dtypes(df_from_dict):
|
||||
df = df_from_dict(
|
||||
{
|
||||
"a": [1, 2, 3], # dtype kind INT = 0
|
||||
"b": [3, 4, 5], # dtype kind INT = 0
|
||||
"c": [1.5, 2.5, 3.5], # dtype kind FLOAT = 2
|
||||
"d": [9, 10, 11], # dtype kind INT = 0
|
||||
"e": [True, False, True], # dtype kind BOOLEAN = 20
|
||||
"f": ["a", "", "c"], # dtype kind STRING = 21
|
||||
}
|
||||
)
|
||||
dfX = df.__dataframe__()
|
||||
# for meanings of dtype[0] see the spec; we cannot import the spec here as this
|
||||
# file is expected to be vendored *anywhere*;
|
||||
# values for dtype[0] are explained above
|
||||
columns = {"a": 0, "b": 0, "c": 2, "d": 0, "e": 20, "f": 21}
|
||||
|
||||
for column, kind in columns.items():
|
||||
colX = dfX.get_column_by_name(column)
|
||||
assert colX.null_count == 0
|
||||
assert isinstance(colX.null_count, int)
|
||||
assert colX.size() == 3
|
||||
assert colX.offset == 0
|
||||
|
||||
assert colX.dtype[0] == kind
|
||||
|
||||
assert dfX.get_column_by_name("c").dtype[1] == 64
|
||||
|
||||
|
||||
def test_na_float(df_from_dict):
|
||||
df = df_from_dict({"a": [1.0, math.nan, 2.0]})
|
||||
dfX = df.__dataframe__()
|
||||
colX = dfX.get_column_by_name("a")
|
||||
assert colX.null_count == 1
|
||||
assert isinstance(colX.null_count, int)
|
||||
|
||||
|
||||
def test_noncategorical(df_from_dict):
|
||||
df = df_from_dict({"a": [1, 2, 3]})
|
||||
dfX = df.__dataframe__()
|
||||
colX = dfX.get_column_by_name("a")
|
||||
with pytest.raises(TypeError, match=".*categorical.*"):
|
||||
colX.describe_categorical
|
||||
|
||||
|
||||
def test_categorical(df_from_dict):
|
||||
df = df_from_dict(
|
||||
{"weekday": ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"]},
|
||||
is_categorical=True,
|
||||
)
|
||||
|
||||
colX = df.__dataframe__().get_column_by_name("weekday")
|
||||
categorical = colX.describe_categorical
|
||||
assert isinstance(categorical["is_ordered"], bool)
|
||||
assert isinstance(categorical["is_dictionary"], bool)
|
||||
|
||||
|
||||
def test_dataframe(df_from_dict):
|
||||
df = df_from_dict(
|
||||
{"x": [True, True, False], "y": [1, 2, 0], "z": [9.2, 10.5, 11.8]}
|
||||
)
|
||||
dfX = df.__dataframe__()
|
||||
|
||||
assert dfX.num_columns() == 3
|
||||
assert dfX.num_rows() == 3
|
||||
assert dfX.num_chunks() == 1
|
||||
assert list(dfX.column_names()) == ["x", "y", "z"]
|
||||
assert list(dfX.select_columns((0, 2)).column_names()) == list(
|
||||
dfX.select_columns_by_name(("x", "z")).column_names()
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)])
|
||||
def test_df_get_chunks(size, n_chunks, df_from_dict):
|
||||
df = df_from_dict({"x": list(range(size))})
|
||||
dfX = df.__dataframe__()
|
||||
chunks = list(dfX.get_chunks(n_chunks))
|
||||
assert len(chunks) == n_chunks
|
||||
assert sum(chunk.num_rows() for chunk in chunks) == size
|
||||
|
||||
|
||||
@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)])
|
||||
def test_column_get_chunks(size, n_chunks, df_from_dict):
|
||||
df = df_from_dict({"x": list(range(size))})
|
||||
dfX = df.__dataframe__()
|
||||
chunks = list(dfX.get_column(0).get_chunks(n_chunks))
|
||||
assert len(chunks) == n_chunks
|
||||
assert sum(chunk.size() for chunk in chunks) == size
|
||||
|
||||
|
||||
def test_get_columns(df_from_dict):
|
||||
df = df_from_dict({"a": [0, 1], "b": [2.5, 3.5]})
|
||||
dfX = df.__dataframe__()
|
||||
for colX in dfX.get_columns():
|
||||
assert colX.size() == 2
|
||||
assert colX.num_chunks() == 1
|
||||
# for meanings of dtype[0] see the spec; we cannot import the spec here as this
|
||||
# file is expected to be vendored *anywhere*
|
||||
assert dfX.get_column(0).dtype[0] == 0 # INT
|
||||
assert dfX.get_column(1).dtype[0] == 2 # FLOAT
|
||||
|
||||
|
||||
def test_buffer(df_from_dict):
|
||||
arr = [0, 1, -1]
|
||||
df = df_from_dict({"a": arr})
|
||||
dfX = df.__dataframe__()
|
||||
colX = dfX.get_column(0)
|
||||
bufX = colX.get_buffers()
|
||||
|
||||
dataBuf, dataDtype = bufX["data"]
|
||||
|
||||
assert dataBuf.bufsize > 0
|
||||
assert dataBuf.ptr != 0
|
||||
device, _ = dataBuf.__dlpack_device__()
|
||||
|
||||
# for meanings of dtype[0] see the spec; we cannot import the spec here as this
|
||||
# file is expected to be vendored *anywhere*
|
||||
assert dataDtype[0] == 0 # INT
|
||||
|
||||
if device == 1: # CPU-only as we're going to directly read memory here
|
||||
bitwidth = dataDtype[1]
|
||||
ctype = {
|
||||
8: ctypes.c_int8,
|
||||
16: ctypes.c_int16,
|
||||
32: ctypes.c_int32,
|
||||
64: ctypes.c_int64,
|
||||
}[bitwidth]
|
||||
|
||||
for idx, truth in enumerate(arr):
|
||||
val = ctype.from_address(dataBuf.ptr + idx * (bitwidth // 8)).value
|
||||
assert val == truth, f"Buffer at index {idx} mismatch"
|
@ -0,0 +1,89 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas.core.interchange.utils import dtype_to_arrow_c_fmt
|
||||
|
||||
# TODO: use ArrowSchema to get reference C-string.
|
||||
# At the time, there is no way to access ArrowSchema holding a type format string
|
||||
# from python. The only way to access it is to export the structure to a C-pointer,
|
||||
# see DataType._export_to_c() method defined in
|
||||
# https://github.com/apache/arrow/blob/master/python/pyarrow/types.pxi
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pandas_dtype, c_string",
|
||||
[
|
||||
(np.dtype("bool"), "b"),
|
||||
(np.dtype("int8"), "c"),
|
||||
(np.dtype("uint8"), "C"),
|
||||
(np.dtype("int16"), "s"),
|
||||
(np.dtype("uint16"), "S"),
|
||||
(np.dtype("int32"), "i"),
|
||||
(np.dtype("uint32"), "I"),
|
||||
(np.dtype("int64"), "l"),
|
||||
(np.dtype("uint64"), "L"),
|
||||
(np.dtype("float16"), "e"),
|
||||
(np.dtype("float32"), "f"),
|
||||
(np.dtype("float64"), "g"),
|
||||
(pd.Series(["a"]).dtype, "u"),
|
||||
(
|
||||
pd.Series([0]).astype("datetime64[ns]").dtype,
|
||||
"tsn:",
|
||||
),
|
||||
(pd.CategoricalDtype(["a"]), "l"),
|
||||
(np.dtype("O"), "u"),
|
||||
],
|
||||
)
|
||||
def test_dtype_to_arrow_c_fmt(pandas_dtype, c_string): # PR01
|
||||
"""Test ``dtype_to_arrow_c_fmt`` utility function."""
|
||||
assert dtype_to_arrow_c_fmt(pandas_dtype) == c_string
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pa_dtype, args_kwargs, c_string",
|
||||
[
|
||||
["null", {}, "n"],
|
||||
["bool_", {}, "b"],
|
||||
["uint8", {}, "C"],
|
||||
["uint16", {}, "S"],
|
||||
["uint32", {}, "I"],
|
||||
["uint64", {}, "L"],
|
||||
["int8", {}, "c"],
|
||||
["int16", {}, "S"],
|
||||
["int32", {}, "i"],
|
||||
["int64", {}, "l"],
|
||||
["float16", {}, "e"],
|
||||
["float32", {}, "f"],
|
||||
["float64", {}, "g"],
|
||||
["string", {}, "u"],
|
||||
["binary", {}, "z"],
|
||||
["time32", ("s",), "tts"],
|
||||
["time32", ("ms",), "ttm"],
|
||||
["time64", ("us",), "ttu"],
|
||||
["time64", ("ns",), "ttn"],
|
||||
["date32", {}, "tdD"],
|
||||
["date64", {}, "tdm"],
|
||||
["timestamp", {"unit": "s"}, "tss:"],
|
||||
["timestamp", {"unit": "ms"}, "tsm:"],
|
||||
["timestamp", {"unit": "us"}, "tsu:"],
|
||||
["timestamp", {"unit": "ns"}, "tsn:"],
|
||||
["timestamp", {"unit": "ns", "tz": "UTC"}, "tsn:UTC"],
|
||||
["duration", ("s",), "tDs"],
|
||||
["duration", ("ms",), "tDm"],
|
||||
["duration", ("us",), "tDu"],
|
||||
["duration", ("ns",), "tDn"],
|
||||
["decimal128", {"precision": 4, "scale": 2}, "d:4,2"],
|
||||
],
|
||||
)
|
||||
def test_dtype_to_arrow_c_fmt_arrowdtype(pa_dtype, args_kwargs, c_string):
|
||||
# GH 52323
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
if not args_kwargs:
|
||||
pa_type = getattr(pa, pa_dtype)()
|
||||
elif isinstance(args_kwargs, tuple):
|
||||
pa_type = getattr(pa, pa_dtype)(*args_kwargs)
|
||||
else:
|
||||
pa_type = getattr(pa, pa_dtype)(**args_kwargs)
|
||||
arrow_type = pd.ArrowDtype(pa_type)
|
||||
assert dtype_to_arrow_c_fmt(arrow_type) == c_string
|
Reference in New Issue
Block a user