Updated script that can be controled by Nodejs web app
This commit is contained in:
436
lib/python3.13/site-packages/pandas/tests/io/test_orc.py
Normal file
436
lib/python3.13/site-packages/pandas/tests/io/test_orc.py
Normal file
@ -0,0 +1,436 @@
|
||||
""" test orc compat """
|
||||
import datetime
|
||||
from decimal import Decimal
|
||||
from io import BytesIO
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import read_orc
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays import StringArray
|
||||
|
||||
pytest.importorskip("pyarrow.orc")
|
||||
|
||||
import pyarrow as pa
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dirpath(datapath):
|
||||
return datapath("io", "data", "orc")
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
np.array([1, 20], dtype="uint64"),
|
||||
pd.Series(["a", "b", "a"], dtype="category"),
|
||||
[pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)],
|
||||
[pd.Period("2022-01-03", freq="D"), pd.Period("2022-01-04", freq="D")],
|
||||
]
|
||||
)
|
||||
def orc_writer_dtypes_not_supported(request):
|
||||
# Examples of dataframes with dtypes for which conversion to ORC
|
||||
# hasn't been implemented yet, that is, Category, unsigned integers,
|
||||
# interval, period and sparse.
|
||||
return pd.DataFrame({"unimpl": request.param})
|
||||
|
||||
|
||||
def test_orc_reader_empty(dirpath):
|
||||
columns = [
|
||||
"boolean1",
|
||||
"byte1",
|
||||
"short1",
|
||||
"int1",
|
||||
"long1",
|
||||
"float1",
|
||||
"double1",
|
||||
"bytes1",
|
||||
"string1",
|
||||
]
|
||||
dtypes = [
|
||||
"bool",
|
||||
"int8",
|
||||
"int16",
|
||||
"int32",
|
||||
"int64",
|
||||
"float32",
|
||||
"float64",
|
||||
"object",
|
||||
"object",
|
||||
]
|
||||
expected = pd.DataFrame(index=pd.RangeIndex(0))
|
||||
for colname, dtype in zip(columns, dtypes):
|
||||
expected[colname] = pd.Series(dtype=dtype)
|
||||
|
||||
inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
|
||||
got = read_orc(inputfile, columns=columns)
|
||||
|
||||
tm.assert_equal(expected, got)
|
||||
|
||||
|
||||
def test_orc_reader_basic(dirpath):
|
||||
data = {
|
||||
"boolean1": np.array([False, True], dtype="bool"),
|
||||
"byte1": np.array([1, 100], dtype="int8"),
|
||||
"short1": np.array([1024, 2048], dtype="int16"),
|
||||
"int1": np.array([65536, 65536], dtype="int32"),
|
||||
"long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
|
||||
"float1": np.array([1.0, 2.0], dtype="float32"),
|
||||
"double1": np.array([-15.0, -5.0], dtype="float64"),
|
||||
"bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
|
||||
"string1": np.array(["hi", "bye"], dtype="object"),
|
||||
}
|
||||
expected = pd.DataFrame.from_dict(data)
|
||||
|
||||
inputfile = os.path.join(dirpath, "TestOrcFile.test1.orc")
|
||||
got = read_orc(inputfile, columns=data.keys())
|
||||
|
||||
tm.assert_equal(expected, got)
|
||||
|
||||
|
||||
def test_orc_reader_decimal(dirpath):
|
||||
# Only testing the first 10 rows of data
|
||||
data = {
|
||||
"_col0": np.array(
|
||||
[
|
||||
Decimal("-1000.50000"),
|
||||
Decimal("-999.60000"),
|
||||
Decimal("-998.70000"),
|
||||
Decimal("-997.80000"),
|
||||
Decimal("-996.90000"),
|
||||
Decimal("-995.10000"),
|
||||
Decimal("-994.11000"),
|
||||
Decimal("-993.12000"),
|
||||
Decimal("-992.13000"),
|
||||
Decimal("-991.14000"),
|
||||
],
|
||||
dtype="object",
|
||||
)
|
||||
}
|
||||
expected = pd.DataFrame.from_dict(data)
|
||||
|
||||
inputfile = os.path.join(dirpath, "TestOrcFile.decimal.orc")
|
||||
got = read_orc(inputfile).iloc[:10]
|
||||
|
||||
tm.assert_equal(expected, got)
|
||||
|
||||
|
||||
def test_orc_reader_date_low(dirpath):
|
||||
data = {
|
||||
"time": np.array(
|
||||
[
|
||||
"1900-05-05 12:34:56.100000",
|
||||
"1900-05-05 12:34:56.100100",
|
||||
"1900-05-05 12:34:56.100200",
|
||||
"1900-05-05 12:34:56.100300",
|
||||
"1900-05-05 12:34:56.100400",
|
||||
"1900-05-05 12:34:56.100500",
|
||||
"1900-05-05 12:34:56.100600",
|
||||
"1900-05-05 12:34:56.100700",
|
||||
"1900-05-05 12:34:56.100800",
|
||||
"1900-05-05 12:34:56.100900",
|
||||
],
|
||||
dtype="datetime64[ns]",
|
||||
),
|
||||
"date": np.array(
|
||||
[
|
||||
datetime.date(1900, 12, 25),
|
||||
datetime.date(1900, 12, 25),
|
||||
datetime.date(1900, 12, 25),
|
||||
datetime.date(1900, 12, 25),
|
||||
datetime.date(1900, 12, 25),
|
||||
datetime.date(1900, 12, 25),
|
||||
datetime.date(1900, 12, 25),
|
||||
datetime.date(1900, 12, 25),
|
||||
datetime.date(1900, 12, 25),
|
||||
datetime.date(1900, 12, 25),
|
||||
],
|
||||
dtype="object",
|
||||
),
|
||||
}
|
||||
expected = pd.DataFrame.from_dict(data)
|
||||
|
||||
inputfile = os.path.join(dirpath, "TestOrcFile.testDate1900.orc")
|
||||
got = read_orc(inputfile).iloc[:10]
|
||||
|
||||
tm.assert_equal(expected, got)
|
||||
|
||||
|
||||
def test_orc_reader_date_high(dirpath):
|
||||
data = {
|
||||
"time": np.array(
|
||||
[
|
||||
"2038-05-05 12:34:56.100000",
|
||||
"2038-05-05 12:34:56.100100",
|
||||
"2038-05-05 12:34:56.100200",
|
||||
"2038-05-05 12:34:56.100300",
|
||||
"2038-05-05 12:34:56.100400",
|
||||
"2038-05-05 12:34:56.100500",
|
||||
"2038-05-05 12:34:56.100600",
|
||||
"2038-05-05 12:34:56.100700",
|
||||
"2038-05-05 12:34:56.100800",
|
||||
"2038-05-05 12:34:56.100900",
|
||||
],
|
||||
dtype="datetime64[ns]",
|
||||
),
|
||||
"date": np.array(
|
||||
[
|
||||
datetime.date(2038, 12, 25),
|
||||
datetime.date(2038, 12, 25),
|
||||
datetime.date(2038, 12, 25),
|
||||
datetime.date(2038, 12, 25),
|
||||
datetime.date(2038, 12, 25),
|
||||
datetime.date(2038, 12, 25),
|
||||
datetime.date(2038, 12, 25),
|
||||
datetime.date(2038, 12, 25),
|
||||
datetime.date(2038, 12, 25),
|
||||
datetime.date(2038, 12, 25),
|
||||
],
|
||||
dtype="object",
|
||||
),
|
||||
}
|
||||
expected = pd.DataFrame.from_dict(data)
|
||||
|
||||
inputfile = os.path.join(dirpath, "TestOrcFile.testDate2038.orc")
|
||||
got = read_orc(inputfile).iloc[:10]
|
||||
|
||||
tm.assert_equal(expected, got)
|
||||
|
||||
|
||||
def test_orc_reader_snappy_compressed(dirpath):
|
||||
data = {
|
||||
"int1": np.array(
|
||||
[
|
||||
-1160101563,
|
||||
1181413113,
|
||||
2065821249,
|
||||
-267157795,
|
||||
172111193,
|
||||
1752363137,
|
||||
1406072123,
|
||||
1911809390,
|
||||
-1308542224,
|
||||
-467100286,
|
||||
],
|
||||
dtype="int32",
|
||||
),
|
||||
"string1": np.array(
|
||||
[
|
||||
"f50dcb8",
|
||||
"382fdaaa",
|
||||
"90758c6",
|
||||
"9e8caf3f",
|
||||
"ee97332b",
|
||||
"d634da1",
|
||||
"2bea4396",
|
||||
"d67d89e8",
|
||||
"ad71007e",
|
||||
"e8c82066",
|
||||
],
|
||||
dtype="object",
|
||||
),
|
||||
}
|
||||
expected = pd.DataFrame.from_dict(data)
|
||||
|
||||
inputfile = os.path.join(dirpath, "TestOrcFile.testSnappy.orc")
|
||||
got = read_orc(inputfile).iloc[:10]
|
||||
|
||||
tm.assert_equal(expected, got)
|
||||
|
||||
|
||||
def test_orc_roundtrip_file(dirpath):
|
||||
# GH44554
|
||||
# PyArrow gained ORC write support with the current argument order
|
||||
pytest.importorskip("pyarrow")
|
||||
|
||||
data = {
|
||||
"boolean1": np.array([False, True], dtype="bool"),
|
||||
"byte1": np.array([1, 100], dtype="int8"),
|
||||
"short1": np.array([1024, 2048], dtype="int16"),
|
||||
"int1": np.array([65536, 65536], dtype="int32"),
|
||||
"long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
|
||||
"float1": np.array([1.0, 2.0], dtype="float32"),
|
||||
"double1": np.array([-15.0, -5.0], dtype="float64"),
|
||||
"bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
|
||||
"string1": np.array(["hi", "bye"], dtype="object"),
|
||||
}
|
||||
expected = pd.DataFrame.from_dict(data)
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
expected.to_orc(path)
|
||||
got = read_orc(path)
|
||||
|
||||
tm.assert_equal(expected, got)
|
||||
|
||||
|
||||
def test_orc_roundtrip_bytesio():
|
||||
# GH44554
|
||||
# PyArrow gained ORC write support with the current argument order
|
||||
pytest.importorskip("pyarrow")
|
||||
|
||||
data = {
|
||||
"boolean1": np.array([False, True], dtype="bool"),
|
||||
"byte1": np.array([1, 100], dtype="int8"),
|
||||
"short1": np.array([1024, 2048], dtype="int16"),
|
||||
"int1": np.array([65536, 65536], dtype="int32"),
|
||||
"long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
|
||||
"float1": np.array([1.0, 2.0], dtype="float32"),
|
||||
"double1": np.array([-15.0, -5.0], dtype="float64"),
|
||||
"bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
|
||||
"string1": np.array(["hi", "bye"], dtype="object"),
|
||||
}
|
||||
expected = pd.DataFrame.from_dict(data)
|
||||
|
||||
bytes = expected.to_orc()
|
||||
got = read_orc(BytesIO(bytes))
|
||||
|
||||
tm.assert_equal(expected, got)
|
||||
|
||||
|
||||
def test_orc_writer_dtypes_not_supported(orc_writer_dtypes_not_supported):
|
||||
# GH44554
|
||||
# PyArrow gained ORC write support with the current argument order
|
||||
pytest.importorskip("pyarrow")
|
||||
|
||||
msg = "The dtype of one or more columns is not supported yet."
|
||||
with pytest.raises(NotImplementedError, match=msg):
|
||||
orc_writer_dtypes_not_supported.to_orc()
|
||||
|
||||
|
||||
def test_orc_dtype_backend_pyarrow():
|
||||
pytest.importorskip("pyarrow")
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"string": list("abc"),
|
||||
"string_with_nan": ["a", np.nan, "c"],
|
||||
"string_with_none": ["a", None, "c"],
|
||||
"bytes": [b"foo", b"bar", None],
|
||||
"int": list(range(1, 4)),
|
||||
"float": np.arange(4.0, 7.0, dtype="float64"),
|
||||
"float_with_nan": [2.0, np.nan, 3.0],
|
||||
"bool": [True, False, True],
|
||||
"bool_with_na": [True, False, None],
|
||||
"datetime": pd.date_range("20130101", periods=3),
|
||||
"datetime_with_nat": [
|
||||
pd.Timestamp("20130101"),
|
||||
pd.NaT,
|
||||
pd.Timestamp("20130103"),
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
bytes_data = df.copy().to_orc()
|
||||
result = read_orc(BytesIO(bytes_data), dtype_backend="pyarrow")
|
||||
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
col: pd.arrays.ArrowExtensionArray(pa.array(df[col], from_pandas=True))
|
||||
for col in df.columns
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_orc_dtype_backend_numpy_nullable():
|
||||
# GH#50503
|
||||
pytest.importorskip("pyarrow")
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"string": list("abc"),
|
||||
"string_with_nan": ["a", np.nan, "c"],
|
||||
"string_with_none": ["a", None, "c"],
|
||||
"int": list(range(1, 4)),
|
||||
"int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
|
||||
"na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
|
||||
"float": np.arange(4.0, 7.0, dtype="float64"),
|
||||
"float_with_nan": [2.0, np.nan, 3.0],
|
||||
"bool": [True, False, True],
|
||||
"bool_with_na": [True, False, None],
|
||||
}
|
||||
)
|
||||
|
||||
bytes_data = df.copy().to_orc()
|
||||
result = read_orc(BytesIO(bytes_data), dtype_backend="numpy_nullable")
|
||||
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"string": StringArray(np.array(["a", "b", "c"], dtype=np.object_)),
|
||||
"string_with_nan": StringArray(
|
||||
np.array(["a", pd.NA, "c"], dtype=np.object_)
|
||||
),
|
||||
"string_with_none": StringArray(
|
||||
np.array(["a", pd.NA, "c"], dtype=np.object_)
|
||||
),
|
||||
"int": pd.Series([1, 2, 3], dtype="Int64"),
|
||||
"int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
|
||||
"na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
|
||||
"float": pd.Series([4.0, 5.0, 6.0], dtype="Float64"),
|
||||
"float_with_nan": pd.Series([2.0, pd.NA, 3.0], dtype="Float64"),
|
||||
"bool": pd.Series([True, False, True], dtype="boolean"),
|
||||
"bool_with_na": pd.Series([True, False, pd.NA], dtype="boolean"),
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_orc_uri_path():
|
||||
expected = pd.DataFrame({"int": list(range(1, 4))})
|
||||
with tm.ensure_clean("tmp.orc") as path:
|
||||
expected.to_orc(path)
|
||||
uri = pathlib.Path(path).as_uri()
|
||||
result = read_orc(uri)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index",
|
||||
[
|
||||
pd.RangeIndex(start=2, stop=5, step=1),
|
||||
pd.RangeIndex(start=0, stop=3, step=1, name="non-default"),
|
||||
pd.Index([1, 2, 3]),
|
||||
],
|
||||
)
|
||||
def test_to_orc_non_default_index(index):
|
||||
df = pd.DataFrame({"a": [1, 2, 3]}, index=index)
|
||||
msg = (
|
||||
"orc does not support serializing a non-default index|"
|
||||
"orc does not serialize index meta-data"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_orc()
|
||||
|
||||
|
||||
def test_invalid_dtype_backend():
|
||||
msg = (
|
||||
"dtype_backend numpy is invalid, only 'numpy_nullable' and "
|
||||
"'pyarrow' are allowed."
|
||||
)
|
||||
df = pd.DataFrame({"int": list(range(1, 4))})
|
||||
with tm.ensure_clean("tmp.orc") as path:
|
||||
df.to_orc(path)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_orc(path, dtype_backend="numpy")
|
||||
|
||||
|
||||
def test_string_inference(tmp_path):
|
||||
# GH#54431
|
||||
path = tmp_path / "test_string_inference.p"
|
||||
df = pd.DataFrame(data={"a": ["x", "y"]})
|
||||
df.to_orc(path)
|
||||
with pd.option_context("future.infer_string", True):
|
||||
result = read_orc(path)
|
||||
expected = pd.DataFrame(
|
||||
data={"a": ["x", "y"]},
|
||||
dtype="string[pyarrow_numpy]",
|
||||
columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
Reference in New Issue
Block a user