Updated script that can be controled by Nodejs web app

2024-11-25 12:24:18 +07:00
parent c440eda1f4
commit 8b0ab2bd3a
8662 changed files with 1803808 additions and 34 deletions
--- a/lib/python3.13/site-packages/pandas/tests/io/test_orc.py
+++ b/lib/python3.13/site-packages/pandas/tests/io/test_orc.py
@ -0,0 +1,436 @@
+""" test orc compat """
+import datetime
+from decimal import Decimal
+from io import BytesIO
+import os
+import pathlib
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import read_orc
+import pandas._testing as tm
+from pandas.core.arrays import StringArray
+
+pytest.importorskip("pyarrow.orc")
+
+import pyarrow as pa
+
+pytestmark = pytest.mark.filterwarnings(
+    "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
+)
+
+
+@pytest.fixture
+def dirpath(datapath):
+    return datapath("io", "data", "orc")
+
+
+@pytest.fixture(
+    params=[
+        np.array([1, 20], dtype="uint64"),
+        pd.Series(["a", "b", "a"], dtype="category"),
+        [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)],
+        [pd.Period("2022-01-03", freq="D"), pd.Period("2022-01-04", freq="D")],
+    ]
+)
+def orc_writer_dtypes_not_supported(request):
+    # Examples of dataframes with dtypes for which conversion to ORC
+    # hasn't been implemented yet, that is, Category, unsigned integers,
+    # interval, period and sparse.
+    return pd.DataFrame({"unimpl": request.param})
+
+
+def test_orc_reader_empty(dirpath):
+    columns = [
+        "boolean1",
+        "byte1",
+        "short1",
+        "int1",
+        "long1",
+        "float1",
+        "double1",
+        "bytes1",
+        "string1",
+    ]
+    dtypes = [
+        "bool",
+        "int8",
+        "int16",
+        "int32",
+        "int64",
+        "float32",
+        "float64",
+        "object",
+        "object",
+    ]
+    expected = pd.DataFrame(index=pd.RangeIndex(0))
+    for colname, dtype in zip(columns, dtypes):
+        expected[colname] = pd.Series(dtype=dtype)
+
+    inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
+    got = read_orc(inputfile, columns=columns)
+
+    tm.assert_equal(expected, got)
+
+
+def test_orc_reader_basic(dirpath):
+    data = {
+        "boolean1": np.array([False, True], dtype="bool"),
+        "byte1": np.array([1, 100], dtype="int8"),
+        "short1": np.array([1024, 2048], dtype="int16"),
+        "int1": np.array([65536, 65536], dtype="int32"),
+        "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
+        "float1": np.array([1.0, 2.0], dtype="float32"),
+        "double1": np.array([-15.0, -5.0], dtype="float64"),
+        "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
+        "string1": np.array(["hi", "bye"], dtype="object"),
+    }
+    expected = pd.DataFrame.from_dict(data)
+
+    inputfile = os.path.join(dirpath, "TestOrcFile.test1.orc")
+    got = read_orc(inputfile, columns=data.keys())
+
+    tm.assert_equal(expected, got)
+
+
+def test_orc_reader_decimal(dirpath):
+    # Only testing the first 10 rows of data
+    data = {
+        "_col0": np.array(
+            [
+                Decimal("-1000.50000"),
+                Decimal("-999.60000"),
+                Decimal("-998.70000"),
+                Decimal("-997.80000"),
+                Decimal("-996.90000"),
+                Decimal("-995.10000"),
+                Decimal("-994.11000"),
+                Decimal("-993.12000"),
+                Decimal("-992.13000"),
+                Decimal("-991.14000"),
+            ],
+            dtype="object",
+        )
+    }
+    expected = pd.DataFrame.from_dict(data)
+
+    inputfile = os.path.join(dirpath, "TestOrcFile.decimal.orc")
+    got = read_orc(inputfile).iloc[:10]
+
+    tm.assert_equal(expected, got)
+
+
+def test_orc_reader_date_low(dirpath):
+    data = {
+        "time": np.array(
+            [
+                "1900-05-05 12:34:56.100000",
+                "1900-05-05 12:34:56.100100",
+                "1900-05-05 12:34:56.100200",
+                "1900-05-05 12:34:56.100300",
+                "1900-05-05 12:34:56.100400",
+                "1900-05-05 12:34:56.100500",
+                "1900-05-05 12:34:56.100600",
+                "1900-05-05 12:34:56.100700",
+                "1900-05-05 12:34:56.100800",
+                "1900-05-05 12:34:56.100900",
+            ],
+            dtype="datetime64[ns]",
+        ),
+        "date": np.array(
+            [
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+            ],
+            dtype="object",
+        ),
+    }
+    expected = pd.DataFrame.from_dict(data)
+
+    inputfile = os.path.join(dirpath, "TestOrcFile.testDate1900.orc")
+    got = read_orc(inputfile).iloc[:10]
+
+    tm.assert_equal(expected, got)
+
+
+def test_orc_reader_date_high(dirpath):
+    data = {
+        "time": np.array(
+            [
+                "2038-05-05 12:34:56.100000",
+                "2038-05-05 12:34:56.100100",
+                "2038-05-05 12:34:56.100200",
+                "2038-05-05 12:34:56.100300",
+                "2038-05-05 12:34:56.100400",
+                "2038-05-05 12:34:56.100500",
+                "2038-05-05 12:34:56.100600",
+                "2038-05-05 12:34:56.100700",
+                "2038-05-05 12:34:56.100800",
+                "2038-05-05 12:34:56.100900",
+            ],
+            dtype="datetime64[ns]",
+        ),
+        "date": np.array(
+            [
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+            ],
+            dtype="object",
+        ),
+    }
+    expected = pd.DataFrame.from_dict(data)
+
+    inputfile = os.path.join(dirpath, "TestOrcFile.testDate2038.orc")
+    got = read_orc(inputfile).iloc[:10]
+
+    tm.assert_equal(expected, got)
+
+
+def test_orc_reader_snappy_compressed(dirpath):
+    data = {
+        "int1": np.array(
+            [
+                -1160101563,
+                1181413113,
+                2065821249,
+                -267157795,
+                172111193,
+                1752363137,
+                1406072123,
+                1911809390,
+                -1308542224,
+                -467100286,
+            ],
+            dtype="int32",
+        ),
+        "string1": np.array(
+            [
+                "f50dcb8",
+                "382fdaaa",
+                "90758c6",
+                "9e8caf3f",
+                "ee97332b",
+                "d634da1",
+                "2bea4396",
+                "d67d89e8",
+                "ad71007e",
+                "e8c82066",
+            ],
+            dtype="object",
+        ),
+    }
+    expected = pd.DataFrame.from_dict(data)
+
+    inputfile = os.path.join(dirpath, "TestOrcFile.testSnappy.orc")
+    got = read_orc(inputfile).iloc[:10]
+
+    tm.assert_equal(expected, got)
+
+
+def test_orc_roundtrip_file(dirpath):
+    # GH44554
+    # PyArrow gained ORC write support with the current argument order
+    pytest.importorskip("pyarrow")
+
+    data = {
+        "boolean1": np.array([False, True], dtype="bool"),
+        "byte1": np.array([1, 100], dtype="int8"),
+        "short1": np.array([1024, 2048], dtype="int16"),
+        "int1": np.array([65536, 65536], dtype="int32"),
+        "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
+        "float1": np.array([1.0, 2.0], dtype="float32"),
+        "double1": np.array([-15.0, -5.0], dtype="float64"),
+        "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
+        "string1": np.array(["hi", "bye"], dtype="object"),
+    }
+    expected = pd.DataFrame.from_dict(data)
+
+    with tm.ensure_clean() as path:
+        expected.to_orc(path)
+        got = read_orc(path)
+
+        tm.assert_equal(expected, got)
+
+
+def test_orc_roundtrip_bytesio():
+    # GH44554
+    # PyArrow gained ORC write support with the current argument order
+    pytest.importorskip("pyarrow")
+
+    data = {
+        "boolean1": np.array([False, True], dtype="bool"),
+        "byte1": np.array([1, 100], dtype="int8"),
+        "short1": np.array([1024, 2048], dtype="int16"),
+        "int1": np.array([65536, 65536], dtype="int32"),
+        "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
+        "float1": np.array([1.0, 2.0], dtype="float32"),
+        "double1": np.array([-15.0, -5.0], dtype="float64"),
+        "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
+        "string1": np.array(["hi", "bye"], dtype="object"),
+    }
+    expected = pd.DataFrame.from_dict(data)
+
+    bytes = expected.to_orc()
+    got = read_orc(BytesIO(bytes))
+
+    tm.assert_equal(expected, got)
+
+
+def test_orc_writer_dtypes_not_supported(orc_writer_dtypes_not_supported):
+    # GH44554
+    # PyArrow gained ORC write support with the current argument order
+    pytest.importorskip("pyarrow")
+
+    msg = "The dtype of one or more columns is not supported yet."
+    with pytest.raises(NotImplementedError, match=msg):
+        orc_writer_dtypes_not_supported.to_orc()
+
+
+def test_orc_dtype_backend_pyarrow():
+    pytest.importorskip("pyarrow")
+    df = pd.DataFrame(
+        {
+            "string": list("abc"),
+            "string_with_nan": ["a", np.nan, "c"],
+            "string_with_none": ["a", None, "c"],
+            "bytes": [b"foo", b"bar", None],
+            "int": list(range(1, 4)),
+            "float": np.arange(4.0, 7.0, dtype="float64"),
+            "float_with_nan": [2.0, np.nan, 3.0],
+            "bool": [True, False, True],
+            "bool_with_na": [True, False, None],
+            "datetime": pd.date_range("20130101", periods=3),
+            "datetime_with_nat": [
+                pd.Timestamp("20130101"),
+                pd.NaT,
+                pd.Timestamp("20130103"),
+            ],
+        }
+    )
+
+    bytes_data = df.copy().to_orc()
+    result = read_orc(BytesIO(bytes_data), dtype_backend="pyarrow")
+
+    expected = pd.DataFrame(
+        {
+            col: pd.arrays.ArrowExtensionArray(pa.array(df[col], from_pandas=True))
+            for col in df.columns
+        }
+    )
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_orc_dtype_backend_numpy_nullable():
+    # GH#50503
+    pytest.importorskip("pyarrow")
+    df = pd.DataFrame(
+        {
+            "string": list("abc"),
+            "string_with_nan": ["a", np.nan, "c"],
+            "string_with_none": ["a", None, "c"],
+            "int": list(range(1, 4)),
+            "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
+            "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
+            "float": np.arange(4.0, 7.0, dtype="float64"),
+            "float_with_nan": [2.0, np.nan, 3.0],
+            "bool": [True, False, True],
+            "bool_with_na": [True, False, None],
+        }
+    )
+
+    bytes_data = df.copy().to_orc()
+    result = read_orc(BytesIO(bytes_data), dtype_backend="numpy_nullable")
+
+    expected = pd.DataFrame(
+        {
+            "string": StringArray(np.array(["a", "b", "c"], dtype=np.object_)),
+            "string_with_nan": StringArray(
+                np.array(["a", pd.NA, "c"], dtype=np.object_)
+            ),
+            "string_with_none": StringArray(
+                np.array(["a", pd.NA, "c"], dtype=np.object_)
+            ),
+            "int": pd.Series([1, 2, 3], dtype="Int64"),
+            "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
+            "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
+            "float": pd.Series([4.0, 5.0, 6.0], dtype="Float64"),
+            "float_with_nan": pd.Series([2.0, pd.NA, 3.0], dtype="Float64"),
+            "bool": pd.Series([True, False, True], dtype="boolean"),
+            "bool_with_na": pd.Series([True, False, pd.NA], dtype="boolean"),
+        }
+    )
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_orc_uri_path():
+    expected = pd.DataFrame({"int": list(range(1, 4))})
+    with tm.ensure_clean("tmp.orc") as path:
+        expected.to_orc(path)
+        uri = pathlib.Path(path).as_uri()
+        result = read_orc(uri)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "index",
+    [
+        pd.RangeIndex(start=2, stop=5, step=1),
+        pd.RangeIndex(start=0, stop=3, step=1, name="non-default"),
+        pd.Index([1, 2, 3]),
+    ],
+)
+def test_to_orc_non_default_index(index):
+    df = pd.DataFrame({"a": [1, 2, 3]}, index=index)
+    msg = (
+        "orc does not support serializing a non-default index|"
+        "orc does not serialize index meta-data"
+    )
+    with pytest.raises(ValueError, match=msg):
+        df.to_orc()
+
+
+def test_invalid_dtype_backend():
+    msg = (
+        "dtype_backend numpy is invalid, only 'numpy_nullable' and "
+        "'pyarrow' are allowed."
+    )
+    df = pd.DataFrame({"int": list(range(1, 4))})
+    with tm.ensure_clean("tmp.orc") as path:
+        df.to_orc(path)
+        with pytest.raises(ValueError, match=msg):
+            read_orc(path, dtype_backend="numpy")
+
+
+def test_string_inference(tmp_path):
+    # GH#54431
+    path = tmp_path / "test_string_inference.p"
+    df = pd.DataFrame(data={"a": ["x", "y"]})
+    df.to_orc(path)
+    with pd.option_context("future.infer_string", True):
+        result = read_orc(path)
+    expected = pd.DataFrame(
+        data={"a": ["x", "y"]},
+        dtype="string[pyarrow_numpy]",
+        columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"),
+    )
+    tm.assert_frame_equal(result, expected)