Updated script that can be controled by Nodejs web app
This commit is contained in:
228
lib/python3.13/site-packages/pandas/tests/io/test_gcs.py
Normal file
228
lib/python3.13/site-packages/pandas/tests/io/test_gcs.py
Normal file
@ -0,0 +1,228 @@
|
||||
from io import BytesIO
|
||||
import os
|
||||
import pathlib
|
||||
import tarfile
|
||||
import zipfile
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat.pyarrow import pa_version_under17p0
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
date_range,
|
||||
read_csv,
|
||||
read_excel,
|
||||
read_json,
|
||||
read_parquet,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.util import _test_decorators as td
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def gcs_buffer():
|
||||
"""Emulate GCS using a binary buffer."""
|
||||
pytest.importorskip("gcsfs")
|
||||
fsspec = pytest.importorskip("fsspec")
|
||||
|
||||
gcs_buffer = BytesIO()
|
||||
gcs_buffer.close = lambda: True
|
||||
|
||||
class MockGCSFileSystem(fsspec.AbstractFileSystem):
|
||||
@staticmethod
|
||||
def open(*args, **kwargs):
|
||||
gcs_buffer.seek(0)
|
||||
return gcs_buffer
|
||||
|
||||
def ls(self, path, **kwargs):
|
||||
# needed for pyarrow
|
||||
return [{"name": path, "type": "file"}]
|
||||
|
||||
# Overwrites the default implementation from gcsfs to our mock class
|
||||
fsspec.register_implementation("gs", MockGCSFileSystem, clobber=True)
|
||||
|
||||
return gcs_buffer
|
||||
|
||||
|
||||
# Patches pyarrow; other processes should not pick up change
|
||||
@pytest.mark.single_cpu
|
||||
@pytest.mark.parametrize("format", ["csv", "json", "parquet", "excel", "markdown"])
|
||||
def test_to_read_gcs(gcs_buffer, format, monkeypatch, capsys, request):
|
||||
"""
|
||||
Test that many to/read functions support GCS.
|
||||
|
||||
GH 33987
|
||||
"""
|
||||
|
||||
df1 = DataFrame(
|
||||
{
|
||||
"int": [1, 3],
|
||||
"float": [2.0, np.nan],
|
||||
"str": ["t", "s"],
|
||||
"dt": date_range("2018-06-18", periods=2),
|
||||
}
|
||||
)
|
||||
|
||||
path = f"gs://test/test.{format}"
|
||||
|
||||
if format == "csv":
|
||||
df1.to_csv(path, index=True)
|
||||
df2 = read_csv(path, parse_dates=["dt"], index_col=0)
|
||||
elif format == "excel":
|
||||
path = "gs://test/test.xlsx"
|
||||
df1.to_excel(path)
|
||||
df2 = read_excel(path, parse_dates=["dt"], index_col=0)
|
||||
elif format == "json":
|
||||
df1.to_json(path)
|
||||
df2 = read_json(path, convert_dates=["dt"])
|
||||
elif format == "parquet":
|
||||
pytest.importorskip("pyarrow")
|
||||
pa_fs = pytest.importorskip("pyarrow.fs")
|
||||
|
||||
class MockFileSystem(pa_fs.FileSystem):
|
||||
@staticmethod
|
||||
def from_uri(path):
|
||||
print("Using pyarrow filesystem")
|
||||
to_local = pathlib.Path(path.replace("gs://", "")).absolute().as_uri()
|
||||
return pa_fs.LocalFileSystem(to_local)
|
||||
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(
|
||||
not pa_version_under17p0,
|
||||
raises=TypeError,
|
||||
reason="pyarrow 17 broke the mocked filesystem",
|
||||
)
|
||||
)
|
||||
with monkeypatch.context() as m:
|
||||
m.setattr(pa_fs, "FileSystem", MockFileSystem)
|
||||
df1.to_parquet(path)
|
||||
df2 = read_parquet(path)
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == "Using pyarrow filesystem\nUsing pyarrow filesystem\n"
|
||||
elif format == "markdown":
|
||||
pytest.importorskip("tabulate")
|
||||
df1.to_markdown(path)
|
||||
df2 = df1
|
||||
|
||||
tm.assert_frame_equal(df1, df2)
|
||||
|
||||
|
||||
def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str):
|
||||
"""
|
||||
For zip compression, only compare the CRC-32 checksum of the file contents
|
||||
to avoid checking the time-dependent last-modified timestamp which
|
||||
in some CI builds is off-by-one
|
||||
|
||||
See https://en.wikipedia.org/wiki/ZIP_(file_format)#File_headers
|
||||
"""
|
||||
if compression == "zip":
|
||||
# Only compare the CRC checksum of the file contents
|
||||
with zipfile.ZipFile(BytesIO(result)) as exp, zipfile.ZipFile(
|
||||
BytesIO(expected)
|
||||
) as res:
|
||||
for res_info, exp_info in zip(res.infolist(), exp.infolist()):
|
||||
assert res_info.CRC == exp_info.CRC
|
||||
elif compression == "tar":
|
||||
with tarfile.open(fileobj=BytesIO(result)) as tar_exp, tarfile.open(
|
||||
fileobj=BytesIO(expected)
|
||||
) as tar_res:
|
||||
for tar_res_info, tar_exp_info in zip(
|
||||
tar_res.getmembers(), tar_exp.getmembers()
|
||||
):
|
||||
actual_file = tar_res.extractfile(tar_res_info)
|
||||
expected_file = tar_exp.extractfile(tar_exp_info)
|
||||
assert (actual_file is None) == (expected_file is None)
|
||||
if actual_file is not None and expected_file is not None:
|
||||
assert actual_file.read() == expected_file.read()
|
||||
else:
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("encoding", ["utf-8", "cp1251"])
|
||||
def test_to_csv_compression_encoding_gcs(
|
||||
gcs_buffer, compression_only, encoding, compression_to_extension
|
||||
):
|
||||
"""
|
||||
Compression and encoding should with GCS.
|
||||
|
||||
GH 35677 (to_csv, compression), GH 26124 (to_csv, encoding), and
|
||||
GH 32392 (read_csv, encoding)
|
||||
"""
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=Index([f"i-{i}" for i in range(30)], dtype=object),
|
||||
)
|
||||
|
||||
# reference of compressed and encoded file
|
||||
compression = {"method": compression_only}
|
||||
if compression_only == "gzip":
|
||||
compression["mtime"] = 1 # be reproducible
|
||||
buffer = BytesIO()
|
||||
df.to_csv(buffer, compression=compression, encoding=encoding, mode="wb")
|
||||
|
||||
# write compressed file with explicit compression
|
||||
path_gcs = "gs://test/test.csv"
|
||||
df.to_csv(path_gcs, compression=compression, encoding=encoding)
|
||||
res = gcs_buffer.getvalue()
|
||||
expected = buffer.getvalue()
|
||||
assert_equal_zip_safe(res, expected, compression_only)
|
||||
|
||||
read_df = read_csv(
|
||||
path_gcs, index_col=0, compression=compression_only, encoding=encoding
|
||||
)
|
||||
tm.assert_frame_equal(df, read_df)
|
||||
|
||||
# write compressed file with implicit compression
|
||||
file_ext = compression_to_extension[compression_only]
|
||||
compression["method"] = "infer"
|
||||
path_gcs += f".{file_ext}"
|
||||
df.to_csv(path_gcs, compression=compression, encoding=encoding)
|
||||
|
||||
res = gcs_buffer.getvalue()
|
||||
expected = buffer.getvalue()
|
||||
assert_equal_zip_safe(res, expected, compression_only)
|
||||
|
||||
read_df = read_csv(path_gcs, index_col=0, compression="infer", encoding=encoding)
|
||||
tm.assert_frame_equal(df, read_df)
|
||||
|
||||
|
||||
def test_to_parquet_gcs_new_file(monkeypatch, tmpdir):
|
||||
"""Regression test for writing to a not-yet-existent GCS Parquet file."""
|
||||
pytest.importorskip("fastparquet")
|
||||
pytest.importorskip("gcsfs")
|
||||
|
||||
from fsspec import AbstractFileSystem
|
||||
|
||||
df1 = DataFrame(
|
||||
{
|
||||
"int": [1, 3],
|
||||
"float": [2.0, np.nan],
|
||||
"str": ["t", "s"],
|
||||
"dt": date_range("2018-06-18", periods=2),
|
||||
}
|
||||
)
|
||||
|
||||
class MockGCSFileSystem(AbstractFileSystem):
|
||||
def open(self, path, mode="r", *args):
|
||||
if "w" not in mode:
|
||||
raise FileNotFoundError
|
||||
return open(os.path.join(tmpdir, "test.parquet"), mode, encoding="utf-8")
|
||||
|
||||
monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem)
|
||||
df1.to_parquet(
|
||||
"gs://test/test.csv", index=True, engine="fastparquet", compression=None
|
||||
)
|
||||
|
||||
|
||||
@td.skip_if_installed("gcsfs")
|
||||
def test_gcs_not_present_exception():
|
||||
with tm.external_error_raised(ImportError):
|
||||
read_csv("gs://test/test.csv")
|
Reference in New Issue
Block a user