Updated script that can be controled by Nodejs web app

This commit is contained in:
mac OS
2024-11-25 12:24:18 +07:00
parent c440eda1f4
commit 8b0ab2bd3a
8662 changed files with 1803808 additions and 34 deletions

View File

@ -0,0 +1,9 @@
import pytest
@pytest.fixture(params=["split", "records", "index", "columns", "values"])
def orient(request):
"""
Fixture for orients excluding the table format.
"""
return request.param

View File

@ -0,0 +1,130 @@
from io import (
BytesIO,
StringIO,
)
import pytest
import pandas.util._test_decorators as td
import pandas as pd
import pandas._testing as tm
def test_compression_roundtrip(compression):
df = pd.DataFrame(
[[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
index=["A", "B"],
columns=["X", "Y", "Z"],
)
with tm.ensure_clean() as path:
df.to_json(path, compression=compression)
tm.assert_frame_equal(df, pd.read_json(path, compression=compression))
# explicitly ensure file was compressed.
with tm.decompress_file(path, compression) as fh:
result = fh.read().decode("utf8")
data = StringIO(result)
tm.assert_frame_equal(df, pd.read_json(data))
def test_read_zipped_json(datapath):
uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json")
uncompressed_df = pd.read_json(uncompressed_path)
compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip")
compressed_df = pd.read_json(compressed_path, compression="zip")
tm.assert_frame_equal(uncompressed_df, compressed_df)
@td.skip_if_not_us_locale
@pytest.mark.single_cpu
def test_with_s3_url(compression, s3_public_bucket, s3so):
# Bucket created in tests/io/conftest.py
df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
with tm.ensure_clean() as path:
df.to_json(path, compression=compression)
with open(path, "rb") as f:
s3_public_bucket.put_object(Key="test-1", Body=f)
roundtripped_df = pd.read_json(
f"s3://{s3_public_bucket.name}/test-1",
compression=compression,
storage_options=s3so,
)
tm.assert_frame_equal(df, roundtripped_df)
def test_lines_with_compression(compression):
with tm.ensure_clean() as path:
df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
df.to_json(path, orient="records", lines=True, compression=compression)
roundtripped_df = pd.read_json(path, lines=True, compression=compression)
tm.assert_frame_equal(df, roundtripped_df)
def test_chunksize_with_compression(compression):
with tm.ensure_clean() as path:
df = pd.read_json(StringIO('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}'))
df.to_json(path, orient="records", lines=True, compression=compression)
with pd.read_json(
path, lines=True, chunksize=1, compression=compression
) as res:
roundtripped_df = pd.concat(res)
tm.assert_frame_equal(df, roundtripped_df)
def test_write_unsupported_compression_type():
df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
with tm.ensure_clean() as path:
msg = "Unrecognized compression type: unsupported"
with pytest.raises(ValueError, match=msg):
df.to_json(path, compression="unsupported")
def test_read_unsupported_compression_type():
with tm.ensure_clean() as path:
msg = "Unrecognized compression type: unsupported"
with pytest.raises(ValueError, match=msg):
pd.read_json(path, compression="unsupported")
@pytest.mark.parametrize(
"infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
)
@pytest.mark.parametrize("to_infer", [True, False])
@pytest.mark.parametrize("read_infer", [True, False])
def test_to_json_compression(
compression_only, read_infer, to_infer, compression_to_extension, infer_string
):
with pd.option_context("future.infer_string", infer_string):
# see gh-15008
compression = compression_only
# We'll complete file extension subsequently.
filename = "test."
filename += compression_to_extension[compression]
df = pd.DataFrame({"A": [1]})
to_compression = "infer" if to_infer else compression
read_compression = "infer" if read_infer else compression
with tm.ensure_clean(filename) as path:
df.to_json(path, compression=to_compression)
result = pd.read_json(path, compression=read_compression)
tm.assert_frame_equal(result, df)
def test_to_json_compression_mode(compression):
# GH 39985 (read_json does not support user-provided binary files)
expected = pd.DataFrame({"A": [1]})
with BytesIO() as buffer:
expected.to_json(buffer, compression=compression)
# df = pd.read_json(buffer, compression=compression)
# tm.assert_frame_equal(expected, df)

View File

@ -0,0 +1,21 @@
"""
Tests for the deprecated keyword arguments for `read_json`.
"""
from io import StringIO
import pandas as pd
import pandas._testing as tm
from pandas.io.json import read_json
def test_good_kwargs():
df = pd.DataFrame({"A": [2, 4, 6], "B": [3, 6, 9]}, index=[0, 1, 2])
with tm.assert_produces_warning(None):
data1 = StringIO(df.to_json(orient="split"))
tm.assert_frame_equal(df, read_json(data1, orient="split"))
data2 = StringIO(df.to_json(orient="columns"))
tm.assert_frame_equal(df, read_json(data2, orient="columns"))
data3 = StringIO(df.to_json(orient="index"))
tm.assert_frame_equal(df, read_json(data3, orient="index"))

View File

@ -0,0 +1,873 @@
"""Tests for Table Schema integration."""
from collections import OrderedDict
from io import StringIO
import json
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
DatetimeTZDtype,
PeriodDtype,
)
import pandas as pd
from pandas import DataFrame
import pandas._testing as tm
from pandas.io.json._table_schema import (
as_json_table_type,
build_table_schema,
convert_json_field_to_pandas_type,
convert_pandas_type_to_json_field,
set_default_names,
)
@pytest.fixture
def df_schema():
return DataFrame(
{
"A": [1, 2, 3, 4],
"B": ["a", "b", "c", "c"],
"C": pd.date_range("2016-01-01", freq="d", periods=4),
"D": pd.timedelta_range("1h", periods=4, freq="min"),
},
index=pd.Index(range(4), name="idx"),
)
@pytest.fixture
def df_table():
return DataFrame(
{
"A": [1, 2, 3, 4],
"B": ["a", "b", "c", "c"],
"C": pd.date_range("2016-01-01", freq="d", periods=4),
"D": pd.timedelta_range("1h", periods=4, freq="min"),
"E": pd.Series(pd.Categorical(["a", "b", "c", "c"])),
"F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)),
"G": [1.0, 2.0, 3, 4.0],
"H": pd.date_range("2016-01-01", freq="d", periods=4, tz="US/Central"),
},
index=pd.Index(range(4), name="idx"),
)
class TestBuildSchema:
def test_build_table_schema(self, df_schema, using_infer_string):
result = build_table_schema(df_schema, version=False)
expected = {
"fields": [
{"name": "idx", "type": "integer"},
{"name": "A", "type": "integer"},
{"name": "B", "type": "string"},
{"name": "C", "type": "datetime"},
{"name": "D", "type": "duration"},
],
"primaryKey": ["idx"],
}
if using_infer_string:
expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "string"}
assert result == expected
result = build_table_schema(df_schema)
assert "pandas_version" in result
def test_series(self):
s = pd.Series([1, 2, 3], name="foo")
result = build_table_schema(s, version=False)
expected = {
"fields": [
{"name": "index", "type": "integer"},
{"name": "foo", "type": "integer"},
],
"primaryKey": ["index"],
}
assert result == expected
result = build_table_schema(s)
assert "pandas_version" in result
def test_series_unnamed(self):
result = build_table_schema(pd.Series([1, 2, 3]), version=False)
expected = {
"fields": [
{"name": "index", "type": "integer"},
{"name": "values", "type": "integer"},
],
"primaryKey": ["index"],
}
assert result == expected
def test_multiindex(self, df_schema, using_infer_string):
df = df_schema
idx = pd.MultiIndex.from_product([("a", "b"), (1, 2)])
df.index = idx
result = build_table_schema(df, version=False)
expected = {
"fields": [
{"name": "level_0", "type": "string"},
{"name": "level_1", "type": "integer"},
{"name": "A", "type": "integer"},
{"name": "B", "type": "string"},
{"name": "C", "type": "datetime"},
{"name": "D", "type": "duration"},
],
"primaryKey": ["level_0", "level_1"],
}
if using_infer_string:
expected["fields"][0] = {
"name": "level_0",
"type": "any",
"extDtype": "string",
}
expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "string"}
assert result == expected
df.index.names = ["idx0", None]
expected["fields"][0]["name"] = "idx0"
expected["primaryKey"] = ["idx0", "level_1"]
result = build_table_schema(df, version=False)
assert result == expected
class TestTableSchemaType:
@pytest.mark.parametrize("int_type", [int, np.int16, np.int32, np.int64])
def test_as_json_table_type_int_data(self, int_type):
int_data = [1, 2, 3]
assert as_json_table_type(np.array(int_data, dtype=int_type).dtype) == "integer"
@pytest.mark.parametrize("float_type", [float, np.float16, np.float32, np.float64])
def test_as_json_table_type_float_data(self, float_type):
float_data = [1.0, 2.0, 3.0]
assert (
as_json_table_type(np.array(float_data, dtype=float_type).dtype) == "number"
)
@pytest.mark.parametrize("bool_type", [bool, np.bool_])
def test_as_json_table_type_bool_data(self, bool_type):
bool_data = [True, False]
assert (
as_json_table_type(np.array(bool_data, dtype=bool_type).dtype) == "boolean"
)
@pytest.mark.parametrize(
"date_data",
[
pd.to_datetime(["2016"]),
pd.to_datetime(["2016"], utc=True),
pd.Series(pd.to_datetime(["2016"])),
pd.Series(pd.to_datetime(["2016"], utc=True)),
pd.period_range("2016", freq="Y", periods=3),
],
)
def test_as_json_table_type_date_data(self, date_data):
assert as_json_table_type(date_data.dtype) == "datetime"
@pytest.mark.parametrize(
"str_data",
[pd.Series(["a", "b"], dtype=object), pd.Index(["a", "b"], dtype=object)],
)
def test_as_json_table_type_string_data(self, str_data):
assert as_json_table_type(str_data.dtype) == "string"
@pytest.mark.parametrize(
"cat_data",
[
pd.Categorical(["a"]),
pd.Categorical([1]),
pd.Series(pd.Categorical([1])),
pd.CategoricalIndex([1]),
pd.Categorical([1]),
],
)
def test_as_json_table_type_categorical_data(self, cat_data):
assert as_json_table_type(cat_data.dtype) == "any"
# ------
# dtypes
# ------
@pytest.mark.parametrize("int_dtype", [int, np.int16, np.int32, np.int64])
def test_as_json_table_type_int_dtypes(self, int_dtype):
assert as_json_table_type(int_dtype) == "integer"
@pytest.mark.parametrize("float_dtype", [float, np.float16, np.float32, np.float64])
def test_as_json_table_type_float_dtypes(self, float_dtype):
assert as_json_table_type(float_dtype) == "number"
@pytest.mark.parametrize("bool_dtype", [bool, np.bool_])
def test_as_json_table_type_bool_dtypes(self, bool_dtype):
assert as_json_table_type(bool_dtype) == "boolean"
@pytest.mark.parametrize(
"date_dtype",
[
np.dtype("<M8[ns]"),
PeriodDtype("D"),
DatetimeTZDtype("ns", "US/Central"),
],
)
def test_as_json_table_type_date_dtypes(self, date_dtype):
# TODO: datedate.date? datetime.time?
assert as_json_table_type(date_dtype) == "datetime"
@pytest.mark.parametrize("td_dtype", [np.dtype("<m8[ns]")])
def test_as_json_table_type_timedelta_dtypes(self, td_dtype):
assert as_json_table_type(td_dtype) == "duration"
@pytest.mark.parametrize("str_dtype", [object]) # TODO(GH#14904) flesh out dtypes?
def test_as_json_table_type_string_dtypes(self, str_dtype):
assert as_json_table_type(str_dtype) == "string"
def test_as_json_table_type_categorical_dtypes(self):
assert as_json_table_type(pd.Categorical(["a"]).dtype) == "any"
assert as_json_table_type(CategoricalDtype()) == "any"
class TestTableOrient:
def test_build_series(self):
s = pd.Series([1, 2], name="a")
s.index.name = "id"
result = s.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result["schema"]
result["schema"].pop("pandas_version")
fields = [{"name": "id", "type": "integer"}, {"name": "a", "type": "integer"}]
schema = {"fields": fields, "primaryKey": ["id"]}
expected = OrderedDict(
[
("schema", schema),
(
"data",
[
OrderedDict([("id", 0), ("a", 1)]),
OrderedDict([("id", 1), ("a", 2)]),
],
),
]
)
assert result == expected
def test_read_json_from_to_json_results(self):
# GH32383
df = DataFrame(
{
"_id": {"row_0": 0},
"category": {"row_0": "Goods"},
"recommender_id": {"row_0": 3},
"recommender_name_jp": {"row_0": "浦田"},
"recommender_name_en": {"row_0": "Urata"},
"name_jp": {"row_0": "博多人形(松尾吉将まつお よしまさ)"},
"name_en": {"row_0": "Hakata Dolls Matsuo"},
}
)
result1 = pd.read_json(StringIO(df.to_json()))
result2 = DataFrame.from_dict(json.loads(df.to_json()))
tm.assert_frame_equal(result1, df)
tm.assert_frame_equal(result2, df)
def test_to_json(self, df_table, using_infer_string):
df = df_table
df.index.name = "idx"
result = df.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result["schema"]
result["schema"].pop("pandas_version")
fields = [
{"name": "idx", "type": "integer"},
{"name": "A", "type": "integer"},
{"name": "B", "type": "string"},
{"name": "C", "type": "datetime"},
{"name": "D", "type": "duration"},
{
"constraints": {"enum": ["a", "b", "c"]},
"name": "E",
"ordered": False,
"type": "any",
},
{
"constraints": {"enum": ["a", "b", "c"]},
"name": "F",
"ordered": True,
"type": "any",
},
{"name": "G", "type": "number"},
{"name": "H", "type": "datetime", "tz": "US/Central"},
]
if using_infer_string:
fields[2] = {"name": "B", "type": "any", "extDtype": "string"}
schema = {"fields": fields, "primaryKey": ["idx"]}
data = [
OrderedDict(
[
("idx", 0),
("A", 1),
("B", "a"),
("C", "2016-01-01T00:00:00.000"),
("D", "P0DT1H0M0S"),
("E", "a"),
("F", "a"),
("G", 1.0),
("H", "2016-01-01T06:00:00.000Z"),
]
),
OrderedDict(
[
("idx", 1),
("A", 2),
("B", "b"),
("C", "2016-01-02T00:00:00.000"),
("D", "P0DT1H1M0S"),
("E", "b"),
("F", "b"),
("G", 2.0),
("H", "2016-01-02T06:00:00.000Z"),
]
),
OrderedDict(
[
("idx", 2),
("A", 3),
("B", "c"),
("C", "2016-01-03T00:00:00.000"),
("D", "P0DT1H2M0S"),
("E", "c"),
("F", "c"),
("G", 3.0),
("H", "2016-01-03T06:00:00.000Z"),
]
),
OrderedDict(
[
("idx", 3),
("A", 4),
("B", "c"),
("C", "2016-01-04T00:00:00.000"),
("D", "P0DT1H3M0S"),
("E", "c"),
("F", "c"),
("G", 4.0),
("H", "2016-01-04T06:00:00.000Z"),
]
),
]
expected = OrderedDict([("schema", schema), ("data", data)])
assert result == expected
def test_to_json_float_index(self):
data = pd.Series(1, index=[1.0, 2.0])
result = data.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
result["schema"].pop("pandas_version")
expected = OrderedDict(
[
(
"schema",
{
"fields": [
{"name": "index", "type": "number"},
{"name": "values", "type": "integer"},
],
"primaryKey": ["index"],
},
),
(
"data",
[
OrderedDict([("index", 1.0), ("values", 1)]),
OrderedDict([("index", 2.0), ("values", 1)]),
],
),
]
)
assert result == expected
def test_to_json_period_index(self):
idx = pd.period_range("2016", freq="Q-JAN", periods=2)
data = pd.Series(1, idx)
result = data.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
result["schema"].pop("pandas_version")
fields = [
{"freq": "QE-JAN", "name": "index", "type": "datetime"},
{"name": "values", "type": "integer"},
]
schema = {"fields": fields, "primaryKey": ["index"]}
data = [
OrderedDict([("index", "2015-11-01T00:00:00.000"), ("values", 1)]),
OrderedDict([("index", "2016-02-01T00:00:00.000"), ("values", 1)]),
]
expected = OrderedDict([("schema", schema), ("data", data)])
assert result == expected
def test_to_json_categorical_index(self):
data = pd.Series(1, pd.CategoricalIndex(["a", "b"]))
result = data.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
result["schema"].pop("pandas_version")
expected = OrderedDict(
[
(
"schema",
{
"fields": [
{
"name": "index",
"type": "any",
"constraints": {"enum": ["a", "b"]},
"ordered": False,
},
{"name": "values", "type": "integer"},
],
"primaryKey": ["index"],
},
),
(
"data",
[
OrderedDict([("index", "a"), ("values", 1)]),
OrderedDict([("index", "b"), ("values", 1)]),
],
),
]
)
assert result == expected
def test_date_format_raises(self, df_table):
msg = (
"Trying to write with `orient='table'` and `date_format='epoch'`. Table "
"Schema requires dates to be formatted with `date_format='iso'`"
)
with pytest.raises(ValueError, match=msg):
df_table.to_json(orient="table", date_format="epoch")
# others work
df_table.to_json(orient="table", date_format="iso")
df_table.to_json(orient="table")
def test_convert_pandas_type_to_json_field_int(self, index_or_series):
kind = index_or_series
data = [1, 2, 3]
result = convert_pandas_type_to_json_field(kind(data, name="name"))
expected = {"name": "name", "type": "integer"}
assert result == expected
def test_convert_pandas_type_to_json_field_float(self, index_or_series):
kind = index_or_series
data = [1.0, 2.0, 3.0]
result = convert_pandas_type_to_json_field(kind(data, name="name"))
expected = {"name": "name", "type": "number"}
assert result == expected
@pytest.mark.parametrize(
"dt_args,extra_exp", [({}, {}), ({"utc": True}, {"tz": "UTC"})]
)
@pytest.mark.parametrize("wrapper", [None, pd.Series])
def test_convert_pandas_type_to_json_field_datetime(
self, dt_args, extra_exp, wrapper
):
data = [1.0, 2.0, 3.0]
data = pd.to_datetime(data, **dt_args)
if wrapper is pd.Series:
data = pd.Series(data, name="values")
result = convert_pandas_type_to_json_field(data)
expected = {"name": "values", "type": "datetime"}
expected.update(extra_exp)
assert result == expected
def test_convert_pandas_type_to_json_period_range(self):
arr = pd.period_range("2016", freq="Y-DEC", periods=4)
result = convert_pandas_type_to_json_field(arr)
expected = {"name": "values", "type": "datetime", "freq": "YE-DEC"}
assert result == expected
@pytest.mark.parametrize("kind", [pd.Categorical, pd.CategoricalIndex])
@pytest.mark.parametrize("ordered", [True, False])
def test_convert_pandas_type_to_json_field_categorical(self, kind, ordered):
data = ["a", "b", "c"]
if kind is pd.Categorical:
arr = pd.Series(kind(data, ordered=ordered), name="cats")
elif kind is pd.CategoricalIndex:
arr = kind(data, ordered=ordered, name="cats")
result = convert_pandas_type_to_json_field(arr)
expected = {
"name": "cats",
"type": "any",
"constraints": {"enum": data},
"ordered": ordered,
}
assert result == expected
@pytest.mark.parametrize(
"inp,exp",
[
({"type": "integer"}, "int64"),
({"type": "number"}, "float64"),
({"type": "boolean"}, "bool"),
({"type": "duration"}, "timedelta64"),
({"type": "datetime"}, "datetime64[ns]"),
({"type": "datetime", "tz": "US/Hawaii"}, "datetime64[ns, US/Hawaii]"),
({"type": "any"}, "object"),
(
{
"type": "any",
"constraints": {"enum": ["a", "b", "c"]},
"ordered": False,
},
CategoricalDtype(categories=["a", "b", "c"], ordered=False),
),
(
{
"type": "any",
"constraints": {"enum": ["a", "b", "c"]},
"ordered": True,
},
CategoricalDtype(categories=["a", "b", "c"], ordered=True),
),
({"type": "string"}, "object"),
],
)
def test_convert_json_field_to_pandas_type(self, inp, exp):
field = {"name": "foo"}
field.update(inp)
assert convert_json_field_to_pandas_type(field) == exp
@pytest.mark.parametrize("inp", ["geopoint", "geojson", "fake_type"])
def test_convert_json_field_to_pandas_type_raises(self, inp):
field = {"type": inp}
with pytest.raises(
ValueError, match=f"Unsupported or invalid field type: {inp}"
):
convert_json_field_to_pandas_type(field)
def test_categorical(self):
s = pd.Series(pd.Categorical(["a", "b", "a"]))
s.index.name = "idx"
result = s.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
result["schema"].pop("pandas_version")
fields = [
{"name": "idx", "type": "integer"},
{
"constraints": {"enum": ["a", "b"]},
"name": "values",
"ordered": False,
"type": "any",
},
]
expected = OrderedDict(
[
("schema", {"fields": fields, "primaryKey": ["idx"]}),
(
"data",
[
OrderedDict([("idx", 0), ("values", "a")]),
OrderedDict([("idx", 1), ("values", "b")]),
OrderedDict([("idx", 2), ("values", "a")]),
],
),
]
)
assert result == expected
@pytest.mark.parametrize(
"idx,nm,prop",
[
(pd.Index([1]), "index", "name"),
(pd.Index([1], name="myname"), "myname", "name"),
(
pd.MultiIndex.from_product([("a", "b"), ("c", "d")]),
["level_0", "level_1"],
"names",
),
(
pd.MultiIndex.from_product(
[("a", "b"), ("c", "d")], names=["n1", "n2"]
),
["n1", "n2"],
"names",
),
(
pd.MultiIndex.from_product(
[("a", "b"), ("c", "d")], names=["n1", None]
),
["n1", "level_1"],
"names",
),
],
)
def test_set_names_unset(self, idx, nm, prop):
data = pd.Series(1, idx)
result = set_default_names(data)
assert getattr(result.index, prop) == nm
@pytest.mark.parametrize(
"idx",
[
pd.Index([], name="index"),
pd.MultiIndex.from_arrays([["foo"], ["bar"]], names=("level_0", "level_1")),
pd.MultiIndex.from_arrays([["foo"], ["bar"]], names=("foo", "level_1")),
],
)
def test_warns_non_roundtrippable_names(self, idx):
# GH 19130
df = DataFrame(index=idx)
df.index.name = "index"
with tm.assert_produces_warning():
set_default_names(df)
def test_timestamp_in_columns(self):
df = DataFrame(
[[1, 2]], columns=[pd.Timestamp("2016"), pd.Timedelta(10, unit="s")]
)
result = df.to_json(orient="table")
js = json.loads(result)
assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000"
assert js["schema"]["fields"][2]["name"] == "P0DT0H0M10S"
@pytest.mark.parametrize(
"case",
[
pd.Series([1], index=pd.Index([1], name="a"), name="a"),
DataFrame({"A": [1]}, index=pd.Index([1], name="A")),
DataFrame(
{"A": [1]},
index=pd.MultiIndex.from_arrays([["a"], [1]], names=["A", "a"]),
),
],
)
def test_overlapping_names(self, case):
with pytest.raises(ValueError, match="Overlapping"):
case.to_json(orient="table")
def test_mi_falsey_name(self):
# GH 16203
df = DataFrame(
np.random.default_rng(2).standard_normal((4, 4)),
index=pd.MultiIndex.from_product([("A", "B"), ("a", "b")]),
)
result = [x["name"] for x in build_table_schema(df)["fields"]]
assert result == ["level_0", "level_1", 0, 1, 2, 3]
class TestTableOrientReader:
@pytest.mark.parametrize(
"index_nm",
[None, "idx", pytest.param("index", marks=pytest.mark.xfail), "level_0"],
)
@pytest.mark.parametrize(
"vals",
[
{"ints": [1, 2, 3, 4]},
{"objects": ["a", "b", "c", "d"]},
{"objects": ["1", "2", "3", "4"]},
{"date_ranges": pd.date_range("2016-01-01", freq="d", periods=4)},
{"categoricals": pd.Series(pd.Categorical(["a", "b", "c", "c"]))},
{
"ordered_cats": pd.Series(
pd.Categorical(["a", "b", "c", "c"], ordered=True)
)
},
{"floats": [1.0, 2.0, 3.0, 4.0]},
{"floats": [1.1, 2.2, 3.3, 4.4]},
{"bools": [True, False, False, True]},
{
"timezones": pd.date_range(
"2016-01-01", freq="d", periods=4, tz="US/Central"
) # added in # GH 35973
},
],
)
def test_read_json_table_orient(self, index_nm, vals, recwarn):
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
out = df.to_json(orient="table")
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)
@pytest.mark.parametrize("index_nm", [None, "idx", "index"])
@pytest.mark.parametrize(
"vals",
[{"timedeltas": pd.timedelta_range("1h", periods=4, freq="min")}],
)
def test_read_json_table_orient_raises(self, index_nm, vals, recwarn):
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
out = df.to_json(orient="table")
with pytest.raises(NotImplementedError, match="can not yet read "):
pd.read_json(out, orient="table")
@pytest.mark.parametrize(
"index_nm",
[None, "idx", pytest.param("index", marks=pytest.mark.xfail), "level_0"],
)
@pytest.mark.parametrize(
"vals",
[
{"ints": [1, 2, 3, 4]},
{"objects": ["a", "b", "c", "d"]},
{"objects": ["1", "2", "3", "4"]},
{"date_ranges": pd.date_range("2016-01-01", freq="d", periods=4)},
{"categoricals": pd.Series(pd.Categorical(["a", "b", "c", "c"]))},
{
"ordered_cats": pd.Series(
pd.Categorical(["a", "b", "c", "c"], ordered=True)
)
},
{"floats": [1.0, 2.0, 3.0, 4.0]},
{"floats": [1.1, 2.2, 3.3, 4.4]},
{"bools": [True, False, False, True]},
{
"timezones": pd.date_range(
"2016-01-01", freq="d", periods=4, tz="US/Central"
) # added in # GH 35973
},
],
)
def test_read_json_table_period_orient(self, index_nm, vals, recwarn):
df = DataFrame(
vals,
index=pd.Index(
(pd.Period(f"2022Q{q}") for q in range(1, 5)), name=index_nm
),
)
out = df.to_json(orient="table")
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)
@pytest.mark.parametrize(
"idx",
[
pd.Index(range(4)),
pd.date_range(
"2020-08-30",
freq="d",
periods=4,
)._with_freq(None),
pd.date_range(
"2020-08-30", freq="d", periods=4, tz="US/Central"
)._with_freq(None),
pd.MultiIndex.from_product(
[
pd.date_range("2020-08-30", freq="d", periods=2, tz="US/Central"),
["x", "y"],
],
),
],
)
@pytest.mark.parametrize(
"vals",
[
{"floats": [1.1, 2.2, 3.3, 4.4]},
{"dates": pd.date_range("2020-08-30", freq="d", periods=4)},
{
"timezones": pd.date_range(
"2020-08-30", freq="d", periods=4, tz="Europe/London"
)
},
],
)
def test_read_json_table_timezones_orient(self, idx, vals, recwarn):
# GH 35973
df = DataFrame(vals, index=idx)
out = df.to_json(orient="table")
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)
def test_comprehensive(self):
df = DataFrame(
{
"A": [1, 2, 3, 4],
"B": ["a", "b", "c", "c"],
"C": pd.date_range("2016-01-01", freq="d", periods=4),
# 'D': pd.timedelta_range('1h', periods=4, freq='min'),
"E": pd.Series(pd.Categorical(["a", "b", "c", "c"])),
"F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)),
"G": [1.1, 2.2, 3.3, 4.4],
"H": pd.date_range("2016-01-01", freq="d", periods=4, tz="US/Central"),
"I": [True, False, False, True],
},
index=pd.Index(range(4), name="idx"),
)
out = StringIO(df.to_json(orient="table"))
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)
@pytest.mark.parametrize(
"index_names",
[[None, None], ["foo", "bar"], ["foo", None], [None, "foo"], ["index", "foo"]],
)
def test_multiindex(self, index_names):
# GH 18912
df = DataFrame(
[["Arr", "alpha", [1, 2, 3, 4]], ["Bee", "Beta", [10, 20, 30, 40]]],
index=[["A", "B"], ["Null", "Eins"]],
columns=["Aussprache", "Griechisch", "Args"],
)
df.index.names = index_names
out = StringIO(df.to_json(orient="table"))
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)
def test_empty_frame_roundtrip(self):
# GH 21287
df = DataFrame(columns=["a", "b", "c"])
expected = df.copy()
out = StringIO(df.to_json(orient="table"))
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(expected, result)
def test_read_json_orient_table_old_schema_version(self):
df_json = """
{
"schema":{
"fields":[
{"name":"index","type":"integer"},
{"name":"a","type":"string"}
],
"primaryKey":["index"],
"pandas_version":"0.20.0"
},
"data":[
{"index":0,"a":1},
{"index":1,"a":2.0},
{"index":2,"a":"s"}
]
}
"""
expected = DataFrame({"a": [1, 2.0, "s"]})
result = pd.read_json(StringIO(df_json), orient="table")
tm.assert_frame_equal(expected, result)
@pytest.mark.parametrize("freq", ["M", "2M", "Q", "2Q", "Y", "2Y"])
def test_read_json_table_orient_period_depr_freq(self, freq, recwarn):
# GH#9586
df = DataFrame(
{"ints": [1, 2]},
index=pd.PeriodIndex(["2020-01", "2021-06"], freq=freq),
)
out = df.to_json(orient="table")
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)

View File

@ -0,0 +1,317 @@
"""Tests for ExtensionDtype Table Schema integration."""
from collections import OrderedDict
import datetime as dt
import decimal
from io import StringIO
import json
import pytest
from pandas import (
NA,
DataFrame,
Index,
array,
read_json,
)
import pandas._testing as tm
from pandas.core.arrays.integer import Int64Dtype
from pandas.core.arrays.string_ import StringDtype
from pandas.core.series import Series
from pandas.tests.extension.date import (
DateArray,
DateDtype,
)
from pandas.tests.extension.decimal.array import (
DecimalArray,
DecimalDtype,
)
from pandas.io.json._table_schema import (
as_json_table_type,
build_table_schema,
)
class TestBuildSchema:
def test_build_table_schema(self):
df = DataFrame(
{
"A": DateArray([dt.date(2021, 10, 10)]),
"B": DecimalArray([decimal.Decimal(10)]),
"C": array(["pandas"], dtype="string"),
"D": array([10], dtype="Int64"),
}
)
result = build_table_schema(df, version=False)
expected = {
"fields": [
{"name": "index", "type": "integer"},
{"name": "A", "type": "any", "extDtype": "DateDtype"},
{"name": "B", "type": "number", "extDtype": "decimal"},
{"name": "C", "type": "any", "extDtype": "string"},
{"name": "D", "type": "integer", "extDtype": "Int64"},
],
"primaryKey": ["index"],
}
assert result == expected
result = build_table_schema(df)
assert "pandas_version" in result
class TestTableSchemaType:
@pytest.mark.parametrize(
"date_data",
[
DateArray([dt.date(2021, 10, 10)]),
DateArray(dt.date(2021, 10, 10)),
Series(DateArray(dt.date(2021, 10, 10))),
],
)
def test_as_json_table_type_ext_date_array_dtype(self, date_data):
assert as_json_table_type(date_data.dtype) == "any"
def test_as_json_table_type_ext_date_dtype(self):
assert as_json_table_type(DateDtype()) == "any"
@pytest.mark.parametrize(
"decimal_data",
[
DecimalArray([decimal.Decimal(10)]),
Series(DecimalArray([decimal.Decimal(10)])),
],
)
def test_as_json_table_type_ext_decimal_array_dtype(self, decimal_data):
assert as_json_table_type(decimal_data.dtype) == "number"
def test_as_json_table_type_ext_decimal_dtype(self):
assert as_json_table_type(DecimalDtype()) == "number"
@pytest.mark.parametrize(
"string_data",
[
array(["pandas"], dtype="string"),
Series(array(["pandas"], dtype="string")),
],
)
def test_as_json_table_type_ext_string_array_dtype(self, string_data):
assert as_json_table_type(string_data.dtype) == "any"
def test_as_json_table_type_ext_string_dtype(self):
assert as_json_table_type(StringDtype()) == "any"
@pytest.mark.parametrize(
"integer_data",
[
array([10], dtype="Int64"),
Series(array([10], dtype="Int64")),
],
)
def test_as_json_table_type_ext_integer_array_dtype(self, integer_data):
assert as_json_table_type(integer_data.dtype) == "integer"
def test_as_json_table_type_ext_integer_dtype(self):
assert as_json_table_type(Int64Dtype()) == "integer"
class TestTableOrient:
@pytest.fixture
def da(self):
return DateArray([dt.date(2021, 10, 10)])
@pytest.fixture
def dc(self):
return DecimalArray([decimal.Decimal(10)])
@pytest.fixture
def sa(self):
return array(["pandas"], dtype="string")
@pytest.fixture
def ia(self):
return array([10], dtype="Int64")
@pytest.fixture
def df(self, da, dc, sa, ia):
return DataFrame(
{
"A": da,
"B": dc,
"C": sa,
"D": ia,
}
)
def test_build_date_series(self, da):
s = Series(da, name="a")
s.index.name = "id"
result = s.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result["schema"]
result["schema"].pop("pandas_version")
fields = [
{"name": "id", "type": "integer"},
{"name": "a", "type": "any", "extDtype": "DateDtype"},
]
schema = {"fields": fields, "primaryKey": ["id"]}
expected = OrderedDict(
[
("schema", schema),
("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000")])]),
]
)
assert result == expected
def test_build_decimal_series(self, dc):
s = Series(dc, name="a")
s.index.name = "id"
result = s.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result["schema"]
result["schema"].pop("pandas_version")
fields = [
{"name": "id", "type": "integer"},
{"name": "a", "type": "number", "extDtype": "decimal"},
]
schema = {"fields": fields, "primaryKey": ["id"]}
expected = OrderedDict(
[
("schema", schema),
("data", [OrderedDict([("id", 0), ("a", 10.0)])]),
]
)
assert result == expected
def test_build_string_series(self, sa):
s = Series(sa, name="a")
s.index.name = "id"
result = s.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result["schema"]
result["schema"].pop("pandas_version")
fields = [
{"name": "id", "type": "integer"},
{"name": "a", "type": "any", "extDtype": "string"},
]
schema = {"fields": fields, "primaryKey": ["id"]}
expected = OrderedDict(
[
("schema", schema),
("data", [OrderedDict([("id", 0), ("a", "pandas")])]),
]
)
assert result == expected
def test_build_int64_series(self, ia):
s = Series(ia, name="a")
s.index.name = "id"
result = s.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result["schema"]
result["schema"].pop("pandas_version")
fields = [
{"name": "id", "type": "integer"},
{"name": "a", "type": "integer", "extDtype": "Int64"},
]
schema = {"fields": fields, "primaryKey": ["id"]}
expected = OrderedDict(
[
("schema", schema),
("data", [OrderedDict([("id", 0), ("a", 10)])]),
]
)
assert result == expected
def test_to_json(self, df):
df = df.copy()
df.index.name = "idx"
result = df.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result["schema"]
result["schema"].pop("pandas_version")
fields = [
OrderedDict({"name": "idx", "type": "integer"}),
OrderedDict({"name": "A", "type": "any", "extDtype": "DateDtype"}),
OrderedDict({"name": "B", "type": "number", "extDtype": "decimal"}),
OrderedDict({"name": "C", "type": "any", "extDtype": "string"}),
OrderedDict({"name": "D", "type": "integer", "extDtype": "Int64"}),
]
schema = OrderedDict({"fields": fields, "primaryKey": ["idx"]})
data = [
OrderedDict(
[
("idx", 0),
("A", "2021-10-10T00:00:00.000"),
("B", 10.0),
("C", "pandas"),
("D", 10),
]
)
]
expected = OrderedDict([("schema", schema), ("data", data)])
assert result == expected
def test_json_ext_dtype_reading_roundtrip(self):
# GH#40255
df = DataFrame(
{
"a": Series([2, NA], dtype="Int64"),
"b": Series([1.5, NA], dtype="Float64"),
"c": Series([True, NA], dtype="boolean"),
},
index=Index([1, NA], dtype="Int64"),
)
expected = df.copy()
data_json = df.to_json(orient="table", indent=4)
result = read_json(StringIO(data_json), orient="table")
tm.assert_frame_equal(result, expected)
def test_json_ext_dtype_reading(self):
# GH#40255
data_json = """{
"schema":{
"fields":[
{
"name":"a",
"type":"integer",
"extDtype":"Int64"
}
],
},
"data":[
{
"a":2
},
{
"a":null
}
]
}"""
result = read_json(StringIO(data_json), orient="table")
expected = DataFrame({"a": Series([2, NA], dtype="Int64")})
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,907 @@
import json
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
Series,
json_normalize,
)
import pandas._testing as tm
from pandas.io.json._normalize import nested_to_record
@pytest.fixture
def deep_nested():
# deeply nested data
return [
{
"country": "USA",
"states": [
{
"name": "California",
"cities": [
{"name": "San Francisco", "pop": 12345},
{"name": "Los Angeles", "pop": 12346},
],
},
{
"name": "Ohio",
"cities": [
{"name": "Columbus", "pop": 1234},
{"name": "Cleveland", "pop": 1236},
],
},
],
},
{
"country": "Germany",
"states": [
{"name": "Bayern", "cities": [{"name": "Munich", "pop": 12347}]},
{
"name": "Nordrhein-Westfalen",
"cities": [
{"name": "Duesseldorf", "pop": 1238},
{"name": "Koeln", "pop": 1239},
],
},
],
},
]
@pytest.fixture
def state_data():
return [
{
"counties": [
{"name": "Dade", "population": 12345},
{"name": "Broward", "population": 40000},
{"name": "Palm Beach", "population": 60000},
],
"info": {"governor": "Rick Scott"},
"shortname": "FL",
"state": "Florida",
},
{
"counties": [
{"name": "Summit", "population": 1234},
{"name": "Cuyahoga", "population": 1337},
],
"info": {"governor": "John Kasich"},
"shortname": "OH",
"state": "Ohio",
},
]
@pytest.fixture
def author_missing_data():
return [
{"info": None},
{
"info": {"created_at": "11/08/1993", "last_updated": "26/05/2012"},
"author_name": {"first": "Jane", "last_name": "Doe"},
},
]
@pytest.fixture
def missing_metadata():
return [
{
"name": "Alice",
"addresses": [
{
"number": 9562,
"street": "Morris St.",
"city": "Massillon",
"state": "OH",
"zip": 44646,
}
],
"previous_residences": {"cities": [{"city_name": "Foo York City"}]},
},
{
"addresses": [
{
"number": 8449,
"street": "Spring St.",
"city": "Elizabethton",
"state": "TN",
"zip": 37643,
}
],
"previous_residences": {"cities": [{"city_name": "Barmingham"}]},
},
]
@pytest.fixture
def max_level_test_input_data():
"""
input data to test json_normalize with max_level param
"""
return [
{
"CreatedBy": {"Name": "User001"},
"Lookup": {
"TextField": "Some text",
"UserField": {"Id": "ID001", "Name": "Name001"},
},
"Image": {"a": "b"},
}
]
class TestJSONNormalize:
def test_simple_records(self):
recs = [
{"a": 1, "b": 2, "c": 3},
{"a": 4, "b": 5, "c": 6},
{"a": 7, "b": 8, "c": 9},
{"a": 10, "b": 11, "c": 12},
]
result = json_normalize(recs)
expected = DataFrame(recs)
tm.assert_frame_equal(result, expected)
def test_simple_normalize(self, state_data):
result = json_normalize(state_data[0], "counties")
expected = DataFrame(state_data[0]["counties"])
tm.assert_frame_equal(result, expected)
result = json_normalize(state_data, "counties")
expected = []
for rec in state_data:
expected.extend(rec["counties"])
expected = DataFrame(expected)
tm.assert_frame_equal(result, expected)
result = json_normalize(state_data, "counties", meta="state")
expected["state"] = np.array(["Florida", "Ohio"]).repeat([3, 2])
tm.assert_frame_equal(result, expected)
def test_fields_list_type_normalize(self):
parse_metadata_fields_list_type = [
{"values": [1, 2, 3], "metadata": {"listdata": [1, 2]}}
]
result = json_normalize(
parse_metadata_fields_list_type,
record_path=["values"],
meta=[["metadata", "listdata"]],
)
expected = DataFrame(
{0: [1, 2, 3], "metadata.listdata": [[1, 2], [1, 2], [1, 2]]}
)
tm.assert_frame_equal(result, expected)
def test_empty_array(self):
result = json_normalize([])
expected = DataFrame()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data, record_path, exception_type",
[
([{"a": 0}, {"a": 1}], None, None),
({"a": [{"a": 0}, {"a": 1}]}, "a", None),
('{"a": [{"a": 0}, {"a": 1}]}', None, NotImplementedError),
(None, None, NotImplementedError),
],
)
def test_accepted_input(self, data, record_path, exception_type):
if exception_type is not None:
with pytest.raises(exception_type, match=""):
json_normalize(data, record_path=record_path)
else:
result = json_normalize(data, record_path=record_path)
expected = DataFrame([0, 1], columns=["a"])
tm.assert_frame_equal(result, expected)
def test_simple_normalize_with_separator(self, deep_nested):
# GH 14883
result = json_normalize({"A": {"A": 1, "B": 2}})
expected = DataFrame([[1, 2]], columns=["A.A", "A.B"])
tm.assert_frame_equal(result.reindex_like(expected), expected)
result = json_normalize({"A": {"A": 1, "B": 2}}, sep="_")
expected = DataFrame([[1, 2]], columns=["A_A", "A_B"])
tm.assert_frame_equal(result.reindex_like(expected), expected)
result = json_normalize({"A": {"A": 1, "B": 2}}, sep="\u03c3")
expected = DataFrame([[1, 2]], columns=["A\u03c3A", "A\u03c3B"])
tm.assert_frame_equal(result.reindex_like(expected), expected)
result = json_normalize(
deep_nested,
["states", "cities"],
meta=["country", ["states", "name"]],
sep="_",
)
expected = Index(["name", "pop", "country", "states_name"]).sort_values()
assert result.columns.sort_values().equals(expected)
def test_normalize_with_multichar_separator(self):
# GH #43831
data = {"a": [1, 2], "b": {"b_1": 2, "b_2": (3, 4)}}
result = json_normalize(data, sep="__")
expected = DataFrame([[[1, 2], 2, (3, 4)]], columns=["a", "b__b_1", "b__b_2"])
tm.assert_frame_equal(result, expected)
def test_value_array_record_prefix(self):
# GH 21536
result = json_normalize({"A": [1, 2]}, "A", record_prefix="Prefix.")
expected = DataFrame([[1], [2]], columns=["Prefix.0"])
tm.assert_frame_equal(result, expected)
def test_nested_object_record_path(self):
# GH 22706
data = {
"state": "Florida",
"info": {
"governor": "Rick Scott",
"counties": [
{"name": "Dade", "population": 12345},
{"name": "Broward", "population": 40000},
{"name": "Palm Beach", "population": 60000},
],
},
}
result = json_normalize(data, record_path=["info", "counties"])
expected = DataFrame(
[["Dade", 12345], ["Broward", 40000], ["Palm Beach", 60000]],
columns=["name", "population"],
)
tm.assert_frame_equal(result, expected)
def test_more_deeply_nested(self, deep_nested):
result = json_normalize(
deep_nested, ["states", "cities"], meta=["country", ["states", "name"]]
)
ex_data = {
"country": ["USA"] * 4 + ["Germany"] * 3,
"states.name": [
"California",
"California",
"Ohio",
"Ohio",
"Bayern",
"Nordrhein-Westfalen",
"Nordrhein-Westfalen",
],
"name": [
"San Francisco",
"Los Angeles",
"Columbus",
"Cleveland",
"Munich",
"Duesseldorf",
"Koeln",
],
"pop": [12345, 12346, 1234, 1236, 12347, 1238, 1239],
}
expected = DataFrame(ex_data, columns=result.columns)
tm.assert_frame_equal(result, expected)
def test_shallow_nested(self):
data = [
{
"state": "Florida",
"shortname": "FL",
"info": {"governor": "Rick Scott"},
"counties": [
{"name": "Dade", "population": 12345},
{"name": "Broward", "population": 40000},
{"name": "Palm Beach", "population": 60000},
],
},
{
"state": "Ohio",
"shortname": "OH",
"info": {"governor": "John Kasich"},
"counties": [
{"name": "Summit", "population": 1234},
{"name": "Cuyahoga", "population": 1337},
],
},
]
result = json_normalize(
data, "counties", ["state", "shortname", ["info", "governor"]]
)
ex_data = {
"name": ["Dade", "Broward", "Palm Beach", "Summit", "Cuyahoga"],
"state": ["Florida"] * 3 + ["Ohio"] * 2,
"shortname": ["FL", "FL", "FL", "OH", "OH"],
"info.governor": ["Rick Scott"] * 3 + ["John Kasich"] * 2,
"population": [12345, 40000, 60000, 1234, 1337],
}
expected = DataFrame(ex_data, columns=result.columns)
tm.assert_frame_equal(result, expected)
def test_nested_meta_path_with_nested_record_path(self, state_data):
# GH 27220
result = json_normalize(
data=state_data,
record_path=["counties"],
meta=["state", "shortname", ["info", "governor"]],
errors="ignore",
)
ex_data = {
"name": ["Dade", "Broward", "Palm Beach", "Summit", "Cuyahoga"],
"population": [12345, 40000, 60000, 1234, 1337],
"state": ["Florida"] * 3 + ["Ohio"] * 2,
"shortname": ["FL"] * 3 + ["OH"] * 2,
"info.governor": ["Rick Scott"] * 3 + ["John Kasich"] * 2,
}
expected = DataFrame(ex_data)
tm.assert_frame_equal(result, expected)
def test_meta_name_conflict(self):
data = [
{
"foo": "hello",
"bar": "there",
"data": [
{"foo": "something", "bar": "else"},
{"foo": "something2", "bar": "else2"},
],
}
]
msg = r"Conflicting metadata name (foo|bar), need distinguishing prefix"
with pytest.raises(ValueError, match=msg):
json_normalize(data, "data", meta=["foo", "bar"])
result = json_normalize(data, "data", meta=["foo", "bar"], meta_prefix="meta")
for val in ["metafoo", "metabar", "foo", "bar"]:
assert val in result
def test_meta_parameter_not_modified(self):
# GH 18610
data = [
{
"foo": "hello",
"bar": "there",
"data": [
{"foo": "something", "bar": "else"},
{"foo": "something2", "bar": "else2"},
],
}
]
COLUMNS = ["foo", "bar"]
result = json_normalize(data, "data", meta=COLUMNS, meta_prefix="meta")
assert COLUMNS == ["foo", "bar"]
for val in ["metafoo", "metabar", "foo", "bar"]:
assert val in result
def test_record_prefix(self, state_data):
result = json_normalize(state_data[0], "counties")
expected = DataFrame(state_data[0]["counties"])
tm.assert_frame_equal(result, expected)
result = json_normalize(
state_data, "counties", meta="state", record_prefix="county_"
)
expected = []
for rec in state_data:
expected.extend(rec["counties"])
expected = DataFrame(expected)
expected = expected.rename(columns=lambda x: "county_" + x)
expected["state"] = np.array(["Florida", "Ohio"]).repeat([3, 2])
tm.assert_frame_equal(result, expected)
def test_non_ascii_key(self):
testjson = (
b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},'
b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]'
).decode("utf8")
testdata = {
b"\xc3\x9cnic\xc3\xb8de".decode("utf8"): [0, 1],
"sub.A": [1, 3],
"sub.B": [2, 4],
}
expected = DataFrame(testdata)
result = json_normalize(json.loads(testjson))
tm.assert_frame_equal(result, expected)
def test_missing_field(self, author_missing_data):
# GH20030:
result = json_normalize(author_missing_data)
ex_data = [
{
"info": np.nan,
"info.created_at": np.nan,
"info.last_updated": np.nan,
"author_name.first": np.nan,
"author_name.last_name": np.nan,
},
{
"info": None,
"info.created_at": "11/08/1993",
"info.last_updated": "26/05/2012",
"author_name.first": "Jane",
"author_name.last_name": "Doe",
},
]
expected = DataFrame(ex_data)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"max_level,expected",
[
(
0,
[
{
"TextField": "Some text",
"UserField": {"Id": "ID001", "Name": "Name001"},
"CreatedBy": {"Name": "User001"},
"Image": {"a": "b"},
},
{
"TextField": "Some text",
"UserField": {"Id": "ID001", "Name": "Name001"},
"CreatedBy": {"Name": "User001"},
"Image": {"a": "b"},
},
],
),
(
1,
[
{
"TextField": "Some text",
"UserField.Id": "ID001",
"UserField.Name": "Name001",
"CreatedBy": {"Name": "User001"},
"Image": {"a": "b"},
},
{
"TextField": "Some text",
"UserField.Id": "ID001",
"UserField.Name": "Name001",
"CreatedBy": {"Name": "User001"},
"Image": {"a": "b"},
},
],
),
],
)
def test_max_level_with_records_path(self, max_level, expected):
# GH23843: Enhanced JSON normalize
test_input = [
{
"CreatedBy": {"Name": "User001"},
"Lookup": [
{
"TextField": "Some text",
"UserField": {"Id": "ID001", "Name": "Name001"},
},
{
"TextField": "Some text",
"UserField": {"Id": "ID001", "Name": "Name001"},
},
],
"Image": {"a": "b"},
"tags": [
{"foo": "something", "bar": "else"},
{"foo": "something2", "bar": "else2"},
],
}
]
result = json_normalize(
test_input,
record_path=["Lookup"],
meta=[["CreatedBy"], ["Image"]],
max_level=max_level,
)
expected_df = DataFrame(data=expected, columns=result.columns.values)
tm.assert_equal(expected_df, result)
def test_nested_flattening_consistent(self):
# see gh-21537
df1 = json_normalize([{"A": {"B": 1}}])
df2 = json_normalize({"dummy": [{"A": {"B": 1}}]}, "dummy")
# They should be the same.
tm.assert_frame_equal(df1, df2)
def test_nonetype_record_path(self, nulls_fixture):
# see gh-30148
# should not raise TypeError
result = json_normalize(
[
{"state": "Texas", "info": nulls_fixture},
{"state": "Florida", "info": [{"i": 2}]},
],
record_path=["info"],
)
expected = DataFrame({"i": 2}, index=[0])
tm.assert_equal(result, expected)
@pytest.mark.parametrize("value", ["false", "true", "{}", "1", '"text"'])
def test_non_list_record_path_errors(self, value):
# see gh-30148, GH 26284
parsed_value = json.loads(value)
test_input = {"state": "Texas", "info": parsed_value}
test_path = "info"
msg = (
f"{test_input} has non list value {parsed_value} for path {test_path}. "
"Must be list or null."
)
with pytest.raises(TypeError, match=msg):
json_normalize([test_input], record_path=[test_path])
def test_meta_non_iterable(self):
# GH 31507
data = """[{"id": 99, "data": [{"one": 1, "two": 2}]}]"""
result = json_normalize(json.loads(data), record_path=["data"], meta=["id"])
expected = DataFrame(
{"one": [1], "two": [2], "id": np.array([99], dtype=object)}
)
tm.assert_frame_equal(result, expected)
def test_generator(self, state_data):
# GH35923 Fix pd.json_normalize to not skip the first element of a
# generator input
def generator_data():
yield from state_data[0]["counties"]
result = json_normalize(generator_data())
expected = DataFrame(state_data[0]["counties"])
tm.assert_frame_equal(result, expected)
def test_top_column_with_leading_underscore(self):
# 49861
data = {"_id": {"a1": 10, "l2": {"l3": 0}}, "gg": 4}
result = json_normalize(data, sep="_")
expected = DataFrame([[4, 10, 0]], columns=["gg", "_id_a1", "_id_l2_l3"])
tm.assert_frame_equal(result, expected)
class TestNestedToRecord:
def test_flat_stays_flat(self):
recs = [{"flat1": 1, "flat2": 2}, {"flat3": 3, "flat2": 4}]
result = nested_to_record(recs)
expected = recs
assert result == expected
def test_one_level_deep_flattens(self):
data = {"flat1": 1, "dict1": {"c": 1, "d": 2}}
result = nested_to_record(data)
expected = {"dict1.c": 1, "dict1.d": 2, "flat1": 1}
assert result == expected
def test_nested_flattens(self):
data = {
"flat1": 1,
"dict1": {"c": 1, "d": 2},
"nested": {"e": {"c": 1, "d": 2}, "d": 2},
}
result = nested_to_record(data)
expected = {
"dict1.c": 1,
"dict1.d": 2,
"flat1": 1,
"nested.d": 2,
"nested.e.c": 1,
"nested.e.d": 2,
}
assert result == expected
def test_json_normalize_errors(self, missing_metadata):
# GH14583:
# If meta keys are not always present a new option to set
# errors='ignore' has been implemented
msg = (
"Key 'name' not found. To replace missing values of "
"'name' with np.nan, pass in errors='ignore'"
)
with pytest.raises(KeyError, match=msg):
json_normalize(
data=missing_metadata,
record_path="addresses",
meta="name",
errors="raise",
)
def test_missing_meta(self, missing_metadata):
# GH25468
# If metadata is nullable with errors set to ignore, the null values
# should be numpy.nan values
result = json_normalize(
data=missing_metadata, record_path="addresses", meta="name", errors="ignore"
)
ex_data = [
[9562, "Morris St.", "Massillon", "OH", 44646, "Alice"],
[8449, "Spring St.", "Elizabethton", "TN", 37643, np.nan],
]
columns = ["number", "street", "city", "state", "zip", "name"]
expected = DataFrame(ex_data, columns=columns)
tm.assert_frame_equal(result, expected)
def test_missing_nested_meta(self):
# GH44312
# If errors="ignore" and nested metadata is null, we should return nan
data = {"meta": "foo", "nested_meta": None, "value": [{"rec": 1}, {"rec": 2}]}
result = json_normalize(
data,
record_path="value",
meta=["meta", ["nested_meta", "leaf"]],
errors="ignore",
)
ex_data = [[1, "foo", np.nan], [2, "foo", np.nan]]
columns = ["rec", "meta", "nested_meta.leaf"]
expected = DataFrame(ex_data, columns=columns).astype(
{"nested_meta.leaf": object}
)
tm.assert_frame_equal(result, expected)
# If errors="raise" and nested metadata is null, we should raise with the
# key of the first missing level
with pytest.raises(KeyError, match="'leaf' not found"):
json_normalize(
data,
record_path="value",
meta=["meta", ["nested_meta", "leaf"]],
errors="raise",
)
def test_missing_meta_multilevel_record_path_errors_raise(self, missing_metadata):
# GH41876
# Ensure errors='raise' works as intended even when a record_path of length
# greater than one is passed in
msg = (
"Key 'name' not found. To replace missing values of "
"'name' with np.nan, pass in errors='ignore'"
)
with pytest.raises(KeyError, match=msg):
json_normalize(
data=missing_metadata,
record_path=["previous_residences", "cities"],
meta="name",
errors="raise",
)
def test_missing_meta_multilevel_record_path_errors_ignore(self, missing_metadata):
# GH41876
# Ensure errors='ignore' works as intended even when a record_path of length
# greater than one is passed in
result = json_normalize(
data=missing_metadata,
record_path=["previous_residences", "cities"],
meta="name",
errors="ignore",
)
ex_data = [
["Foo York City", "Alice"],
["Barmingham", np.nan],
]
columns = ["city_name", "name"]
expected = DataFrame(ex_data, columns=columns)
tm.assert_frame_equal(result, expected)
def test_donot_drop_nonevalues(self):
# GH21356
data = [
{"info": None, "author_name": {"first": "Smith", "last_name": "Appleseed"}},
{
"info": {"created_at": "11/08/1993", "last_updated": "26/05/2012"},
"author_name": {"first": "Jane", "last_name": "Doe"},
},
]
result = nested_to_record(data)
expected = [
{
"info": None,
"author_name.first": "Smith",
"author_name.last_name": "Appleseed",
},
{
"author_name.first": "Jane",
"author_name.last_name": "Doe",
"info.created_at": "11/08/1993",
"info.last_updated": "26/05/2012",
},
]
assert result == expected
def test_nonetype_top_level_bottom_level(self):
# GH21158: If inner level json has a key with a null value
# make sure it does not do a new_d.pop twice and except
data = {
"id": None,
"location": {
"country": {
"state": {
"id": None,
"town.info": {
"id": None,
"region": None,
"x": 49.151580810546875,
"y": -33.148521423339844,
"z": 27.572303771972656,
},
}
}
},
}
result = nested_to_record(data)
expected = {
"id": None,
"location.country.state.id": None,
"location.country.state.town.info.id": None,
"location.country.state.town.info.region": None,
"location.country.state.town.info.x": 49.151580810546875,
"location.country.state.town.info.y": -33.148521423339844,
"location.country.state.town.info.z": 27.572303771972656,
}
assert result == expected
def test_nonetype_multiple_levels(self):
# GH21158: If inner level json has a key with a null value
# make sure it does not do a new_d.pop twice and except
data = {
"id": None,
"location": {
"id": None,
"country": {
"id": None,
"state": {
"id": None,
"town.info": {
"region": None,
"x": 49.151580810546875,
"y": -33.148521423339844,
"z": 27.572303771972656,
},
},
},
},
}
result = nested_to_record(data)
expected = {
"id": None,
"location.id": None,
"location.country.id": None,
"location.country.state.id": None,
"location.country.state.town.info.region": None,
"location.country.state.town.info.x": 49.151580810546875,
"location.country.state.town.info.y": -33.148521423339844,
"location.country.state.town.info.z": 27.572303771972656,
}
assert result == expected
@pytest.mark.parametrize(
"max_level, expected",
[
(
None,
[
{
"CreatedBy.Name": "User001",
"Lookup.TextField": "Some text",
"Lookup.UserField.Id": "ID001",
"Lookup.UserField.Name": "Name001",
"Image.a": "b",
}
],
),
(
0,
[
{
"CreatedBy": {"Name": "User001"},
"Lookup": {
"TextField": "Some text",
"UserField": {"Id": "ID001", "Name": "Name001"},
},
"Image": {"a": "b"},
}
],
),
(
1,
[
{
"CreatedBy.Name": "User001",
"Lookup.TextField": "Some text",
"Lookup.UserField": {"Id": "ID001", "Name": "Name001"},
"Image.a": "b",
}
],
),
],
)
def test_with_max_level(self, max_level, expected, max_level_test_input_data):
# GH23843: Enhanced JSON normalize
output = nested_to_record(max_level_test_input_data, max_level=max_level)
assert output == expected
def test_with_large_max_level(self):
# GH23843: Enhanced JSON normalize
max_level = 100
input_data = [
{
"CreatedBy": {
"user": {
"name": {"firstname": "Leo", "LastName": "Thomson"},
"family_tree": {
"father": {
"name": "Father001",
"father": {
"Name": "Father002",
"father": {
"name": "Father003",
"father": {"Name": "Father004"},
},
},
}
},
}
}
}
]
expected = [
{
"CreatedBy.user.name.firstname": "Leo",
"CreatedBy.user.name.LastName": "Thomson",
"CreatedBy.user.family_tree.father.name": "Father001",
"CreatedBy.user.family_tree.father.father.Name": "Father002",
"CreatedBy.user.family_tree.father.father.father.name": "Father003",
"CreatedBy.user.family_tree.father.father.father.father.Name": "Father004", # noqa: E501
}
]
output = nested_to_record(input_data, max_level=max_level)
assert output == expected
def test_series_non_zero_index(self):
# GH 19020
data = {
0: {"id": 1, "name": "Foo", "elements": {"a": 1}},
1: {"id": 2, "name": "Bar", "elements": {"b": 2}},
2: {"id": 3, "name": "Baz", "elements": {"c": 3}},
}
s = Series(data)
s.index = [1, 2, 3]
result = json_normalize(s)
expected = DataFrame(
{
"id": [1, 2, 3],
"name": ["Foo", "Bar", "Baz"],
"elements.a": [1.0, np.nan, np.nan],
"elements.b": [np.nan, 2.0, np.nan],
"elements.c": [np.nan, np.nan, 3.0],
}
)
tm.assert_frame_equal(result, expected)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,543 @@
from collections.abc import Iterator
from io import StringIO
from pathlib import Path
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
read_json,
)
import pandas._testing as tm
from pandas.io.json._json import JsonReader
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@pytest.fixture
def lines_json_df():
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
return df.to_json(lines=True, orient="records")
@pytest.fixture(params=["ujson", "pyarrow"])
def engine(request):
if request.param == "pyarrow":
pytest.importorskip("pyarrow.json")
return request.param
def test_read_jsonl():
# GH9180
result = read_json(StringIO('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n'), lines=True)
expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
def test_read_jsonl_engine_pyarrow(datapath, engine):
result = read_json(
datapath("io", "json", "data", "line_delimited.json"),
lines=True,
engine=engine,
)
expected = DataFrame({"a": [1, 3, 5], "b": [2, 4, 6]})
tm.assert_frame_equal(result, expected)
def test_read_datetime(request, engine):
# GH33787
if engine == "pyarrow":
# GH 48893
reason = "Pyarrow only supports a file path as an input and line delimited json"
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
df = DataFrame(
[([1, 2], ["2020-03-05", "2020-04-08T09:58:49+00:00"], "hector")],
columns=["accounts", "date", "name"],
)
json_line = df.to_json(lines=True, orient="records")
if engine == "pyarrow":
result = read_json(StringIO(json_line), engine=engine)
else:
result = read_json(StringIO(json_line), engine=engine)
expected = DataFrame(
[[1, "2020-03-05", "hector"], [2, "2020-04-08T09:58:49+00:00", "hector"]],
columns=["accounts", "date", "name"],
)
tm.assert_frame_equal(result, expected)
def test_read_jsonl_unicode_chars():
# GH15132: non-ascii unicode characters
# \u201d == RIGHT DOUBLE QUOTATION MARK
# simulate file handle
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
json = StringIO(json)
result = read_json(json, lines=True)
expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
# simulate string
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
result = read_json(StringIO(json), lines=True)
expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
def test_to_jsonl():
# GH9180
df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
result = df.to_json(orient="records", lines=True)
expected = '{"a":1,"b":2}\n{"a":1,"b":2}\n'
assert result == expected
df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=["a", "b"])
result = df.to_json(orient="records", lines=True)
expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n'
assert result == expected
tm.assert_frame_equal(read_json(StringIO(result), lines=True), df)
# GH15096: escaped characters in columns and data
df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"])
result = df.to_json(orient="records", lines=True)
expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n'
assert result == expected
tm.assert_frame_equal(read_json(StringIO(result), lines=True), df)
def test_to_jsonl_count_new_lines():
# GH36888
df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
actual_new_lines_count = df.to_json(orient="records", lines=True).count("\n")
expected_new_lines_count = 2
assert actual_new_lines_count == expected_new_lines_count
@pytest.mark.parametrize("chunksize", [1, 1.0])
def test_readjson_chunks(request, lines_json_df, chunksize, engine):
# Basic test that read_json(chunks=True) gives the same result as
# read_json(chunks=False)
# GH17048: memory usage when lines=True
if engine == "pyarrow":
# GH 48893
reason = (
"Pyarrow only supports a file path as an input and line delimited json"
"and doesn't support chunksize parameter."
)
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
unchunked = read_json(StringIO(lines_json_df), lines=True)
with read_json(
StringIO(lines_json_df), lines=True, chunksize=chunksize, engine=engine
) as reader:
chunked = pd.concat(reader)
tm.assert_frame_equal(chunked, unchunked)
def test_readjson_chunksize_requires_lines(lines_json_df, engine):
msg = "chunksize can only be passed if lines=True"
with pytest.raises(ValueError, match=msg):
with read_json(
StringIO(lines_json_df), lines=False, chunksize=2, engine=engine
) as _:
pass
def test_readjson_chunks_series(request, engine):
if engine == "pyarrow":
# GH 48893
reason = (
"Pyarrow only supports a file path as an input and line delimited json"
"and doesn't support chunksize parameter."
)
request.applymarker(pytest.mark.xfail(reason=reason))
# Test reading line-format JSON to Series with chunksize param
s = pd.Series({"A": 1, "B": 2})
strio = StringIO(s.to_json(lines=True, orient="records"))
unchunked = read_json(strio, lines=True, typ="Series", engine=engine)
strio = StringIO(s.to_json(lines=True, orient="records"))
with read_json(
strio, lines=True, typ="Series", chunksize=1, engine=engine
) as reader:
chunked = pd.concat(reader)
tm.assert_series_equal(chunked, unchunked)
def test_readjson_each_chunk(request, lines_json_df, engine):
if engine == "pyarrow":
# GH 48893
reason = (
"Pyarrow only supports a file path as an input and line delimited json"
"and doesn't support chunksize parameter."
)
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
# Other tests check that the final result of read_json(chunksize=True)
# is correct. This checks the intermediate chunks.
with read_json(
StringIO(lines_json_df), lines=True, chunksize=2, engine=engine
) as reader:
chunks = list(reader)
assert chunks[0].shape == (2, 2)
assert chunks[1].shape == (1, 2)
def test_readjson_chunks_from_file(request, engine):
if engine == "pyarrow":
# GH 48893
reason = (
"Pyarrow only supports a file path as an input and line delimited json"
"and doesn't support chunksize parameter."
)
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
with tm.ensure_clean("test.json") as path:
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df.to_json(path, lines=True, orient="records")
with read_json(path, lines=True, chunksize=1, engine=engine) as reader:
chunked = pd.concat(reader)
unchunked = read_json(path, lines=True, engine=engine)
tm.assert_frame_equal(unchunked, chunked)
@pytest.mark.parametrize("chunksize", [None, 1])
def test_readjson_chunks_closes(chunksize):
with tm.ensure_clean("test.json") as path:
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df.to_json(path, lines=True, orient="records")
reader = JsonReader(
path,
orient=None,
typ="frame",
dtype=True,
convert_axes=True,
convert_dates=True,
keep_default_dates=True,
precise_float=False,
date_unit=None,
encoding=None,
lines=True,
chunksize=chunksize,
compression=None,
nrows=None,
)
with reader:
reader.read()
assert (
reader.handles.handle.closed
), f"didn't close stream with chunksize = {chunksize}"
@pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
def test_readjson_invalid_chunksize(lines_json_df, chunksize, engine):
msg = r"'chunksize' must be an integer >=1"
with pytest.raises(ValueError, match=msg):
with read_json(
StringIO(lines_json_df), lines=True, chunksize=chunksize, engine=engine
) as _:
pass
@pytest.mark.parametrize("chunksize", [None, 1, 2])
def test_readjson_chunks_multiple_empty_lines(chunksize):
j = """
{"A":1,"B":4}
{"A":2,"B":5}
{"A":3,"B":6}
"""
orig = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
test = read_json(StringIO(j), lines=True, chunksize=chunksize)
if chunksize is not None:
with test:
test = pd.concat(test)
tm.assert_frame_equal(orig, test, obj=f"chunksize: {chunksize}")
def test_readjson_unicode(request, monkeypatch, engine):
if engine == "pyarrow":
# GH 48893
reason = (
"Pyarrow only supports a file path as an input and line delimited json"
"and doesn't support chunksize parameter."
)
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
with tm.ensure_clean("test.json") as path:
monkeypatch.setattr("locale.getpreferredencoding", lambda do_setlocale: "cp949")
with open(path, "w", encoding="utf-8") as f:
f.write('{"£©µÀÆÖÞßéöÿ":["АБВГДабвгд가"]}')
result = read_json(path, engine=engine)
expected = DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("nrows", [1, 2])
def test_readjson_nrows(nrows, engine):
# GH 33916
# Test reading line-format JSON to Series with nrows param
jsonl = """{"a": 1, "b": 2}
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}"""
result = read_json(StringIO(jsonl), lines=True, nrows=nrows)
expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)])
def test_readjson_nrows_chunks(request, nrows, chunksize, engine):
# GH 33916
# Test reading line-format JSON to Series with nrows and chunksize param
if engine == "pyarrow":
# GH 48893
reason = (
"Pyarrow only supports a file path as an input and line delimited json"
"and doesn't support chunksize parameter."
)
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
jsonl = """{"a": 1, "b": 2}
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}"""
if engine != "pyarrow":
with read_json(
StringIO(jsonl), lines=True, nrows=nrows, chunksize=chunksize, engine=engine
) as reader:
chunked = pd.concat(reader)
else:
with read_json(
jsonl, lines=True, nrows=nrows, chunksize=chunksize, engine=engine
) as reader:
chunked = pd.concat(reader)
expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
tm.assert_frame_equal(chunked, expected)
def test_readjson_nrows_requires_lines(engine):
# GH 33916
# Test ValueError raised if nrows is set without setting lines in read_json
jsonl = """{"a": 1, "b": 2}
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}"""
msg = "nrows can only be passed if lines=True"
with pytest.raises(ValueError, match=msg):
read_json(jsonl, lines=False, nrows=2, engine=engine)
def test_readjson_lines_chunks_fileurl(request, datapath, engine):
# GH 27135
# Test reading line-format JSON from file url
if engine == "pyarrow":
# GH 48893
reason = (
"Pyarrow only supports a file path as an input and line delimited json"
"and doesn't support chunksize parameter."
)
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
df_list_expected = [
DataFrame([[1, 2]], columns=["a", "b"], index=[0]),
DataFrame([[3, 4]], columns=["a", "b"], index=[1]),
DataFrame([[5, 6]], columns=["a", "b"], index=[2]),
]
os_path = datapath("io", "json", "data", "line_delimited.json")
file_url = Path(os_path).as_uri()
with read_json(file_url, lines=True, chunksize=1, engine=engine) as url_reader:
for index, chuck in enumerate(url_reader):
tm.assert_frame_equal(chuck, df_list_expected[index])
def test_chunksize_is_incremental():
# See https://github.com/pandas-dev/pandas/issues/34548
jsonl = (
"""{"a": 1, "b": 2}
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}\n"""
* 1000
)
class MyReader:
def __init__(self, contents) -> None:
self.read_count = 0
self.stringio = StringIO(contents)
def read(self, *args):
self.read_count += 1
return self.stringio.read(*args)
def __iter__(self) -> Iterator:
self.read_count += 1
return iter(self.stringio)
reader = MyReader(jsonl)
assert len(list(read_json(reader, lines=True, chunksize=100))) > 1
assert reader.read_count > 10
@pytest.mark.parametrize("orient_", ["split", "index", "table"])
def test_to_json_append_orient(orient_):
# GH 35849
# Test ValueError when orient is not 'records'
df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
msg = (
r"mode='a' \(append\) is only supported when "
"lines is True and orient is 'records'"
)
with pytest.raises(ValueError, match=msg):
df.to_json(mode="a", orient=orient_)
def test_to_json_append_lines():
# GH 35849
# Test ValueError when lines is not True
df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
msg = (
r"mode='a' \(append\) is only supported when "
"lines is True and orient is 'records'"
)
with pytest.raises(ValueError, match=msg):
df.to_json(mode="a", lines=False, orient="records")
@pytest.mark.parametrize("mode_", ["r", "x"])
def test_to_json_append_mode(mode_):
# GH 35849
# Test ValueError when mode is not supported option
df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
msg = (
f"mode={mode_} is not a valid option."
"Only 'w' and 'a' are currently supported."
)
with pytest.raises(ValueError, match=msg):
df.to_json(mode=mode_, lines=False, orient="records")
def test_to_json_append_output_consistent_columns():
# GH 35849
# Testing that resulting output reads in as expected.
# Testing same columns, new rows
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
expected = DataFrame({"col1": [1, 2, 3, 4], "col2": ["a", "b", "c", "d"]})
with tm.ensure_clean("test.json") as path:
# Save dataframes to the same file
df1.to_json(path, lines=True, orient="records")
df2.to_json(path, mode="a", lines=True, orient="records")
# Read path file
result = read_json(path, lines=True)
tm.assert_frame_equal(result, expected)
def test_to_json_append_output_inconsistent_columns():
# GH 35849
# Testing that resulting output reads in as expected.
# Testing one new column, one old column, new rows
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
expected = DataFrame(
{
"col1": [1, 2, None, None],
"col2": ["a", "b", "e", "f"],
"col3": [np.nan, np.nan, "!", "#"],
}
)
with tm.ensure_clean("test.json") as path:
# Save dataframes to the same file
df1.to_json(path, mode="a", lines=True, orient="records")
df3.to_json(path, mode="a", lines=True, orient="records")
# Read path file
result = read_json(path, lines=True)
tm.assert_frame_equal(result, expected)
def test_to_json_append_output_different_columns():
# GH 35849
# Testing that resulting output reads in as expected.
# Testing same, differing and new columns
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
df4 = DataFrame({"col4": [True, False]})
expected = DataFrame(
{
"col1": [1, 2, 3, 4, None, None, None, None],
"col2": ["a", "b", "c", "d", "e", "f", np.nan, np.nan],
"col3": [np.nan, np.nan, np.nan, np.nan, "!", "#", np.nan, np.nan],
"col4": [None, None, None, None, None, None, True, False],
}
).astype({"col4": "float"})
with tm.ensure_clean("test.json") as path:
# Save dataframes to the same file
df1.to_json(path, mode="a", lines=True, orient="records")
df2.to_json(path, mode="a", lines=True, orient="records")
df3.to_json(path, mode="a", lines=True, orient="records")
df4.to_json(path, mode="a", lines=True, orient="records")
# Read path file
result = read_json(path, lines=True)
tm.assert_frame_equal(result, expected)
def test_to_json_append_output_different_columns_reordered():
# GH 35849
# Testing that resulting output reads in as expected.
# Testing specific result column order.
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
df4 = DataFrame({"col4": [True, False]})
# df4, df3, df2, df1 (in that order)
expected = DataFrame(
{
"col4": [True, False, None, None, None, None, None, None],
"col2": [np.nan, np.nan, "e", "f", "c", "d", "a", "b"],
"col3": [np.nan, np.nan, "!", "#", np.nan, np.nan, np.nan, np.nan],
"col1": [None, None, None, None, 3, 4, 1, 2],
}
).astype({"col4": "float"})
with tm.ensure_clean("test.json") as path:
# Save dataframes to the same file
df4.to_json(path, mode="a", lines=True, orient="records")
df3.to_json(path, mode="a", lines=True, orient="records")
df2.to_json(path, mode="a", lines=True, orient="records")
df1.to_json(path, mode="a", lines=True, orient="records")
# Read path file
result = read_json(path, lines=True)
tm.assert_frame_equal(result, expected)

File diff suppressed because it is too large Load Diff