Updated script that can be controled by Nodejs web app

2024-11-25 12:24:18 +07:00
parent c440eda1f4
commit 8b0ab2bd3a
8662 changed files with 1803808 additions and 34 deletions
--- a/lib/python3.13/site-packages/pandas/io/json/_normalize.py
+++ b/lib/python3.13/site-packages/pandas/io/json/_normalize.py
@ -0,0 +1,544 @@
+# ---------------------------------------------------------------------
+# JSON normalization routines
+from __future__ import annotations
+
+from collections import (
+    abc,
+    defaultdict,
+)
+import copy
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    DefaultDict,
+)
+
+import numpy as np
+
+from pandas._libs.writers import convert_json_to_lines
+
+import pandas as pd
+from pandas import DataFrame
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+    from pandas._typing import (
+        IgnoreRaise,
+        Scalar,
+    )
+
+
+def convert_to_line_delimits(s: str) -> str:
+    """
+    Helper function that converts JSON lists to line delimited JSON.
+    """
+    # Determine we have a JSON list to turn to lines otherwise just return the
+    # json object, only lists can
+    if not s[0] == "[" and s[-1] == "]":
+        return s
+    s = s[1:-1]
+
+    return convert_json_to_lines(s)
+
+
+def nested_to_record(
+    ds,
+    prefix: str = "",
+    sep: str = ".",
+    level: int = 0,
+    max_level: int | None = None,
+):
+    """
+    A simplified json_normalize
+
+    Converts a nested dict into a flat dict ("record"), unlike json_normalize,
+    it does not attempt to extract a subset of the data.
+
+    Parameters
+    ----------
+    ds : dict or list of dicts
+    prefix: the prefix, optional, default: ""
+    sep : str, default '.'
+        Nested records will generate names separated by sep,
+        e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
+    level: int, optional, default: 0
+        The number of levels in the json string.
+
+    max_level: int, optional, default: None
+        The max depth to normalize.
+
+    Returns
+    -------
+    d - dict or list of dicts, matching `ds`
+
+    Examples
+    --------
+    >>> nested_to_record(
+    ...     dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2))
+    ... )
+    {\
+'flat1': 1, \
+'dict1.c': 1, \
+'dict1.d': 2, \
+'nested.e.c': 1, \
+'nested.e.d': 2, \
+'nested.d': 2\
+}
+    """
+    singleton = False
+    if isinstance(ds, dict):
+        ds = [ds]
+        singleton = True
+    new_ds = []
+    for d in ds:
+        new_d = copy.deepcopy(d)
+        for k, v in d.items():
+            # each key gets renamed with prefix
+            if not isinstance(k, str):
+                k = str(k)
+            if level == 0:
+                newkey = k
+            else:
+                newkey = prefix + sep + k
+
+            # flatten if type is dict and
+            # current dict level  < maximum level provided and
+            # only dicts gets recurse-flattened
+            # only at level>1 do we rename the rest of the keys
+            if not isinstance(v, dict) or (
+                max_level is not None and level >= max_level
+            ):
+                if level != 0:  # so we skip copying for top level, common case
+                    v = new_d.pop(k)
+                    new_d[newkey] = v
+                continue
+
+            v = new_d.pop(k)
+            new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level))
+        new_ds.append(new_d)
+
+    if singleton:
+        return new_ds[0]
+    return new_ds
+
+
+def _normalise_json(
+    data: Any,
+    key_string: str,
+    normalized_dict: dict[str, Any],
+    separator: str,
+) -> dict[str, Any]:
+    """
+    Main recursive function
+    Designed for the most basic use case of pd.json_normalize(data)
+    intended as a performance improvement, see #15621
+
+    Parameters
+    ----------
+    data : Any
+        Type dependent on types contained within nested Json
+    key_string : str
+        New key (with separator(s) in) for data
+    normalized_dict : dict
+        The new normalized/flattened Json dict
+    separator : str, default '.'
+        Nested records will generate names separated by sep,
+        e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
+    """
+    if isinstance(data, dict):
+        for key, value in data.items():
+            new_key = f"{key_string}{separator}{key}"
+
+            if not key_string:
+                new_key = new_key.removeprefix(separator)
+
+            _normalise_json(
+                data=value,
+                key_string=new_key,
+                normalized_dict=normalized_dict,
+                separator=separator,
+            )
+    else:
+        normalized_dict[key_string] = data
+    return normalized_dict
+
+
+def _normalise_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]:
+    """
+    Order the top level keys and then recursively go to depth
+
+    Parameters
+    ----------
+    data : dict or list of dicts
+    separator : str, default '.'
+        Nested records will generate names separated by sep,
+        e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
+
+    Returns
+    -------
+    dict or list of dicts, matching `normalised_json_object`
+    """
+    top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)}
+    nested_dict_ = _normalise_json(
+        data={k: v for k, v in data.items() if isinstance(v, dict)},
+        key_string="",
+        normalized_dict={},
+        separator=separator,
+    )
+    return {**top_dict_, **nested_dict_}
+
+
+def _simple_json_normalize(
+    ds: dict | list[dict],
+    sep: str = ".",
+) -> dict | list[dict] | Any:
+    """
+    A optimized basic json_normalize
+
+    Converts a nested dict into a flat dict ("record"), unlike
+    json_normalize and nested_to_record it doesn't do anything clever.
+    But for the most basic use cases it enhances performance.
+    E.g. pd.json_normalize(data)
+
+    Parameters
+    ----------
+    ds : dict or list of dicts
+    sep : str, default '.'
+        Nested records will generate names separated by sep,
+        e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
+
+    Returns
+    -------
+    frame : DataFrame
+    d - dict or list of dicts, matching `normalised_json_object`
+
+    Examples
+    --------
+    >>> _simple_json_normalize(
+    ...     {
+    ...         "flat1": 1,
+    ...         "dict1": {"c": 1, "d": 2},
+    ...         "nested": {"e": {"c": 1, "d": 2}, "d": 2},
+    ...     }
+    ... )
+    {\
+'flat1': 1, \
+'dict1.c': 1, \
+'dict1.d': 2, \
+'nested.e.c': 1, \
+'nested.e.d': 2, \
+'nested.d': 2\
+}
+
+    """
+    normalised_json_object = {}
+    # expect a dictionary, as most jsons are. However, lists are perfectly valid
+    if isinstance(ds, dict):
+        normalised_json_object = _normalise_json_ordered(data=ds, separator=sep)
+    elif isinstance(ds, list):
+        normalised_json_list = [_simple_json_normalize(row, sep=sep) for row in ds]
+        return normalised_json_list
+    return normalised_json_object
+
+
+def json_normalize(
+    data: dict | list[dict],
+    record_path: str | list | None = None,
+    meta: str | list[str | list[str]] | None = None,
+    meta_prefix: str | None = None,
+    record_prefix: str | None = None,
+    errors: IgnoreRaise = "raise",
+    sep: str = ".",
+    max_level: int | None = None,
+) -> DataFrame:
+    """
+    Normalize semi-structured JSON data into a flat table.
+
+    Parameters
+    ----------
+    data : dict or list of dicts
+        Unserialized JSON objects.
+    record_path : str or list of str, default None
+        Path in each object to list of records. If not passed, data will be
+        assumed to be an array of records.
+    meta : list of paths (str or list of str), default None
+        Fields to use as metadata for each record in resulting table.
+    meta_prefix : str, default None
+        If True, prefix records with dotted (?) path, e.g. foo.bar.field if
+        meta is ['foo', 'bar'].
+    record_prefix : str, default None
+        If True, prefix records with dotted (?) path, e.g. foo.bar.field if
+        path to records is ['foo', 'bar'].
+    errors : {'raise', 'ignore'}, default 'raise'
+        Configures error handling.
+
+        * 'ignore' : will ignore KeyError if keys listed in meta are not
+          always present.
+        * 'raise' : will raise KeyError if keys listed in meta are not
+          always present.
+    sep : str, default '.'
+        Nested records will generate names separated by sep.
+        e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar.
+    max_level : int, default None
+        Max number of levels(depth of dict) to normalize.
+        if None, normalizes all levels.
+
+    Returns
+    -------
+    frame : DataFrame
+    Normalize semi-structured JSON data into a flat table.
+
+    Examples
+    --------
+    >>> data = [
+    ...     {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
+    ...     {"name": {"given": "Mark", "family": "Regner"}},
+    ...     {"id": 2, "name": "Faye Raker"},
+    ... ]
+    >>> pd.json_normalize(data)
+        id name.first name.last name.given name.family        name
+    0  1.0     Coleen      Volk        NaN         NaN         NaN
+    1  NaN        NaN       NaN       Mark      Regner         NaN
+    2  2.0        NaN       NaN        NaN         NaN  Faye Raker
+
+    >>> data = [
+    ...     {
+    ...         "id": 1,
+    ...         "name": "Cole Volk",
+    ...         "fitness": {"height": 130, "weight": 60},
+    ...     },
+    ...     {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
+    ...     {
+    ...         "id": 2,
+    ...         "name": "Faye Raker",
+    ...         "fitness": {"height": 130, "weight": 60},
+    ...     },
+    ... ]
+    >>> pd.json_normalize(data, max_level=0)
+        id        name                        fitness
+    0  1.0   Cole Volk  {'height': 130, 'weight': 60}
+    1  NaN    Mark Reg  {'height': 130, 'weight': 60}
+    2  2.0  Faye Raker  {'height': 130, 'weight': 60}
+
+    Normalizes nested data up to level 1.
+
+    >>> data = [
+    ...     {
+    ...         "id": 1,
+    ...         "name": "Cole Volk",
+    ...         "fitness": {"height": 130, "weight": 60},
+    ...     },
+    ...     {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
+    ...     {
+    ...         "id": 2,
+    ...         "name": "Faye Raker",
+    ...         "fitness": {"height": 130, "weight": 60},
+    ...     },
+    ... ]
+    >>> pd.json_normalize(data, max_level=1)
+        id        name  fitness.height  fitness.weight
+    0  1.0   Cole Volk             130              60
+    1  NaN    Mark Reg             130              60
+    2  2.0  Faye Raker             130              60
+
+    >>> data = [
+    ...     {
+    ...         "state": "Florida",
+    ...         "shortname": "FL",
+    ...         "info": {"governor": "Rick Scott"},
+    ...         "counties": [
+    ...             {"name": "Dade", "population": 12345},
+    ...             {"name": "Broward", "population": 40000},
+    ...             {"name": "Palm Beach", "population": 60000},
+    ...         ],
+    ...     },
+    ...     {
+    ...         "state": "Ohio",
+    ...         "shortname": "OH",
+    ...         "info": {"governor": "John Kasich"},
+    ...         "counties": [
+    ...             {"name": "Summit", "population": 1234},
+    ...             {"name": "Cuyahoga", "population": 1337},
+    ...         ],
+    ...     },
+    ... ]
+    >>> result = pd.json_normalize(
+    ...     data, "counties", ["state", "shortname", ["info", "governor"]]
+    ... )
+    >>> result
+             name  population    state shortname info.governor
+    0        Dade       12345   Florida    FL    Rick Scott
+    1     Broward       40000   Florida    FL    Rick Scott
+    2  Palm Beach       60000   Florida    FL    Rick Scott
+    3      Summit        1234   Ohio       OH    John Kasich
+    4    Cuyahoga        1337   Ohio       OH    John Kasich
+
+    >>> data = {"A": [1, 2]}
+    >>> pd.json_normalize(data, "A", record_prefix="Prefix.")
+        Prefix.0
+    0          1
+    1          2
+
+    Returns normalized data with columns prefixed with the given string.
+    """
+
+    def _pull_field(
+        js: dict[str, Any], spec: list | str, extract_record: bool = False
+    ) -> Scalar | Iterable:
+        """Internal function to pull field"""
+        result = js
+        try:
+            if isinstance(spec, list):
+                for field in spec:
+                    if result is None:
+                        raise KeyError(field)
+                    result = result[field]
+            else:
+                result = result[spec]
+        except KeyError as e:
+            if extract_record:
+                raise KeyError(
+                    f"Key {e} not found. If specifying a record_path, all elements of "
+                    f"data should have the path."
+                ) from e
+            if errors == "ignore":
+                return np.nan
+            else:
+                raise KeyError(
+                    f"Key {e} not found. To replace missing values of {e} with "
+                    f"np.nan, pass in errors='ignore'"
+                ) from e
+
+        return result
+
+    def _pull_records(js: dict[str, Any], spec: list | str) -> list:
+        """
+        Internal function to pull field for records, and similar to
+        _pull_field, but require to return list. And will raise error
+        if has non iterable value.
+        """
+        result = _pull_field(js, spec, extract_record=True)
+
+        # GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not
+        # null, otherwise return an empty list
+        if not isinstance(result, list):
+            if pd.isnull(result):
+                result = []
+            else:
+                raise TypeError(
+                    f"{js} has non list value {result} for path {spec}. "
+                    "Must be list or null."
+                )
+        return result
+
+    if isinstance(data, list) and not data:
+        return DataFrame()
+    elif isinstance(data, dict):
+        # A bit of a hackjob
+        data = [data]
+    elif isinstance(data, abc.Iterable) and not isinstance(data, str):
+        # GH35923 Fix pd.json_normalize to not skip the first element of a
+        # generator input
+        data = list(data)
+    else:
+        raise NotImplementedError
+
+    # check to see if a simple recursive function is possible to
+    # improve performance (see #15621) but only for cases such
+    # as pd.Dataframe(data) or pd.Dataframe(data, sep)
+    if (
+        record_path is None
+        and meta is None
+        and meta_prefix is None
+        and record_prefix is None
+        and max_level is None
+    ):
+        return DataFrame(_simple_json_normalize(data, sep=sep))
+
+    if record_path is None:
+        if any([isinstance(x, dict) for x in y.values()] for y in data):
+            # naive normalization, this is idempotent for flat records
+            # and potentially will inflate the data considerably for
+            # deeply nested structures:
+            #  {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
+            #
+            # TODO: handle record value which are lists, at least error
+            #       reasonably
+            data = nested_to_record(data, sep=sep, max_level=max_level)
+        return DataFrame(data)
+    elif not isinstance(record_path, list):
+        record_path = [record_path]
+
+    if meta is None:
+        meta = []
+    elif not isinstance(meta, list):
+        meta = [meta]
+
+    _meta = [m if isinstance(m, list) else [m] for m in meta]
+
+    # Disastrously inefficient for now
+    records: list = []
+    lengths = []
+
+    meta_vals: DefaultDict = defaultdict(list)
+    meta_keys = [sep.join(val) for val in _meta]
+
+    def _recursive_extract(data, path, seen_meta, level: int = 0) -> None:
+        if isinstance(data, dict):
+            data = [data]
+        if len(path) > 1:
+            for obj in data:
+                for val, key in zip(_meta, meta_keys):
+                    if level + 1 == len(val):
+                        seen_meta[key] = _pull_field(obj, val[-1])
+
+                _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1)
+        else:
+            for obj in data:
+                recs = _pull_records(obj, path[0])
+                recs = [
+                    nested_to_record(r, sep=sep, max_level=max_level)
+                    if isinstance(r, dict)
+                    else r
+                    for r in recs
+                ]
+
+                # For repeating the metadata later
+                lengths.append(len(recs))
+                for val, key in zip(_meta, meta_keys):
+                    if level + 1 > len(val):
+                        meta_val = seen_meta[key]
+                    else:
+                        meta_val = _pull_field(obj, val[level:])
+                    meta_vals[key].append(meta_val)
+                records.extend(recs)
+
+    _recursive_extract(data, record_path, {}, level=0)
+
+    result = DataFrame(records)
+
+    if record_prefix is not None:
+        result = result.rename(columns=lambda x: f"{record_prefix}{x}")
+
+    # Data types, a problem
+    for k, v in meta_vals.items():
+        if meta_prefix is not None:
+            k = meta_prefix + k
+
+        if k in result:
+            raise ValueError(
+                f"Conflicting metadata name {k}, need distinguishing prefix "
+            )
+        # GH 37782
+
+        values = np.array(v, dtype=object)
+
+        if values.ndim > 1:
+            # GH 37782
+            values = np.empty((len(v),), dtype=object)
+            for i, v in enumerate(v):
+                values[i] = v
+
+        result[k] = values.repeat(lengths)
+    return result