Updated script that can be controled by Nodejs web app
This commit is contained in:
@ -0,0 +1,9 @@
|
||||
from pandas.io.parsers.readers import (
|
||||
TextFileReader,
|
||||
TextParser,
|
||||
read_csv,
|
||||
read_fwf,
|
||||
read_table,
|
||||
)
|
||||
|
||||
__all__ = ["TextFileReader", "TextParser", "read_csv", "read_fwf", "read_table"]
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,303 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
import warnings
|
||||
|
||||
from pandas._config import using_pyarrow_string_dtype
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.errors import (
|
||||
ParserError,
|
||||
ParserWarning,
|
||||
)
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.common import pandas_dtype
|
||||
from pandas.core.dtypes.inference import is_integer
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
|
||||
from pandas.io._util import (
|
||||
_arrow_dtype_mapping,
|
||||
arrow_string_types_mapper,
|
||||
)
|
||||
from pandas.io.parsers.base_parser import ParserBase
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import ReadBuffer
|
||||
|
||||
|
||||
class ArrowParserWrapper(ParserBase):
|
||||
"""
|
||||
Wrapper for the pyarrow engine for read_csv()
|
||||
"""
|
||||
|
||||
def __init__(self, src: ReadBuffer[bytes], **kwds) -> None:
|
||||
super().__init__(kwds)
|
||||
self.kwds = kwds
|
||||
self.src = src
|
||||
|
||||
self._parse_kwds()
|
||||
|
||||
def _parse_kwds(self) -> None:
|
||||
"""
|
||||
Validates keywords before passing to pyarrow.
|
||||
"""
|
||||
encoding: str | None = self.kwds.get("encoding")
|
||||
self.encoding = "utf-8" if encoding is None else encoding
|
||||
|
||||
na_values = self.kwds["na_values"]
|
||||
if isinstance(na_values, dict):
|
||||
raise ValueError(
|
||||
"The pyarrow engine doesn't support passing a dict for na_values"
|
||||
)
|
||||
self.na_values = list(self.kwds["na_values"])
|
||||
|
||||
def _get_pyarrow_options(self) -> None:
|
||||
"""
|
||||
Rename some arguments to pass to pyarrow
|
||||
"""
|
||||
mapping = {
|
||||
"usecols": "include_columns",
|
||||
"na_values": "null_values",
|
||||
"escapechar": "escape_char",
|
||||
"skip_blank_lines": "ignore_empty_lines",
|
||||
"decimal": "decimal_point",
|
||||
"quotechar": "quote_char",
|
||||
}
|
||||
for pandas_name, pyarrow_name in mapping.items():
|
||||
if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None:
|
||||
self.kwds[pyarrow_name] = self.kwds.pop(pandas_name)
|
||||
|
||||
# Date format handling
|
||||
# If we get a string, we need to convert it into a list for pyarrow
|
||||
# If we get a dict, we want to parse those separately
|
||||
date_format = self.date_format
|
||||
if isinstance(date_format, str):
|
||||
date_format = [date_format]
|
||||
else:
|
||||
# In case of dict, we don't want to propagate through, so
|
||||
# just set to pyarrow default of None
|
||||
|
||||
# Ideally, in future we disable pyarrow dtype inference (read in as string)
|
||||
# to prevent misreads.
|
||||
date_format = None
|
||||
self.kwds["timestamp_parsers"] = date_format
|
||||
|
||||
self.parse_options = {
|
||||
option_name: option_value
|
||||
for option_name, option_value in self.kwds.items()
|
||||
if option_value is not None
|
||||
and option_name
|
||||
in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines")
|
||||
}
|
||||
|
||||
on_bad_lines = self.kwds.get("on_bad_lines")
|
||||
if on_bad_lines is not None:
|
||||
if callable(on_bad_lines):
|
||||
self.parse_options["invalid_row_handler"] = on_bad_lines
|
||||
elif on_bad_lines == ParserBase.BadLineHandleMethod.ERROR:
|
||||
self.parse_options[
|
||||
"invalid_row_handler"
|
||||
] = None # PyArrow raises an exception by default
|
||||
elif on_bad_lines == ParserBase.BadLineHandleMethod.WARN:
|
||||
|
||||
def handle_warning(invalid_row) -> str:
|
||||
warnings.warn(
|
||||
f"Expected {invalid_row.expected_columns} columns, but found "
|
||||
f"{invalid_row.actual_columns}: {invalid_row.text}",
|
||||
ParserWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
return "skip"
|
||||
|
||||
self.parse_options["invalid_row_handler"] = handle_warning
|
||||
elif on_bad_lines == ParserBase.BadLineHandleMethod.SKIP:
|
||||
self.parse_options["invalid_row_handler"] = lambda _: "skip"
|
||||
|
||||
self.convert_options = {
|
||||
option_name: option_value
|
||||
for option_name, option_value in self.kwds.items()
|
||||
if option_value is not None
|
||||
and option_name
|
||||
in (
|
||||
"include_columns",
|
||||
"null_values",
|
||||
"true_values",
|
||||
"false_values",
|
||||
"decimal_point",
|
||||
"timestamp_parsers",
|
||||
)
|
||||
}
|
||||
self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"]
|
||||
# autogenerated column names are prefixed with 'f' in pyarrow.csv
|
||||
if self.header is None and "include_columns" in self.convert_options:
|
||||
self.convert_options["include_columns"] = [
|
||||
f"f{n}" for n in self.convert_options["include_columns"]
|
||||
]
|
||||
|
||||
self.read_options = {
|
||||
"autogenerate_column_names": self.header is None,
|
||||
"skip_rows": self.header
|
||||
if self.header is not None
|
||||
else self.kwds["skiprows"],
|
||||
"encoding": self.encoding,
|
||||
}
|
||||
|
||||
def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
|
||||
"""
|
||||
Processes data read in based on kwargs.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
frame: DataFrame
|
||||
The DataFrame to process.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
The processed DataFrame.
|
||||
"""
|
||||
num_cols = len(frame.columns)
|
||||
multi_index_named = True
|
||||
if self.header is None:
|
||||
if self.names is None:
|
||||
if self.header is None:
|
||||
self.names = range(num_cols)
|
||||
if len(self.names) != num_cols:
|
||||
# usecols is passed through to pyarrow, we only handle index col here
|
||||
# The only way self.names is not the same length as number of cols is
|
||||
# if we have int index_col. We should just pad the names(they will get
|
||||
# removed anyways) to expected length then.
|
||||
self.names = list(range(num_cols - len(self.names))) + self.names
|
||||
multi_index_named = False
|
||||
frame.columns = self.names
|
||||
# we only need the frame not the names
|
||||
_, frame = self._do_date_conversions(frame.columns, frame)
|
||||
if self.index_col is not None:
|
||||
index_to_set = self.index_col.copy()
|
||||
for i, item in enumerate(self.index_col):
|
||||
if is_integer(item):
|
||||
index_to_set[i] = frame.columns[item]
|
||||
# String case
|
||||
elif item not in frame.columns:
|
||||
raise ValueError(f"Index {item} invalid")
|
||||
|
||||
# Process dtype for index_col and drop from dtypes
|
||||
if self.dtype is not None:
|
||||
key, new_dtype = (
|
||||
(item, self.dtype.get(item))
|
||||
if self.dtype.get(item) is not None
|
||||
else (frame.columns[item], self.dtype.get(frame.columns[item]))
|
||||
)
|
||||
if new_dtype is not None:
|
||||
frame[key] = frame[key].astype(new_dtype)
|
||||
del self.dtype[key]
|
||||
|
||||
frame.set_index(index_to_set, drop=True, inplace=True)
|
||||
# Clear names if headerless and no name given
|
||||
if self.header is None and not multi_index_named:
|
||||
frame.index.names = [None] * len(frame.index.names)
|
||||
|
||||
if self.dtype is not None:
|
||||
# Ignore non-existent columns from dtype mapping
|
||||
# like other parsers do
|
||||
if isinstance(self.dtype, dict):
|
||||
self.dtype = {
|
||||
k: pandas_dtype(v)
|
||||
for k, v in self.dtype.items()
|
||||
if k in frame.columns
|
||||
}
|
||||
else:
|
||||
self.dtype = pandas_dtype(self.dtype)
|
||||
try:
|
||||
frame = frame.astype(self.dtype)
|
||||
except TypeError as e:
|
||||
# GH#44901 reraise to keep api consistent
|
||||
raise ValueError(e)
|
||||
return frame
|
||||
|
||||
def _validate_usecols(self, usecols) -> None:
|
||||
if lib.is_list_like(usecols) and not all(isinstance(x, str) for x in usecols):
|
||||
raise ValueError(
|
||||
"The pyarrow engine does not allow 'usecols' to be integer "
|
||||
"column positions. Pass a list of string column names instead."
|
||||
)
|
||||
elif callable(usecols):
|
||||
raise ValueError(
|
||||
"The pyarrow engine does not allow 'usecols' to be a callable."
|
||||
)
|
||||
|
||||
def read(self) -> DataFrame:
|
||||
"""
|
||||
Reads the contents of a CSV file into a DataFrame and
|
||||
processes it according to the kwargs passed in the
|
||||
constructor.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
The DataFrame created from the CSV file.
|
||||
"""
|
||||
pa = import_optional_dependency("pyarrow")
|
||||
pyarrow_csv = import_optional_dependency("pyarrow.csv")
|
||||
self._get_pyarrow_options()
|
||||
|
||||
try:
|
||||
convert_options = pyarrow_csv.ConvertOptions(**self.convert_options)
|
||||
except TypeError:
|
||||
include = self.convert_options.get("include_columns", None)
|
||||
if include is not None:
|
||||
self._validate_usecols(include)
|
||||
|
||||
nulls = self.convert_options.get("null_values", set())
|
||||
if not lib.is_list_like(nulls) or not all(
|
||||
isinstance(x, str) for x in nulls
|
||||
):
|
||||
raise TypeError(
|
||||
"The 'pyarrow' engine requires all na_values to be strings"
|
||||
)
|
||||
|
||||
raise
|
||||
|
||||
try:
|
||||
table = pyarrow_csv.read_csv(
|
||||
self.src,
|
||||
read_options=pyarrow_csv.ReadOptions(**self.read_options),
|
||||
parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
|
||||
convert_options=convert_options,
|
||||
)
|
||||
except pa.ArrowInvalid as e:
|
||||
raise ParserError(e) from e
|
||||
|
||||
dtype_backend = self.kwds["dtype_backend"]
|
||||
|
||||
# Convert all pa.null() cols -> float64 (non nullable)
|
||||
# else Int64 (nullable case, see below)
|
||||
if dtype_backend is lib.no_default:
|
||||
new_schema = table.schema
|
||||
new_type = pa.float64()
|
||||
for i, arrow_type in enumerate(table.schema.types):
|
||||
if pa.types.is_null(arrow_type):
|
||||
new_schema = new_schema.set(
|
||||
i, new_schema.field(i).with_type(new_type)
|
||||
)
|
||||
|
||||
table = table.cast(new_schema)
|
||||
|
||||
if dtype_backend == "pyarrow":
|
||||
frame = table.to_pandas(types_mapper=pd.ArrowDtype)
|
||||
elif dtype_backend == "numpy_nullable":
|
||||
# Modify the default mapping to also
|
||||
# map null to Int64 (to match other engines)
|
||||
dtype_mapping = _arrow_dtype_mapping()
|
||||
dtype_mapping[pa.null()] = pd.Int64Dtype()
|
||||
frame = table.to_pandas(types_mapper=dtype_mapping.get)
|
||||
elif using_pyarrow_string_dtype():
|
||||
frame = table.to_pandas(types_mapper=arrow_string_types_mapper())
|
||||
|
||||
else:
|
||||
frame = table.to_pandas()
|
||||
return self._finalize_pandas_output(frame)
|
1448
lib/python3.13/site-packages/pandas/io/parsers/base_parser.py
Normal file
1448
lib/python3.13/site-packages/pandas/io/parsers/base_parser.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,410 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import defaultdict
|
||||
from typing import TYPE_CHECKING
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
lib,
|
||||
parsers,
|
||||
)
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.errors import DtypeWarning
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.common import pandas_dtype
|
||||
from pandas.core.dtypes.concat import (
|
||||
concat_compat,
|
||||
union_categoricals,
|
||||
)
|
||||
from pandas.core.dtypes.dtypes import CategoricalDtype
|
||||
|
||||
from pandas.core.indexes.api import ensure_index_from_sequences
|
||||
|
||||
from pandas.io.common import (
|
||||
dedup_names,
|
||||
is_potential_multi_index,
|
||||
)
|
||||
from pandas.io.parsers.base_parser import (
|
||||
ParserBase,
|
||||
ParserError,
|
||||
is_index_col,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import (
|
||||
Hashable,
|
||||
Mapping,
|
||||
Sequence,
|
||||
)
|
||||
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
DtypeArg,
|
||||
DtypeObj,
|
||||
ReadCsvBuffer,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
|
||||
|
||||
class CParserWrapper(ParserBase):
|
||||
low_memory: bool
|
||||
_reader: parsers.TextReader
|
||||
|
||||
def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None:
|
||||
super().__init__(kwds)
|
||||
self.kwds = kwds
|
||||
kwds = kwds.copy()
|
||||
|
||||
self.low_memory = kwds.pop("low_memory", False)
|
||||
|
||||
# #2442
|
||||
# error: Cannot determine type of 'index_col'
|
||||
kwds["allow_leading_cols"] = (
|
||||
self.index_col is not False # type: ignore[has-type]
|
||||
)
|
||||
|
||||
# GH20529, validate usecol arg before TextReader
|
||||
kwds["usecols"] = self.usecols
|
||||
|
||||
# Have to pass int, would break tests using TextReader directly otherwise :(
|
||||
kwds["on_bad_lines"] = self.on_bad_lines.value
|
||||
|
||||
for key in (
|
||||
"storage_options",
|
||||
"encoding",
|
||||
"memory_map",
|
||||
"compression",
|
||||
):
|
||||
kwds.pop(key, None)
|
||||
|
||||
kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
|
||||
if "dtype_backend" not in kwds or kwds["dtype_backend"] is lib.no_default:
|
||||
kwds["dtype_backend"] = "numpy"
|
||||
if kwds["dtype_backend"] == "pyarrow":
|
||||
# Fail here loudly instead of in cython after reading
|
||||
import_optional_dependency("pyarrow")
|
||||
self._reader = parsers.TextReader(src, **kwds)
|
||||
|
||||
self.unnamed_cols = self._reader.unnamed_cols
|
||||
|
||||
# error: Cannot determine type of 'names'
|
||||
passed_names = self.names is None # type: ignore[has-type]
|
||||
|
||||
if self._reader.header is None:
|
||||
self.names = None
|
||||
else:
|
||||
# error: Cannot determine type of 'names'
|
||||
# error: Cannot determine type of 'index_names'
|
||||
(
|
||||
self.names, # type: ignore[has-type]
|
||||
self.index_names,
|
||||
self.col_names,
|
||||
passed_names,
|
||||
) = self._extract_multi_indexer_columns(
|
||||
self._reader.header,
|
||||
self.index_names, # type: ignore[has-type]
|
||||
passed_names,
|
||||
)
|
||||
|
||||
# error: Cannot determine type of 'names'
|
||||
if self.names is None: # type: ignore[has-type]
|
||||
self.names = list(range(self._reader.table_width))
|
||||
|
||||
# gh-9755
|
||||
#
|
||||
# need to set orig_names here first
|
||||
# so that proper indexing can be done
|
||||
# with _set_noconvert_columns
|
||||
#
|
||||
# once names has been filtered, we will
|
||||
# then set orig_names again to names
|
||||
# error: Cannot determine type of 'names'
|
||||
self.orig_names = self.names[:] # type: ignore[has-type]
|
||||
|
||||
if self.usecols:
|
||||
usecols = self._evaluate_usecols(self.usecols, self.orig_names)
|
||||
|
||||
# GH 14671
|
||||
# assert for mypy, orig_names is List or None, None would error in issubset
|
||||
assert self.orig_names is not None
|
||||
if self.usecols_dtype == "string" and not set(usecols).issubset(
|
||||
self.orig_names
|
||||
):
|
||||
self._validate_usecols_names(usecols, self.orig_names)
|
||||
|
||||
# error: Cannot determine type of 'names'
|
||||
if len(self.names) > len(usecols): # type: ignore[has-type]
|
||||
# error: Cannot determine type of 'names'
|
||||
self.names = [ # type: ignore[has-type]
|
||||
n
|
||||
# error: Cannot determine type of 'names'
|
||||
for i, n in enumerate(self.names) # type: ignore[has-type]
|
||||
if (i in usecols or n in usecols)
|
||||
]
|
||||
|
||||
# error: Cannot determine type of 'names'
|
||||
if len(self.names) < len(usecols): # type: ignore[has-type]
|
||||
# error: Cannot determine type of 'names'
|
||||
self._validate_usecols_names(
|
||||
usecols,
|
||||
self.names, # type: ignore[has-type]
|
||||
)
|
||||
|
||||
# error: Cannot determine type of 'names'
|
||||
self._validate_parse_dates_presence(self.names) # type: ignore[has-type]
|
||||
self._set_noconvert_columns()
|
||||
|
||||
# error: Cannot determine type of 'names'
|
||||
self.orig_names = self.names # type: ignore[has-type]
|
||||
|
||||
if not self._has_complex_date_col:
|
||||
# error: Cannot determine type of 'index_col'
|
||||
if self._reader.leading_cols == 0 and is_index_col(
|
||||
self.index_col # type: ignore[has-type]
|
||||
):
|
||||
self._name_processed = True
|
||||
(
|
||||
index_names,
|
||||
# error: Cannot determine type of 'names'
|
||||
self.names, # type: ignore[has-type]
|
||||
self.index_col,
|
||||
) = self._clean_index_names(
|
||||
# error: Cannot determine type of 'names'
|
||||
self.names, # type: ignore[has-type]
|
||||
# error: Cannot determine type of 'index_col'
|
||||
self.index_col, # type: ignore[has-type]
|
||||
)
|
||||
|
||||
if self.index_names is None:
|
||||
self.index_names = index_names
|
||||
|
||||
if self._reader.header is None and not passed_names:
|
||||
assert self.index_names is not None
|
||||
self.index_names = [None] * len(self.index_names)
|
||||
|
||||
self._implicit_index = self._reader.leading_cols > 0
|
||||
|
||||
def close(self) -> None:
|
||||
# close handles opened by C parser
|
||||
try:
|
||||
self._reader.close()
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
def _set_noconvert_columns(self) -> None:
|
||||
"""
|
||||
Set the columns that should not undergo dtype conversions.
|
||||
|
||||
Currently, any column that is involved with date parsing will not
|
||||
undergo such conversions.
|
||||
"""
|
||||
assert self.orig_names is not None
|
||||
# error: Cannot determine type of 'names'
|
||||
|
||||
# much faster than using orig_names.index(x) xref GH#44106
|
||||
names_dict = {x: i for i, x in enumerate(self.orig_names)}
|
||||
col_indices = [names_dict[x] for x in self.names] # type: ignore[has-type]
|
||||
# error: Cannot determine type of 'names'
|
||||
noconvert_columns = self._set_noconvert_dtype_columns(
|
||||
col_indices,
|
||||
self.names, # type: ignore[has-type]
|
||||
)
|
||||
for col in noconvert_columns:
|
||||
self._reader.set_noconvert(col)
|
||||
|
||||
def read(
|
||||
self,
|
||||
nrows: int | None = None,
|
||||
) -> tuple[
|
||||
Index | MultiIndex | None,
|
||||
Sequence[Hashable] | MultiIndex,
|
||||
Mapping[Hashable, ArrayLike],
|
||||
]:
|
||||
index: Index | MultiIndex | None
|
||||
column_names: Sequence[Hashable] | MultiIndex
|
||||
try:
|
||||
if self.low_memory:
|
||||
chunks = self._reader.read_low_memory(nrows)
|
||||
# destructive to chunks
|
||||
data = _concatenate_chunks(chunks)
|
||||
|
||||
else:
|
||||
data = self._reader.read(nrows)
|
||||
except StopIteration:
|
||||
if self._first_chunk:
|
||||
self._first_chunk = False
|
||||
names = dedup_names(
|
||||
self.orig_names,
|
||||
is_potential_multi_index(self.orig_names, self.index_col),
|
||||
)
|
||||
index, columns, col_dict = self._get_empty_meta(
|
||||
names,
|
||||
dtype=self.dtype,
|
||||
)
|
||||
columns = self._maybe_make_multi_index_columns(columns, self.col_names)
|
||||
|
||||
if self.usecols is not None:
|
||||
columns = self._filter_usecols(columns)
|
||||
|
||||
col_dict = {k: v for k, v in col_dict.items() if k in columns}
|
||||
|
||||
return index, columns, col_dict
|
||||
|
||||
else:
|
||||
self.close()
|
||||
raise
|
||||
|
||||
# Done with first read, next time raise StopIteration
|
||||
self._first_chunk = False
|
||||
|
||||
# error: Cannot determine type of 'names'
|
||||
names = self.names # type: ignore[has-type]
|
||||
|
||||
if self._reader.leading_cols:
|
||||
if self._has_complex_date_col:
|
||||
raise NotImplementedError("file structure not yet supported")
|
||||
|
||||
# implicit index, no index names
|
||||
arrays = []
|
||||
|
||||
if self.index_col and self._reader.leading_cols != len(self.index_col):
|
||||
raise ParserError(
|
||||
"Could not construct index. Requested to use "
|
||||
f"{len(self.index_col)} number of columns, but "
|
||||
f"{self._reader.leading_cols} left to parse."
|
||||
)
|
||||
|
||||
for i in range(self._reader.leading_cols):
|
||||
if self.index_col is None:
|
||||
values = data.pop(i)
|
||||
else:
|
||||
values = data.pop(self.index_col[i])
|
||||
|
||||
values = self._maybe_parse_dates(values, i, try_parse_dates=True)
|
||||
arrays.append(values)
|
||||
|
||||
index = ensure_index_from_sequences(arrays)
|
||||
|
||||
if self.usecols is not None:
|
||||
names = self._filter_usecols(names)
|
||||
|
||||
names = dedup_names(names, is_potential_multi_index(names, self.index_col))
|
||||
|
||||
# rename dict keys
|
||||
data_tups = sorted(data.items())
|
||||
data = {k: v for k, (i, v) in zip(names, data_tups)}
|
||||
|
||||
column_names, date_data = self._do_date_conversions(names, data)
|
||||
|
||||
# maybe create a mi on the columns
|
||||
column_names = self._maybe_make_multi_index_columns(
|
||||
column_names, self.col_names
|
||||
)
|
||||
|
||||
else:
|
||||
# rename dict keys
|
||||
data_tups = sorted(data.items())
|
||||
|
||||
# ugh, mutation
|
||||
|
||||
# assert for mypy, orig_names is List or None, None would error in list(...)
|
||||
assert self.orig_names is not None
|
||||
names = list(self.orig_names)
|
||||
names = dedup_names(names, is_potential_multi_index(names, self.index_col))
|
||||
|
||||
if self.usecols is not None:
|
||||
names = self._filter_usecols(names)
|
||||
|
||||
# columns as list
|
||||
alldata = [x[1] for x in data_tups]
|
||||
if self.usecols is None:
|
||||
self._check_data_length(names, alldata)
|
||||
|
||||
data = {k: v for k, (i, v) in zip(names, data_tups)}
|
||||
|
||||
names, date_data = self._do_date_conversions(names, data)
|
||||
index, column_names = self._make_index(date_data, alldata, names)
|
||||
|
||||
return index, column_names, date_data
|
||||
|
||||
def _filter_usecols(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
|
||||
# hackish
|
||||
usecols = self._evaluate_usecols(self.usecols, names)
|
||||
if usecols is not None and len(names) != len(usecols):
|
||||
names = [
|
||||
name for i, name in enumerate(names) if i in usecols or name in usecols
|
||||
]
|
||||
return names
|
||||
|
||||
def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True):
|
||||
if try_parse_dates and self._should_parse_dates(index):
|
||||
values = self._date_conv(
|
||||
values,
|
||||
col=self.index_names[index] if self.index_names is not None else None,
|
||||
)
|
||||
return values
|
||||
|
||||
|
||||
def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict:
|
||||
"""
|
||||
Concatenate chunks of data read with low_memory=True.
|
||||
|
||||
The tricky part is handling Categoricals, where different chunks
|
||||
may have different inferred categories.
|
||||
"""
|
||||
names = list(chunks[0].keys())
|
||||
warning_columns = []
|
||||
|
||||
result: dict = {}
|
||||
for name in names:
|
||||
arrs = [chunk.pop(name) for chunk in chunks]
|
||||
# Check each arr for consistent types.
|
||||
dtypes = {a.dtype for a in arrs}
|
||||
non_cat_dtypes = {x for x in dtypes if not isinstance(x, CategoricalDtype)}
|
||||
|
||||
dtype = dtypes.pop()
|
||||
if isinstance(dtype, CategoricalDtype):
|
||||
result[name] = union_categoricals(arrs, sort_categories=False)
|
||||
else:
|
||||
result[name] = concat_compat(arrs)
|
||||
if len(non_cat_dtypes) > 1 and result[name].dtype == np.dtype(object):
|
||||
warning_columns.append(str(name))
|
||||
|
||||
if warning_columns:
|
||||
warning_names = ",".join(warning_columns)
|
||||
warning_message = " ".join(
|
||||
[
|
||||
f"Columns ({warning_names}) have mixed types. "
|
||||
f"Specify dtype option on import or set low_memory=False."
|
||||
]
|
||||
)
|
||||
warnings.warn(warning_message, DtypeWarning, stacklevel=find_stack_level())
|
||||
return result
|
||||
|
||||
|
||||
def ensure_dtype_objs(
|
||||
dtype: DtypeArg | dict[Hashable, DtypeArg] | None
|
||||
) -> DtypeObj | dict[Hashable, DtypeObj] | None:
|
||||
"""
|
||||
Ensure we have either None, a dtype object, or a dictionary mapping to
|
||||
dtype objects.
|
||||
"""
|
||||
if isinstance(dtype, defaultdict):
|
||||
# "None" not callable [misc]
|
||||
default_dtype = pandas_dtype(dtype.default_factory()) # type: ignore[misc]
|
||||
dtype_converted: defaultdict = defaultdict(lambda: default_dtype)
|
||||
for key in dtype.keys():
|
||||
dtype_converted[key] = pandas_dtype(dtype[key])
|
||||
return dtype_converted
|
||||
elif isinstance(dtype, dict):
|
||||
return {k: pandas_dtype(dtype[k]) for k in dtype}
|
||||
elif dtype is not None:
|
||||
return pandas_dtype(dtype)
|
||||
return dtype
|
1387
lib/python3.13/site-packages/pandas/io/parsers/python_parser.py
Normal file
1387
lib/python3.13/site-packages/pandas/io/parsers/python_parser.py
Normal file
File diff suppressed because it is too large
Load Diff
2383
lib/python3.13/site-packages/pandas/io/parsers/readers.py
Normal file
2383
lib/python3.13/site-packages/pandas/io/parsers/readers.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user