Updated script that can be controled by Nodejs web app
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
416
lib/python3.13/site-packages/pandas/core/methods/describe.py
Normal file
416
lib/python3.13/site-packages/pandas/core/methods/describe.py
Normal file
@ -0,0 +1,416 @@
|
||||
"""
|
||||
Module responsible for execution of NDFrame.describe() method.
|
||||
|
||||
Method NDFrame.describe() delegates actual execution to function describe_ndframe().
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import (
|
||||
ABC,
|
||||
abstractmethod,
|
||||
)
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Callable,
|
||||
cast,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs.tslibs import Timestamp
|
||||
from pandas._typing import (
|
||||
DtypeObj,
|
||||
NDFrameT,
|
||||
npt,
|
||||
)
|
||||
from pandas.util._validators import validate_percentile
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_bool_dtype,
|
||||
is_numeric_dtype,
|
||||
)
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
ArrowDtype,
|
||||
DatetimeTZDtype,
|
||||
ExtensionDtype,
|
||||
)
|
||||
|
||||
from pandas.core.arrays.floating import Float64Dtype
|
||||
from pandas.core.reshape.concat import concat
|
||||
|
||||
from pandas.io.formats.format import format_percentiles
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import (
|
||||
Hashable,
|
||||
Sequence,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
|
||||
|
||||
def describe_ndframe(
|
||||
*,
|
||||
obj: NDFrameT,
|
||||
include: str | Sequence[str] | None,
|
||||
exclude: str | Sequence[str] | None,
|
||||
percentiles: Sequence[float] | np.ndarray | None,
|
||||
) -> NDFrameT:
|
||||
"""Describe series or dataframe.
|
||||
|
||||
Called from pandas.core.generic.NDFrame.describe()
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj: DataFrame or Series
|
||||
Either dataframe or series to be described.
|
||||
include : 'all', list-like of dtypes or None (default), optional
|
||||
A white list of data types to include in the result. Ignored for ``Series``.
|
||||
exclude : list-like of dtypes or None (default), optional,
|
||||
A black list of data types to omit from the result. Ignored for ``Series``.
|
||||
percentiles : list-like of numbers, optional
|
||||
The percentiles to include in the output. All should fall between 0 and 1.
|
||||
The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
|
||||
75th percentiles.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Dataframe or series description.
|
||||
"""
|
||||
percentiles = _refine_percentiles(percentiles)
|
||||
|
||||
describer: NDFrameDescriberAbstract
|
||||
if obj.ndim == 1:
|
||||
describer = SeriesDescriber(
|
||||
obj=cast("Series", obj),
|
||||
)
|
||||
else:
|
||||
describer = DataFrameDescriber(
|
||||
obj=cast("DataFrame", obj),
|
||||
include=include,
|
||||
exclude=exclude,
|
||||
)
|
||||
|
||||
result = describer.describe(percentiles=percentiles)
|
||||
return cast(NDFrameT, result)
|
||||
|
||||
|
||||
class NDFrameDescriberAbstract(ABC):
|
||||
"""Abstract class for describing dataframe or series.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : Series or DataFrame
|
||||
Object to be described.
|
||||
"""
|
||||
|
||||
def __init__(self, obj: DataFrame | Series) -> None:
|
||||
self.obj = obj
|
||||
|
||||
@abstractmethod
|
||||
def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame | Series:
|
||||
"""Do describe either series or dataframe.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
percentiles : list-like of numbers
|
||||
The percentiles to include in the output.
|
||||
"""
|
||||
|
||||
|
||||
class SeriesDescriber(NDFrameDescriberAbstract):
|
||||
"""Class responsible for creating series description."""
|
||||
|
||||
obj: Series
|
||||
|
||||
def describe(self, percentiles: Sequence[float] | np.ndarray) -> Series:
|
||||
describe_func = select_describe_func(
|
||||
self.obj,
|
||||
)
|
||||
return describe_func(self.obj, percentiles)
|
||||
|
||||
|
||||
class DataFrameDescriber(NDFrameDescriberAbstract):
|
||||
"""Class responsible for creating dataobj description.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : DataFrame
|
||||
DataFrame to be described.
|
||||
include : 'all', list-like of dtypes or None
|
||||
A white list of data types to include in the result.
|
||||
exclude : list-like of dtypes or None
|
||||
A black list of data types to omit from the result.
|
||||
"""
|
||||
|
||||
obj: DataFrame
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
obj: DataFrame,
|
||||
*,
|
||||
include: str | Sequence[str] | None,
|
||||
exclude: str | Sequence[str] | None,
|
||||
) -> None:
|
||||
self.include = include
|
||||
self.exclude = exclude
|
||||
|
||||
if obj.ndim == 2 and obj.columns.size == 0:
|
||||
raise ValueError("Cannot describe a DataFrame without columns")
|
||||
|
||||
super().__init__(obj)
|
||||
|
||||
def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame:
|
||||
data = self._select_data()
|
||||
|
||||
ldesc: list[Series] = []
|
||||
for _, series in data.items():
|
||||
describe_func = select_describe_func(series)
|
||||
ldesc.append(describe_func(series, percentiles))
|
||||
|
||||
col_names = reorder_columns(ldesc)
|
||||
d = concat(
|
||||
[x.reindex(col_names, copy=False) for x in ldesc],
|
||||
axis=1,
|
||||
sort=False,
|
||||
)
|
||||
d.columns = data.columns.copy()
|
||||
return d
|
||||
|
||||
def _select_data(self) -> DataFrame:
|
||||
"""Select columns to be described."""
|
||||
if (self.include is None) and (self.exclude is None):
|
||||
# when some numerics are found, keep only numerics
|
||||
default_include: list[npt.DTypeLike] = [np.number, "datetime"]
|
||||
data = self.obj.select_dtypes(include=default_include)
|
||||
if len(data.columns) == 0:
|
||||
data = self.obj
|
||||
elif self.include == "all":
|
||||
if self.exclude is not None:
|
||||
msg = "exclude must be None when include is 'all'"
|
||||
raise ValueError(msg)
|
||||
data = self.obj
|
||||
else:
|
||||
data = self.obj.select_dtypes(
|
||||
include=self.include,
|
||||
exclude=self.exclude,
|
||||
)
|
||||
return data
|
||||
|
||||
|
||||
def reorder_columns(ldesc: Sequence[Series]) -> list[Hashable]:
|
||||
"""Set a convenient order for rows for display."""
|
||||
names: list[Hashable] = []
|
||||
seen_names: set[Hashable] = set()
|
||||
ldesc_indexes = sorted((x.index for x in ldesc), key=len)
|
||||
for idxnames in ldesc_indexes:
|
||||
for name in idxnames:
|
||||
if name not in seen_names:
|
||||
seen_names.add(name)
|
||||
names.append(name)
|
||||
return names
|
||||
|
||||
|
||||
def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series:
|
||||
"""Describe series containing numerical data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
series : Series
|
||||
Series to be described.
|
||||
percentiles : list-like of numbers
|
||||
The percentiles to include in the output.
|
||||
"""
|
||||
from pandas import Series
|
||||
|
||||
formatted_percentiles = format_percentiles(percentiles)
|
||||
|
||||
stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
|
||||
d = (
|
||||
[series.count(), series.mean(), series.std(), series.min()]
|
||||
+ series.quantile(percentiles).tolist()
|
||||
+ [series.max()]
|
||||
)
|
||||
# GH#48340 - always return float on non-complex numeric data
|
||||
dtype: DtypeObj | None
|
||||
if isinstance(series.dtype, ExtensionDtype):
|
||||
if isinstance(series.dtype, ArrowDtype):
|
||||
if series.dtype.kind == "m":
|
||||
# GH53001: describe timedeltas with object dtype
|
||||
dtype = None
|
||||
else:
|
||||
import pyarrow as pa
|
||||
|
||||
dtype = ArrowDtype(pa.float64())
|
||||
else:
|
||||
dtype = Float64Dtype()
|
||||
elif series.dtype.kind in "iufb":
|
||||
# i.e. numeric but exclude complex dtype
|
||||
dtype = np.dtype("float")
|
||||
else:
|
||||
dtype = None
|
||||
return Series(d, index=stat_index, name=series.name, dtype=dtype)
|
||||
|
||||
|
||||
def describe_categorical_1d(
|
||||
data: Series,
|
||||
percentiles_ignored: Sequence[float],
|
||||
) -> Series:
|
||||
"""Describe series containing categorical data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : Series
|
||||
Series to be described.
|
||||
percentiles_ignored : list-like of numbers
|
||||
Ignored, but in place to unify interface.
|
||||
"""
|
||||
names = ["count", "unique", "top", "freq"]
|
||||
objcounts = data.value_counts()
|
||||
count_unique = len(objcounts[objcounts != 0])
|
||||
if count_unique > 0:
|
||||
top, freq = objcounts.index[0], objcounts.iloc[0]
|
||||
dtype = None
|
||||
else:
|
||||
# If the DataFrame is empty, set 'top' and 'freq' to None
|
||||
# to maintain output shape consistency
|
||||
top, freq = np.nan, np.nan
|
||||
dtype = "object"
|
||||
|
||||
result = [data.count(), count_unique, top, freq]
|
||||
|
||||
from pandas import Series
|
||||
|
||||
return Series(result, index=names, name=data.name, dtype=dtype)
|
||||
|
||||
|
||||
def describe_timestamp_as_categorical_1d(
|
||||
data: Series,
|
||||
percentiles_ignored: Sequence[float],
|
||||
) -> Series:
|
||||
"""Describe series containing timestamp data treated as categorical.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : Series
|
||||
Series to be described.
|
||||
percentiles_ignored : list-like of numbers
|
||||
Ignored, but in place to unify interface.
|
||||
"""
|
||||
names = ["count", "unique"]
|
||||
objcounts = data.value_counts()
|
||||
count_unique = len(objcounts[objcounts != 0])
|
||||
result: list[float | Timestamp] = [data.count(), count_unique]
|
||||
dtype = None
|
||||
if count_unique > 0:
|
||||
top, freq = objcounts.index[0], objcounts.iloc[0]
|
||||
tz = data.dt.tz
|
||||
asint = data.dropna().values.view("i8")
|
||||
top = Timestamp(top)
|
||||
if top.tzinfo is not None and tz is not None:
|
||||
# Don't tz_localize(None) if key is already tz-aware
|
||||
top = top.tz_convert(tz)
|
||||
else:
|
||||
top = top.tz_localize(tz)
|
||||
names += ["top", "freq", "first", "last"]
|
||||
result += [
|
||||
top,
|
||||
freq,
|
||||
Timestamp(asint.min(), tz=tz),
|
||||
Timestamp(asint.max(), tz=tz),
|
||||
]
|
||||
|
||||
# If the DataFrame is empty, set 'top' and 'freq' to None
|
||||
# to maintain output shape consistency
|
||||
else:
|
||||
names += ["top", "freq"]
|
||||
result += [np.nan, np.nan]
|
||||
dtype = "object"
|
||||
|
||||
from pandas import Series
|
||||
|
||||
return Series(result, index=names, name=data.name, dtype=dtype)
|
||||
|
||||
|
||||
def describe_timestamp_1d(data: Series, percentiles: Sequence[float]) -> Series:
|
||||
"""Describe series containing datetime64 dtype.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : Series
|
||||
Series to be described.
|
||||
percentiles : list-like of numbers
|
||||
The percentiles to include in the output.
|
||||
"""
|
||||
# GH-30164
|
||||
from pandas import Series
|
||||
|
||||
formatted_percentiles = format_percentiles(percentiles)
|
||||
|
||||
stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
|
||||
d = (
|
||||
[data.count(), data.mean(), data.min()]
|
||||
+ data.quantile(percentiles).tolist()
|
||||
+ [data.max()]
|
||||
)
|
||||
return Series(d, index=stat_index, name=data.name)
|
||||
|
||||
|
||||
def select_describe_func(
|
||||
data: Series,
|
||||
) -> Callable:
|
||||
"""Select proper function for describing series based on data type.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : Series
|
||||
Series to be described.
|
||||
"""
|
||||
if is_bool_dtype(data.dtype):
|
||||
return describe_categorical_1d
|
||||
elif is_numeric_dtype(data):
|
||||
return describe_numeric_1d
|
||||
elif data.dtype.kind == "M" or isinstance(data.dtype, DatetimeTZDtype):
|
||||
return describe_timestamp_1d
|
||||
elif data.dtype.kind == "m":
|
||||
return describe_numeric_1d
|
||||
else:
|
||||
return describe_categorical_1d
|
||||
|
||||
|
||||
def _refine_percentiles(
|
||||
percentiles: Sequence[float] | np.ndarray | None,
|
||||
) -> npt.NDArray[np.float64]:
|
||||
"""
|
||||
Ensure that percentiles are unique and sorted.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
percentiles : list-like of numbers, optional
|
||||
The percentiles to include in the output.
|
||||
"""
|
||||
if percentiles is None:
|
||||
return np.array([0.25, 0.5, 0.75])
|
||||
|
||||
# explicit conversion of `percentiles` to list
|
||||
percentiles = list(percentiles)
|
||||
|
||||
# get them all to be in [0, 1]
|
||||
validate_percentile(percentiles)
|
||||
|
||||
# median should always be included
|
||||
if 0.5 not in percentiles:
|
||||
percentiles.append(0.5)
|
||||
|
||||
percentiles = np.asarray(percentiles)
|
||||
|
||||
# sort and check for duplicates
|
||||
unique_pcts = np.unique(percentiles)
|
||||
assert percentiles is not None
|
||||
if len(unique_pcts) < len(percentiles):
|
||||
raise ValueError("percentiles cannot contain duplicates")
|
||||
|
||||
return unique_pcts
|
269
lib/python3.13/site-packages/pandas/core/methods/selectn.py
Normal file
269
lib/python3.13/site-packages/pandas/core/methods/selectn.py
Normal file
@ -0,0 +1,269 @@
|
||||
"""
|
||||
Implementation of nlargest and nsmallest.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import (
|
||||
Hashable,
|
||||
Sequence,
|
||||
)
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
cast,
|
||||
final,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import algos as libalgos
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_bool_dtype,
|
||||
is_complex_dtype,
|
||||
is_integer_dtype,
|
||||
is_list_like,
|
||||
is_numeric_dtype,
|
||||
needs_i8_conversion,
|
||||
)
|
||||
from pandas.core.dtypes.dtypes import BaseMaskedDtype
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
DtypeObj,
|
||||
IndexLabel,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
|
||||
|
||||
class SelectN:
|
||||
def __init__(self, obj, n: int, keep: str) -> None:
|
||||
self.obj = obj
|
||||
self.n = n
|
||||
self.keep = keep
|
||||
|
||||
if self.keep not in ("first", "last", "all"):
|
||||
raise ValueError('keep must be either "first", "last" or "all"')
|
||||
|
||||
def compute(self, method: str) -> DataFrame | Series:
|
||||
raise NotImplementedError
|
||||
|
||||
@final
|
||||
def nlargest(self):
|
||||
return self.compute("nlargest")
|
||||
|
||||
@final
|
||||
def nsmallest(self):
|
||||
return self.compute("nsmallest")
|
||||
|
||||
@final
|
||||
@staticmethod
|
||||
def is_valid_dtype_n_method(dtype: DtypeObj) -> bool:
|
||||
"""
|
||||
Helper function to determine if dtype is valid for
|
||||
nsmallest/nlargest methods
|
||||
"""
|
||||
if is_numeric_dtype(dtype):
|
||||
return not is_complex_dtype(dtype)
|
||||
return needs_i8_conversion(dtype)
|
||||
|
||||
|
||||
class SelectNSeries(SelectN):
|
||||
"""
|
||||
Implement n largest/smallest for Series
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : Series
|
||||
n : int
|
||||
keep : {'first', 'last'}, default 'first'
|
||||
|
||||
Returns
|
||||
-------
|
||||
nordered : Series
|
||||
"""
|
||||
|
||||
def compute(self, method: str) -> Series:
|
||||
from pandas.core.reshape.concat import concat
|
||||
|
||||
n = self.n
|
||||
dtype = self.obj.dtype
|
||||
if not self.is_valid_dtype_n_method(dtype):
|
||||
raise TypeError(f"Cannot use method '{method}' with dtype {dtype}")
|
||||
|
||||
if n <= 0:
|
||||
return self.obj[[]]
|
||||
|
||||
dropped = self.obj.dropna()
|
||||
nan_index = self.obj.drop(dropped.index)
|
||||
|
||||
# slow method
|
||||
if n >= len(self.obj):
|
||||
ascending = method == "nsmallest"
|
||||
return self.obj.sort_values(ascending=ascending).head(n)
|
||||
|
||||
# fast method
|
||||
new_dtype = dropped.dtype
|
||||
|
||||
# Similar to algorithms._ensure_data
|
||||
arr = dropped._values
|
||||
if needs_i8_conversion(arr.dtype):
|
||||
arr = arr.view("i8")
|
||||
elif isinstance(arr.dtype, BaseMaskedDtype):
|
||||
arr = arr._data
|
||||
else:
|
||||
arr = np.asarray(arr)
|
||||
if arr.dtype.kind == "b":
|
||||
arr = arr.view(np.uint8)
|
||||
|
||||
if method == "nlargest":
|
||||
arr = -arr
|
||||
if is_integer_dtype(new_dtype):
|
||||
# GH 21426: ensure reverse ordering at boundaries
|
||||
arr -= 1
|
||||
|
||||
elif is_bool_dtype(new_dtype):
|
||||
# GH 26154: ensure False is smaller than True
|
||||
arr = 1 - (-arr)
|
||||
|
||||
if self.keep == "last":
|
||||
arr = arr[::-1]
|
||||
|
||||
nbase = n
|
||||
narr = len(arr)
|
||||
n = min(n, narr)
|
||||
|
||||
# arr passed into kth_smallest must be contiguous. We copy
|
||||
# here because kth_smallest will modify its input
|
||||
# avoid OOB access with kth_smallest_c when n <= 0
|
||||
if len(arr) > 0:
|
||||
kth_val = libalgos.kth_smallest(arr.copy(order="C"), n - 1)
|
||||
else:
|
||||
kth_val = np.nan
|
||||
(ns,) = np.nonzero(arr <= kth_val)
|
||||
inds = ns[arr[ns].argsort(kind="mergesort")]
|
||||
|
||||
if self.keep != "all":
|
||||
inds = inds[:n]
|
||||
findex = nbase
|
||||
else:
|
||||
if len(inds) < nbase <= len(nan_index) + len(inds):
|
||||
findex = len(nan_index) + len(inds)
|
||||
else:
|
||||
findex = len(inds)
|
||||
|
||||
if self.keep == "last":
|
||||
# reverse indices
|
||||
inds = narr - 1 - inds
|
||||
|
||||
return concat([dropped.iloc[inds], nan_index]).iloc[:findex]
|
||||
|
||||
|
||||
class SelectNFrame(SelectN):
|
||||
"""
|
||||
Implement n largest/smallest for DataFrame
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : DataFrame
|
||||
n : int
|
||||
keep : {'first', 'last'}, default 'first'
|
||||
columns : list or str
|
||||
|
||||
Returns
|
||||
-------
|
||||
nordered : DataFrame
|
||||
"""
|
||||
|
||||
def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> None:
|
||||
super().__init__(obj, n, keep)
|
||||
if not is_list_like(columns) or isinstance(columns, tuple):
|
||||
columns = [columns]
|
||||
|
||||
columns = cast(Sequence[Hashable], columns)
|
||||
columns = list(columns)
|
||||
self.columns = columns
|
||||
|
||||
def compute(self, method: str) -> DataFrame:
|
||||
from pandas.core.api import Index
|
||||
|
||||
n = self.n
|
||||
frame = self.obj
|
||||
columns = self.columns
|
||||
|
||||
for column in columns:
|
||||
dtype = frame[column].dtype
|
||||
if not self.is_valid_dtype_n_method(dtype):
|
||||
raise TypeError(
|
||||
f"Column {repr(column)} has dtype {dtype}, "
|
||||
f"cannot use method {repr(method)} with this dtype"
|
||||
)
|
||||
|
||||
def get_indexer(current_indexer, other_indexer):
|
||||
"""
|
||||
Helper function to concat `current_indexer` and `other_indexer`
|
||||
depending on `method`
|
||||
"""
|
||||
if method == "nsmallest":
|
||||
return current_indexer.append(other_indexer)
|
||||
else:
|
||||
return other_indexer.append(current_indexer)
|
||||
|
||||
# Below we save and reset the index in case index contains duplicates
|
||||
original_index = frame.index
|
||||
cur_frame = frame = frame.reset_index(drop=True)
|
||||
cur_n = n
|
||||
indexer = Index([], dtype=np.int64)
|
||||
|
||||
for i, column in enumerate(columns):
|
||||
# For each column we apply method to cur_frame[column].
|
||||
# If it's the last column or if we have the number of
|
||||
# results desired we are done.
|
||||
# Otherwise there are duplicates of the largest/smallest
|
||||
# value and we need to look at the rest of the columns
|
||||
# to determine which of the rows with the largest/smallest
|
||||
# value in the column to keep.
|
||||
series = cur_frame[column]
|
||||
is_last_column = len(columns) - 1 == i
|
||||
values = getattr(series, method)(
|
||||
cur_n, keep=self.keep if is_last_column else "all"
|
||||
)
|
||||
|
||||
if is_last_column or len(values) <= cur_n:
|
||||
indexer = get_indexer(indexer, values.index)
|
||||
break
|
||||
|
||||
# Now find all values which are equal to
|
||||
# the (nsmallest: largest)/(nlargest: smallest)
|
||||
# from our series.
|
||||
border_value = values == values[values.index[-1]]
|
||||
|
||||
# Some of these values are among the top-n
|
||||
# some aren't.
|
||||
unsafe_values = values[border_value]
|
||||
|
||||
# These values are definitely among the top-n
|
||||
safe_values = values[~border_value]
|
||||
indexer = get_indexer(indexer, safe_values.index)
|
||||
|
||||
# Go on and separate the unsafe_values on the remaining
|
||||
# columns.
|
||||
cur_frame = cur_frame.loc[unsafe_values.index]
|
||||
cur_n = n - len(indexer)
|
||||
|
||||
frame = frame.take(indexer)
|
||||
|
||||
# Restore the index on frame
|
||||
frame.index = original_index.take(indexer)
|
||||
|
||||
# If there is only one column, the frame is already sorted.
|
||||
if len(columns) == 1:
|
||||
return frame
|
||||
|
||||
ascending = method == "nsmallest"
|
||||
|
||||
return frame.sort_values(columns, ascending=ascending, kind="mergesort")
|
272
lib/python3.13/site-packages/pandas/core/methods/to_dict.py
Normal file
272
lib/python3.13/site-packages/pandas/core/methods/to_dict.py
Normal file
@ -0,0 +1,272 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Literal,
|
||||
overload,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
lib,
|
||||
missing as libmissing,
|
||||
)
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.cast import maybe_box_native
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
BaseMaskedDtype,
|
||||
ExtensionDtype,
|
||||
)
|
||||
|
||||
from pandas.core import common as com
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import MutableMappingT
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
@overload
|
||||
def to_dict(
|
||||
df: DataFrame,
|
||||
orient: Literal["dict", "list", "series", "split", "tight", "index"] = ...,
|
||||
*,
|
||||
into: type[MutableMappingT] | MutableMappingT,
|
||||
index: bool = ...,
|
||||
) -> MutableMappingT:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def to_dict(
|
||||
df: DataFrame,
|
||||
orient: Literal["records"],
|
||||
*,
|
||||
into: type[MutableMappingT] | MutableMappingT,
|
||||
index: bool = ...,
|
||||
) -> list[MutableMappingT]:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def to_dict(
|
||||
df: DataFrame,
|
||||
orient: Literal["dict", "list", "series", "split", "tight", "index"] = ...,
|
||||
*,
|
||||
into: type[dict] = ...,
|
||||
index: bool = ...,
|
||||
) -> dict:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def to_dict(
|
||||
df: DataFrame,
|
||||
orient: Literal["records"],
|
||||
*,
|
||||
into: type[dict] = ...,
|
||||
index: bool = ...,
|
||||
) -> list[dict]:
|
||||
...
|
||||
|
||||
|
||||
# error: Incompatible default for argument "into" (default has type "type[dict
|
||||
# [Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT")
|
||||
def to_dict(
|
||||
df: DataFrame,
|
||||
orient: Literal[
|
||||
"dict", "list", "series", "split", "tight", "records", "index"
|
||||
] = "dict",
|
||||
*,
|
||||
into: type[MutableMappingT] | MutableMappingT = dict, # type: ignore[assignment]
|
||||
index: bool = True,
|
||||
) -> MutableMappingT | list[MutableMappingT]:
|
||||
"""
|
||||
Convert the DataFrame to a dictionary.
|
||||
|
||||
The type of the key-value pairs can be customized with the parameters
|
||||
(see below).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'}
|
||||
Determines the type of the values of the dictionary.
|
||||
|
||||
- 'dict' (default) : dict like {column -> {index -> value}}
|
||||
- 'list' : dict like {column -> [values]}
|
||||
- 'series' : dict like {column -> Series(values)}
|
||||
- 'split' : dict like
|
||||
{'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
|
||||
- 'tight' : dict like
|
||||
{'index' -> [index], 'columns' -> [columns], 'data' -> [values],
|
||||
'index_names' -> [index.names], 'column_names' -> [column.names]}
|
||||
- 'records' : list like
|
||||
[{column -> value}, ... , {column -> value}]
|
||||
- 'index' : dict like {index -> {column -> value}}
|
||||
|
||||
.. versionadded:: 1.4.0
|
||||
'tight' as an allowed value for the ``orient`` argument
|
||||
|
||||
into : class, default dict
|
||||
The collections.abc.MutableMapping subclass used for all Mappings
|
||||
in the return value. Can be the actual class or an empty
|
||||
instance of the mapping type you want. If you want a
|
||||
collections.defaultdict, you must pass it initialized.
|
||||
|
||||
index : bool, default True
|
||||
Whether to include the index item (and index_names item if `orient`
|
||||
is 'tight') in the returned dictionary. Can only be ``False``
|
||||
when `orient` is 'split' or 'tight'.
|
||||
|
||||
.. versionadded:: 2.0.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict, list or collections.abc.Mapping
|
||||
Return a collections.abc.MutableMapping object representing the
|
||||
DataFrame. The resulting transformation depends on the `orient` parameter.
|
||||
"""
|
||||
if not df.columns.is_unique:
|
||||
warnings.warn(
|
||||
"DataFrame columns are not unique, some columns will be omitted.",
|
||||
UserWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
# GH16122
|
||||
into_c = com.standardize_mapping(into)
|
||||
|
||||
# error: Incompatible types in assignment (expression has type "str",
|
||||
# variable has type "Literal['dict', 'list', 'series', 'split', 'tight',
|
||||
# 'records', 'index']")
|
||||
orient = orient.lower() # type: ignore[assignment]
|
||||
|
||||
if not index and orient not in ["split", "tight"]:
|
||||
raise ValueError(
|
||||
"'index=False' is only valid when 'orient' is 'split' or 'tight'"
|
||||
)
|
||||
|
||||
if orient == "series":
|
||||
# GH46470 Return quickly if orient series to avoid creating dtype objects
|
||||
return into_c((k, v) for k, v in df.items())
|
||||
|
||||
box_native_indices = [
|
||||
i
|
||||
for i, col_dtype in enumerate(df.dtypes.values)
|
||||
if col_dtype == np.dtype(object) or isinstance(col_dtype, ExtensionDtype)
|
||||
]
|
||||
box_na_values = [
|
||||
lib.no_default if not isinstance(col_dtype, BaseMaskedDtype) else libmissing.NA
|
||||
for i, col_dtype in enumerate(df.dtypes.values)
|
||||
]
|
||||
are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes)
|
||||
|
||||
if orient == "dict":
|
||||
return into_c((k, v.to_dict(into=into)) for k, v in df.items())
|
||||
|
||||
elif orient == "list":
|
||||
object_dtype_indices_as_set: set[int] = set(box_native_indices)
|
||||
return into_c(
|
||||
(
|
||||
k,
|
||||
list(map(maybe_box_native, v.to_numpy(na_value=box_na_values[i])))
|
||||
if i in object_dtype_indices_as_set
|
||||
else list(map(maybe_box_native, v.to_numpy())),
|
||||
)
|
||||
for i, (k, v) in enumerate(df.items())
|
||||
)
|
||||
|
||||
elif orient == "split":
|
||||
data = df._create_data_for_split_and_tight_to_dict(
|
||||
are_all_object_dtype_cols, box_native_indices
|
||||
)
|
||||
|
||||
return into_c(
|
||||
((("index", df.index.tolist()),) if index else ())
|
||||
+ (
|
||||
("columns", df.columns.tolist()),
|
||||
("data", data),
|
||||
)
|
||||
)
|
||||
|
||||
elif orient == "tight":
|
||||
data = df._create_data_for_split_and_tight_to_dict(
|
||||
are_all_object_dtype_cols, box_native_indices
|
||||
)
|
||||
|
||||
return into_c(
|
||||
((("index", df.index.tolist()),) if index else ())
|
||||
+ (
|
||||
("columns", df.columns.tolist()),
|
||||
(
|
||||
"data",
|
||||
[
|
||||
list(map(maybe_box_native, t))
|
||||
for t in df.itertuples(index=False, name=None)
|
||||
],
|
||||
),
|
||||
)
|
||||
+ ((("index_names", list(df.index.names)),) if index else ())
|
||||
+ (("column_names", list(df.columns.names)),)
|
||||
)
|
||||
|
||||
elif orient == "records":
|
||||
columns = df.columns.tolist()
|
||||
if are_all_object_dtype_cols:
|
||||
rows = (
|
||||
dict(zip(columns, row)) for row in df.itertuples(index=False, name=None)
|
||||
)
|
||||
return [
|
||||
into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows
|
||||
]
|
||||
else:
|
||||
data = [
|
||||
into_c(zip(columns, t)) for t in df.itertuples(index=False, name=None)
|
||||
]
|
||||
if box_native_indices:
|
||||
object_dtype_indices_as_set = set(box_native_indices)
|
||||
object_dtype_cols = {
|
||||
col
|
||||
for i, col in enumerate(df.columns)
|
||||
if i in object_dtype_indices_as_set
|
||||
}
|
||||
for row in data:
|
||||
for col in object_dtype_cols:
|
||||
row[col] = maybe_box_native(row[col])
|
||||
return data
|
||||
|
||||
elif orient == "index":
|
||||
if not df.index.is_unique:
|
||||
raise ValueError("DataFrame index must be unique for orient='index'.")
|
||||
columns = df.columns.tolist()
|
||||
if are_all_object_dtype_cols:
|
||||
return into_c(
|
||||
(t[0], dict(zip(df.columns, map(maybe_box_native, t[1:]))))
|
||||
for t in df.itertuples(name=None)
|
||||
)
|
||||
elif box_native_indices:
|
||||
object_dtype_indices_as_set = set(box_native_indices)
|
||||
is_object_dtype_by_index = [
|
||||
i in object_dtype_indices_as_set for i in range(len(df.columns))
|
||||
]
|
||||
return into_c(
|
||||
(
|
||||
t[0],
|
||||
{
|
||||
columns[i]: maybe_box_native(v)
|
||||
if is_object_dtype_by_index[i]
|
||||
else v
|
||||
for i, v in enumerate(t[1:])
|
||||
},
|
||||
)
|
||||
for t in df.itertuples(name=None)
|
||||
)
|
||||
else:
|
||||
return into_c(
|
||||
(t[0], dict(zip(df.columns, t[1:]))) for t in df.itertuples(name=None)
|
||||
)
|
||||
|
||||
else:
|
||||
raise ValueError(f"orient '{orient}' not understood")
|
Reference in New Issue
Block a user