Updated script that can be controled by Nodejs web app
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
41
lib/python3.13/site-packages/pandas/core/reshape/api.py
Normal file
41
lib/python3.13/site-packages/pandas/core/reshape/api.py
Normal file
@ -0,0 +1,41 @@
|
||||
from pandas.core.reshape.concat import concat
|
||||
from pandas.core.reshape.encoding import (
|
||||
from_dummies,
|
||||
get_dummies,
|
||||
)
|
||||
from pandas.core.reshape.melt import (
|
||||
lreshape,
|
||||
melt,
|
||||
wide_to_long,
|
||||
)
|
||||
from pandas.core.reshape.merge import (
|
||||
merge,
|
||||
merge_asof,
|
||||
merge_ordered,
|
||||
)
|
||||
from pandas.core.reshape.pivot import (
|
||||
crosstab,
|
||||
pivot,
|
||||
pivot_table,
|
||||
)
|
||||
from pandas.core.reshape.tile import (
|
||||
cut,
|
||||
qcut,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"concat",
|
||||
"crosstab",
|
||||
"cut",
|
||||
"from_dummies",
|
||||
"get_dummies",
|
||||
"lreshape",
|
||||
"melt",
|
||||
"merge",
|
||||
"merge_asof",
|
||||
"merge_ordered",
|
||||
"pivot",
|
||||
"pivot_table",
|
||||
"qcut",
|
||||
"wide_to_long",
|
||||
]
|
894
lib/python3.13/site-packages/pandas/core/reshape/concat.py
Normal file
894
lib/python3.13/site-packages/pandas/core/reshape/concat.py
Normal file
@ -0,0 +1,894 @@
|
||||
"""
|
||||
Concat routines.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import abc
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Callable,
|
||||
Literal,
|
||||
cast,
|
||||
overload,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._config import using_copy_on_write
|
||||
|
||||
from pandas.util._decorators import cache_readonly
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_bool,
|
||||
is_iterator,
|
||||
)
|
||||
from pandas.core.dtypes.concat import concat_compat
|
||||
from pandas.core.dtypes.generic import (
|
||||
ABCDataFrame,
|
||||
ABCSeries,
|
||||
)
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
from pandas.core.arrays.categorical import (
|
||||
factorize_from_iterable,
|
||||
factorize_from_iterables,
|
||||
)
|
||||
import pandas.core.common as com
|
||||
from pandas.core.indexes.api import (
|
||||
Index,
|
||||
MultiIndex,
|
||||
all_indexes_same,
|
||||
default_index,
|
||||
ensure_index,
|
||||
get_objs_combined_axis,
|
||||
get_unanimous_names,
|
||||
)
|
||||
from pandas.core.internals import concatenate_managers
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import (
|
||||
Hashable,
|
||||
Iterable,
|
||||
Mapping,
|
||||
)
|
||||
|
||||
from pandas._typing import (
|
||||
Axis,
|
||||
AxisInt,
|
||||
HashableT,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# Concatenate DataFrame objects
|
||||
|
||||
|
||||
@overload
|
||||
def concat(
|
||||
objs: Iterable[DataFrame] | Mapping[HashableT, DataFrame],
|
||||
*,
|
||||
axis: Literal[0, "index"] = ...,
|
||||
join: str = ...,
|
||||
ignore_index: bool = ...,
|
||||
keys: Iterable[Hashable] | None = ...,
|
||||
levels=...,
|
||||
names: list[HashableT] | None = ...,
|
||||
verify_integrity: bool = ...,
|
||||
sort: bool = ...,
|
||||
copy: bool | None = ...,
|
||||
) -> DataFrame:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def concat(
|
||||
objs: Iterable[Series] | Mapping[HashableT, Series],
|
||||
*,
|
||||
axis: Literal[0, "index"] = ...,
|
||||
join: str = ...,
|
||||
ignore_index: bool = ...,
|
||||
keys: Iterable[Hashable] | None = ...,
|
||||
levels=...,
|
||||
names: list[HashableT] | None = ...,
|
||||
verify_integrity: bool = ...,
|
||||
sort: bool = ...,
|
||||
copy: bool | None = ...,
|
||||
) -> Series:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def concat(
|
||||
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
|
||||
*,
|
||||
axis: Literal[0, "index"] = ...,
|
||||
join: str = ...,
|
||||
ignore_index: bool = ...,
|
||||
keys: Iterable[Hashable] | None = ...,
|
||||
levels=...,
|
||||
names: list[HashableT] | None = ...,
|
||||
verify_integrity: bool = ...,
|
||||
sort: bool = ...,
|
||||
copy: bool | None = ...,
|
||||
) -> DataFrame | Series:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def concat(
|
||||
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
|
||||
*,
|
||||
axis: Literal[1, "columns"],
|
||||
join: str = ...,
|
||||
ignore_index: bool = ...,
|
||||
keys: Iterable[Hashable] | None = ...,
|
||||
levels=...,
|
||||
names: list[HashableT] | None = ...,
|
||||
verify_integrity: bool = ...,
|
||||
sort: bool = ...,
|
||||
copy: bool | None = ...,
|
||||
) -> DataFrame:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def concat(
|
||||
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
|
||||
*,
|
||||
axis: Axis = ...,
|
||||
join: str = ...,
|
||||
ignore_index: bool = ...,
|
||||
keys: Iterable[Hashable] | None = ...,
|
||||
levels=...,
|
||||
names: list[HashableT] | None = ...,
|
||||
verify_integrity: bool = ...,
|
||||
sort: bool = ...,
|
||||
copy: bool | None = ...,
|
||||
) -> DataFrame | Series:
|
||||
...
|
||||
|
||||
|
||||
def concat(
|
||||
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
|
||||
*,
|
||||
axis: Axis = 0,
|
||||
join: str = "outer",
|
||||
ignore_index: bool = False,
|
||||
keys: Iterable[Hashable] | None = None,
|
||||
levels=None,
|
||||
names: list[HashableT] | None = None,
|
||||
verify_integrity: bool = False,
|
||||
sort: bool = False,
|
||||
copy: bool | None = None,
|
||||
) -> DataFrame | Series:
|
||||
"""
|
||||
Concatenate pandas objects along a particular axis.
|
||||
|
||||
Allows optional set logic along the other axes.
|
||||
|
||||
Can also add a layer of hierarchical indexing on the concatenation axis,
|
||||
which may be useful if the labels are the same (or overlapping) on
|
||||
the passed axis number.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
objs : a sequence or mapping of Series or DataFrame objects
|
||||
If a mapping is passed, the sorted keys will be used as the `keys`
|
||||
argument, unless it is passed, in which case the values will be
|
||||
selected (see below). Any None objects will be dropped silently unless
|
||||
they are all None in which case a ValueError will be raised.
|
||||
axis : {0/'index', 1/'columns'}, default 0
|
||||
The axis to concatenate along.
|
||||
join : {'inner', 'outer'}, default 'outer'
|
||||
How to handle indexes on other axis (or axes).
|
||||
ignore_index : bool, default False
|
||||
If True, do not use the index values along the concatenation axis. The
|
||||
resulting axis will be labeled 0, ..., n - 1. This is useful if you are
|
||||
concatenating objects where the concatenation axis does not have
|
||||
meaningful indexing information. Note the index values on the other
|
||||
axes are still respected in the join.
|
||||
keys : sequence, default None
|
||||
If multiple levels passed, should contain tuples. Construct
|
||||
hierarchical index using the passed keys as the outermost level.
|
||||
levels : list of sequences, default None
|
||||
Specific levels (unique values) to use for constructing a
|
||||
MultiIndex. Otherwise they will be inferred from the keys.
|
||||
names : list, default None
|
||||
Names for the levels in the resulting hierarchical index.
|
||||
verify_integrity : bool, default False
|
||||
Check whether the new concatenated axis contains duplicates. This can
|
||||
be very expensive relative to the actual data concatenation.
|
||||
sort : bool, default False
|
||||
Sort non-concatenation axis if it is not already aligned. One exception to
|
||||
this is when the non-concatentation axis is a DatetimeIndex and join='outer'
|
||||
and the axis is not already aligned. In that case, the non-concatenation
|
||||
axis is always sorted lexicographically.
|
||||
copy : bool, default True
|
||||
If False, do not copy data unnecessarily.
|
||||
|
||||
Returns
|
||||
-------
|
||||
object, type of objs
|
||||
When concatenating all ``Series`` along the index (axis=0), a
|
||||
``Series`` is returned. When ``objs`` contains at least one
|
||||
``DataFrame``, a ``DataFrame`` is returned. When concatenating along
|
||||
the columns (axis=1), a ``DataFrame`` is returned.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.join : Join DataFrames using indexes.
|
||||
DataFrame.merge : Merge DataFrames by indexes or columns.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The keys, levels, and names arguments are all optional.
|
||||
|
||||
A walkthrough of how this method fits in with other tools for combining
|
||||
pandas objects can be found `here
|
||||
<https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html>`__.
|
||||
|
||||
It is not recommended to build DataFrames by adding single rows in a
|
||||
for loop. Build a list of rows and make a DataFrame in a single concat.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Combine two ``Series``.
|
||||
|
||||
>>> s1 = pd.Series(['a', 'b'])
|
||||
>>> s2 = pd.Series(['c', 'd'])
|
||||
>>> pd.concat([s1, s2])
|
||||
0 a
|
||||
1 b
|
||||
0 c
|
||||
1 d
|
||||
dtype: object
|
||||
|
||||
Clear the existing index and reset it in the result
|
||||
by setting the ``ignore_index`` option to ``True``.
|
||||
|
||||
>>> pd.concat([s1, s2], ignore_index=True)
|
||||
0 a
|
||||
1 b
|
||||
2 c
|
||||
3 d
|
||||
dtype: object
|
||||
|
||||
Add a hierarchical index at the outermost level of
|
||||
the data with the ``keys`` option.
|
||||
|
||||
>>> pd.concat([s1, s2], keys=['s1', 's2'])
|
||||
s1 0 a
|
||||
1 b
|
||||
s2 0 c
|
||||
1 d
|
||||
dtype: object
|
||||
|
||||
Label the index keys you create with the ``names`` option.
|
||||
|
||||
>>> pd.concat([s1, s2], keys=['s1', 's2'],
|
||||
... names=['Series name', 'Row ID'])
|
||||
Series name Row ID
|
||||
s1 0 a
|
||||
1 b
|
||||
s2 0 c
|
||||
1 d
|
||||
dtype: object
|
||||
|
||||
Combine two ``DataFrame`` objects with identical columns.
|
||||
|
||||
>>> df1 = pd.DataFrame([['a', 1], ['b', 2]],
|
||||
... columns=['letter', 'number'])
|
||||
>>> df1
|
||||
letter number
|
||||
0 a 1
|
||||
1 b 2
|
||||
>>> df2 = pd.DataFrame([['c', 3], ['d', 4]],
|
||||
... columns=['letter', 'number'])
|
||||
>>> df2
|
||||
letter number
|
||||
0 c 3
|
||||
1 d 4
|
||||
>>> pd.concat([df1, df2])
|
||||
letter number
|
||||
0 a 1
|
||||
1 b 2
|
||||
0 c 3
|
||||
1 d 4
|
||||
|
||||
Combine ``DataFrame`` objects with overlapping columns
|
||||
and return everything. Columns outside the intersection will
|
||||
be filled with ``NaN`` values.
|
||||
|
||||
>>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
|
||||
... columns=['letter', 'number', 'animal'])
|
||||
>>> df3
|
||||
letter number animal
|
||||
0 c 3 cat
|
||||
1 d 4 dog
|
||||
>>> pd.concat([df1, df3], sort=False)
|
||||
letter number animal
|
||||
0 a 1 NaN
|
||||
1 b 2 NaN
|
||||
0 c 3 cat
|
||||
1 d 4 dog
|
||||
|
||||
Combine ``DataFrame`` objects with overlapping columns
|
||||
and return only those that are shared by passing ``inner`` to
|
||||
the ``join`` keyword argument.
|
||||
|
||||
>>> pd.concat([df1, df3], join="inner")
|
||||
letter number
|
||||
0 a 1
|
||||
1 b 2
|
||||
0 c 3
|
||||
1 d 4
|
||||
|
||||
Combine ``DataFrame`` objects horizontally along the x axis by
|
||||
passing in ``axis=1``.
|
||||
|
||||
>>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
|
||||
... columns=['animal', 'name'])
|
||||
>>> pd.concat([df1, df4], axis=1)
|
||||
letter number animal name
|
||||
0 a 1 bird polly
|
||||
1 b 2 monkey george
|
||||
|
||||
Prevent the result from including duplicate index values with the
|
||||
``verify_integrity`` option.
|
||||
|
||||
>>> df5 = pd.DataFrame([1], index=['a'])
|
||||
>>> df5
|
||||
0
|
||||
a 1
|
||||
>>> df6 = pd.DataFrame([2], index=['a'])
|
||||
>>> df6
|
||||
0
|
||||
a 2
|
||||
>>> pd.concat([df5, df6], verify_integrity=True)
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: Indexes have overlapping values: ['a']
|
||||
|
||||
Append a single row to the end of a ``DataFrame`` object.
|
||||
|
||||
>>> df7 = pd.DataFrame({'a': 1, 'b': 2}, index=[0])
|
||||
>>> df7
|
||||
a b
|
||||
0 1 2
|
||||
>>> new_row = pd.Series({'a': 3, 'b': 4})
|
||||
>>> new_row
|
||||
a 3
|
||||
b 4
|
||||
dtype: int64
|
||||
>>> pd.concat([df7, new_row.to_frame().T], ignore_index=True)
|
||||
a b
|
||||
0 1 2
|
||||
1 3 4
|
||||
"""
|
||||
if copy is None:
|
||||
if using_copy_on_write():
|
||||
copy = False
|
||||
else:
|
||||
copy = True
|
||||
elif copy and using_copy_on_write():
|
||||
copy = False
|
||||
|
||||
op = _Concatenator(
|
||||
objs,
|
||||
axis=axis,
|
||||
ignore_index=ignore_index,
|
||||
join=join,
|
||||
keys=keys,
|
||||
levels=levels,
|
||||
names=names,
|
||||
verify_integrity=verify_integrity,
|
||||
copy=copy,
|
||||
sort=sort,
|
||||
)
|
||||
|
||||
return op.get_result()
|
||||
|
||||
|
||||
class _Concatenator:
|
||||
"""
|
||||
Orchestrates a concatenation operation for BlockManagers
|
||||
"""
|
||||
|
||||
sort: bool
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
|
||||
axis: Axis = 0,
|
||||
join: str = "outer",
|
||||
keys: Iterable[Hashable] | None = None,
|
||||
levels=None,
|
||||
names: list[HashableT] | None = None,
|
||||
ignore_index: bool = False,
|
||||
verify_integrity: bool = False,
|
||||
copy: bool = True,
|
||||
sort: bool = False,
|
||||
) -> None:
|
||||
if isinstance(objs, (ABCSeries, ABCDataFrame, str)):
|
||||
raise TypeError(
|
||||
"first argument must be an iterable of pandas "
|
||||
f'objects, you passed an object of type "{type(objs).__name__}"'
|
||||
)
|
||||
|
||||
if join == "outer":
|
||||
self.intersect = False
|
||||
elif join == "inner":
|
||||
self.intersect = True
|
||||
else: # pragma: no cover
|
||||
raise ValueError(
|
||||
"Only can inner (intersect) or outer (union) join the other axis"
|
||||
)
|
||||
|
||||
if not is_bool(sort):
|
||||
raise ValueError(
|
||||
f"The 'sort' keyword only accepts boolean values; {sort} was passed."
|
||||
)
|
||||
# Incompatible types in assignment (expression has type "Union[bool, bool_]",
|
||||
# variable has type "bool")
|
||||
self.sort = sort # type: ignore[assignment]
|
||||
|
||||
self.ignore_index = ignore_index
|
||||
self.verify_integrity = verify_integrity
|
||||
self.copy = copy
|
||||
|
||||
objs, keys = self._clean_keys_and_objs(objs, keys)
|
||||
|
||||
# figure out what our result ndim is going to be
|
||||
ndims = self._get_ndims(objs)
|
||||
sample, objs = self._get_sample_object(objs, ndims, keys, names, levels)
|
||||
|
||||
# Standardize axis parameter to int
|
||||
if sample.ndim == 1:
|
||||
from pandas import DataFrame
|
||||
|
||||
axis = DataFrame._get_axis_number(axis)
|
||||
self._is_frame = False
|
||||
self._is_series = True
|
||||
else:
|
||||
axis = sample._get_axis_number(axis)
|
||||
self._is_frame = True
|
||||
self._is_series = False
|
||||
|
||||
# Need to flip BlockManager axis in the DataFrame special case
|
||||
axis = sample._get_block_manager_axis(axis)
|
||||
|
||||
# if we have mixed ndims, then convert to highest ndim
|
||||
# creating column numbers as needed
|
||||
if len(ndims) > 1:
|
||||
objs = self._sanitize_mixed_ndim(objs, sample, ignore_index, axis)
|
||||
|
||||
self.objs = objs
|
||||
|
||||
# note: this is the BlockManager axis (since DataFrame is transposed)
|
||||
self.bm_axis = axis
|
||||
self.axis = 1 - self.bm_axis if self._is_frame else 0
|
||||
self.keys = keys
|
||||
self.names = names or getattr(keys, "names", None)
|
||||
self.levels = levels
|
||||
|
||||
def _get_ndims(self, objs: list[Series | DataFrame]) -> set[int]:
|
||||
# figure out what our result ndim is going to be
|
||||
ndims = set()
|
||||
for obj in objs:
|
||||
if not isinstance(obj, (ABCSeries, ABCDataFrame)):
|
||||
msg = (
|
||||
f"cannot concatenate object of type '{type(obj)}'; "
|
||||
"only Series and DataFrame objs are valid"
|
||||
)
|
||||
raise TypeError(msg)
|
||||
|
||||
ndims.add(obj.ndim)
|
||||
return ndims
|
||||
|
||||
def _clean_keys_and_objs(
|
||||
self,
|
||||
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
|
||||
keys,
|
||||
) -> tuple[list[Series | DataFrame], Index | None]:
|
||||
if isinstance(objs, abc.Mapping):
|
||||
if keys is None:
|
||||
keys = list(objs.keys())
|
||||
objs_list = [objs[k] for k in keys]
|
||||
else:
|
||||
objs_list = list(objs)
|
||||
|
||||
if len(objs_list) == 0:
|
||||
raise ValueError("No objects to concatenate")
|
||||
|
||||
if keys is None:
|
||||
objs_list = list(com.not_none(*objs_list))
|
||||
else:
|
||||
# GH#1649
|
||||
clean_keys = []
|
||||
clean_objs = []
|
||||
if is_iterator(keys):
|
||||
keys = list(keys)
|
||||
if len(keys) != len(objs_list):
|
||||
# GH#43485
|
||||
warnings.warn(
|
||||
"The behavior of pd.concat with len(keys) != len(objs) is "
|
||||
"deprecated. In a future version this will raise instead of "
|
||||
"truncating to the smaller of the two sequences",
|
||||
FutureWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
for k, v in zip(keys, objs_list):
|
||||
if v is None:
|
||||
continue
|
||||
clean_keys.append(k)
|
||||
clean_objs.append(v)
|
||||
objs_list = clean_objs
|
||||
|
||||
if isinstance(keys, MultiIndex):
|
||||
# TODO: retain levels?
|
||||
keys = type(keys).from_tuples(clean_keys, names=keys.names)
|
||||
else:
|
||||
name = getattr(keys, "name", None)
|
||||
keys = Index(clean_keys, name=name, dtype=getattr(keys, "dtype", None))
|
||||
|
||||
if len(objs_list) == 0:
|
||||
raise ValueError("All objects passed were None")
|
||||
|
||||
return objs_list, keys
|
||||
|
||||
def _get_sample_object(
|
||||
self,
|
||||
objs: list[Series | DataFrame],
|
||||
ndims: set[int],
|
||||
keys,
|
||||
names,
|
||||
levels,
|
||||
) -> tuple[Series | DataFrame, list[Series | DataFrame]]:
|
||||
# get the sample
|
||||
# want the highest ndim that we have, and must be non-empty
|
||||
# unless all objs are empty
|
||||
sample: Series | DataFrame | None = None
|
||||
if len(ndims) > 1:
|
||||
max_ndim = max(ndims)
|
||||
for obj in objs:
|
||||
if obj.ndim == max_ndim and np.sum(obj.shape):
|
||||
sample = obj
|
||||
break
|
||||
|
||||
else:
|
||||
# filter out the empties if we have not multi-index possibilities
|
||||
# note to keep empty Series as it affect to result columns / name
|
||||
non_empties = [obj for obj in objs if sum(obj.shape) > 0 or obj.ndim == 1]
|
||||
|
||||
if len(non_empties) and (
|
||||
keys is None and names is None and levels is None and not self.intersect
|
||||
):
|
||||
objs = non_empties
|
||||
sample = objs[0]
|
||||
|
||||
if sample is None:
|
||||
sample = objs[0]
|
||||
return sample, objs
|
||||
|
||||
def _sanitize_mixed_ndim(
|
||||
self,
|
||||
objs: list[Series | DataFrame],
|
||||
sample: Series | DataFrame,
|
||||
ignore_index: bool,
|
||||
axis: AxisInt,
|
||||
) -> list[Series | DataFrame]:
|
||||
# if we have mixed ndims, then convert to highest ndim
|
||||
# creating column numbers as needed
|
||||
|
||||
new_objs = []
|
||||
|
||||
current_column = 0
|
||||
max_ndim = sample.ndim
|
||||
for obj in objs:
|
||||
ndim = obj.ndim
|
||||
if ndim == max_ndim:
|
||||
pass
|
||||
|
||||
elif ndim != max_ndim - 1:
|
||||
raise ValueError(
|
||||
"cannot concatenate unaligned mixed dimensional NDFrame objects"
|
||||
)
|
||||
|
||||
else:
|
||||
name = getattr(obj, "name", None)
|
||||
if ignore_index or name is None:
|
||||
if axis == 1:
|
||||
# doing a row-wise concatenation so need everything
|
||||
# to line up
|
||||
name = 0
|
||||
else:
|
||||
# doing a column-wise concatenation so need series
|
||||
# to have unique names
|
||||
name = current_column
|
||||
current_column += 1
|
||||
|
||||
obj = sample._constructor({name: obj}, copy=False)
|
||||
|
||||
new_objs.append(obj)
|
||||
|
||||
return new_objs
|
||||
|
||||
def get_result(self):
|
||||
cons: Callable[..., DataFrame | Series]
|
||||
sample: DataFrame | Series
|
||||
|
||||
# series only
|
||||
if self._is_series:
|
||||
sample = cast("Series", self.objs[0])
|
||||
|
||||
# stack blocks
|
||||
if self.bm_axis == 0:
|
||||
name = com.consensus_name_attr(self.objs)
|
||||
cons = sample._constructor
|
||||
|
||||
arrs = [ser._values for ser in self.objs]
|
||||
|
||||
res = concat_compat(arrs, axis=0)
|
||||
|
||||
new_index: Index
|
||||
if self.ignore_index:
|
||||
# We can avoid surprisingly-expensive _get_concat_axis
|
||||
new_index = default_index(len(res))
|
||||
else:
|
||||
new_index = self.new_axes[0]
|
||||
|
||||
mgr = type(sample._mgr).from_array(res, index=new_index)
|
||||
|
||||
result = sample._constructor_from_mgr(mgr, axes=mgr.axes)
|
||||
result._name = name
|
||||
return result.__finalize__(self, method="concat")
|
||||
|
||||
# combine as columns in a frame
|
||||
else:
|
||||
data = dict(zip(range(len(self.objs)), self.objs))
|
||||
|
||||
# GH28330 Preserves subclassed objects through concat
|
||||
cons = sample._constructor_expanddim
|
||||
|
||||
index, columns = self.new_axes
|
||||
df = cons(data, index=index, copy=self.copy)
|
||||
df.columns = columns
|
||||
return df.__finalize__(self, method="concat")
|
||||
|
||||
# combine block managers
|
||||
else:
|
||||
sample = cast("DataFrame", self.objs[0])
|
||||
|
||||
mgrs_indexers = []
|
||||
for obj in self.objs:
|
||||
indexers = {}
|
||||
for ax, new_labels in enumerate(self.new_axes):
|
||||
# ::-1 to convert BlockManager ax to DataFrame ax
|
||||
if ax == self.bm_axis:
|
||||
# Suppress reindexing on concat axis
|
||||
continue
|
||||
|
||||
# 1-ax to convert BlockManager axis to DataFrame axis
|
||||
obj_labels = obj.axes[1 - ax]
|
||||
if not new_labels.equals(obj_labels):
|
||||
indexers[ax] = obj_labels.get_indexer(new_labels)
|
||||
|
||||
mgrs_indexers.append((obj._mgr, indexers))
|
||||
|
||||
new_data = concatenate_managers(
|
||||
mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy
|
||||
)
|
||||
if not self.copy and not using_copy_on_write():
|
||||
new_data._consolidate_inplace()
|
||||
|
||||
out = sample._constructor_from_mgr(new_data, axes=new_data.axes)
|
||||
return out.__finalize__(self, method="concat")
|
||||
|
||||
def _get_result_dim(self) -> int:
|
||||
if self._is_series and self.bm_axis == 1:
|
||||
return 2
|
||||
else:
|
||||
return self.objs[0].ndim
|
||||
|
||||
@cache_readonly
|
||||
def new_axes(self) -> list[Index]:
|
||||
ndim = self._get_result_dim()
|
||||
return [
|
||||
self._get_concat_axis if i == self.bm_axis else self._get_comb_axis(i)
|
||||
for i in range(ndim)
|
||||
]
|
||||
|
||||
def _get_comb_axis(self, i: AxisInt) -> Index:
|
||||
data_axis = self.objs[0]._get_block_manager_axis(i)
|
||||
return get_objs_combined_axis(
|
||||
self.objs,
|
||||
axis=data_axis,
|
||||
intersect=self.intersect,
|
||||
sort=self.sort,
|
||||
copy=self.copy,
|
||||
)
|
||||
|
||||
@cache_readonly
|
||||
def _get_concat_axis(self) -> Index:
|
||||
"""
|
||||
Return index to be used along concatenation axis.
|
||||
"""
|
||||
if self._is_series:
|
||||
if self.bm_axis == 0:
|
||||
indexes = [x.index for x in self.objs]
|
||||
elif self.ignore_index:
|
||||
idx = default_index(len(self.objs))
|
||||
return idx
|
||||
elif self.keys is None:
|
||||
names: list[Hashable] = [None] * len(self.objs)
|
||||
num = 0
|
||||
has_names = False
|
||||
for i, x in enumerate(self.objs):
|
||||
if x.ndim != 1:
|
||||
raise TypeError(
|
||||
f"Cannot concatenate type 'Series' with "
|
||||
f"object of type '{type(x).__name__}'"
|
||||
)
|
||||
if x.name is not None:
|
||||
names[i] = x.name
|
||||
has_names = True
|
||||
else:
|
||||
names[i] = num
|
||||
num += 1
|
||||
if has_names:
|
||||
return Index(names)
|
||||
else:
|
||||
return default_index(len(self.objs))
|
||||
else:
|
||||
return ensure_index(self.keys).set_names(self.names)
|
||||
else:
|
||||
indexes = [x.axes[self.axis] for x in self.objs]
|
||||
|
||||
if self.ignore_index:
|
||||
idx = default_index(sum(len(i) for i in indexes))
|
||||
return idx
|
||||
|
||||
if self.keys is None:
|
||||
if self.levels is not None:
|
||||
raise ValueError("levels supported only when keys is not None")
|
||||
concat_axis = _concat_indexes(indexes)
|
||||
else:
|
||||
concat_axis = _make_concat_multiindex(
|
||||
indexes, self.keys, self.levels, self.names
|
||||
)
|
||||
|
||||
self._maybe_check_integrity(concat_axis)
|
||||
|
||||
return concat_axis
|
||||
|
||||
def _maybe_check_integrity(self, concat_index: Index):
|
||||
if self.verify_integrity:
|
||||
if not concat_index.is_unique:
|
||||
overlap = concat_index[concat_index.duplicated()].unique()
|
||||
raise ValueError(f"Indexes have overlapping values: {overlap}")
|
||||
|
||||
|
||||
def _concat_indexes(indexes) -> Index:
|
||||
return indexes[0].append(indexes[1:])
|
||||
|
||||
|
||||
def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiIndex:
|
||||
if (levels is None and isinstance(keys[0], tuple)) or (
|
||||
levels is not None and len(levels) > 1
|
||||
):
|
||||
zipped = list(zip(*keys))
|
||||
if names is None:
|
||||
names = [None] * len(zipped)
|
||||
|
||||
if levels is None:
|
||||
_, levels = factorize_from_iterables(zipped)
|
||||
else:
|
||||
levels = [ensure_index(x) for x in levels]
|
||||
else:
|
||||
zipped = [keys]
|
||||
if names is None:
|
||||
names = [None]
|
||||
|
||||
if levels is None:
|
||||
levels = [ensure_index(keys).unique()]
|
||||
else:
|
||||
levels = [ensure_index(x) for x in levels]
|
||||
|
||||
for level in levels:
|
||||
if not level.is_unique:
|
||||
raise ValueError(f"Level values not unique: {level.tolist()}")
|
||||
|
||||
if not all_indexes_same(indexes) or not all(level.is_unique for level in levels):
|
||||
codes_list = []
|
||||
|
||||
# things are potentially different sizes, so compute the exact codes
|
||||
# for each level and pass those to MultiIndex.from_arrays
|
||||
|
||||
for hlevel, level in zip(zipped, levels):
|
||||
to_concat = []
|
||||
if isinstance(hlevel, Index) and hlevel.equals(level):
|
||||
lens = [len(idx) for idx in indexes]
|
||||
codes_list.append(np.repeat(np.arange(len(hlevel)), lens))
|
||||
else:
|
||||
for key, index in zip(hlevel, indexes):
|
||||
# Find matching codes, include matching nan values as equal.
|
||||
mask = (isna(level) & isna(key)) | (level == key)
|
||||
if not mask.any():
|
||||
raise ValueError(f"Key {key} not in level {level}")
|
||||
i = np.nonzero(mask)[0][0]
|
||||
|
||||
to_concat.append(np.repeat(i, len(index)))
|
||||
codes_list.append(np.concatenate(to_concat))
|
||||
|
||||
concat_index = _concat_indexes(indexes)
|
||||
|
||||
# these go at the end
|
||||
if isinstance(concat_index, MultiIndex):
|
||||
levels.extend(concat_index.levels)
|
||||
codes_list.extend(concat_index.codes)
|
||||
else:
|
||||
codes, categories = factorize_from_iterable(concat_index)
|
||||
levels.append(categories)
|
||||
codes_list.append(codes)
|
||||
|
||||
if len(names) == len(levels):
|
||||
names = list(names)
|
||||
else:
|
||||
# make sure that all of the passed indices have the same nlevels
|
||||
if not len({idx.nlevels for idx in indexes}) == 1:
|
||||
raise AssertionError(
|
||||
"Cannot concat indices that do not have the same number of levels"
|
||||
)
|
||||
|
||||
# also copies
|
||||
names = list(names) + list(get_unanimous_names(*indexes))
|
||||
|
||||
return MultiIndex(
|
||||
levels=levels, codes=codes_list, names=names, verify_integrity=False
|
||||
)
|
||||
|
||||
new_index = indexes[0]
|
||||
n = len(new_index)
|
||||
kpieces = len(indexes)
|
||||
|
||||
# also copies
|
||||
new_names = list(names)
|
||||
new_levels = list(levels)
|
||||
|
||||
# construct codes
|
||||
new_codes = []
|
||||
|
||||
# do something a bit more speedy
|
||||
|
||||
for hlevel, level in zip(zipped, levels):
|
||||
hlevel_index = ensure_index(hlevel)
|
||||
mapped = level.get_indexer(hlevel_index)
|
||||
|
||||
mask = mapped == -1
|
||||
if mask.any():
|
||||
raise ValueError(
|
||||
f"Values not found in passed level: {hlevel_index[mask]!s}"
|
||||
)
|
||||
|
||||
new_codes.append(np.repeat(mapped, n))
|
||||
|
||||
if isinstance(new_index, MultiIndex):
|
||||
new_levels.extend(new_index.levels)
|
||||
new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes])
|
||||
else:
|
||||
new_levels.append(new_index.unique())
|
||||
single_codes = new_index.unique().get_indexer(new_index)
|
||||
new_codes.append(np.tile(single_codes, kpieces))
|
||||
|
||||
if len(new_names) < len(new_levels):
|
||||
new_names.extend(new_index.names)
|
||||
|
||||
return MultiIndex(
|
||||
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
|
||||
)
|
570
lib/python3.13/site-packages/pandas/core/reshape/encoding.py
Normal file
570
lib/python3.13/site-packages/pandas/core/reshape/encoding.py
Normal file
@ -0,0 +1,570 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import defaultdict
|
||||
from collections.abc import (
|
||||
Hashable,
|
||||
Iterable,
|
||||
)
|
||||
import itertools
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
cast,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs.sparse import IntIndex
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_integer_dtype,
|
||||
is_list_like,
|
||||
is_object_dtype,
|
||||
pandas_dtype,
|
||||
)
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
ArrowDtype,
|
||||
CategoricalDtype,
|
||||
)
|
||||
|
||||
from pandas.core.arrays import SparseArray
|
||||
from pandas.core.arrays.categorical import factorize_from_iterable
|
||||
from pandas.core.arrays.string_ import StringDtype
|
||||
from pandas.core.frame import DataFrame
|
||||
from pandas.core.indexes.api import (
|
||||
Index,
|
||||
default_index,
|
||||
)
|
||||
from pandas.core.series import Series
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import NpDtype
|
||||
|
||||
|
||||
def get_dummies(
|
||||
data,
|
||||
prefix=None,
|
||||
prefix_sep: str | Iterable[str] | dict[str, str] = "_",
|
||||
dummy_na: bool = False,
|
||||
columns=None,
|
||||
sparse: bool = False,
|
||||
drop_first: bool = False,
|
||||
dtype: NpDtype | None = None,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Convert categorical variable into dummy/indicator variables.
|
||||
|
||||
Each variable is converted in as many 0/1 variables as there are different
|
||||
values. Columns in the output are each named after a value; if the input is
|
||||
a DataFrame, the name of the original variable is prepended to the value.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array-like, Series, or DataFrame
|
||||
Data of which to get dummy indicators.
|
||||
prefix : str, list of str, or dict of str, default None
|
||||
String to append DataFrame column names.
|
||||
Pass a list with length equal to the number of columns
|
||||
when calling get_dummies on a DataFrame. Alternatively, `prefix`
|
||||
can be a dictionary mapping column names to prefixes.
|
||||
prefix_sep : str, default '_'
|
||||
If appending prefix, separator/delimiter to use. Or pass a
|
||||
list or dictionary as with `prefix`.
|
||||
dummy_na : bool, default False
|
||||
Add a column to indicate NaNs, if False NaNs are ignored.
|
||||
columns : list-like, default None
|
||||
Column names in the DataFrame to be encoded.
|
||||
If `columns` is None then all the columns with
|
||||
`object`, `string`, or `category` dtype will be converted.
|
||||
sparse : bool, default False
|
||||
Whether the dummy-encoded columns should be backed by
|
||||
a :class:`SparseArray` (True) or a regular NumPy array (False).
|
||||
drop_first : bool, default False
|
||||
Whether to get k-1 dummies out of k categorical levels by removing the
|
||||
first level.
|
||||
dtype : dtype, default bool
|
||||
Data type for new columns. Only a single dtype is allowed.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
Dummy-coded data. If `data` contains other columns than the
|
||||
dummy-coded one(s), these will be prepended, unaltered, to the result.
|
||||
|
||||
See Also
|
||||
--------
|
||||
Series.str.get_dummies : Convert Series of strings to dummy codes.
|
||||
:func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame``.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Reference :ref:`the user guide <reshaping.dummies>` for more examples.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> s = pd.Series(list('abca'))
|
||||
|
||||
>>> pd.get_dummies(s)
|
||||
a b c
|
||||
0 True False False
|
||||
1 False True False
|
||||
2 False False True
|
||||
3 True False False
|
||||
|
||||
>>> s1 = ['a', 'b', np.nan]
|
||||
|
||||
>>> pd.get_dummies(s1)
|
||||
a b
|
||||
0 True False
|
||||
1 False True
|
||||
2 False False
|
||||
|
||||
>>> pd.get_dummies(s1, dummy_na=True)
|
||||
a b NaN
|
||||
0 True False False
|
||||
1 False True False
|
||||
2 False False True
|
||||
|
||||
>>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
|
||||
... 'C': [1, 2, 3]})
|
||||
|
||||
>>> pd.get_dummies(df, prefix=['col1', 'col2'])
|
||||
C col1_a col1_b col2_a col2_b col2_c
|
||||
0 1 True False False True False
|
||||
1 2 False True True False False
|
||||
2 3 True False False False True
|
||||
|
||||
>>> pd.get_dummies(pd.Series(list('abcaa')))
|
||||
a b c
|
||||
0 True False False
|
||||
1 False True False
|
||||
2 False False True
|
||||
3 True False False
|
||||
4 True False False
|
||||
|
||||
>>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
|
||||
b c
|
||||
0 False False
|
||||
1 True False
|
||||
2 False True
|
||||
3 False False
|
||||
4 False False
|
||||
|
||||
>>> pd.get_dummies(pd.Series(list('abc')), dtype=float)
|
||||
a b c
|
||||
0 1.0 0.0 0.0
|
||||
1 0.0 1.0 0.0
|
||||
2 0.0 0.0 1.0
|
||||
"""
|
||||
from pandas.core.reshape.concat import concat
|
||||
|
||||
dtypes_to_encode = ["object", "string", "category"]
|
||||
|
||||
if isinstance(data, DataFrame):
|
||||
# determine columns being encoded
|
||||
if columns is None:
|
||||
data_to_encode = data.select_dtypes(include=dtypes_to_encode)
|
||||
elif not is_list_like(columns):
|
||||
raise TypeError("Input must be a list-like for parameter `columns`")
|
||||
else:
|
||||
data_to_encode = data[columns]
|
||||
|
||||
# validate prefixes and separator to avoid silently dropping cols
|
||||
def check_len(item, name: str):
|
||||
if is_list_like(item):
|
||||
if not len(item) == data_to_encode.shape[1]:
|
||||
len_msg = (
|
||||
f"Length of '{name}' ({len(item)}) did not match the "
|
||||
"length of the columns being encoded "
|
||||
f"({data_to_encode.shape[1]})."
|
||||
)
|
||||
raise ValueError(len_msg)
|
||||
|
||||
check_len(prefix, "prefix")
|
||||
check_len(prefix_sep, "prefix_sep")
|
||||
|
||||
if isinstance(prefix, str):
|
||||
prefix = itertools.cycle([prefix])
|
||||
if isinstance(prefix, dict):
|
||||
prefix = [prefix[col] for col in data_to_encode.columns]
|
||||
|
||||
if prefix is None:
|
||||
prefix = data_to_encode.columns
|
||||
|
||||
# validate separators
|
||||
if isinstance(prefix_sep, str):
|
||||
prefix_sep = itertools.cycle([prefix_sep])
|
||||
elif isinstance(prefix_sep, dict):
|
||||
prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]
|
||||
|
||||
with_dummies: list[DataFrame]
|
||||
if data_to_encode.shape == data.shape:
|
||||
# Encoding the entire df, do not prepend any dropped columns
|
||||
with_dummies = []
|
||||
elif columns is not None:
|
||||
# Encoding only cols specified in columns. Get all cols not in
|
||||
# columns to prepend to result.
|
||||
with_dummies = [data.drop(columns, axis=1)]
|
||||
else:
|
||||
# Encoding only object and category dtype columns. Get remaining
|
||||
# columns to prepend to result.
|
||||
with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]
|
||||
|
||||
for col, pre, sep in zip(data_to_encode.items(), prefix, prefix_sep):
|
||||
# col is (column_name, column), use just column data here
|
||||
dummy = _get_dummies_1d(
|
||||
col[1],
|
||||
prefix=pre,
|
||||
prefix_sep=sep,
|
||||
dummy_na=dummy_na,
|
||||
sparse=sparse,
|
||||
drop_first=drop_first,
|
||||
dtype=dtype,
|
||||
)
|
||||
with_dummies.append(dummy)
|
||||
result = concat(with_dummies, axis=1)
|
||||
else:
|
||||
result = _get_dummies_1d(
|
||||
data,
|
||||
prefix,
|
||||
prefix_sep,
|
||||
dummy_na,
|
||||
sparse=sparse,
|
||||
drop_first=drop_first,
|
||||
dtype=dtype,
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
def _get_dummies_1d(
|
||||
data,
|
||||
prefix,
|
||||
prefix_sep: str | Iterable[str] | dict[str, str] = "_",
|
||||
dummy_na: bool = False,
|
||||
sparse: bool = False,
|
||||
drop_first: bool = False,
|
||||
dtype: NpDtype | None = None,
|
||||
) -> DataFrame:
|
||||
from pandas.core.reshape.concat import concat
|
||||
|
||||
# Series avoids inconsistent NaN handling
|
||||
codes, levels = factorize_from_iterable(Series(data, copy=False))
|
||||
|
||||
if dtype is None and hasattr(data, "dtype"):
|
||||
input_dtype = data.dtype
|
||||
if isinstance(input_dtype, CategoricalDtype):
|
||||
input_dtype = input_dtype.categories.dtype
|
||||
|
||||
if isinstance(input_dtype, ArrowDtype):
|
||||
import pyarrow as pa
|
||||
|
||||
dtype = ArrowDtype(pa.bool_()) # type: ignore[assignment]
|
||||
elif (
|
||||
isinstance(input_dtype, StringDtype)
|
||||
and input_dtype.storage != "pyarrow_numpy"
|
||||
):
|
||||
dtype = pandas_dtype("boolean") # type: ignore[assignment]
|
||||
else:
|
||||
dtype = np.dtype(bool)
|
||||
elif dtype is None:
|
||||
dtype = np.dtype(bool)
|
||||
|
||||
_dtype = pandas_dtype(dtype)
|
||||
|
||||
if is_object_dtype(_dtype):
|
||||
raise ValueError("dtype=object is not a valid dtype for get_dummies")
|
||||
|
||||
def get_empty_frame(data) -> DataFrame:
|
||||
index: Index | np.ndarray
|
||||
if isinstance(data, Series):
|
||||
index = data.index
|
||||
else:
|
||||
index = default_index(len(data))
|
||||
return DataFrame(index=index)
|
||||
|
||||
# if all NaN
|
||||
if not dummy_na and len(levels) == 0:
|
||||
return get_empty_frame(data)
|
||||
|
||||
codes = codes.copy()
|
||||
if dummy_na:
|
||||
codes[codes == -1] = len(levels)
|
||||
levels = levels.insert(len(levels), np.nan)
|
||||
|
||||
# if dummy_na, we just fake a nan level. drop_first will drop it again
|
||||
if drop_first and len(levels) == 1:
|
||||
return get_empty_frame(data)
|
||||
|
||||
number_of_cols = len(levels)
|
||||
|
||||
if prefix is None:
|
||||
dummy_cols = levels
|
||||
else:
|
||||
dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels])
|
||||
|
||||
index: Index | None
|
||||
if isinstance(data, Series):
|
||||
index = data.index
|
||||
else:
|
||||
index = None
|
||||
|
||||
if sparse:
|
||||
fill_value: bool | float
|
||||
if is_integer_dtype(dtype):
|
||||
fill_value = 0
|
||||
elif dtype == np.dtype(bool):
|
||||
fill_value = False
|
||||
else:
|
||||
fill_value = 0.0
|
||||
|
||||
sparse_series = []
|
||||
N = len(data)
|
||||
sp_indices: list[list] = [[] for _ in range(len(dummy_cols))]
|
||||
mask = codes != -1
|
||||
codes = codes[mask]
|
||||
n_idx = np.arange(N)[mask]
|
||||
|
||||
for ndx, code in zip(n_idx, codes):
|
||||
sp_indices[code].append(ndx)
|
||||
|
||||
if drop_first:
|
||||
# remove first categorical level to avoid perfect collinearity
|
||||
# GH12042
|
||||
sp_indices = sp_indices[1:]
|
||||
dummy_cols = dummy_cols[1:]
|
||||
for col, ixs in zip(dummy_cols, sp_indices):
|
||||
sarr = SparseArray(
|
||||
np.ones(len(ixs), dtype=dtype),
|
||||
sparse_index=IntIndex(N, ixs),
|
||||
fill_value=fill_value,
|
||||
dtype=dtype,
|
||||
)
|
||||
sparse_series.append(Series(data=sarr, index=index, name=col, copy=False))
|
||||
|
||||
return concat(sparse_series, axis=1, copy=False)
|
||||
|
||||
else:
|
||||
# ensure ndarray layout is column-major
|
||||
shape = len(codes), number_of_cols
|
||||
dummy_dtype: NpDtype
|
||||
if isinstance(_dtype, np.dtype):
|
||||
dummy_dtype = _dtype
|
||||
else:
|
||||
dummy_dtype = np.bool_
|
||||
dummy_mat = np.zeros(shape=shape, dtype=dummy_dtype, order="F")
|
||||
dummy_mat[np.arange(len(codes)), codes] = 1
|
||||
|
||||
if not dummy_na:
|
||||
# reset NaN GH4446
|
||||
dummy_mat[codes == -1] = 0
|
||||
|
||||
if drop_first:
|
||||
# remove first GH12042
|
||||
dummy_mat = dummy_mat[:, 1:]
|
||||
dummy_cols = dummy_cols[1:]
|
||||
return DataFrame(dummy_mat, index=index, columns=dummy_cols, dtype=_dtype)
|
||||
|
||||
|
||||
def from_dummies(
|
||||
data: DataFrame,
|
||||
sep: None | str = None,
|
||||
default_category: None | Hashable | dict[str, Hashable] = None,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables.
|
||||
|
||||
Inverts the operation performed by :func:`~pandas.get_dummies`.
|
||||
|
||||
.. versionadded:: 1.5.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : DataFrame
|
||||
Data which contains dummy-coded variables in form of integer columns of
|
||||
1's and 0's.
|
||||
sep : str, default None
|
||||
Separator used in the column names of the dummy categories they are
|
||||
character indicating the separation of the categorical names from the prefixes.
|
||||
For example, if your column names are 'prefix_A' and 'prefix_B',
|
||||
you can strip the underscore by specifying sep='_'.
|
||||
default_category : None, Hashable or dict of Hashables, default None
|
||||
The default category is the implied category when a value has none of the
|
||||
listed categories specified with a one, i.e. if all dummies in a row are
|
||||
zero. Can be a single value for all variables or a dict directly mapping
|
||||
the default categories to a prefix of a variable.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
Categorical data decoded from the dummy input-data.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
* When the input ``DataFrame`` ``data`` contains NA values.
|
||||
* When the input ``DataFrame`` ``data`` contains column names with separators
|
||||
that do not match the separator specified with ``sep``.
|
||||
* When a ``dict`` passed to ``default_category`` does not include an implied
|
||||
category for each prefix.
|
||||
* When a value in ``data`` has more than one category assigned to it.
|
||||
* When ``default_category=None`` and a value in ``data`` has no category
|
||||
assigned to it.
|
||||
TypeError
|
||||
* When the input ``data`` is not of type ``DataFrame``.
|
||||
* When the input ``DataFrame`` ``data`` contains non-dummy data.
|
||||
* When the passed ``sep`` is of a wrong data type.
|
||||
* When the passed ``default_category`` is of a wrong data type.
|
||||
|
||||
See Also
|
||||
--------
|
||||
:func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy codes.
|
||||
:class:`~pandas.Categorical` : Represent a categorical variable in classic.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The columns of the passed dummy data should only include 1's and 0's,
|
||||
or boolean values.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0],
|
||||
... "c": [0, 0, 1, 0]})
|
||||
|
||||
>>> df
|
||||
a b c
|
||||
0 1 0 0
|
||||
1 0 1 0
|
||||
2 0 0 1
|
||||
3 1 0 0
|
||||
|
||||
>>> pd.from_dummies(df)
|
||||
0 a
|
||||
1 b
|
||||
2 c
|
||||
3 a
|
||||
|
||||
>>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0],
|
||||
... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],
|
||||
... "col2_c": [0, 0, 1]})
|
||||
|
||||
>>> df
|
||||
col1_a col1_b col2_a col2_b col2_c
|
||||
0 1 0 0 1 0
|
||||
1 0 1 1 0 0
|
||||
2 1 0 0 0 1
|
||||
|
||||
>>> pd.from_dummies(df, sep="_")
|
||||
col1 col2
|
||||
0 a b
|
||||
1 b a
|
||||
2 a c
|
||||
|
||||
>>> df = pd.DataFrame({"col1_a": [1, 0, 0], "col1_b": [0, 1, 0],
|
||||
... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],
|
||||
... "col2_c": [0, 0, 0]})
|
||||
|
||||
>>> df
|
||||
col1_a col1_b col2_a col2_b col2_c
|
||||
0 1 0 0 1 0
|
||||
1 0 1 1 0 0
|
||||
2 0 0 0 0 0
|
||||
|
||||
>>> pd.from_dummies(df, sep="_", default_category={"col1": "d", "col2": "e"})
|
||||
col1 col2
|
||||
0 a b
|
||||
1 b a
|
||||
2 d e
|
||||
"""
|
||||
from pandas.core.reshape.concat import concat
|
||||
|
||||
if not isinstance(data, DataFrame):
|
||||
raise TypeError(
|
||||
"Expected 'data' to be a 'DataFrame'; "
|
||||
f"Received 'data' of type: {type(data).__name__}"
|
||||
)
|
||||
|
||||
col_isna_mask = cast(Series, data.isna().any())
|
||||
|
||||
if col_isna_mask.any():
|
||||
raise ValueError(
|
||||
"Dummy DataFrame contains NA value in column: "
|
||||
f"'{col_isna_mask.idxmax()}'"
|
||||
)
|
||||
|
||||
# index data with a list of all columns that are dummies
|
||||
try:
|
||||
data_to_decode = data.astype("boolean", copy=False)
|
||||
except TypeError:
|
||||
raise TypeError("Passed DataFrame contains non-dummy data")
|
||||
|
||||
# collect prefixes and get lists to slice data for each prefix
|
||||
variables_slice = defaultdict(list)
|
||||
if sep is None:
|
||||
variables_slice[""] = list(data.columns)
|
||||
elif isinstance(sep, str):
|
||||
for col in data_to_decode.columns:
|
||||
prefix = col.split(sep)[0]
|
||||
if len(prefix) == len(col):
|
||||
raise ValueError(f"Separator not specified for column: {col}")
|
||||
variables_slice[prefix].append(col)
|
||||
else:
|
||||
raise TypeError(
|
||||
"Expected 'sep' to be of type 'str' or 'None'; "
|
||||
f"Received 'sep' of type: {type(sep).__name__}"
|
||||
)
|
||||
|
||||
if default_category is not None:
|
||||
if isinstance(default_category, dict):
|
||||
if not len(default_category) == len(variables_slice):
|
||||
len_msg = (
|
||||
f"Length of 'default_category' ({len(default_category)}) "
|
||||
f"did not match the length of the columns being encoded "
|
||||
f"({len(variables_slice)})"
|
||||
)
|
||||
raise ValueError(len_msg)
|
||||
elif isinstance(default_category, Hashable):
|
||||
default_category = dict(
|
||||
zip(variables_slice, [default_category] * len(variables_slice))
|
||||
)
|
||||
else:
|
||||
raise TypeError(
|
||||
"Expected 'default_category' to be of type "
|
||||
"'None', 'Hashable', or 'dict'; "
|
||||
"Received 'default_category' of type: "
|
||||
f"{type(default_category).__name__}"
|
||||
)
|
||||
|
||||
cat_data = {}
|
||||
for prefix, prefix_slice in variables_slice.items():
|
||||
if sep is None:
|
||||
cats = prefix_slice.copy()
|
||||
else:
|
||||
cats = [col[len(prefix + sep) :] for col in prefix_slice]
|
||||
assigned = data_to_decode.loc[:, prefix_slice].sum(axis=1)
|
||||
if any(assigned > 1):
|
||||
raise ValueError(
|
||||
"Dummy DataFrame contains multi-assignment(s); "
|
||||
f"First instance in row: {assigned.idxmax()}"
|
||||
)
|
||||
if any(assigned == 0):
|
||||
if isinstance(default_category, dict):
|
||||
cats.append(default_category[prefix])
|
||||
else:
|
||||
raise ValueError(
|
||||
"Dummy DataFrame contains unassigned value(s); "
|
||||
f"First instance in row: {assigned.idxmin()}"
|
||||
)
|
||||
data_slice = concat(
|
||||
(data_to_decode.loc[:, prefix_slice], assigned == 0), axis=1
|
||||
)
|
||||
else:
|
||||
data_slice = data_to_decode.loc[:, prefix_slice]
|
||||
cats_array = data._constructor_sliced(cats, dtype=data.columns.dtype)
|
||||
# get indices of True entries along axis=1
|
||||
true_values = data_slice.idxmax(axis=1)
|
||||
indexer = data_slice.columns.get_indexer_for(true_values)
|
||||
cat_data[prefix] = cats_array.take(indexer).set_axis(data.index)
|
||||
|
||||
result = DataFrame(cat_data)
|
||||
if sep is not None:
|
||||
result.columns = result.columns.astype(data.columns.dtype)
|
||||
return result
|
512
lib/python3.13/site-packages/pandas/core/reshape/melt.py
Normal file
512
lib/python3.13/site-packages/pandas/core/reshape/melt.py
Normal file
@ -0,0 +1,512 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.util._decorators import Appender
|
||||
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
from pandas.core.dtypes.concat import concat_compat
|
||||
from pandas.core.dtypes.missing import notna
|
||||
|
||||
import pandas.core.algorithms as algos
|
||||
from pandas.core.indexes.api import MultiIndex
|
||||
from pandas.core.reshape.concat import concat
|
||||
from pandas.core.reshape.util import tile_compat
|
||||
from pandas.core.shared_docs import _shared_docs
|
||||
from pandas.core.tools.numeric import to_numeric
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Hashable
|
||||
|
||||
from pandas._typing import AnyArrayLike
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
def ensure_list_vars(arg_vars, variable: str, columns) -> list:
|
||||
if arg_vars is not None:
|
||||
if not is_list_like(arg_vars):
|
||||
return [arg_vars]
|
||||
elif isinstance(columns, MultiIndex) and not isinstance(arg_vars, list):
|
||||
raise ValueError(
|
||||
f"{variable} must be a list of tuples when columns are a MultiIndex"
|
||||
)
|
||||
else:
|
||||
return list(arg_vars)
|
||||
else:
|
||||
return []
|
||||
|
||||
|
||||
@Appender(_shared_docs["melt"] % {"caller": "pd.melt(df, ", "other": "DataFrame.melt"})
|
||||
def melt(
|
||||
frame: DataFrame,
|
||||
id_vars=None,
|
||||
value_vars=None,
|
||||
var_name=None,
|
||||
value_name: Hashable = "value",
|
||||
col_level=None,
|
||||
ignore_index: bool = True,
|
||||
) -> DataFrame:
|
||||
if value_name in frame.columns:
|
||||
raise ValueError(
|
||||
f"value_name ({value_name}) cannot match an element in "
|
||||
"the DataFrame columns."
|
||||
)
|
||||
id_vars = ensure_list_vars(id_vars, "id_vars", frame.columns)
|
||||
value_vars_was_not_none = value_vars is not None
|
||||
value_vars = ensure_list_vars(value_vars, "value_vars", frame.columns)
|
||||
|
||||
if id_vars or value_vars:
|
||||
if col_level is not None:
|
||||
level = frame.columns.get_level_values(col_level)
|
||||
else:
|
||||
level = frame.columns
|
||||
labels = id_vars + value_vars
|
||||
idx = level.get_indexer_for(labels)
|
||||
missing = idx == -1
|
||||
if missing.any():
|
||||
missing_labels = [
|
||||
lab for lab, not_found in zip(labels, missing) if not_found
|
||||
]
|
||||
raise KeyError(
|
||||
"The following id_vars or value_vars are not present in "
|
||||
f"the DataFrame: {missing_labels}"
|
||||
)
|
||||
if value_vars_was_not_none:
|
||||
frame = frame.iloc[:, algos.unique(idx)]
|
||||
else:
|
||||
frame = frame.copy()
|
||||
else:
|
||||
frame = frame.copy()
|
||||
|
||||
if col_level is not None: # allow list or other?
|
||||
# frame is a copy
|
||||
frame.columns = frame.columns.get_level_values(col_level)
|
||||
|
||||
if var_name is None:
|
||||
if isinstance(frame.columns, MultiIndex):
|
||||
if len(frame.columns.names) == len(set(frame.columns.names)):
|
||||
var_name = frame.columns.names
|
||||
else:
|
||||
var_name = [f"variable_{i}" for i in range(len(frame.columns.names))]
|
||||
else:
|
||||
var_name = [
|
||||
frame.columns.name if frame.columns.name is not None else "variable"
|
||||
]
|
||||
elif is_list_like(var_name):
|
||||
raise ValueError(f"{var_name=} must be a scalar.")
|
||||
else:
|
||||
var_name = [var_name]
|
||||
|
||||
num_rows, K = frame.shape
|
||||
num_cols_adjusted = K - len(id_vars)
|
||||
|
||||
mdata: dict[Hashable, AnyArrayLike] = {}
|
||||
for col in id_vars:
|
||||
id_data = frame.pop(col)
|
||||
if not isinstance(id_data.dtype, np.dtype):
|
||||
# i.e. ExtensionDtype
|
||||
if num_cols_adjusted > 0:
|
||||
mdata[col] = concat([id_data] * num_cols_adjusted, ignore_index=True)
|
||||
else:
|
||||
# We can't concat empty list. (GH 46044)
|
||||
mdata[col] = type(id_data)([], name=id_data.name, dtype=id_data.dtype)
|
||||
else:
|
||||
mdata[col] = np.tile(id_data._values, num_cols_adjusted)
|
||||
|
||||
mcolumns = id_vars + var_name + [value_name]
|
||||
|
||||
if frame.shape[1] > 0 and not any(
|
||||
not isinstance(dt, np.dtype) and dt._supports_2d for dt in frame.dtypes
|
||||
):
|
||||
mdata[value_name] = concat(
|
||||
[frame.iloc[:, i] for i in range(frame.shape[1])]
|
||||
).values
|
||||
else:
|
||||
mdata[value_name] = frame._values.ravel("F")
|
||||
for i, col in enumerate(var_name):
|
||||
mdata[col] = frame.columns._get_level_values(i).repeat(num_rows)
|
||||
|
||||
result = frame._constructor(mdata, columns=mcolumns)
|
||||
|
||||
if not ignore_index:
|
||||
result.index = tile_compat(frame.index, num_cols_adjusted)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def lreshape(data: DataFrame, groups: dict, dropna: bool = True) -> DataFrame:
|
||||
"""
|
||||
Reshape wide-format data to long. Generalized inverse of DataFrame.pivot.
|
||||
|
||||
Accepts a dictionary, ``groups``, in which each key is a new column name
|
||||
and each value is a list of old column names that will be "melted" under
|
||||
the new column name as part of the reshape.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : DataFrame
|
||||
The wide-format DataFrame.
|
||||
groups : dict
|
||||
{new_name : list_of_columns}.
|
||||
dropna : bool, default True
|
||||
Do not include columns whose entries are all NaN.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
Reshaped DataFrame.
|
||||
|
||||
See Also
|
||||
--------
|
||||
melt : Unpivot a DataFrame from wide to long format, optionally leaving
|
||||
identifiers set.
|
||||
pivot : Create a spreadsheet-style pivot table as a DataFrame.
|
||||
DataFrame.pivot : Pivot without aggregation that can handle
|
||||
non-numeric data.
|
||||
DataFrame.pivot_table : Generalization of pivot that can handle
|
||||
duplicate values for one index/column pair.
|
||||
DataFrame.unstack : Pivot based on the index values instead of a
|
||||
column.
|
||||
wide_to_long : Wide panel to long format. Less flexible but more
|
||||
user-friendly than melt.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526],
|
||||
... 'team': ['Red Sox', 'Yankees'],
|
||||
... 'year1': [2007, 2007], 'year2': [2008, 2008]})
|
||||
>>> data
|
||||
hr1 hr2 team year1 year2
|
||||
0 514 545 Red Sox 2007 2008
|
||||
1 573 526 Yankees 2007 2008
|
||||
|
||||
>>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']})
|
||||
team year hr
|
||||
0 Red Sox 2007 514
|
||||
1 Yankees 2007 573
|
||||
2 Red Sox 2008 545
|
||||
3 Yankees 2008 526
|
||||
"""
|
||||
mdata = {}
|
||||
pivot_cols = []
|
||||
all_cols: set[Hashable] = set()
|
||||
K = len(next(iter(groups.values())))
|
||||
for target, names in groups.items():
|
||||
if len(names) != K:
|
||||
raise ValueError("All column lists must be same length")
|
||||
to_concat = [data[col]._values for col in names]
|
||||
|
||||
mdata[target] = concat_compat(to_concat)
|
||||
pivot_cols.append(target)
|
||||
all_cols = all_cols.union(names)
|
||||
|
||||
id_cols = list(data.columns.difference(all_cols))
|
||||
for col in id_cols:
|
||||
mdata[col] = np.tile(data[col]._values, K)
|
||||
|
||||
if dropna:
|
||||
mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool)
|
||||
for c in pivot_cols:
|
||||
mask &= notna(mdata[c])
|
||||
if not mask.all():
|
||||
mdata = {k: v[mask] for k, v in mdata.items()}
|
||||
|
||||
return data._constructor(mdata, columns=id_cols + pivot_cols)
|
||||
|
||||
|
||||
def wide_to_long(
|
||||
df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+"
|
||||
) -> DataFrame:
|
||||
r"""
|
||||
Unpivot a DataFrame from wide to long format.
|
||||
|
||||
Less flexible but more user-friendly than melt.
|
||||
|
||||
With stubnames ['A', 'B'], this function expects to find one or more
|
||||
group of columns with format
|
||||
A-suffix1, A-suffix2,..., B-suffix1, B-suffix2,...
|
||||
You specify what you want to call this suffix in the resulting long format
|
||||
with `j` (for example `j='year'`)
|
||||
|
||||
Each row of these wide variables are assumed to be uniquely identified by
|
||||
`i` (can be a single column name or a list of column names)
|
||||
|
||||
All remaining variables in the data frame are left intact.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrame
|
||||
The wide-format DataFrame.
|
||||
stubnames : str or list-like
|
||||
The stub name(s). The wide format variables are assumed to
|
||||
start with the stub names.
|
||||
i : str or list-like
|
||||
Column(s) to use as id variable(s).
|
||||
j : str
|
||||
The name of the sub-observation variable. What you wish to name your
|
||||
suffix in the long format.
|
||||
sep : str, default ""
|
||||
A character indicating the separation of the variable names
|
||||
in the wide format, to be stripped from the names in the long format.
|
||||
For example, if your column names are A-suffix1, A-suffix2, you
|
||||
can strip the hyphen by specifying `sep='-'`.
|
||||
suffix : str, default '\\d+'
|
||||
A regular expression capturing the wanted suffixes. '\\d+' captures
|
||||
numeric suffixes. Suffixes with no numbers could be specified with the
|
||||
negated character class '\\D+'. You can also further disambiguate
|
||||
suffixes, for example, if your wide variables are of the form A-one,
|
||||
B-two,.., and you have an unrelated column A-rating, you can ignore the
|
||||
last one by specifying `suffix='(!?one|two)'`. When all suffixes are
|
||||
numeric, they are cast to int64/float64.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
A DataFrame that contains each stub name as a variable, with new index
|
||||
(i, j).
|
||||
|
||||
See Also
|
||||
--------
|
||||
melt : Unpivot a DataFrame from wide to long format, optionally leaving
|
||||
identifiers set.
|
||||
pivot : Create a spreadsheet-style pivot table as a DataFrame.
|
||||
DataFrame.pivot : Pivot without aggregation that can handle
|
||||
non-numeric data.
|
||||
DataFrame.pivot_table : Generalization of pivot that can handle
|
||||
duplicate values for one index/column pair.
|
||||
DataFrame.unstack : Pivot based on the index values instead of a
|
||||
column.
|
||||
|
||||
Notes
|
||||
-----
|
||||
All extra variables are left untouched. This simply uses
|
||||
`pandas.melt` under the hood, but is hard-coded to "do the right thing"
|
||||
in a typical case.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> np.random.seed(123)
|
||||
>>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
|
||||
... "A1980" : {0 : "d", 1 : "e", 2 : "f"},
|
||||
... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
|
||||
... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
|
||||
... "X" : dict(zip(range(3), np.random.randn(3)))
|
||||
... })
|
||||
>>> df["id"] = df.index
|
||||
>>> df
|
||||
A1970 A1980 B1970 B1980 X id
|
||||
0 a d 2.5 3.2 -1.085631 0
|
||||
1 b e 1.2 1.3 0.997345 1
|
||||
2 c f 0.7 0.1 0.282978 2
|
||||
>>> pd.wide_to_long(df, ["A", "B"], i="id", j="year")
|
||||
... # doctest: +NORMALIZE_WHITESPACE
|
||||
X A B
|
||||
id year
|
||||
0 1970 -1.085631 a 2.5
|
||||
1 1970 0.997345 b 1.2
|
||||
2 1970 0.282978 c 0.7
|
||||
0 1980 -1.085631 d 3.2
|
||||
1 1980 0.997345 e 1.3
|
||||
2 1980 0.282978 f 0.1
|
||||
|
||||
With multiple id columns
|
||||
|
||||
>>> df = pd.DataFrame({
|
||||
... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
|
||||
... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
|
||||
... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
|
||||
... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
|
||||
... })
|
||||
>>> df
|
||||
famid birth ht1 ht2
|
||||
0 1 1 2.8 3.4
|
||||
1 1 2 2.9 3.8
|
||||
2 1 3 2.2 2.9
|
||||
3 2 1 2.0 3.2
|
||||
4 2 2 1.8 2.8
|
||||
5 2 3 1.9 2.4
|
||||
6 3 1 2.2 3.3
|
||||
7 3 2 2.3 3.4
|
||||
8 3 3 2.1 2.9
|
||||
>>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')
|
||||
>>> l
|
||||
... # doctest: +NORMALIZE_WHITESPACE
|
||||
ht
|
||||
famid birth age
|
||||
1 1 1 2.8
|
||||
2 3.4
|
||||
2 1 2.9
|
||||
2 3.8
|
||||
3 1 2.2
|
||||
2 2.9
|
||||
2 1 1 2.0
|
||||
2 3.2
|
||||
2 1 1.8
|
||||
2 2.8
|
||||
3 1 1.9
|
||||
2 2.4
|
||||
3 1 1 2.2
|
||||
2 3.3
|
||||
2 1 2.3
|
||||
2 3.4
|
||||
3 1 2.1
|
||||
2 2.9
|
||||
|
||||
Going from long back to wide just takes some creative use of `unstack`
|
||||
|
||||
>>> w = l.unstack()
|
||||
>>> w.columns = w.columns.map('{0[0]}{0[1]}'.format)
|
||||
>>> w.reset_index()
|
||||
famid birth ht1 ht2
|
||||
0 1 1 2.8 3.4
|
||||
1 1 2 2.9 3.8
|
||||
2 1 3 2.2 2.9
|
||||
3 2 1 2.0 3.2
|
||||
4 2 2 1.8 2.8
|
||||
5 2 3 1.9 2.4
|
||||
6 3 1 2.2 3.3
|
||||
7 3 2 2.3 3.4
|
||||
8 3 3 2.1 2.9
|
||||
|
||||
Less wieldy column names are also handled
|
||||
|
||||
>>> np.random.seed(0)
|
||||
>>> df = pd.DataFrame({'A(weekly)-2010': np.random.rand(3),
|
||||
... 'A(weekly)-2011': np.random.rand(3),
|
||||
... 'B(weekly)-2010': np.random.rand(3),
|
||||
... 'B(weekly)-2011': np.random.rand(3),
|
||||
... 'X' : np.random.randint(3, size=3)})
|
||||
>>> df['id'] = df.index
|
||||
>>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
|
||||
A(weekly)-2010 A(weekly)-2011 B(weekly)-2010 B(weekly)-2011 X id
|
||||
0 0.548814 0.544883 0.437587 0.383442 0 0
|
||||
1 0.715189 0.423655 0.891773 0.791725 1 1
|
||||
2 0.602763 0.645894 0.963663 0.528895 1 2
|
||||
|
||||
>>> pd.wide_to_long(df, ['A(weekly)', 'B(weekly)'], i='id',
|
||||
... j='year', sep='-')
|
||||
... # doctest: +NORMALIZE_WHITESPACE
|
||||
X A(weekly) B(weekly)
|
||||
id year
|
||||
0 2010 0 0.548814 0.437587
|
||||
1 2010 1 0.715189 0.891773
|
||||
2 2010 1 0.602763 0.963663
|
||||
0 2011 0 0.544883 0.383442
|
||||
1 2011 1 0.423655 0.791725
|
||||
2 2011 1 0.645894 0.528895
|
||||
|
||||
If we have many columns, we could also use a regex to find our
|
||||
stubnames and pass that list on to wide_to_long
|
||||
|
||||
>>> stubnames = sorted(
|
||||
... set([match[0] for match in df.columns.str.findall(
|
||||
... r'[A-B]\(.*\)').values if match != []])
|
||||
... )
|
||||
>>> list(stubnames)
|
||||
['A(weekly)', 'B(weekly)']
|
||||
|
||||
All of the above examples have integers as suffixes. It is possible to
|
||||
have non-integers as suffixes.
|
||||
|
||||
>>> df = pd.DataFrame({
|
||||
... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
|
||||
... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
|
||||
... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
|
||||
... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
|
||||
... })
|
||||
>>> df
|
||||
famid birth ht_one ht_two
|
||||
0 1 1 2.8 3.4
|
||||
1 1 2 2.9 3.8
|
||||
2 1 3 2.2 2.9
|
||||
3 2 1 2.0 3.2
|
||||
4 2 2 1.8 2.8
|
||||
5 2 3 1.9 2.4
|
||||
6 3 1 2.2 3.3
|
||||
7 3 2 2.3 3.4
|
||||
8 3 3 2.1 2.9
|
||||
|
||||
>>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age',
|
||||
... sep='_', suffix=r'\w+')
|
||||
>>> l
|
||||
... # doctest: +NORMALIZE_WHITESPACE
|
||||
ht
|
||||
famid birth age
|
||||
1 1 one 2.8
|
||||
two 3.4
|
||||
2 one 2.9
|
||||
two 3.8
|
||||
3 one 2.2
|
||||
two 2.9
|
||||
2 1 one 2.0
|
||||
two 3.2
|
||||
2 one 1.8
|
||||
two 2.8
|
||||
3 one 1.9
|
||||
two 2.4
|
||||
3 1 one 2.2
|
||||
two 3.3
|
||||
2 one 2.3
|
||||
two 3.4
|
||||
3 one 2.1
|
||||
two 2.9
|
||||
"""
|
||||
|
||||
def get_var_names(df, stub: str, sep: str, suffix: str):
|
||||
regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$"
|
||||
return df.columns[df.columns.str.match(regex)]
|
||||
|
||||
def melt_stub(df, stub: str, i, j, value_vars, sep: str):
|
||||
newdf = melt(
|
||||
df,
|
||||
id_vars=i,
|
||||
value_vars=value_vars,
|
||||
value_name=stub.rstrip(sep),
|
||||
var_name=j,
|
||||
)
|
||||
newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True)
|
||||
|
||||
# GH17627 Cast numerics suffixes to int/float
|
||||
try:
|
||||
newdf[j] = to_numeric(newdf[j])
|
||||
except (TypeError, ValueError, OverflowError):
|
||||
# TODO: anything else to catch?
|
||||
pass
|
||||
|
||||
return newdf.set_index(i + [j])
|
||||
|
||||
if not is_list_like(stubnames):
|
||||
stubnames = [stubnames]
|
||||
else:
|
||||
stubnames = list(stubnames)
|
||||
|
||||
if df.columns.isin(stubnames).any():
|
||||
raise ValueError("stubname can't be identical to a column name")
|
||||
|
||||
if not is_list_like(i):
|
||||
i = [i]
|
||||
else:
|
||||
i = list(i)
|
||||
|
||||
if df[i].duplicated().any():
|
||||
raise ValueError("the id variables need to uniquely identify each row")
|
||||
|
||||
_melted = []
|
||||
value_vars_flattened = []
|
||||
for stub in stubnames:
|
||||
value_var = get_var_names(df, stub, sep, suffix)
|
||||
value_vars_flattened.extend(value_var)
|
||||
_melted.append(melt_stub(df, stub, i, j, value_var, sep))
|
||||
|
||||
melted = concat(_melted, axis=1)
|
||||
id_vars = df.columns.difference(value_vars_flattened)
|
||||
new = df[id_vars]
|
||||
|
||||
if len(i) == 1:
|
||||
return new.set_index(i).join(melted)
|
||||
else:
|
||||
return new.merge(melted.reset_index(), on=i).set_index(i + [j])
|
2762
lib/python3.13/site-packages/pandas/core/reshape/merge.py
Normal file
2762
lib/python3.13/site-packages/pandas/core/reshape/merge.py
Normal file
File diff suppressed because it is too large
Load Diff
899
lib/python3.13/site-packages/pandas/core/reshape/pivot.py
Normal file
899
lib/python3.13/site-packages/pandas/core/reshape/pivot.py
Normal file
@ -0,0 +1,899 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import (
|
||||
Hashable,
|
||||
Sequence,
|
||||
)
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Callable,
|
||||
Literal,
|
||||
cast,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.util._decorators import (
|
||||
Appender,
|
||||
Substitution,
|
||||
)
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.cast import maybe_downcast_to_dtype
|
||||
from pandas.core.dtypes.common import (
|
||||
is_list_like,
|
||||
is_nested_list_like,
|
||||
is_scalar,
|
||||
)
|
||||
from pandas.core.dtypes.dtypes import ExtensionDtype
|
||||
from pandas.core.dtypes.generic import (
|
||||
ABCDataFrame,
|
||||
ABCSeries,
|
||||
)
|
||||
|
||||
import pandas.core.common as com
|
||||
from pandas.core.frame import _shared_docs
|
||||
from pandas.core.groupby import Grouper
|
||||
from pandas.core.indexes.api import (
|
||||
Index,
|
||||
MultiIndex,
|
||||
get_objs_combined_axis,
|
||||
)
|
||||
from pandas.core.reshape.concat import concat
|
||||
from pandas.core.reshape.util import cartesian_product
|
||||
from pandas.core.series import Series
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
AggFuncType,
|
||||
AggFuncTypeBase,
|
||||
AggFuncTypeDict,
|
||||
IndexLabel,
|
||||
)
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
# Note: We need to make sure `frame` is imported before `pivot`, otherwise
|
||||
# _shared_docs['pivot_table'] will not yet exist. TODO: Fix this dependency
|
||||
@Substitution("\ndata : DataFrame")
|
||||
@Appender(_shared_docs["pivot_table"], indents=1)
|
||||
def pivot_table(
|
||||
data: DataFrame,
|
||||
values=None,
|
||||
index=None,
|
||||
columns=None,
|
||||
aggfunc: AggFuncType = "mean",
|
||||
fill_value=None,
|
||||
margins: bool = False,
|
||||
dropna: bool = True,
|
||||
margins_name: Hashable = "All",
|
||||
observed: bool | lib.NoDefault = lib.no_default,
|
||||
sort: bool = True,
|
||||
) -> DataFrame:
|
||||
index = _convert_by(index)
|
||||
columns = _convert_by(columns)
|
||||
|
||||
if isinstance(aggfunc, list):
|
||||
pieces: list[DataFrame] = []
|
||||
keys = []
|
||||
for func in aggfunc:
|
||||
_table = __internal_pivot_table(
|
||||
data,
|
||||
values=values,
|
||||
index=index,
|
||||
columns=columns,
|
||||
fill_value=fill_value,
|
||||
aggfunc=func,
|
||||
margins=margins,
|
||||
dropna=dropna,
|
||||
margins_name=margins_name,
|
||||
observed=observed,
|
||||
sort=sort,
|
||||
)
|
||||
pieces.append(_table)
|
||||
keys.append(getattr(func, "__name__", func))
|
||||
|
||||
table = concat(pieces, keys=keys, axis=1)
|
||||
return table.__finalize__(data, method="pivot_table")
|
||||
|
||||
table = __internal_pivot_table(
|
||||
data,
|
||||
values,
|
||||
index,
|
||||
columns,
|
||||
aggfunc,
|
||||
fill_value,
|
||||
margins,
|
||||
dropna,
|
||||
margins_name,
|
||||
observed,
|
||||
sort,
|
||||
)
|
||||
return table.__finalize__(data, method="pivot_table")
|
||||
|
||||
|
||||
def __internal_pivot_table(
|
||||
data: DataFrame,
|
||||
values,
|
||||
index,
|
||||
columns,
|
||||
aggfunc: AggFuncTypeBase | AggFuncTypeDict,
|
||||
fill_value,
|
||||
margins: bool,
|
||||
dropna: bool,
|
||||
margins_name: Hashable,
|
||||
observed: bool | lib.NoDefault,
|
||||
sort: bool,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Helper of :func:`pandas.pivot_table` for any non-list ``aggfunc``.
|
||||
"""
|
||||
keys = index + columns
|
||||
|
||||
values_passed = values is not None
|
||||
if values_passed:
|
||||
if is_list_like(values):
|
||||
values_multi = True
|
||||
values = list(values)
|
||||
else:
|
||||
values_multi = False
|
||||
values = [values]
|
||||
|
||||
# GH14938 Make sure value labels are in data
|
||||
for i in values:
|
||||
if i not in data:
|
||||
raise KeyError(i)
|
||||
|
||||
to_filter = []
|
||||
for x in keys + values:
|
||||
if isinstance(x, Grouper):
|
||||
x = x.key
|
||||
try:
|
||||
if x in data:
|
||||
to_filter.append(x)
|
||||
except TypeError:
|
||||
pass
|
||||
if len(to_filter) < len(data.columns):
|
||||
data = data[to_filter]
|
||||
|
||||
else:
|
||||
values = data.columns
|
||||
for key in keys:
|
||||
try:
|
||||
values = values.drop(key)
|
||||
except (TypeError, ValueError, KeyError):
|
||||
pass
|
||||
values = list(values)
|
||||
|
||||
observed_bool = False if observed is lib.no_default else observed
|
||||
grouped = data.groupby(keys, observed=observed_bool, sort=sort, dropna=dropna)
|
||||
if observed is lib.no_default and any(
|
||||
ping._passed_categorical for ping in grouped._grouper.groupings
|
||||
):
|
||||
warnings.warn(
|
||||
"The default value of observed=False is deprecated and will change "
|
||||
"to observed=True in a future version of pandas. Specify "
|
||||
"observed=False to silence this warning and retain the current behavior",
|
||||
category=FutureWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
agged = grouped.agg(aggfunc)
|
||||
|
||||
if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
|
||||
agged = agged.dropna(how="all")
|
||||
|
||||
table = agged
|
||||
|
||||
# GH17038, this check should only happen if index is defined (not None)
|
||||
if table.index.nlevels > 1 and index:
|
||||
# Related GH #17123
|
||||
# If index_names are integers, determine whether the integers refer
|
||||
# to the level position or name.
|
||||
index_names = agged.index.names[: len(index)]
|
||||
to_unstack = []
|
||||
for i in range(len(index), len(keys)):
|
||||
name = agged.index.names[i]
|
||||
if name is None or name in index_names:
|
||||
to_unstack.append(i)
|
||||
else:
|
||||
to_unstack.append(name)
|
||||
table = agged.unstack(to_unstack, fill_value=fill_value)
|
||||
|
||||
if not dropna:
|
||||
if isinstance(table.index, MultiIndex):
|
||||
m = MultiIndex.from_arrays(
|
||||
cartesian_product(table.index.levels), names=table.index.names
|
||||
)
|
||||
table = table.reindex(m, axis=0, fill_value=fill_value)
|
||||
|
||||
if isinstance(table.columns, MultiIndex):
|
||||
m = MultiIndex.from_arrays(
|
||||
cartesian_product(table.columns.levels), names=table.columns.names
|
||||
)
|
||||
table = table.reindex(m, axis=1, fill_value=fill_value)
|
||||
|
||||
if sort is True and isinstance(table, ABCDataFrame):
|
||||
table = table.sort_index(axis=1)
|
||||
|
||||
if fill_value is not None:
|
||||
table = table.fillna(fill_value)
|
||||
if aggfunc is len and not observed and lib.is_integer(fill_value):
|
||||
# TODO: can we avoid this? this used to be handled by
|
||||
# downcast="infer" in fillna
|
||||
table = table.astype(np.int64)
|
||||
|
||||
if margins:
|
||||
if dropna:
|
||||
data = data[data.notna().all(axis=1)]
|
||||
table = _add_margins(
|
||||
table,
|
||||
data,
|
||||
values,
|
||||
rows=index,
|
||||
cols=columns,
|
||||
aggfunc=aggfunc,
|
||||
observed=dropna,
|
||||
margins_name=margins_name,
|
||||
fill_value=fill_value,
|
||||
)
|
||||
|
||||
# discard the top level
|
||||
if values_passed and not values_multi and table.columns.nlevels > 1:
|
||||
table.columns = table.columns.droplevel(0)
|
||||
if len(index) == 0 and len(columns) > 0:
|
||||
table = table.T
|
||||
|
||||
# GH 15193 Make sure empty columns are removed if dropna=True
|
||||
if isinstance(table, ABCDataFrame) and dropna:
|
||||
table = table.dropna(how="all", axis=1)
|
||||
|
||||
return table
|
||||
|
||||
|
||||
def _add_margins(
|
||||
table: DataFrame | Series,
|
||||
data: DataFrame,
|
||||
values,
|
||||
rows,
|
||||
cols,
|
||||
aggfunc,
|
||||
observed: bool,
|
||||
margins_name: Hashable = "All",
|
||||
fill_value=None,
|
||||
):
|
||||
if not isinstance(margins_name, str):
|
||||
raise ValueError("margins_name argument must be a string")
|
||||
|
||||
msg = f'Conflicting name "{margins_name}" in margins'
|
||||
for level in table.index.names:
|
||||
if margins_name in table.index.get_level_values(level):
|
||||
raise ValueError(msg)
|
||||
|
||||
grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name)
|
||||
|
||||
if table.ndim == 2:
|
||||
# i.e. DataFrame
|
||||
for level in table.columns.names[1:]:
|
||||
if margins_name in table.columns.get_level_values(level):
|
||||
raise ValueError(msg)
|
||||
|
||||
key: str | tuple[str, ...]
|
||||
if len(rows) > 1:
|
||||
key = (margins_name,) + ("",) * (len(rows) - 1)
|
||||
else:
|
||||
key = margins_name
|
||||
|
||||
if not values and isinstance(table, ABCSeries):
|
||||
# If there are no values and the table is a series, then there is only
|
||||
# one column in the data. Compute grand margin and return it.
|
||||
return table._append(table._constructor({key: grand_margin[margins_name]}))
|
||||
|
||||
elif values:
|
||||
marginal_result_set = _generate_marginal_results(
|
||||
table, data, values, rows, cols, aggfunc, observed, margins_name
|
||||
)
|
||||
if not isinstance(marginal_result_set, tuple):
|
||||
return marginal_result_set
|
||||
result, margin_keys, row_margin = marginal_result_set
|
||||
else:
|
||||
# no values, and table is a DataFrame
|
||||
assert isinstance(table, ABCDataFrame)
|
||||
marginal_result_set = _generate_marginal_results_without_values(
|
||||
table, data, rows, cols, aggfunc, observed, margins_name
|
||||
)
|
||||
if not isinstance(marginal_result_set, tuple):
|
||||
return marginal_result_set
|
||||
result, margin_keys, row_margin = marginal_result_set
|
||||
|
||||
row_margin = row_margin.reindex(result.columns, fill_value=fill_value)
|
||||
# populate grand margin
|
||||
for k in margin_keys:
|
||||
if isinstance(k, str):
|
||||
row_margin[k] = grand_margin[k]
|
||||
else:
|
||||
row_margin[k] = grand_margin[k[0]]
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
margin_dummy = DataFrame(row_margin, columns=Index([key])).T
|
||||
|
||||
row_names = result.index.names
|
||||
# check the result column and leave floats
|
||||
|
||||
for dtype in set(result.dtypes):
|
||||
if isinstance(dtype, ExtensionDtype):
|
||||
# Can hold NA already
|
||||
continue
|
||||
|
||||
cols = result.select_dtypes([dtype]).columns
|
||||
margin_dummy[cols] = margin_dummy[cols].apply(
|
||||
maybe_downcast_to_dtype, args=(dtype,)
|
||||
)
|
||||
result = result._append(margin_dummy)
|
||||
result.index.names = row_names
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _compute_grand_margin(
|
||||
data: DataFrame, values, aggfunc, margins_name: Hashable = "All"
|
||||
):
|
||||
if values:
|
||||
grand_margin = {}
|
||||
for k, v in data[values].items():
|
||||
try:
|
||||
if isinstance(aggfunc, str):
|
||||
grand_margin[k] = getattr(v, aggfunc)()
|
||||
elif isinstance(aggfunc, dict):
|
||||
if isinstance(aggfunc[k], str):
|
||||
grand_margin[k] = getattr(v, aggfunc[k])()
|
||||
else:
|
||||
grand_margin[k] = aggfunc[k](v)
|
||||
else:
|
||||
grand_margin[k] = aggfunc(v)
|
||||
except TypeError:
|
||||
pass
|
||||
return grand_margin
|
||||
else:
|
||||
return {margins_name: aggfunc(data.index)}
|
||||
|
||||
|
||||
def _generate_marginal_results(
|
||||
table,
|
||||
data: DataFrame,
|
||||
values,
|
||||
rows,
|
||||
cols,
|
||||
aggfunc,
|
||||
observed: bool,
|
||||
margins_name: Hashable = "All",
|
||||
):
|
||||
margin_keys: list | Index
|
||||
if len(cols) > 0:
|
||||
# need to "interleave" the margins
|
||||
table_pieces = []
|
||||
margin_keys = []
|
||||
|
||||
def _all_key(key):
|
||||
return (key, margins_name) + ("",) * (len(cols) - 1)
|
||||
|
||||
if len(rows) > 0:
|
||||
margin = data[rows + values].groupby(rows, observed=observed).agg(aggfunc)
|
||||
cat_axis = 1
|
||||
|
||||
for key, piece in table.T.groupby(level=0, observed=observed):
|
||||
piece = piece.T
|
||||
all_key = _all_key(key)
|
||||
|
||||
# we are going to mutate this, so need to copy!
|
||||
piece = piece.copy()
|
||||
piece[all_key] = margin[key]
|
||||
|
||||
table_pieces.append(piece)
|
||||
margin_keys.append(all_key)
|
||||
else:
|
||||
from pandas import DataFrame
|
||||
|
||||
cat_axis = 0
|
||||
for key, piece in table.groupby(level=0, observed=observed):
|
||||
if len(cols) > 1:
|
||||
all_key = _all_key(key)
|
||||
else:
|
||||
all_key = margins_name
|
||||
table_pieces.append(piece)
|
||||
# GH31016 this is to calculate margin for each group, and assign
|
||||
# corresponded key as index
|
||||
transformed_piece = DataFrame(piece.apply(aggfunc)).T
|
||||
if isinstance(piece.index, MultiIndex):
|
||||
# We are adding an empty level
|
||||
transformed_piece.index = MultiIndex.from_tuples(
|
||||
[all_key], names=piece.index.names + [None]
|
||||
)
|
||||
else:
|
||||
transformed_piece.index = Index([all_key], name=piece.index.name)
|
||||
|
||||
# append piece for margin into table_piece
|
||||
table_pieces.append(transformed_piece)
|
||||
margin_keys.append(all_key)
|
||||
|
||||
if not table_pieces:
|
||||
# GH 49240
|
||||
return table
|
||||
else:
|
||||
result = concat(table_pieces, axis=cat_axis)
|
||||
|
||||
if len(rows) == 0:
|
||||
return result
|
||||
else:
|
||||
result = table
|
||||
margin_keys = table.columns
|
||||
|
||||
if len(cols) > 0:
|
||||
row_margin = data[cols + values].groupby(cols, observed=observed).agg(aggfunc)
|
||||
row_margin = row_margin.stack(future_stack=True)
|
||||
|
||||
# GH#26568. Use names instead of indices in case of numeric names
|
||||
new_order_indices = [len(cols)] + list(range(len(cols)))
|
||||
new_order_names = [row_margin.index.names[i] for i in new_order_indices]
|
||||
row_margin.index = row_margin.index.reorder_levels(new_order_names)
|
||||
else:
|
||||
row_margin = data._constructor_sliced(np.nan, index=result.columns)
|
||||
|
||||
return result, margin_keys, row_margin
|
||||
|
||||
|
||||
def _generate_marginal_results_without_values(
|
||||
table: DataFrame,
|
||||
data: DataFrame,
|
||||
rows,
|
||||
cols,
|
||||
aggfunc,
|
||||
observed: bool,
|
||||
margins_name: Hashable = "All",
|
||||
):
|
||||
margin_keys: list | Index
|
||||
if len(cols) > 0:
|
||||
# need to "interleave" the margins
|
||||
margin_keys = []
|
||||
|
||||
def _all_key():
|
||||
if len(cols) == 1:
|
||||
return margins_name
|
||||
return (margins_name,) + ("",) * (len(cols) - 1)
|
||||
|
||||
if len(rows) > 0:
|
||||
margin = data.groupby(rows, observed=observed)[rows].apply(aggfunc)
|
||||
all_key = _all_key()
|
||||
table[all_key] = margin
|
||||
result = table
|
||||
margin_keys.append(all_key)
|
||||
|
||||
else:
|
||||
margin = data.groupby(level=0, axis=0, observed=observed).apply(aggfunc)
|
||||
all_key = _all_key()
|
||||
table[all_key] = margin
|
||||
result = table
|
||||
margin_keys.append(all_key)
|
||||
return result
|
||||
else:
|
||||
result = table
|
||||
margin_keys = table.columns
|
||||
|
||||
if len(cols):
|
||||
row_margin = data.groupby(cols, observed=observed)[cols].apply(aggfunc)
|
||||
else:
|
||||
row_margin = Series(np.nan, index=result.columns)
|
||||
|
||||
return result, margin_keys, row_margin
|
||||
|
||||
|
||||
def _convert_by(by):
|
||||
if by is None:
|
||||
by = []
|
||||
elif (
|
||||
is_scalar(by)
|
||||
or isinstance(by, (np.ndarray, Index, ABCSeries, Grouper))
|
||||
or callable(by)
|
||||
):
|
||||
by = [by]
|
||||
else:
|
||||
by = list(by)
|
||||
return by
|
||||
|
||||
|
||||
@Substitution("\ndata : DataFrame")
|
||||
@Appender(_shared_docs["pivot"], indents=1)
|
||||
def pivot(
|
||||
data: DataFrame,
|
||||
*,
|
||||
columns: IndexLabel,
|
||||
index: IndexLabel | lib.NoDefault = lib.no_default,
|
||||
values: IndexLabel | lib.NoDefault = lib.no_default,
|
||||
) -> DataFrame:
|
||||
columns_listlike = com.convert_to_list_like(columns)
|
||||
|
||||
# If columns is None we will create a MultiIndex level with None as name
|
||||
# which might cause duplicated names because None is the default for
|
||||
# level names
|
||||
data = data.copy(deep=False)
|
||||
data.index = data.index.copy()
|
||||
data.index.names = [
|
||||
name if name is not None else lib.no_default for name in data.index.names
|
||||
]
|
||||
|
||||
indexed: DataFrame | Series
|
||||
if values is lib.no_default:
|
||||
if index is not lib.no_default:
|
||||
cols = com.convert_to_list_like(index)
|
||||
else:
|
||||
cols = []
|
||||
|
||||
append = index is lib.no_default
|
||||
# error: Unsupported operand types for + ("List[Any]" and "ExtensionArray")
|
||||
# error: Unsupported left operand type for + ("ExtensionArray")
|
||||
indexed = data.set_index(
|
||||
cols + columns_listlike, append=append # type: ignore[operator]
|
||||
)
|
||||
else:
|
||||
index_list: list[Index] | list[Series]
|
||||
if index is lib.no_default:
|
||||
if isinstance(data.index, MultiIndex):
|
||||
# GH 23955
|
||||
index_list = [
|
||||
data.index.get_level_values(i) for i in range(data.index.nlevels)
|
||||
]
|
||||
else:
|
||||
index_list = [
|
||||
data._constructor_sliced(data.index, name=data.index.name)
|
||||
]
|
||||
else:
|
||||
index_list = [data[idx] for idx in com.convert_to_list_like(index)]
|
||||
|
||||
data_columns = [data[col] for col in columns_listlike]
|
||||
index_list.extend(data_columns)
|
||||
multiindex = MultiIndex.from_arrays(index_list)
|
||||
|
||||
if is_list_like(values) and not isinstance(values, tuple):
|
||||
# Exclude tuple because it is seen as a single column name
|
||||
values = cast(Sequence[Hashable], values)
|
||||
indexed = data._constructor(
|
||||
data[values]._values, index=multiindex, columns=values
|
||||
)
|
||||
else:
|
||||
indexed = data._constructor_sliced(data[values]._values, index=multiindex)
|
||||
# error: Argument 1 to "unstack" of "DataFrame" has incompatible type "Union
|
||||
# [List[Any], ExtensionArray, ndarray[Any, Any], Index, Series]"; expected
|
||||
# "Hashable"
|
||||
result = indexed.unstack(columns_listlike) # type: ignore[arg-type]
|
||||
result.index.names = [
|
||||
name if name is not lib.no_default else None for name in result.index.names
|
||||
]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def crosstab(
|
||||
index,
|
||||
columns,
|
||||
values=None,
|
||||
rownames=None,
|
||||
colnames=None,
|
||||
aggfunc=None,
|
||||
margins: bool = False,
|
||||
margins_name: Hashable = "All",
|
||||
dropna: bool = True,
|
||||
normalize: bool | Literal[0, 1, "all", "index", "columns"] = False,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Compute a simple cross tabulation of two (or more) factors.
|
||||
|
||||
By default, computes a frequency table of the factors unless an
|
||||
array of values and an aggregation function are passed.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
index : array-like, Series, or list of arrays/Series
|
||||
Values to group by in the rows.
|
||||
columns : array-like, Series, or list of arrays/Series
|
||||
Values to group by in the columns.
|
||||
values : array-like, optional
|
||||
Array of values to aggregate according to the factors.
|
||||
Requires `aggfunc` be specified.
|
||||
rownames : sequence, default None
|
||||
If passed, must match number of row arrays passed.
|
||||
colnames : sequence, default None
|
||||
If passed, must match number of column arrays passed.
|
||||
aggfunc : function, optional
|
||||
If specified, requires `values` be specified as well.
|
||||
margins : bool, default False
|
||||
Add row/column margins (subtotals).
|
||||
margins_name : str, default 'All'
|
||||
Name of the row/column that will contain the totals
|
||||
when margins is True.
|
||||
dropna : bool, default True
|
||||
Do not include columns whose entries are all NaN.
|
||||
normalize : bool, {'all', 'index', 'columns'}, or {0,1}, default False
|
||||
Normalize by dividing all values by the sum of values.
|
||||
|
||||
- If passed 'all' or `True`, will normalize over all values.
|
||||
- If passed 'index' will normalize over each row.
|
||||
- If passed 'columns' will normalize over each column.
|
||||
- If margins is `True`, will also normalize margin values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
Cross tabulation of the data.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.pivot : Reshape data based on column values.
|
||||
pivot_table : Create a pivot table as a DataFrame.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Any Series passed will have their name attributes used unless row or column
|
||||
names for the cross-tabulation are specified.
|
||||
|
||||
Any input passed containing Categorical data will have **all** of its
|
||||
categories included in the cross-tabulation, even if the actual data does
|
||||
not contain any instances of a particular category.
|
||||
|
||||
In the event that there aren't overlapping indexes an empty DataFrame will
|
||||
be returned.
|
||||
|
||||
Reference :ref:`the user guide <reshaping.crosstabulations>` for more examples.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar",
|
||||
... "bar", "bar", "foo", "foo", "foo"], dtype=object)
|
||||
>>> b = np.array(["one", "one", "one", "two", "one", "one",
|
||||
... "one", "two", "two", "two", "one"], dtype=object)
|
||||
>>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny",
|
||||
... "shiny", "dull", "shiny", "shiny", "shiny"],
|
||||
... dtype=object)
|
||||
>>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
|
||||
b one two
|
||||
c dull shiny dull shiny
|
||||
a
|
||||
bar 1 2 1 0
|
||||
foo 2 2 1 2
|
||||
|
||||
Here 'c' and 'f' are not represented in the data and will not be
|
||||
shown in the output because dropna is True by default. Set
|
||||
dropna=False to preserve categories with no data.
|
||||
|
||||
>>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])
|
||||
>>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])
|
||||
>>> pd.crosstab(foo, bar)
|
||||
col_0 d e
|
||||
row_0
|
||||
a 1 0
|
||||
b 0 1
|
||||
>>> pd.crosstab(foo, bar, dropna=False)
|
||||
col_0 d e f
|
||||
row_0
|
||||
a 1 0 0
|
||||
b 0 1 0
|
||||
c 0 0 0
|
||||
"""
|
||||
if values is None and aggfunc is not None:
|
||||
raise ValueError("aggfunc cannot be used without values.")
|
||||
|
||||
if values is not None and aggfunc is None:
|
||||
raise ValueError("values cannot be used without an aggfunc.")
|
||||
|
||||
if not is_nested_list_like(index):
|
||||
index = [index]
|
||||
if not is_nested_list_like(columns):
|
||||
columns = [columns]
|
||||
|
||||
common_idx = None
|
||||
pass_objs = [x for x in index + columns if isinstance(x, (ABCSeries, ABCDataFrame))]
|
||||
if pass_objs:
|
||||
common_idx = get_objs_combined_axis(pass_objs, intersect=True, sort=False)
|
||||
|
||||
rownames = _get_names(index, rownames, prefix="row")
|
||||
colnames = _get_names(columns, colnames, prefix="col")
|
||||
|
||||
# duplicate names mapped to unique names for pivot op
|
||||
(
|
||||
rownames_mapper,
|
||||
unique_rownames,
|
||||
colnames_mapper,
|
||||
unique_colnames,
|
||||
) = _build_names_mapper(rownames, colnames)
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
data = {
|
||||
**dict(zip(unique_rownames, index)),
|
||||
**dict(zip(unique_colnames, columns)),
|
||||
}
|
||||
df = DataFrame(data, index=common_idx)
|
||||
|
||||
if values is None:
|
||||
df["__dummy__"] = 0
|
||||
kwargs = {"aggfunc": len, "fill_value": 0}
|
||||
else:
|
||||
df["__dummy__"] = values
|
||||
kwargs = {"aggfunc": aggfunc}
|
||||
|
||||
# error: Argument 7 to "pivot_table" of "DataFrame" has incompatible type
|
||||
# "**Dict[str, object]"; expected "Union[...]"
|
||||
table = df.pivot_table(
|
||||
"__dummy__",
|
||||
index=unique_rownames,
|
||||
columns=unique_colnames,
|
||||
margins=margins,
|
||||
margins_name=margins_name,
|
||||
dropna=dropna,
|
||||
observed=False,
|
||||
**kwargs, # type: ignore[arg-type]
|
||||
)
|
||||
|
||||
# Post-process
|
||||
if normalize is not False:
|
||||
table = _normalize(
|
||||
table, normalize=normalize, margins=margins, margins_name=margins_name
|
||||
)
|
||||
|
||||
table = table.rename_axis(index=rownames_mapper, axis=0)
|
||||
table = table.rename_axis(columns=colnames_mapper, axis=1)
|
||||
|
||||
return table
|
||||
|
||||
|
||||
def _normalize(
|
||||
table: DataFrame, normalize, margins: bool, margins_name: Hashable = "All"
|
||||
) -> DataFrame:
|
||||
if not isinstance(normalize, (bool, str)):
|
||||
axis_subs = {0: "index", 1: "columns"}
|
||||
try:
|
||||
normalize = axis_subs[normalize]
|
||||
except KeyError as err:
|
||||
raise ValueError("Not a valid normalize argument") from err
|
||||
|
||||
if margins is False:
|
||||
# Actual Normalizations
|
||||
normalizers: dict[bool | str, Callable] = {
|
||||
"all": lambda x: x / x.sum(axis=1).sum(axis=0),
|
||||
"columns": lambda x: x / x.sum(),
|
||||
"index": lambda x: x.div(x.sum(axis=1), axis=0),
|
||||
}
|
||||
|
||||
normalizers[True] = normalizers["all"]
|
||||
|
||||
try:
|
||||
f = normalizers[normalize]
|
||||
except KeyError as err:
|
||||
raise ValueError("Not a valid normalize argument") from err
|
||||
|
||||
table = f(table)
|
||||
table = table.fillna(0)
|
||||
|
||||
elif margins is True:
|
||||
# keep index and column of pivoted table
|
||||
table_index = table.index
|
||||
table_columns = table.columns
|
||||
last_ind_or_col = table.iloc[-1, :].name
|
||||
|
||||
# check if margin name is not in (for MI cases) and not equal to last
|
||||
# index/column and save the column and index margin
|
||||
if (margins_name not in last_ind_or_col) & (margins_name != last_ind_or_col):
|
||||
raise ValueError(f"{margins_name} not in pivoted DataFrame")
|
||||
column_margin = table.iloc[:-1, -1]
|
||||
index_margin = table.iloc[-1, :-1]
|
||||
|
||||
# keep the core table
|
||||
table = table.iloc[:-1, :-1]
|
||||
|
||||
# Normalize core
|
||||
table = _normalize(table, normalize=normalize, margins=False)
|
||||
|
||||
# Fix Margins
|
||||
if normalize == "columns":
|
||||
column_margin = column_margin / column_margin.sum()
|
||||
table = concat([table, column_margin], axis=1)
|
||||
table = table.fillna(0)
|
||||
table.columns = table_columns
|
||||
|
||||
elif normalize == "index":
|
||||
index_margin = index_margin / index_margin.sum()
|
||||
table = table._append(index_margin)
|
||||
table = table.fillna(0)
|
||||
table.index = table_index
|
||||
|
||||
elif normalize == "all" or normalize is True:
|
||||
column_margin = column_margin / column_margin.sum()
|
||||
index_margin = index_margin / index_margin.sum()
|
||||
index_margin.loc[margins_name] = 1
|
||||
table = concat([table, column_margin], axis=1)
|
||||
table = table._append(index_margin)
|
||||
|
||||
table = table.fillna(0)
|
||||
table.index = table_index
|
||||
table.columns = table_columns
|
||||
|
||||
else:
|
||||
raise ValueError("Not a valid normalize argument")
|
||||
|
||||
else:
|
||||
raise ValueError("Not a valid margins argument")
|
||||
|
||||
return table
|
||||
|
||||
|
||||
def _get_names(arrs, names, prefix: str = "row"):
|
||||
if names is None:
|
||||
names = []
|
||||
for i, arr in enumerate(arrs):
|
||||
if isinstance(arr, ABCSeries) and arr.name is not None:
|
||||
names.append(arr.name)
|
||||
else:
|
||||
names.append(f"{prefix}_{i}")
|
||||
else:
|
||||
if len(names) != len(arrs):
|
||||
raise AssertionError("arrays and names must have the same length")
|
||||
if not isinstance(names, list):
|
||||
names = list(names)
|
||||
|
||||
return names
|
||||
|
||||
|
||||
def _build_names_mapper(
|
||||
rownames: list[str], colnames: list[str]
|
||||
) -> tuple[dict[str, str], list[str], dict[str, str], list[str]]:
|
||||
"""
|
||||
Given the names of a DataFrame's rows and columns, returns a set of unique row
|
||||
and column names and mappers that convert to original names.
|
||||
|
||||
A row or column name is replaced if it is duplicate among the rows of the inputs,
|
||||
among the columns of the inputs or between the rows and the columns.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
rownames: list[str]
|
||||
colnames: list[str]
|
||||
|
||||
Returns
|
||||
-------
|
||||
Tuple(Dict[str, str], List[str], Dict[str, str], List[str])
|
||||
|
||||
rownames_mapper: dict[str, str]
|
||||
a dictionary with new row names as keys and original rownames as values
|
||||
unique_rownames: list[str]
|
||||
a list of rownames with duplicate names replaced by dummy names
|
||||
colnames_mapper: dict[str, str]
|
||||
a dictionary with new column names as keys and original column names as values
|
||||
unique_colnames: list[str]
|
||||
a list of column names with duplicate names replaced by dummy names
|
||||
|
||||
"""
|
||||
|
||||
def get_duplicates(names):
|
||||
seen: set = set()
|
||||
return {name for name in names if name not in seen}
|
||||
|
||||
shared_names = set(rownames).intersection(set(colnames))
|
||||
dup_names = get_duplicates(rownames) | get_duplicates(colnames) | shared_names
|
||||
|
||||
rownames_mapper = {
|
||||
f"row_{i}": name for i, name in enumerate(rownames) if name in dup_names
|
||||
}
|
||||
unique_rownames = [
|
||||
f"row_{i}" if name in dup_names else name for i, name in enumerate(rownames)
|
||||
]
|
||||
|
||||
colnames_mapper = {
|
||||
f"col_{i}": name for i, name in enumerate(colnames) if name in dup_names
|
||||
}
|
||||
unique_colnames = [
|
||||
f"col_{i}" if name in dup_names else name for i, name in enumerate(colnames)
|
||||
]
|
||||
|
||||
return rownames_mapper, unique_rownames, colnames_mapper, unique_colnames
|
989
lib/python3.13/site-packages/pandas/core/reshape/reshape.py
Normal file
989
lib/python3.13/site-packages/pandas/core/reshape/reshape.py
Normal file
@ -0,0 +1,989 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import itertools
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
cast,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
import pandas._libs.reshape as libreshape
|
||||
from pandas.errors import PerformanceWarning
|
||||
from pandas.util._decorators import cache_readonly
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.cast import (
|
||||
find_common_type,
|
||||
maybe_promote,
|
||||
)
|
||||
from pandas.core.dtypes.common import (
|
||||
ensure_platform_int,
|
||||
is_1d_only_ea_dtype,
|
||||
is_integer,
|
||||
needs_i8_conversion,
|
||||
)
|
||||
from pandas.core.dtypes.dtypes import ExtensionDtype
|
||||
from pandas.core.dtypes.missing import notna
|
||||
|
||||
import pandas.core.algorithms as algos
|
||||
from pandas.core.algorithms import (
|
||||
factorize,
|
||||
unique,
|
||||
)
|
||||
from pandas.core.arrays.categorical import factorize_from_iterable
|
||||
from pandas.core.construction import ensure_wrapped_if_datetimelike
|
||||
from pandas.core.frame import DataFrame
|
||||
from pandas.core.indexes.api import (
|
||||
Index,
|
||||
MultiIndex,
|
||||
RangeIndex,
|
||||
)
|
||||
from pandas.core.reshape.concat import concat
|
||||
from pandas.core.series import Series
|
||||
from pandas.core.sorting import (
|
||||
compress_group_index,
|
||||
decons_obs_group_ids,
|
||||
get_compressed_ids,
|
||||
get_group_index,
|
||||
get_group_index_sorter,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
Level,
|
||||
npt,
|
||||
)
|
||||
|
||||
from pandas.core.arrays import ExtensionArray
|
||||
from pandas.core.indexes.frozen import FrozenList
|
||||
|
||||
|
||||
class _Unstacker:
|
||||
"""
|
||||
Helper class to unstack data / pivot with multi-level index
|
||||
|
||||
Parameters
|
||||
----------
|
||||
index : MultiIndex
|
||||
level : int or str, default last level
|
||||
Level to "unstack". Accepts a name for the level.
|
||||
fill_value : scalar, optional
|
||||
Default value to fill in missing values if subgroups do not have the
|
||||
same set of labels. By default, missing values will be replaced with
|
||||
the default fill value for that data type, NaN for float, NaT for
|
||||
datetimelike, etc. For integer types, by default data will converted to
|
||||
float and missing values will be set to NaN.
|
||||
constructor : object
|
||||
Pandas ``DataFrame`` or subclass used to create unstacked
|
||||
response. If None, DataFrame will be used.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
|
||||
... ('two', 'a'), ('two', 'b')])
|
||||
>>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index)
|
||||
>>> s
|
||||
one a 1
|
||||
b 2
|
||||
two a 3
|
||||
b 4
|
||||
dtype: int64
|
||||
|
||||
>>> s.unstack(level=-1)
|
||||
a b
|
||||
one 1 2
|
||||
two 3 4
|
||||
|
||||
>>> s.unstack(level=0)
|
||||
one two
|
||||
a 1 3
|
||||
b 2 4
|
||||
|
||||
Returns
|
||||
-------
|
||||
unstacked : DataFrame
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, index: MultiIndex, level: Level, constructor, sort: bool = True
|
||||
) -> None:
|
||||
self.constructor = constructor
|
||||
self.sort = sort
|
||||
|
||||
self.index = index.remove_unused_levels()
|
||||
|
||||
self.level = self.index._get_level_number(level)
|
||||
|
||||
# when index includes `nan`, need to lift levels/strides by 1
|
||||
self.lift = 1 if -1 in self.index.codes[self.level] else 0
|
||||
|
||||
# Note: the "pop" below alters these in-place.
|
||||
self.new_index_levels = list(self.index.levels)
|
||||
self.new_index_names = list(self.index.names)
|
||||
|
||||
self.removed_name = self.new_index_names.pop(self.level)
|
||||
self.removed_level = self.new_index_levels.pop(self.level)
|
||||
self.removed_level_full = index.levels[self.level]
|
||||
if not self.sort:
|
||||
unique_codes = unique(self.index.codes[self.level])
|
||||
self.removed_level = self.removed_level.take(unique_codes)
|
||||
self.removed_level_full = self.removed_level_full.take(unique_codes)
|
||||
|
||||
# Bug fix GH 20601
|
||||
# If the data frame is too big, the number of unique index combination
|
||||
# will cause int32 overflow on windows environments.
|
||||
# We want to check and raise an warning before this happens
|
||||
num_rows = np.max([index_level.size for index_level in self.new_index_levels])
|
||||
num_columns = self.removed_level.size
|
||||
|
||||
# GH20601: This forces an overflow if the number of cells is too high.
|
||||
num_cells = num_rows * num_columns
|
||||
|
||||
# GH 26314: Previous ValueError raised was too restrictive for many users.
|
||||
if num_cells > np.iinfo(np.int32).max:
|
||||
warnings.warn(
|
||||
f"The following operation may generate {num_cells} cells "
|
||||
f"in the resulting pandas object.",
|
||||
PerformanceWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
|
||||
self._make_selectors()
|
||||
|
||||
@cache_readonly
|
||||
def _indexer_and_to_sort(
|
||||
self,
|
||||
) -> tuple[
|
||||
npt.NDArray[np.intp],
|
||||
list[np.ndarray], # each has _some_ signed integer dtype
|
||||
]:
|
||||
v = self.level
|
||||
|
||||
codes = list(self.index.codes)
|
||||
levs = list(self.index.levels)
|
||||
to_sort = codes[:v] + codes[v + 1 :] + [codes[v]]
|
||||
sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]])
|
||||
|
||||
comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
|
||||
ngroups = len(obs_ids)
|
||||
|
||||
indexer = get_group_index_sorter(comp_index, ngroups)
|
||||
return indexer, to_sort
|
||||
|
||||
@cache_readonly
|
||||
def sorted_labels(self) -> list[np.ndarray]:
|
||||
indexer, to_sort = self._indexer_and_to_sort
|
||||
if self.sort:
|
||||
return [line.take(indexer) for line in to_sort]
|
||||
return to_sort
|
||||
|
||||
def _make_sorted_values(self, values: np.ndarray) -> np.ndarray:
|
||||
if self.sort:
|
||||
indexer, _ = self._indexer_and_to_sort
|
||||
|
||||
sorted_values = algos.take_nd(values, indexer, axis=0)
|
||||
return sorted_values
|
||||
return values
|
||||
|
||||
def _make_selectors(self):
|
||||
new_levels = self.new_index_levels
|
||||
|
||||
# make the mask
|
||||
remaining_labels = self.sorted_labels[:-1]
|
||||
level_sizes = tuple(len(x) for x in new_levels)
|
||||
|
||||
comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
|
||||
ngroups = len(obs_ids)
|
||||
|
||||
comp_index = ensure_platform_int(comp_index)
|
||||
stride = self.index.levshape[self.level] + self.lift
|
||||
self.full_shape = ngroups, stride
|
||||
|
||||
selector = self.sorted_labels[-1] + stride * comp_index + self.lift
|
||||
mask = np.zeros(np.prod(self.full_shape), dtype=bool)
|
||||
mask.put(selector, True)
|
||||
|
||||
if mask.sum() < len(self.index):
|
||||
raise ValueError("Index contains duplicate entries, cannot reshape")
|
||||
|
||||
self.group_index = comp_index
|
||||
self.mask = mask
|
||||
if self.sort:
|
||||
self.compressor = comp_index.searchsorted(np.arange(ngroups))
|
||||
else:
|
||||
self.compressor = np.sort(np.unique(comp_index, return_index=True)[1])
|
||||
|
||||
@cache_readonly
|
||||
def mask_all(self) -> bool:
|
||||
return bool(self.mask.all())
|
||||
|
||||
@cache_readonly
|
||||
def arange_result(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.bool_]]:
|
||||
# We cache this for reuse in ExtensionBlock._unstack
|
||||
dummy_arr = np.arange(len(self.index), dtype=np.intp)
|
||||
new_values, mask = self.get_new_values(dummy_arr, fill_value=-1)
|
||||
return new_values, mask.any(0)
|
||||
# TODO: in all tests we have mask.any(0).all(); can we rely on that?
|
||||
|
||||
def get_result(self, values, value_columns, fill_value) -> DataFrame:
|
||||
if values.ndim == 1:
|
||||
values = values[:, np.newaxis]
|
||||
|
||||
if value_columns is None and values.shape[1] != 1: # pragma: no cover
|
||||
raise ValueError("must pass column labels for multi-column data")
|
||||
|
||||
values, _ = self.get_new_values(values, fill_value)
|
||||
columns = self.get_new_columns(value_columns)
|
||||
index = self.new_index
|
||||
|
||||
return self.constructor(
|
||||
values, index=index, columns=columns, dtype=values.dtype
|
||||
)
|
||||
|
||||
def get_new_values(self, values, fill_value=None):
|
||||
if values.ndim == 1:
|
||||
values = values[:, np.newaxis]
|
||||
|
||||
sorted_values = self._make_sorted_values(values)
|
||||
|
||||
# place the values
|
||||
length, width = self.full_shape
|
||||
stride = values.shape[1]
|
||||
result_width = width * stride
|
||||
result_shape = (length, result_width)
|
||||
mask = self.mask
|
||||
mask_all = self.mask_all
|
||||
|
||||
# we can simply reshape if we don't have a mask
|
||||
if mask_all and len(values):
|
||||
# TODO: Under what circumstances can we rely on sorted_values
|
||||
# matching values? When that holds, we can slice instead
|
||||
# of take (in particular for EAs)
|
||||
new_values = (
|
||||
sorted_values.reshape(length, width, stride)
|
||||
.swapaxes(1, 2)
|
||||
.reshape(result_shape)
|
||||
)
|
||||
new_mask = np.ones(result_shape, dtype=bool)
|
||||
return new_values, new_mask
|
||||
|
||||
dtype = values.dtype
|
||||
|
||||
# if our mask is all True, then we can use our existing dtype
|
||||
if mask_all:
|
||||
dtype = values.dtype
|
||||
new_values = np.empty(result_shape, dtype=dtype)
|
||||
else:
|
||||
if isinstance(dtype, ExtensionDtype):
|
||||
# GH#41875
|
||||
# We are assuming that fill_value can be held by this dtype,
|
||||
# unlike the non-EA case that promotes.
|
||||
cls = dtype.construct_array_type()
|
||||
new_values = cls._empty(result_shape, dtype=dtype)
|
||||
new_values[:] = fill_value
|
||||
else:
|
||||
dtype, fill_value = maybe_promote(dtype, fill_value)
|
||||
new_values = np.empty(result_shape, dtype=dtype)
|
||||
new_values.fill(fill_value)
|
||||
|
||||
name = dtype.name
|
||||
new_mask = np.zeros(result_shape, dtype=bool)
|
||||
|
||||
# we need to convert to a basic dtype
|
||||
# and possibly coerce an input to our output dtype
|
||||
# e.g. ints -> floats
|
||||
if needs_i8_conversion(values.dtype):
|
||||
sorted_values = sorted_values.view("i8")
|
||||
new_values = new_values.view("i8")
|
||||
else:
|
||||
sorted_values = sorted_values.astype(name, copy=False)
|
||||
|
||||
# fill in our values & mask
|
||||
libreshape.unstack(
|
||||
sorted_values,
|
||||
mask.view("u1"),
|
||||
stride,
|
||||
length,
|
||||
width,
|
||||
new_values,
|
||||
new_mask.view("u1"),
|
||||
)
|
||||
|
||||
# reconstruct dtype if needed
|
||||
if needs_i8_conversion(values.dtype):
|
||||
# view as datetime64 so we can wrap in DatetimeArray and use
|
||||
# DTA's view method
|
||||
new_values = new_values.view("M8[ns]")
|
||||
new_values = ensure_wrapped_if_datetimelike(new_values)
|
||||
new_values = new_values.view(values.dtype)
|
||||
|
||||
return new_values, new_mask
|
||||
|
||||
def get_new_columns(self, value_columns: Index | None):
|
||||
if value_columns is None:
|
||||
if self.lift == 0:
|
||||
return self.removed_level._rename(name=self.removed_name)
|
||||
|
||||
lev = self.removed_level.insert(0, item=self.removed_level._na_value)
|
||||
return lev.rename(self.removed_name)
|
||||
|
||||
stride = len(self.removed_level) + self.lift
|
||||
width = len(value_columns)
|
||||
propagator = np.repeat(np.arange(width), stride)
|
||||
|
||||
new_levels: FrozenList | list[Index]
|
||||
|
||||
if isinstance(value_columns, MultiIndex):
|
||||
# error: Cannot determine type of "__add__" [has-type]
|
||||
new_levels = value_columns.levels + ( # type: ignore[has-type]
|
||||
self.removed_level_full,
|
||||
)
|
||||
new_names = value_columns.names + (self.removed_name,)
|
||||
|
||||
new_codes = [lab.take(propagator) for lab in value_columns.codes]
|
||||
else:
|
||||
new_levels = [
|
||||
value_columns,
|
||||
self.removed_level_full,
|
||||
]
|
||||
new_names = [value_columns.name, self.removed_name]
|
||||
new_codes = [propagator]
|
||||
|
||||
repeater = self._repeater
|
||||
|
||||
# The entire level is then just a repetition of the single chunk:
|
||||
new_codes.append(np.tile(repeater, width))
|
||||
return MultiIndex(
|
||||
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
|
||||
)
|
||||
|
||||
@cache_readonly
|
||||
def _repeater(self) -> np.ndarray:
|
||||
# The two indices differ only if the unstacked level had unused items:
|
||||
if len(self.removed_level_full) != len(self.removed_level):
|
||||
# In this case, we remap the new codes to the original level:
|
||||
repeater = self.removed_level_full.get_indexer(self.removed_level)
|
||||
if self.lift:
|
||||
repeater = np.insert(repeater, 0, -1)
|
||||
else:
|
||||
# Otherwise, we just use each level item exactly once:
|
||||
stride = len(self.removed_level) + self.lift
|
||||
repeater = np.arange(stride) - self.lift
|
||||
|
||||
return repeater
|
||||
|
||||
@cache_readonly
|
||||
def new_index(self) -> MultiIndex:
|
||||
# Does not depend on values or value_columns
|
||||
result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]]
|
||||
|
||||
# construct the new index
|
||||
if len(self.new_index_levels) == 1:
|
||||
level, level_codes = self.new_index_levels[0], result_codes[0]
|
||||
if (level_codes == -1).any():
|
||||
level = level.insert(len(level), level._na_value)
|
||||
return level.take(level_codes).rename(self.new_index_names[0])
|
||||
|
||||
return MultiIndex(
|
||||
levels=self.new_index_levels,
|
||||
codes=result_codes,
|
||||
names=self.new_index_names,
|
||||
verify_integrity=False,
|
||||
)
|
||||
|
||||
|
||||
def _unstack_multiple(
|
||||
data: Series | DataFrame, clocs, fill_value=None, sort: bool = True
|
||||
):
|
||||
if len(clocs) == 0:
|
||||
return data
|
||||
|
||||
# NOTE: This doesn't deal with hierarchical columns yet
|
||||
|
||||
index = data.index
|
||||
index = cast(MultiIndex, index) # caller is responsible for checking
|
||||
|
||||
# GH 19966 Make sure if MultiIndexed index has tuple name, they will be
|
||||
# recognised as a whole
|
||||
if clocs in index.names:
|
||||
clocs = [clocs]
|
||||
clocs = [index._get_level_number(i) for i in clocs]
|
||||
|
||||
rlocs = [i for i in range(index.nlevels) if i not in clocs]
|
||||
|
||||
clevels = [index.levels[i] for i in clocs]
|
||||
ccodes = [index.codes[i] for i in clocs]
|
||||
cnames = [index.names[i] for i in clocs]
|
||||
rlevels = [index.levels[i] for i in rlocs]
|
||||
rcodes = [index.codes[i] for i in rlocs]
|
||||
rnames = [index.names[i] for i in rlocs]
|
||||
|
||||
shape = tuple(len(x) for x in clevels)
|
||||
group_index = get_group_index(ccodes, shape, sort=False, xnull=False)
|
||||
|
||||
comp_ids, obs_ids = compress_group_index(group_index, sort=False)
|
||||
recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False)
|
||||
|
||||
if not rlocs:
|
||||
# Everything is in clocs, so the dummy df has a regular index
|
||||
dummy_index = Index(obs_ids, name="__placeholder__")
|
||||
else:
|
||||
dummy_index = MultiIndex(
|
||||
levels=rlevels + [obs_ids],
|
||||
codes=rcodes + [comp_ids],
|
||||
names=rnames + ["__placeholder__"],
|
||||
verify_integrity=False,
|
||||
)
|
||||
|
||||
if isinstance(data, Series):
|
||||
dummy = data.copy()
|
||||
dummy.index = dummy_index
|
||||
|
||||
unstacked = dummy.unstack("__placeholder__", fill_value=fill_value, sort=sort)
|
||||
new_levels = clevels
|
||||
new_names = cnames
|
||||
new_codes = recons_codes
|
||||
else:
|
||||
if isinstance(data.columns, MultiIndex):
|
||||
result = data
|
||||
while clocs:
|
||||
val = clocs.pop(0)
|
||||
result = result.unstack(val, fill_value=fill_value, sort=sort)
|
||||
clocs = [v if v < val else v - 1 for v in clocs]
|
||||
|
||||
return result
|
||||
|
||||
# GH#42579 deep=False to avoid consolidating
|
||||
dummy_df = data.copy(deep=False)
|
||||
dummy_df.index = dummy_index
|
||||
|
||||
unstacked = dummy_df.unstack(
|
||||
"__placeholder__", fill_value=fill_value, sort=sort
|
||||
)
|
||||
if isinstance(unstacked, Series):
|
||||
unstcols = unstacked.index
|
||||
else:
|
||||
unstcols = unstacked.columns
|
||||
assert isinstance(unstcols, MultiIndex) # for mypy
|
||||
new_levels = [unstcols.levels[0]] + clevels
|
||||
new_names = [data.columns.name] + cnames
|
||||
|
||||
new_codes = [unstcols.codes[0]]
|
||||
new_codes.extend(rec.take(unstcols.codes[-1]) for rec in recons_codes)
|
||||
|
||||
new_columns = MultiIndex(
|
||||
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
|
||||
)
|
||||
|
||||
if isinstance(unstacked, Series):
|
||||
unstacked.index = new_columns
|
||||
else:
|
||||
unstacked.columns = new_columns
|
||||
|
||||
return unstacked
|
||||
|
||||
|
||||
def unstack(obj: Series | DataFrame, level, fill_value=None, sort: bool = True):
|
||||
if isinstance(level, (tuple, list)):
|
||||
if len(level) != 1:
|
||||
# _unstack_multiple only handles MultiIndexes,
|
||||
# and isn't needed for a single level
|
||||
return _unstack_multiple(obj, level, fill_value=fill_value, sort=sort)
|
||||
else:
|
||||
level = level[0]
|
||||
|
||||
if not is_integer(level) and not level == "__placeholder__":
|
||||
# check if level is valid in case of regular index
|
||||
obj.index._get_level_number(level)
|
||||
|
||||
if isinstance(obj, DataFrame):
|
||||
if isinstance(obj.index, MultiIndex):
|
||||
return _unstack_frame(obj, level, fill_value=fill_value, sort=sort)
|
||||
else:
|
||||
return obj.T.stack(future_stack=True)
|
||||
elif not isinstance(obj.index, MultiIndex):
|
||||
# GH 36113
|
||||
# Give nicer error messages when unstack a Series whose
|
||||
# Index is not a MultiIndex.
|
||||
raise ValueError(
|
||||
f"index must be a MultiIndex to unstack, {type(obj.index)} was passed"
|
||||
)
|
||||
else:
|
||||
if is_1d_only_ea_dtype(obj.dtype):
|
||||
return _unstack_extension_series(obj, level, fill_value, sort=sort)
|
||||
unstacker = _Unstacker(
|
||||
obj.index, level=level, constructor=obj._constructor_expanddim, sort=sort
|
||||
)
|
||||
return unstacker.get_result(
|
||||
obj._values, value_columns=None, fill_value=fill_value
|
||||
)
|
||||
|
||||
|
||||
def _unstack_frame(
|
||||
obj: DataFrame, level, fill_value=None, sort: bool = True
|
||||
) -> DataFrame:
|
||||
assert isinstance(obj.index, MultiIndex) # checked by caller
|
||||
unstacker = _Unstacker(
|
||||
obj.index, level=level, constructor=obj._constructor, sort=sort
|
||||
)
|
||||
|
||||
if not obj._can_fast_transpose:
|
||||
mgr = obj._mgr.unstack(unstacker, fill_value=fill_value)
|
||||
return obj._constructor_from_mgr(mgr, axes=mgr.axes)
|
||||
else:
|
||||
return unstacker.get_result(
|
||||
obj._values, value_columns=obj.columns, fill_value=fill_value
|
||||
)
|
||||
|
||||
|
||||
def _unstack_extension_series(
|
||||
series: Series, level, fill_value, sort: bool
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Unstack an ExtensionArray-backed Series.
|
||||
|
||||
The ExtensionDtype is preserved.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
series : Series
|
||||
A Series with an ExtensionArray for values
|
||||
level : Any
|
||||
The level name or number.
|
||||
fill_value : Any
|
||||
The user-level (not physical storage) fill value to use for
|
||||
missing values introduced by the reshape. Passed to
|
||||
``series.values.take``.
|
||||
sort : bool
|
||||
Whether to sort the resulting MuliIndex levels
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
Each column of the DataFrame will have the same dtype as
|
||||
the input Series.
|
||||
"""
|
||||
# Defer to the logic in ExtensionBlock._unstack
|
||||
df = series.to_frame()
|
||||
result = df.unstack(level=level, fill_value=fill_value, sort=sort)
|
||||
|
||||
# equiv: result.droplevel(level=0, axis=1)
|
||||
# but this avoids an extra copy
|
||||
result.columns = result.columns._drop_level_numbers([0])
|
||||
return result
|
||||
|
||||
|
||||
def stack(frame: DataFrame, level=-1, dropna: bool = True, sort: bool = True):
|
||||
"""
|
||||
Convert DataFrame to Series with multi-level Index. Columns become the
|
||||
second level of the resulting hierarchical index
|
||||
|
||||
Returns
|
||||
-------
|
||||
stacked : Series or DataFrame
|
||||
"""
|
||||
|
||||
def stack_factorize(index):
|
||||
if index.is_unique:
|
||||
return index, np.arange(len(index))
|
||||
codes, categories = factorize_from_iterable(index)
|
||||
return categories, codes
|
||||
|
||||
N, K = frame.shape
|
||||
|
||||
# Will also convert negative level numbers and check if out of bounds.
|
||||
level_num = frame.columns._get_level_number(level)
|
||||
|
||||
if isinstance(frame.columns, MultiIndex):
|
||||
return _stack_multi_columns(
|
||||
frame, level_num=level_num, dropna=dropna, sort=sort
|
||||
)
|
||||
elif isinstance(frame.index, MultiIndex):
|
||||
new_levels = list(frame.index.levels)
|
||||
new_codes = [lab.repeat(K) for lab in frame.index.codes]
|
||||
|
||||
clev, clab = stack_factorize(frame.columns)
|
||||
new_levels.append(clev)
|
||||
new_codes.append(np.tile(clab, N).ravel())
|
||||
|
||||
new_names = list(frame.index.names)
|
||||
new_names.append(frame.columns.name)
|
||||
new_index = MultiIndex(
|
||||
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
|
||||
)
|
||||
else:
|
||||
levels, (ilab, clab) = zip(*map(stack_factorize, (frame.index, frame.columns)))
|
||||
codes = ilab.repeat(K), np.tile(clab, N).ravel()
|
||||
new_index = MultiIndex(
|
||||
levels=levels,
|
||||
codes=codes,
|
||||
names=[frame.index.name, frame.columns.name],
|
||||
verify_integrity=False,
|
||||
)
|
||||
|
||||
new_values: ArrayLike
|
||||
if not frame.empty and frame._is_homogeneous_type:
|
||||
# For homogeneous EAs, frame._values will coerce to object. So
|
||||
# we concatenate instead.
|
||||
dtypes = list(frame.dtypes._values)
|
||||
dtype = dtypes[0]
|
||||
|
||||
if isinstance(dtype, ExtensionDtype):
|
||||
arr = dtype.construct_array_type()
|
||||
new_values = arr._concat_same_type(
|
||||
[col._values for _, col in frame.items()]
|
||||
)
|
||||
new_values = _reorder_for_extension_array_stack(new_values, N, K)
|
||||
else:
|
||||
# homogeneous, non-EA
|
||||
new_values = frame._values.ravel()
|
||||
|
||||
else:
|
||||
# non-homogeneous
|
||||
new_values = frame._values.ravel()
|
||||
|
||||
if dropna:
|
||||
mask = notna(new_values)
|
||||
new_values = new_values[mask]
|
||||
new_index = new_index[mask]
|
||||
|
||||
return frame._constructor_sliced(new_values, index=new_index)
|
||||
|
||||
|
||||
def stack_multiple(frame: DataFrame, level, dropna: bool = True, sort: bool = True):
|
||||
# If all passed levels match up to column names, no
|
||||
# ambiguity about what to do
|
||||
if all(lev in frame.columns.names for lev in level):
|
||||
result = frame
|
||||
for lev in level:
|
||||
result = stack(result, lev, dropna=dropna, sort=sort)
|
||||
|
||||
# Otherwise, level numbers may change as each successive level is stacked
|
||||
elif all(isinstance(lev, int) for lev in level):
|
||||
# As each stack is done, the level numbers decrease, so we need
|
||||
# to account for that when level is a sequence of ints
|
||||
result = frame
|
||||
# _get_level_number() checks level numbers are in range and converts
|
||||
# negative numbers to positive
|
||||
level = [frame.columns._get_level_number(lev) for lev in level]
|
||||
|
||||
while level:
|
||||
lev = level.pop(0)
|
||||
result = stack(result, lev, dropna=dropna, sort=sort)
|
||||
# Decrement all level numbers greater than current, as these
|
||||
# have now shifted down by one
|
||||
level = [v if v <= lev else v - 1 for v in level]
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
"level should contain all level names or all level "
|
||||
"numbers, not a mixture of the two."
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex:
|
||||
"""Creates a MultiIndex from the first N-1 levels of this MultiIndex."""
|
||||
if len(columns.levels) <= 2:
|
||||
return columns.levels[0]._rename(name=columns.names[0])
|
||||
|
||||
levs = [
|
||||
[lev[c] if c >= 0 else None for c in codes]
|
||||
for lev, codes in zip(columns.levels[:-1], columns.codes[:-1])
|
||||
]
|
||||
|
||||
# Remove duplicate tuples in the MultiIndex.
|
||||
tuples = zip(*levs)
|
||||
unique_tuples = (key for key, _ in itertools.groupby(tuples))
|
||||
new_levs = zip(*unique_tuples)
|
||||
|
||||
# The dtype of each level must be explicitly set to avoid inferring the wrong type.
|
||||
# See GH-36991.
|
||||
return MultiIndex.from_arrays(
|
||||
[
|
||||
# Not all indices can accept None values.
|
||||
Index(new_lev, dtype=lev.dtype) if None not in new_lev else new_lev
|
||||
for new_lev, lev in zip(new_levs, columns.levels)
|
||||
],
|
||||
names=columns.names[:-1],
|
||||
)
|
||||
|
||||
|
||||
def _stack_multi_columns(
|
||||
frame: DataFrame, level_num: int = -1, dropna: bool = True, sort: bool = True
|
||||
) -> DataFrame:
|
||||
def _convert_level_number(level_num: int, columns: Index):
|
||||
"""
|
||||
Logic for converting the level number to something we can safely pass
|
||||
to swaplevel.
|
||||
|
||||
If `level_num` matches a column name return the name from
|
||||
position `level_num`, otherwise return `level_num`.
|
||||
"""
|
||||
if level_num in columns.names:
|
||||
return columns.names[level_num]
|
||||
|
||||
return level_num
|
||||
|
||||
this = frame.copy(deep=False)
|
||||
mi_cols = this.columns # cast(MultiIndex, this.columns)
|
||||
assert isinstance(mi_cols, MultiIndex) # caller is responsible
|
||||
|
||||
# this makes life much simpler
|
||||
if level_num != mi_cols.nlevels - 1:
|
||||
# roll levels to put selected level at end
|
||||
roll_columns = mi_cols
|
||||
for i in range(level_num, mi_cols.nlevels - 1):
|
||||
# Need to check if the ints conflict with level names
|
||||
lev1 = _convert_level_number(i, roll_columns)
|
||||
lev2 = _convert_level_number(i + 1, roll_columns)
|
||||
roll_columns = roll_columns.swaplevel(lev1, lev2)
|
||||
this.columns = mi_cols = roll_columns
|
||||
|
||||
if not mi_cols._is_lexsorted() and sort:
|
||||
# Workaround the edge case where 0 is one of the column names,
|
||||
# which interferes with trying to sort based on the first
|
||||
# level
|
||||
level_to_sort = _convert_level_number(0, mi_cols)
|
||||
this = this.sort_index(level=level_to_sort, axis=1)
|
||||
mi_cols = this.columns
|
||||
|
||||
mi_cols = cast(MultiIndex, mi_cols)
|
||||
new_columns = _stack_multi_column_index(mi_cols)
|
||||
|
||||
# time to ravel the values
|
||||
new_data = {}
|
||||
level_vals = mi_cols.levels[-1]
|
||||
level_codes = unique(mi_cols.codes[-1])
|
||||
if sort:
|
||||
level_codes = np.sort(level_codes)
|
||||
level_vals_nan = level_vals.insert(len(level_vals), None)
|
||||
|
||||
level_vals_used = np.take(level_vals_nan, level_codes)
|
||||
levsize = len(level_codes)
|
||||
drop_cols = []
|
||||
for key in new_columns:
|
||||
try:
|
||||
loc = this.columns.get_loc(key)
|
||||
except KeyError:
|
||||
drop_cols.append(key)
|
||||
continue
|
||||
|
||||
# can make more efficient?
|
||||
# we almost always return a slice
|
||||
# but if unsorted can get a boolean
|
||||
# indexer
|
||||
if not isinstance(loc, slice):
|
||||
slice_len = len(loc)
|
||||
else:
|
||||
slice_len = loc.stop - loc.start
|
||||
|
||||
if slice_len != levsize:
|
||||
chunk = this.loc[:, this.columns[loc]]
|
||||
chunk.columns = level_vals_nan.take(chunk.columns.codes[-1])
|
||||
value_slice = chunk.reindex(columns=level_vals_used).values
|
||||
else:
|
||||
subset = this.iloc[:, loc]
|
||||
dtype = find_common_type(subset.dtypes.tolist())
|
||||
if isinstance(dtype, ExtensionDtype):
|
||||
# TODO(EA2D): won't need special case, can go through .values
|
||||
# paths below (might change to ._values)
|
||||
value_slice = dtype.construct_array_type()._concat_same_type(
|
||||
[x._values.astype(dtype, copy=False) for _, x in subset.items()]
|
||||
)
|
||||
N, K = subset.shape
|
||||
idx = np.arange(N * K).reshape(K, N).T.ravel()
|
||||
value_slice = value_slice.take(idx)
|
||||
else:
|
||||
value_slice = subset.values
|
||||
|
||||
if value_slice.ndim > 1:
|
||||
# i.e. not extension
|
||||
value_slice = value_slice.ravel()
|
||||
|
||||
new_data[key] = value_slice
|
||||
|
||||
if len(drop_cols) > 0:
|
||||
new_columns = new_columns.difference(drop_cols)
|
||||
|
||||
N = len(this)
|
||||
|
||||
if isinstance(this.index, MultiIndex):
|
||||
new_levels = list(this.index.levels)
|
||||
new_names = list(this.index.names)
|
||||
new_codes = [lab.repeat(levsize) for lab in this.index.codes]
|
||||
else:
|
||||
old_codes, old_levels = factorize_from_iterable(this.index)
|
||||
new_levels = [old_levels]
|
||||
new_codes = [old_codes.repeat(levsize)]
|
||||
new_names = [this.index.name] # something better?
|
||||
|
||||
new_levels.append(level_vals)
|
||||
new_codes.append(np.tile(level_codes, N))
|
||||
new_names.append(frame.columns.names[level_num])
|
||||
|
||||
new_index = MultiIndex(
|
||||
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
|
||||
)
|
||||
|
||||
result = frame._constructor(new_data, index=new_index, columns=new_columns)
|
||||
|
||||
if frame.columns.nlevels > 1:
|
||||
desired_columns = frame.columns._drop_level_numbers([level_num]).unique()
|
||||
if not result.columns.equals(desired_columns):
|
||||
result = result[desired_columns]
|
||||
|
||||
# more efficient way to go about this? can do the whole masking biz but
|
||||
# will only save a small amount of time...
|
||||
if dropna:
|
||||
result = result.dropna(axis=0, how="all")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _reorder_for_extension_array_stack(
|
||||
arr: ExtensionArray, n_rows: int, n_columns: int
|
||||
) -> ExtensionArray:
|
||||
"""
|
||||
Re-orders the values when stacking multiple extension-arrays.
|
||||
|
||||
The indirect stacking method used for EAs requires a followup
|
||||
take to get the order correct.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arr : ExtensionArray
|
||||
n_rows, n_columns : int
|
||||
The number of rows and columns in the original DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
taken : ExtensionArray
|
||||
The original `arr` with elements re-ordered appropriately
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f'])
|
||||
>>> _reorder_for_extension_array_stack(arr, 2, 3)
|
||||
array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='<U1')
|
||||
|
||||
>>> _reorder_for_extension_array_stack(arr, 3, 2)
|
||||
array(['a', 'd', 'b', 'e', 'c', 'f'], dtype='<U1')
|
||||
"""
|
||||
# final take to get the order correct.
|
||||
# idx is an indexer like
|
||||
# [c0r0, c1r0, c2r0, ...,
|
||||
# c0r1, c1r1, c2r1, ...]
|
||||
idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel()
|
||||
return arr.take(idx)
|
||||
|
||||
|
||||
def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame:
|
||||
if frame.columns.nunique() != len(frame.columns):
|
||||
raise ValueError("Columns with duplicate values are not supported in stack")
|
||||
|
||||
# If we need to drop `level` from columns, it needs to be in descending order
|
||||
drop_levnums = sorted(level, reverse=True)
|
||||
stack_cols = frame.columns._drop_level_numbers(
|
||||
[k for k in range(frame.columns.nlevels) if k not in level][::-1]
|
||||
)
|
||||
if len(level) > 1:
|
||||
# Arrange columns in the order we want to take them, e.g. level=[2, 0, 1]
|
||||
sorter = np.argsort(level)
|
||||
ordered_stack_cols = stack_cols._reorder_ilevels(sorter)
|
||||
else:
|
||||
ordered_stack_cols = stack_cols
|
||||
|
||||
stack_cols_unique = stack_cols.unique()
|
||||
ordered_stack_cols_unique = ordered_stack_cols.unique()
|
||||
|
||||
# Grab data for each unique index to be stacked
|
||||
buf = []
|
||||
for idx in stack_cols_unique:
|
||||
if len(frame.columns) == 1:
|
||||
data = frame.copy()
|
||||
else:
|
||||
# Take the data from frame corresponding to this idx value
|
||||
if len(level) == 1:
|
||||
idx = (idx,)
|
||||
gen = iter(idx)
|
||||
column_indexer = tuple(
|
||||
next(gen) if k in level else slice(None)
|
||||
for k in range(frame.columns.nlevels)
|
||||
)
|
||||
data = frame.loc[:, column_indexer]
|
||||
|
||||
if len(level) < frame.columns.nlevels:
|
||||
data.columns = data.columns._drop_level_numbers(drop_levnums)
|
||||
elif stack_cols.nlevels == 1:
|
||||
if data.ndim == 1:
|
||||
data.name = 0
|
||||
else:
|
||||
data.columns = RangeIndex(len(data.columns))
|
||||
buf.append(data)
|
||||
|
||||
result: Series | DataFrame
|
||||
if len(buf) > 0 and not frame.empty:
|
||||
result = concat(buf)
|
||||
ratio = len(result) // len(frame)
|
||||
else:
|
||||
# input is empty
|
||||
if len(level) < frame.columns.nlevels:
|
||||
# concat column order may be different from dropping the levels
|
||||
new_columns = frame.columns._drop_level_numbers(drop_levnums).unique()
|
||||
else:
|
||||
new_columns = [0]
|
||||
result = DataFrame(columns=new_columns, dtype=frame._values.dtype)
|
||||
ratio = 0
|
||||
|
||||
if len(level) < frame.columns.nlevels:
|
||||
# concat column order may be different from dropping the levels
|
||||
desired_columns = frame.columns._drop_level_numbers(drop_levnums).unique()
|
||||
if not result.columns.equals(desired_columns):
|
||||
result = result[desired_columns]
|
||||
|
||||
# Construct the correct MultiIndex by combining the frame's index and
|
||||
# stacked columns.
|
||||
index_levels: list | FrozenList
|
||||
if isinstance(frame.index, MultiIndex):
|
||||
index_levels = frame.index.levels
|
||||
index_codes = list(np.tile(frame.index.codes, (1, ratio)))
|
||||
else:
|
||||
codes, uniques = factorize(frame.index, use_na_sentinel=False)
|
||||
index_levels = [uniques]
|
||||
index_codes = list(np.tile(codes, (1, ratio)))
|
||||
if isinstance(stack_cols, MultiIndex):
|
||||
column_levels = ordered_stack_cols.levels
|
||||
column_codes = ordered_stack_cols.drop_duplicates().codes
|
||||
else:
|
||||
column_levels = [ordered_stack_cols.unique()]
|
||||
column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]]
|
||||
column_codes = [np.repeat(codes, len(frame)) for codes in column_codes]
|
||||
result.index = MultiIndex(
|
||||
levels=index_levels + column_levels,
|
||||
codes=index_codes + column_codes,
|
||||
names=frame.index.names + list(ordered_stack_cols.names),
|
||||
verify_integrity=False,
|
||||
)
|
||||
|
||||
# sort result, but faster than calling sort_index since we know the order we need
|
||||
len_df = len(frame)
|
||||
n_uniques = len(ordered_stack_cols_unique)
|
||||
indexer = np.arange(n_uniques)
|
||||
idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques)
|
||||
result = result.take(idxs)
|
||||
|
||||
# Reshape/rename if needed and dropna
|
||||
if result.ndim == 2 and frame.columns.nlevels == len(level):
|
||||
if len(result.columns) == 0:
|
||||
result = Series(index=result.index)
|
||||
else:
|
||||
result = result.iloc[:, 0]
|
||||
if result.ndim == 1:
|
||||
result.name = None
|
||||
|
||||
return result
|
638
lib/python3.13/site-packages/pandas/core/reshape/tile.py
Normal file
638
lib/python3.13/site-packages/pandas/core/reshape/tile.py
Normal file
@ -0,0 +1,638 @@
|
||||
"""
|
||||
Quantilization functions and related stuff
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
Literal,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
lib,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
ensure_platform_int,
|
||||
is_bool_dtype,
|
||||
is_integer,
|
||||
is_list_like,
|
||||
is_numeric_dtype,
|
||||
is_scalar,
|
||||
)
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
CategoricalDtype,
|
||||
DatetimeTZDtype,
|
||||
ExtensionDtype,
|
||||
)
|
||||
from pandas.core.dtypes.generic import ABCSeries
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
from pandas import (
|
||||
Categorical,
|
||||
Index,
|
||||
IntervalIndex,
|
||||
)
|
||||
import pandas.core.algorithms as algos
|
||||
from pandas.core.arrays.datetimelike import dtype_to_unit
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
DtypeObj,
|
||||
IntervalLeftRight,
|
||||
)
|
||||
|
||||
|
||||
def cut(
|
||||
x,
|
||||
bins,
|
||||
right: bool = True,
|
||||
labels=None,
|
||||
retbins: bool = False,
|
||||
precision: int = 3,
|
||||
include_lowest: bool = False,
|
||||
duplicates: str = "raise",
|
||||
ordered: bool = True,
|
||||
):
|
||||
"""
|
||||
Bin values into discrete intervals.
|
||||
|
||||
Use `cut` when you need to segment and sort data values into bins. This
|
||||
function is also useful for going from a continuous variable to a
|
||||
categorical variable. For example, `cut` could convert ages to groups of
|
||||
age ranges. Supports binning into an equal number of bins, or a
|
||||
pre-specified array of bins.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : array-like
|
||||
The input array to be binned. Must be 1-dimensional.
|
||||
bins : int, sequence of scalars, or IntervalIndex
|
||||
The criteria to bin by.
|
||||
|
||||
* int : Defines the number of equal-width bins in the range of `x`. The
|
||||
range of `x` is extended by .1% on each side to include the minimum
|
||||
and maximum values of `x`.
|
||||
* sequence of scalars : Defines the bin edges allowing for non-uniform
|
||||
width. No extension of the range of `x` is done.
|
||||
* IntervalIndex : Defines the exact bins to be used. Note that
|
||||
IntervalIndex for `bins` must be non-overlapping.
|
||||
|
||||
right : bool, default True
|
||||
Indicates whether `bins` includes the rightmost edge or not. If
|
||||
``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]``
|
||||
indicate (1,2], (2,3], (3,4]. This argument is ignored when
|
||||
`bins` is an IntervalIndex.
|
||||
labels : array or False, default None
|
||||
Specifies the labels for the returned bins. Must be the same length as
|
||||
the resulting bins. If False, returns only integer indicators of the
|
||||
bins. This affects the type of the output container (see below).
|
||||
This argument is ignored when `bins` is an IntervalIndex. If True,
|
||||
raises an error. When `ordered=False`, labels must be provided.
|
||||
retbins : bool, default False
|
||||
Whether to return the bins or not. Useful when bins is provided
|
||||
as a scalar.
|
||||
precision : int, default 3
|
||||
The precision at which to store and display the bins labels.
|
||||
include_lowest : bool, default False
|
||||
Whether the first interval should be left-inclusive or not.
|
||||
duplicates : {default 'raise', 'drop'}, optional
|
||||
If bin edges are not unique, raise ValueError or drop non-uniques.
|
||||
ordered : bool, default True
|
||||
Whether the labels are ordered or not. Applies to returned types
|
||||
Categorical and Series (with Categorical dtype). If True,
|
||||
the resulting categorical will be ordered. If False, the resulting
|
||||
categorical will be unordered (labels must be provided).
|
||||
|
||||
Returns
|
||||
-------
|
||||
out : Categorical, Series, or ndarray
|
||||
An array-like object representing the respective bin for each value
|
||||
of `x`. The type depends on the value of `labels`.
|
||||
|
||||
* None (default) : returns a Series for Series `x` or a
|
||||
Categorical for all other inputs. The values stored within
|
||||
are Interval dtype.
|
||||
|
||||
* sequence of scalars : returns a Series for Series `x` or a
|
||||
Categorical for all other inputs. The values stored within
|
||||
are whatever the type in the sequence is.
|
||||
|
||||
* False : returns an ndarray of integers.
|
||||
|
||||
bins : numpy.ndarray or IntervalIndex.
|
||||
The computed or specified bins. Only returned when `retbins=True`.
|
||||
For scalar or sequence `bins`, this is an ndarray with the computed
|
||||
bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For
|
||||
an IntervalIndex `bins`, this is equal to `bins`.
|
||||
|
||||
See Also
|
||||
--------
|
||||
qcut : Discretize variable into equal-sized buckets based on rank
|
||||
or based on sample quantiles.
|
||||
Categorical : Array type for storing data that come from a
|
||||
fixed set of values.
|
||||
Series : One-dimensional array with axis labels (including time series).
|
||||
IntervalIndex : Immutable Index implementing an ordered, sliceable set.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Any NA values will be NA in the result. Out of bounds values will be NA in
|
||||
the resulting Series or Categorical object.
|
||||
|
||||
Reference :ref:`the user guide <reshaping.tile.cut>` for more examples.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Discretize into three equal-sized bins.
|
||||
|
||||
>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)
|
||||
... # doctest: +ELLIPSIS
|
||||
[(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
|
||||
Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ...
|
||||
|
||||
>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)
|
||||
... # doctest: +ELLIPSIS
|
||||
([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
|
||||
Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ...
|
||||
array([0.994, 3. , 5. , 7. ]))
|
||||
|
||||
Discovers the same bins, but assign them specific labels. Notice that
|
||||
the returned Categorical's categories are `labels` and is ordered.
|
||||
|
||||
>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]),
|
||||
... 3, labels=["bad", "medium", "good"])
|
||||
['bad', 'good', 'medium', 'medium', 'good', 'bad']
|
||||
Categories (3, object): ['bad' < 'medium' < 'good']
|
||||
|
||||
``ordered=False`` will result in unordered categories when labels are passed.
|
||||
This parameter can be used to allow non-unique labels:
|
||||
|
||||
>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3,
|
||||
... labels=["B", "A", "B"], ordered=False)
|
||||
['B', 'B', 'A', 'A', 'B', 'B']
|
||||
Categories (2, object): ['A', 'B']
|
||||
|
||||
``labels=False`` implies you just want the bins back.
|
||||
|
||||
>>> pd.cut([0, 1, 1, 2], bins=4, labels=False)
|
||||
array([0, 1, 1, 3])
|
||||
|
||||
Passing a Series as an input returns a Series with categorical dtype:
|
||||
|
||||
>>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
|
||||
... index=['a', 'b', 'c', 'd', 'e'])
|
||||
>>> pd.cut(s, 3)
|
||||
... # doctest: +ELLIPSIS
|
||||
a (1.992, 4.667]
|
||||
b (1.992, 4.667]
|
||||
c (4.667, 7.333]
|
||||
d (7.333, 10.0]
|
||||
e (7.333, 10.0]
|
||||
dtype: category
|
||||
Categories (3, interval[float64, right]): [(1.992, 4.667] < (4.667, ...
|
||||
|
||||
Passing a Series as an input returns a Series with mapping value.
|
||||
It is used to map numerically to intervals based on bins.
|
||||
|
||||
>>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
|
||||
... index=['a', 'b', 'c', 'd', 'e'])
|
||||
>>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False)
|
||||
... # doctest: +ELLIPSIS
|
||||
(a 1.0
|
||||
b 2.0
|
||||
c 3.0
|
||||
d 4.0
|
||||
e NaN
|
||||
dtype: float64,
|
||||
array([ 0, 2, 4, 6, 8, 10]))
|
||||
|
||||
Use `drop` optional when bins is not unique
|
||||
|
||||
>>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True,
|
||||
... right=False, duplicates='drop')
|
||||
... # doctest: +ELLIPSIS
|
||||
(a 1.0
|
||||
b 2.0
|
||||
c 3.0
|
||||
d 3.0
|
||||
e NaN
|
||||
dtype: float64,
|
||||
array([ 0, 2, 4, 6, 10]))
|
||||
|
||||
Passing an IntervalIndex for `bins` results in those categories exactly.
|
||||
Notice that values not covered by the IntervalIndex are set to NaN. 0
|
||||
is to the left of the first bin (which is closed on the right), and 1.5
|
||||
falls between two bins.
|
||||
|
||||
>>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])
|
||||
>>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)
|
||||
[NaN, (0.0, 1.0], NaN, (2.0, 3.0], (4.0, 5.0]]
|
||||
Categories (3, interval[int64, right]): [(0, 1] < (2, 3] < (4, 5]]
|
||||
"""
|
||||
# NOTE: this binning code is changed a bit from histogram for var(x) == 0
|
||||
|
||||
original = x
|
||||
x_idx = _preprocess_for_cut(x)
|
||||
x_idx, _ = _coerce_to_type(x_idx)
|
||||
|
||||
if not np.iterable(bins):
|
||||
bins = _nbins_to_bins(x_idx, bins, right)
|
||||
|
||||
elif isinstance(bins, IntervalIndex):
|
||||
if bins.is_overlapping:
|
||||
raise ValueError("Overlapping IntervalIndex is not accepted.")
|
||||
|
||||
else:
|
||||
bins = Index(bins)
|
||||
if not bins.is_monotonic_increasing:
|
||||
raise ValueError("bins must increase monotonically.")
|
||||
|
||||
fac, bins = _bins_to_cuts(
|
||||
x_idx,
|
||||
bins,
|
||||
right=right,
|
||||
labels=labels,
|
||||
precision=precision,
|
||||
include_lowest=include_lowest,
|
||||
duplicates=duplicates,
|
||||
ordered=ordered,
|
||||
)
|
||||
|
||||
return _postprocess_for_cut(fac, bins, retbins, original)
|
||||
|
||||
|
||||
def qcut(
|
||||
x,
|
||||
q,
|
||||
labels=None,
|
||||
retbins: bool = False,
|
||||
precision: int = 3,
|
||||
duplicates: str = "raise",
|
||||
):
|
||||
"""
|
||||
Quantile-based discretization function.
|
||||
|
||||
Discretize variable into equal-sized buckets based on rank or based
|
||||
on sample quantiles. For example 1000 values for 10 quantiles would
|
||||
produce a Categorical object indicating quantile membership for each data point.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : 1d ndarray or Series
|
||||
q : int or list-like of float
|
||||
Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
|
||||
array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles.
|
||||
labels : array or False, default None
|
||||
Used as labels for the resulting bins. Must be of the same length as
|
||||
the resulting bins. If False, return only integer indicators of the
|
||||
bins. If True, raises an error.
|
||||
retbins : bool, optional
|
||||
Whether to return the (bins, labels) or not. Can be useful if bins
|
||||
is given as a scalar.
|
||||
precision : int, optional
|
||||
The precision at which to store and display the bins labels.
|
||||
duplicates : {default 'raise', 'drop'}, optional
|
||||
If bin edges are not unique, raise ValueError or drop non-uniques.
|
||||
|
||||
Returns
|
||||
-------
|
||||
out : Categorical or Series or array of integers if labels is False
|
||||
The return type (Categorical or Series) depends on the input: a Series
|
||||
of type category if input is a Series else Categorical. Bins are
|
||||
represented as categories when categorical data is returned.
|
||||
bins : ndarray of floats
|
||||
Returned only if `retbins` is True.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Out of bounds values will be NA in the resulting Categorical object
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> pd.qcut(range(5), 4)
|
||||
... # doctest: +ELLIPSIS
|
||||
[(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]
|
||||
Categories (4, interval[float64, right]): [(-0.001, 1.0] < (1.0, 2.0] ...
|
||||
|
||||
>>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"])
|
||||
... # doctest: +SKIP
|
||||
[good, good, medium, bad, bad]
|
||||
Categories (3, object): [good < medium < bad]
|
||||
|
||||
>>> pd.qcut(range(5), 4, labels=False)
|
||||
array([0, 0, 1, 2, 3])
|
||||
"""
|
||||
original = x
|
||||
x_idx = _preprocess_for_cut(x)
|
||||
x_idx, _ = _coerce_to_type(x_idx)
|
||||
|
||||
quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q
|
||||
|
||||
bins = x_idx.to_series().dropna().quantile(quantiles)
|
||||
|
||||
fac, bins = _bins_to_cuts(
|
||||
x_idx,
|
||||
Index(bins),
|
||||
labels=labels,
|
||||
precision=precision,
|
||||
include_lowest=True,
|
||||
duplicates=duplicates,
|
||||
)
|
||||
|
||||
return _postprocess_for_cut(fac, bins, retbins, original)
|
||||
|
||||
|
||||
def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:
|
||||
"""
|
||||
If a user passed an integer N for bins, convert this to a sequence of N
|
||||
equal(ish)-sized bins.
|
||||
"""
|
||||
if is_scalar(nbins) and nbins < 1:
|
||||
raise ValueError("`bins` should be a positive integer.")
|
||||
|
||||
if x_idx.size == 0:
|
||||
raise ValueError("Cannot cut empty array")
|
||||
|
||||
rng = (x_idx.min(), x_idx.max())
|
||||
mn, mx = rng
|
||||
|
||||
if is_numeric_dtype(x_idx.dtype) and (np.isinf(mn) or np.isinf(mx)):
|
||||
# GH#24314
|
||||
raise ValueError(
|
||||
"cannot specify integer `bins` when input data contains infinity"
|
||||
)
|
||||
|
||||
if mn == mx: # adjust end points before binning
|
||||
if _is_dt_or_td(x_idx.dtype):
|
||||
# using seconds=1 is pretty arbitrary here
|
||||
# error: Argument 1 to "dtype_to_unit" has incompatible type
|
||||
# "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]"
|
||||
unit = dtype_to_unit(x_idx.dtype) # type: ignore[arg-type]
|
||||
td = Timedelta(seconds=1).as_unit(unit)
|
||||
# Use DatetimeArray/TimedeltaArray method instead of linspace
|
||||
# error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
|
||||
# has no attribute "_generate_range"
|
||||
bins = x_idx._values._generate_range( # type: ignore[union-attr]
|
||||
start=mn - td, end=mx + td, periods=nbins + 1, freq=None, unit=unit
|
||||
)
|
||||
else:
|
||||
mn -= 0.001 * abs(mn) if mn != 0 else 0.001
|
||||
mx += 0.001 * abs(mx) if mx != 0 else 0.001
|
||||
|
||||
bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
|
||||
else: # adjust end points after binning
|
||||
if _is_dt_or_td(x_idx.dtype):
|
||||
# Use DatetimeArray/TimedeltaArray method instead of linspace
|
||||
|
||||
# error: Argument 1 to "dtype_to_unit" has incompatible type
|
||||
# "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]"
|
||||
unit = dtype_to_unit(x_idx.dtype) # type: ignore[arg-type]
|
||||
# error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
|
||||
# has no attribute "_generate_range"
|
||||
bins = x_idx._values._generate_range( # type: ignore[union-attr]
|
||||
start=mn, end=mx, periods=nbins + 1, freq=None, unit=unit
|
||||
)
|
||||
else:
|
||||
bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
|
||||
adj = (mx - mn) * 0.001 # 0.1% of the range
|
||||
if right:
|
||||
bins[0] -= adj
|
||||
else:
|
||||
bins[-1] += adj
|
||||
|
||||
return Index(bins)
|
||||
|
||||
|
||||
def _bins_to_cuts(
|
||||
x_idx: Index,
|
||||
bins: Index,
|
||||
right: bool = True,
|
||||
labels=None,
|
||||
precision: int = 3,
|
||||
include_lowest: bool = False,
|
||||
duplicates: str = "raise",
|
||||
ordered: bool = True,
|
||||
):
|
||||
if not ordered and labels is None:
|
||||
raise ValueError("'labels' must be provided if 'ordered = False'")
|
||||
|
||||
if duplicates not in ["raise", "drop"]:
|
||||
raise ValueError(
|
||||
"invalid value for 'duplicates' parameter, valid options are: raise, drop"
|
||||
)
|
||||
|
||||
result: Categorical | np.ndarray
|
||||
|
||||
if isinstance(bins, IntervalIndex):
|
||||
# we have a fast-path here
|
||||
ids = bins.get_indexer(x_idx)
|
||||
cat_dtype = CategoricalDtype(bins, ordered=True)
|
||||
result = Categorical.from_codes(ids, dtype=cat_dtype, validate=False)
|
||||
return result, bins
|
||||
|
||||
unique_bins = algos.unique(bins)
|
||||
if len(unique_bins) < len(bins) and len(bins) != 2:
|
||||
if duplicates == "raise":
|
||||
raise ValueError(
|
||||
f"Bin edges must be unique: {repr(bins)}.\n"
|
||||
f"You can drop duplicate edges by setting the 'duplicates' kwarg"
|
||||
)
|
||||
bins = unique_bins
|
||||
|
||||
side: Literal["left", "right"] = "left" if right else "right"
|
||||
|
||||
try:
|
||||
ids = bins.searchsorted(x_idx, side=side)
|
||||
except TypeError as err:
|
||||
# e.g. test_datetime_nan_error if bins are DatetimeArray and x_idx
|
||||
# is integers
|
||||
if x_idx.dtype.kind == "m":
|
||||
raise ValueError("bins must be of timedelta64 dtype") from err
|
||||
elif x_idx.dtype.kind == bins.dtype.kind == "M":
|
||||
raise ValueError(
|
||||
"Cannot use timezone-naive bins with timezone-aware values, "
|
||||
"or vice-versa"
|
||||
) from err
|
||||
elif x_idx.dtype.kind == "M":
|
||||
raise ValueError("bins must be of datetime64 dtype") from err
|
||||
else:
|
||||
raise
|
||||
ids = ensure_platform_int(ids)
|
||||
|
||||
if include_lowest:
|
||||
ids[x_idx == bins[0]] = 1
|
||||
|
||||
na_mask = isna(x_idx) | (ids == len(bins)) | (ids == 0)
|
||||
has_nas = na_mask.any()
|
||||
|
||||
if labels is not False:
|
||||
if not (labels is None or is_list_like(labels)):
|
||||
raise ValueError(
|
||||
"Bin labels must either be False, None or passed in as a "
|
||||
"list-like argument"
|
||||
)
|
||||
|
||||
if labels is None:
|
||||
labels = _format_labels(
|
||||
bins, precision, right=right, include_lowest=include_lowest
|
||||
)
|
||||
elif ordered and len(set(labels)) != len(labels):
|
||||
raise ValueError(
|
||||
"labels must be unique if ordered=True; pass ordered=False "
|
||||
"for duplicate labels"
|
||||
)
|
||||
else:
|
||||
if len(labels) != len(bins) - 1:
|
||||
raise ValueError(
|
||||
"Bin labels must be one fewer than the number of bin edges"
|
||||
)
|
||||
|
||||
if not isinstance(getattr(labels, "dtype", None), CategoricalDtype):
|
||||
labels = Categorical(
|
||||
labels,
|
||||
categories=labels if len(set(labels)) == len(labels) else None,
|
||||
ordered=ordered,
|
||||
)
|
||||
# TODO: handle mismatch between categorical label order and pandas.cut order.
|
||||
np.putmask(ids, na_mask, 0)
|
||||
result = algos.take_nd(labels, ids - 1)
|
||||
|
||||
else:
|
||||
result = ids - 1
|
||||
if has_nas:
|
||||
result = result.astype(np.float64)
|
||||
np.putmask(result, na_mask, np.nan)
|
||||
|
||||
return result, bins
|
||||
|
||||
|
||||
def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]:
|
||||
"""
|
||||
if the passed data is of datetime/timedelta, bool or nullable int type,
|
||||
this method converts it to numeric so that cut or qcut method can
|
||||
handle it
|
||||
"""
|
||||
dtype: DtypeObj | None = None
|
||||
|
||||
if _is_dt_or_td(x.dtype):
|
||||
dtype = x.dtype
|
||||
elif is_bool_dtype(x.dtype):
|
||||
# GH 20303
|
||||
x = x.astype(np.int64)
|
||||
# To support cut and qcut for IntegerArray we convert to float dtype.
|
||||
# Will properly support in the future.
|
||||
# https://github.com/pandas-dev/pandas/pull/31290
|
||||
# https://github.com/pandas-dev/pandas/issues/31389
|
||||
elif isinstance(x.dtype, ExtensionDtype) and is_numeric_dtype(x.dtype):
|
||||
x_arr = x.to_numpy(dtype=np.float64, na_value=np.nan)
|
||||
x = Index(x_arr)
|
||||
|
||||
return Index(x), dtype
|
||||
|
||||
|
||||
def _is_dt_or_td(dtype: DtypeObj) -> bool:
|
||||
# Note: the dtype here comes from an Index.dtype, so we know that that any
|
||||
# dt64/td64 dtype is of a supported unit.
|
||||
return isinstance(dtype, DatetimeTZDtype) or lib.is_np_dtype(dtype, "mM")
|
||||
|
||||
|
||||
def _format_labels(
|
||||
bins: Index,
|
||||
precision: int,
|
||||
right: bool = True,
|
||||
include_lowest: bool = False,
|
||||
):
|
||||
"""based on the dtype, return our labels"""
|
||||
closed: IntervalLeftRight = "right" if right else "left"
|
||||
|
||||
formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta]
|
||||
|
||||
if _is_dt_or_td(bins.dtype):
|
||||
# error: Argument 1 to "dtype_to_unit" has incompatible type
|
||||
# "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]"
|
||||
unit = dtype_to_unit(bins.dtype) # type: ignore[arg-type]
|
||||
formatter = lambda x: x
|
||||
adjust = lambda x: x - Timedelta(1, unit=unit).as_unit(unit)
|
||||
else:
|
||||
precision = _infer_precision(precision, bins)
|
||||
formatter = lambda x: _round_frac(x, precision)
|
||||
adjust = lambda x: x - 10 ** (-precision)
|
||||
|
||||
breaks = [formatter(b) for b in bins]
|
||||
if right and include_lowest:
|
||||
# adjust lhs of first interval by precision to account for being right closed
|
||||
breaks[0] = adjust(breaks[0])
|
||||
|
||||
if _is_dt_or_td(bins.dtype):
|
||||
# error: "Index" has no attribute "as_unit"
|
||||
breaks = type(bins)(breaks).as_unit(unit) # type: ignore[attr-defined]
|
||||
|
||||
return IntervalIndex.from_breaks(breaks, closed=closed)
|
||||
|
||||
|
||||
def _preprocess_for_cut(x) -> Index:
|
||||
"""
|
||||
handles preprocessing for cut where we convert passed
|
||||
input to array, strip the index information and store it
|
||||
separately
|
||||
"""
|
||||
# Check that the passed array is a Pandas or Numpy object
|
||||
# We don't want to strip away a Pandas data-type here (e.g. datetimetz)
|
||||
ndim = getattr(x, "ndim", None)
|
||||
if ndim is None:
|
||||
x = np.asarray(x)
|
||||
if x.ndim != 1:
|
||||
raise ValueError("Input array must be 1 dimensional")
|
||||
|
||||
return Index(x)
|
||||
|
||||
|
||||
def _postprocess_for_cut(fac, bins, retbins: bool, original):
|
||||
"""
|
||||
handles post processing for the cut method where
|
||||
we combine the index information if the originally passed
|
||||
datatype was a series
|
||||
"""
|
||||
if isinstance(original, ABCSeries):
|
||||
fac = original._constructor(fac, index=original.index, name=original.name)
|
||||
|
||||
if not retbins:
|
||||
return fac
|
||||
|
||||
if isinstance(bins, Index) and is_numeric_dtype(bins.dtype):
|
||||
bins = bins._values
|
||||
|
||||
return fac, bins
|
||||
|
||||
|
||||
def _round_frac(x, precision: int):
|
||||
"""
|
||||
Round the fractional part of the given number
|
||||
"""
|
||||
if not np.isfinite(x) or x == 0:
|
||||
return x
|
||||
else:
|
||||
frac, whole = np.modf(x)
|
||||
if whole == 0:
|
||||
digits = -int(np.floor(np.log10(abs(frac)))) - 1 + precision
|
||||
else:
|
||||
digits = precision
|
||||
return np.around(x, digits)
|
||||
|
||||
|
||||
def _infer_precision(base_precision: int, bins: Index) -> int:
|
||||
"""
|
||||
Infer an appropriate precision for _round_frac
|
||||
"""
|
||||
for precision in range(base_precision, 20):
|
||||
levels = np.asarray([_round_frac(b, precision) for b in bins])
|
||||
if algos.unique(levels).size == bins.size:
|
||||
return precision
|
||||
return base_precision # default
|
85
lib/python3.13/site-packages/pandas/core/reshape/util.py
Normal file
85
lib/python3.13/site-packages/pandas/core/reshape/util.py
Normal file
@ -0,0 +1,85 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import NumpyIndexT
|
||||
|
||||
|
||||
def cartesian_product(X) -> list[np.ndarray]:
|
||||
"""
|
||||
Numpy version of itertools.product.
|
||||
Sometimes faster (for large inputs)...
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : list-like of list-likes
|
||||
|
||||
Returns
|
||||
-------
|
||||
product : list of ndarrays
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> cartesian_product([list('ABC'), [1, 2]])
|
||||
[array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='<U1'), array([1, 2, 1, 2, 1, 2])]
|
||||
|
||||
See Also
|
||||
--------
|
||||
itertools.product : Cartesian product of input iterables. Equivalent to
|
||||
nested for-loops.
|
||||
"""
|
||||
msg = "Input must be a list-like of list-likes"
|
||||
if not is_list_like(X):
|
||||
raise TypeError(msg)
|
||||
for x in X:
|
||||
if not is_list_like(x):
|
||||
raise TypeError(msg)
|
||||
|
||||
if len(X) == 0:
|
||||
return []
|
||||
|
||||
lenX = np.fromiter((len(x) for x in X), dtype=np.intp)
|
||||
cumprodX = np.cumprod(lenX)
|
||||
|
||||
if np.any(cumprodX < 0):
|
||||
raise ValueError("Product space too large to allocate arrays!")
|
||||
|
||||
a = np.roll(cumprodX, 1)
|
||||
a[0] = 1
|
||||
|
||||
if cumprodX[-1] != 0:
|
||||
b = cumprodX[-1] / cumprodX
|
||||
else:
|
||||
# if any factor is empty, the cartesian product is empty
|
||||
b = np.zeros_like(cumprodX)
|
||||
|
||||
# error: Argument of type "int_" cannot be assigned to parameter "num" of
|
||||
# type "int" in function "tile_compat"
|
||||
return [
|
||||
tile_compat(
|
||||
np.repeat(x, b[i]),
|
||||
np.prod(a[i]),
|
||||
)
|
||||
for i, x in enumerate(X)
|
||||
]
|
||||
|
||||
|
||||
def tile_compat(arr: NumpyIndexT, num: int) -> NumpyIndexT:
|
||||
"""
|
||||
Index compat for np.tile.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Does not support multi-dimensional `num`.
|
||||
"""
|
||||
if isinstance(arr, np.ndarray):
|
||||
return np.tile(arr, num)
|
||||
|
||||
# Otherwise we have an Index
|
||||
taker = np.tile(np.arange(len(arr)), num)
|
||||
return arr.take(taker)
|
Reference in New Issue
Block a user