Updated script that can be controled by Nodejs web app

This commit is contained in:
mac OS
2024-11-25 12:24:18 +07:00
parent c440eda1f4
commit 8b0ab2bd3a
8662 changed files with 1803808 additions and 34 deletions

View File

@ -0,0 +1,41 @@
from pandas.core.reshape.concat import concat
from pandas.core.reshape.encoding import (
from_dummies,
get_dummies,
)
from pandas.core.reshape.melt import (
lreshape,
melt,
wide_to_long,
)
from pandas.core.reshape.merge import (
merge,
merge_asof,
merge_ordered,
)
from pandas.core.reshape.pivot import (
crosstab,
pivot,
pivot_table,
)
from pandas.core.reshape.tile import (
cut,
qcut,
)
__all__ = [
"concat",
"crosstab",
"cut",
"from_dummies",
"get_dummies",
"lreshape",
"melt",
"merge",
"merge_asof",
"merge_ordered",
"pivot",
"pivot_table",
"qcut",
"wide_to_long",
]

View File

@ -0,0 +1,894 @@
"""
Concat routines.
"""
from __future__ import annotations
from collections import abc
from typing import (
TYPE_CHECKING,
Callable,
Literal,
cast,
overload,
)
import warnings
import numpy as np
from pandas._config import using_copy_on_write
from pandas.util._decorators import cache_readonly
from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes.common import (
is_bool,
is_iterator,
)
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCSeries,
)
from pandas.core.dtypes.missing import isna
from pandas.core.arrays.categorical import (
factorize_from_iterable,
factorize_from_iterables,
)
import pandas.core.common as com
from pandas.core.indexes.api import (
Index,
MultiIndex,
all_indexes_same,
default_index,
ensure_index,
get_objs_combined_axis,
get_unanimous_names,
)
from pandas.core.internals import concatenate_managers
if TYPE_CHECKING:
from collections.abc import (
Hashable,
Iterable,
Mapping,
)
from pandas._typing import (
Axis,
AxisInt,
HashableT,
)
from pandas import (
DataFrame,
Series,
)
# ---------------------------------------------------------------------
# Concatenate DataFrame objects
@overload
def concat(
objs: Iterable[DataFrame] | Mapping[HashableT, DataFrame],
*,
axis: Literal[0, "index"] = ...,
join: str = ...,
ignore_index: bool = ...,
keys: Iterable[Hashable] | None = ...,
levels=...,
names: list[HashableT] | None = ...,
verify_integrity: bool = ...,
sort: bool = ...,
copy: bool | None = ...,
) -> DataFrame:
...
@overload
def concat(
objs: Iterable[Series] | Mapping[HashableT, Series],
*,
axis: Literal[0, "index"] = ...,
join: str = ...,
ignore_index: bool = ...,
keys: Iterable[Hashable] | None = ...,
levels=...,
names: list[HashableT] | None = ...,
verify_integrity: bool = ...,
sort: bool = ...,
copy: bool | None = ...,
) -> Series:
...
@overload
def concat(
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
*,
axis: Literal[0, "index"] = ...,
join: str = ...,
ignore_index: bool = ...,
keys: Iterable[Hashable] | None = ...,
levels=...,
names: list[HashableT] | None = ...,
verify_integrity: bool = ...,
sort: bool = ...,
copy: bool | None = ...,
) -> DataFrame | Series:
...
@overload
def concat(
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
*,
axis: Literal[1, "columns"],
join: str = ...,
ignore_index: bool = ...,
keys: Iterable[Hashable] | None = ...,
levels=...,
names: list[HashableT] | None = ...,
verify_integrity: bool = ...,
sort: bool = ...,
copy: bool | None = ...,
) -> DataFrame:
...
@overload
def concat(
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
*,
axis: Axis = ...,
join: str = ...,
ignore_index: bool = ...,
keys: Iterable[Hashable] | None = ...,
levels=...,
names: list[HashableT] | None = ...,
verify_integrity: bool = ...,
sort: bool = ...,
copy: bool | None = ...,
) -> DataFrame | Series:
...
def concat(
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
*,
axis: Axis = 0,
join: str = "outer",
ignore_index: bool = False,
keys: Iterable[Hashable] | None = None,
levels=None,
names: list[HashableT] | None = None,
verify_integrity: bool = False,
sort: bool = False,
copy: bool | None = None,
) -> DataFrame | Series:
"""
Concatenate pandas objects along a particular axis.
Allows optional set logic along the other axes.
Can also add a layer of hierarchical indexing on the concatenation axis,
which may be useful if the labels are the same (or overlapping) on
the passed axis number.
Parameters
----------
objs : a sequence or mapping of Series or DataFrame objects
If a mapping is passed, the sorted keys will be used as the `keys`
argument, unless it is passed, in which case the values will be
selected (see below). Any None objects will be dropped silently unless
they are all None in which case a ValueError will be raised.
axis : {0/'index', 1/'columns'}, default 0
The axis to concatenate along.
join : {'inner', 'outer'}, default 'outer'
How to handle indexes on other axis (or axes).
ignore_index : bool, default False
If True, do not use the index values along the concatenation axis. The
resulting axis will be labeled 0, ..., n - 1. This is useful if you are
concatenating objects where the concatenation axis does not have
meaningful indexing information. Note the index values on the other
axes are still respected in the join.
keys : sequence, default None
If multiple levels passed, should contain tuples. Construct
hierarchical index using the passed keys as the outermost level.
levels : list of sequences, default None
Specific levels (unique values) to use for constructing a
MultiIndex. Otherwise they will be inferred from the keys.
names : list, default None
Names for the levels in the resulting hierarchical index.
verify_integrity : bool, default False
Check whether the new concatenated axis contains duplicates. This can
be very expensive relative to the actual data concatenation.
sort : bool, default False
Sort non-concatenation axis if it is not already aligned. One exception to
this is when the non-concatentation axis is a DatetimeIndex and join='outer'
and the axis is not already aligned. In that case, the non-concatenation
axis is always sorted lexicographically.
copy : bool, default True
If False, do not copy data unnecessarily.
Returns
-------
object, type of objs
When concatenating all ``Series`` along the index (axis=0), a
``Series`` is returned. When ``objs`` contains at least one
``DataFrame``, a ``DataFrame`` is returned. When concatenating along
the columns (axis=1), a ``DataFrame`` is returned.
See Also
--------
DataFrame.join : Join DataFrames using indexes.
DataFrame.merge : Merge DataFrames by indexes or columns.
Notes
-----
The keys, levels, and names arguments are all optional.
A walkthrough of how this method fits in with other tools for combining
pandas objects can be found `here
<https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html>`__.
It is not recommended to build DataFrames by adding single rows in a
for loop. Build a list of rows and make a DataFrame in a single concat.
Examples
--------
Combine two ``Series``.
>>> s1 = pd.Series(['a', 'b'])
>>> s2 = pd.Series(['c', 'd'])
>>> pd.concat([s1, s2])
0 a
1 b
0 c
1 d
dtype: object
Clear the existing index and reset it in the result
by setting the ``ignore_index`` option to ``True``.
>>> pd.concat([s1, s2], ignore_index=True)
0 a
1 b
2 c
3 d
dtype: object
Add a hierarchical index at the outermost level of
the data with the ``keys`` option.
>>> pd.concat([s1, s2], keys=['s1', 's2'])
s1 0 a
1 b
s2 0 c
1 d
dtype: object
Label the index keys you create with the ``names`` option.
>>> pd.concat([s1, s2], keys=['s1', 's2'],
... names=['Series name', 'Row ID'])
Series name Row ID
s1 0 a
1 b
s2 0 c
1 d
dtype: object
Combine two ``DataFrame`` objects with identical columns.
>>> df1 = pd.DataFrame([['a', 1], ['b', 2]],
... columns=['letter', 'number'])
>>> df1
letter number
0 a 1
1 b 2
>>> df2 = pd.DataFrame([['c', 3], ['d', 4]],
... columns=['letter', 'number'])
>>> df2
letter number
0 c 3
1 d 4
>>> pd.concat([df1, df2])
letter number
0 a 1
1 b 2
0 c 3
1 d 4
Combine ``DataFrame`` objects with overlapping columns
and return everything. Columns outside the intersection will
be filled with ``NaN`` values.
>>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
... columns=['letter', 'number', 'animal'])
>>> df3
letter number animal
0 c 3 cat
1 d 4 dog
>>> pd.concat([df1, df3], sort=False)
letter number animal
0 a 1 NaN
1 b 2 NaN
0 c 3 cat
1 d 4 dog
Combine ``DataFrame`` objects with overlapping columns
and return only those that are shared by passing ``inner`` to
the ``join`` keyword argument.
>>> pd.concat([df1, df3], join="inner")
letter number
0 a 1
1 b 2
0 c 3
1 d 4
Combine ``DataFrame`` objects horizontally along the x axis by
passing in ``axis=1``.
>>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
... columns=['animal', 'name'])
>>> pd.concat([df1, df4], axis=1)
letter number animal name
0 a 1 bird polly
1 b 2 monkey george
Prevent the result from including duplicate index values with the
``verify_integrity`` option.
>>> df5 = pd.DataFrame([1], index=['a'])
>>> df5
0
a 1
>>> df6 = pd.DataFrame([2], index=['a'])
>>> df6
0
a 2
>>> pd.concat([df5, df6], verify_integrity=True)
Traceback (most recent call last):
...
ValueError: Indexes have overlapping values: ['a']
Append a single row to the end of a ``DataFrame`` object.
>>> df7 = pd.DataFrame({'a': 1, 'b': 2}, index=[0])
>>> df7
a b
0 1 2
>>> new_row = pd.Series({'a': 3, 'b': 4})
>>> new_row
a 3
b 4
dtype: int64
>>> pd.concat([df7, new_row.to_frame().T], ignore_index=True)
a b
0 1 2
1 3 4
"""
if copy is None:
if using_copy_on_write():
copy = False
else:
copy = True
elif copy and using_copy_on_write():
copy = False
op = _Concatenator(
objs,
axis=axis,
ignore_index=ignore_index,
join=join,
keys=keys,
levels=levels,
names=names,
verify_integrity=verify_integrity,
copy=copy,
sort=sort,
)
return op.get_result()
class _Concatenator:
"""
Orchestrates a concatenation operation for BlockManagers
"""
sort: bool
def __init__(
self,
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
axis: Axis = 0,
join: str = "outer",
keys: Iterable[Hashable] | None = None,
levels=None,
names: list[HashableT] | None = None,
ignore_index: bool = False,
verify_integrity: bool = False,
copy: bool = True,
sort: bool = False,
) -> None:
if isinstance(objs, (ABCSeries, ABCDataFrame, str)):
raise TypeError(
"first argument must be an iterable of pandas "
f'objects, you passed an object of type "{type(objs).__name__}"'
)
if join == "outer":
self.intersect = False
elif join == "inner":
self.intersect = True
else: # pragma: no cover
raise ValueError(
"Only can inner (intersect) or outer (union) join the other axis"
)
if not is_bool(sort):
raise ValueError(
f"The 'sort' keyword only accepts boolean values; {sort} was passed."
)
# Incompatible types in assignment (expression has type "Union[bool, bool_]",
# variable has type "bool")
self.sort = sort # type: ignore[assignment]
self.ignore_index = ignore_index
self.verify_integrity = verify_integrity
self.copy = copy
objs, keys = self._clean_keys_and_objs(objs, keys)
# figure out what our result ndim is going to be
ndims = self._get_ndims(objs)
sample, objs = self._get_sample_object(objs, ndims, keys, names, levels)
# Standardize axis parameter to int
if sample.ndim == 1:
from pandas import DataFrame
axis = DataFrame._get_axis_number(axis)
self._is_frame = False
self._is_series = True
else:
axis = sample._get_axis_number(axis)
self._is_frame = True
self._is_series = False
# Need to flip BlockManager axis in the DataFrame special case
axis = sample._get_block_manager_axis(axis)
# if we have mixed ndims, then convert to highest ndim
# creating column numbers as needed
if len(ndims) > 1:
objs = self._sanitize_mixed_ndim(objs, sample, ignore_index, axis)
self.objs = objs
# note: this is the BlockManager axis (since DataFrame is transposed)
self.bm_axis = axis
self.axis = 1 - self.bm_axis if self._is_frame else 0
self.keys = keys
self.names = names or getattr(keys, "names", None)
self.levels = levels
def _get_ndims(self, objs: list[Series | DataFrame]) -> set[int]:
# figure out what our result ndim is going to be
ndims = set()
for obj in objs:
if not isinstance(obj, (ABCSeries, ABCDataFrame)):
msg = (
f"cannot concatenate object of type '{type(obj)}'; "
"only Series and DataFrame objs are valid"
)
raise TypeError(msg)
ndims.add(obj.ndim)
return ndims
def _clean_keys_and_objs(
self,
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
keys,
) -> tuple[list[Series | DataFrame], Index | None]:
if isinstance(objs, abc.Mapping):
if keys is None:
keys = list(objs.keys())
objs_list = [objs[k] for k in keys]
else:
objs_list = list(objs)
if len(objs_list) == 0:
raise ValueError("No objects to concatenate")
if keys is None:
objs_list = list(com.not_none(*objs_list))
else:
# GH#1649
clean_keys = []
clean_objs = []
if is_iterator(keys):
keys = list(keys)
if len(keys) != len(objs_list):
# GH#43485
warnings.warn(
"The behavior of pd.concat with len(keys) != len(objs) is "
"deprecated. In a future version this will raise instead of "
"truncating to the smaller of the two sequences",
FutureWarning,
stacklevel=find_stack_level(),
)
for k, v in zip(keys, objs_list):
if v is None:
continue
clean_keys.append(k)
clean_objs.append(v)
objs_list = clean_objs
if isinstance(keys, MultiIndex):
# TODO: retain levels?
keys = type(keys).from_tuples(clean_keys, names=keys.names)
else:
name = getattr(keys, "name", None)
keys = Index(clean_keys, name=name, dtype=getattr(keys, "dtype", None))
if len(objs_list) == 0:
raise ValueError("All objects passed were None")
return objs_list, keys
def _get_sample_object(
self,
objs: list[Series | DataFrame],
ndims: set[int],
keys,
names,
levels,
) -> tuple[Series | DataFrame, list[Series | DataFrame]]:
# get the sample
# want the highest ndim that we have, and must be non-empty
# unless all objs are empty
sample: Series | DataFrame | None = None
if len(ndims) > 1:
max_ndim = max(ndims)
for obj in objs:
if obj.ndim == max_ndim and np.sum(obj.shape):
sample = obj
break
else:
# filter out the empties if we have not multi-index possibilities
# note to keep empty Series as it affect to result columns / name
non_empties = [obj for obj in objs if sum(obj.shape) > 0 or obj.ndim == 1]
if len(non_empties) and (
keys is None and names is None and levels is None and not self.intersect
):
objs = non_empties
sample = objs[0]
if sample is None:
sample = objs[0]
return sample, objs
def _sanitize_mixed_ndim(
self,
objs: list[Series | DataFrame],
sample: Series | DataFrame,
ignore_index: bool,
axis: AxisInt,
) -> list[Series | DataFrame]:
# if we have mixed ndims, then convert to highest ndim
# creating column numbers as needed
new_objs = []
current_column = 0
max_ndim = sample.ndim
for obj in objs:
ndim = obj.ndim
if ndim == max_ndim:
pass
elif ndim != max_ndim - 1:
raise ValueError(
"cannot concatenate unaligned mixed dimensional NDFrame objects"
)
else:
name = getattr(obj, "name", None)
if ignore_index or name is None:
if axis == 1:
# doing a row-wise concatenation so need everything
# to line up
name = 0
else:
# doing a column-wise concatenation so need series
# to have unique names
name = current_column
current_column += 1
obj = sample._constructor({name: obj}, copy=False)
new_objs.append(obj)
return new_objs
def get_result(self):
cons: Callable[..., DataFrame | Series]
sample: DataFrame | Series
# series only
if self._is_series:
sample = cast("Series", self.objs[0])
# stack blocks
if self.bm_axis == 0:
name = com.consensus_name_attr(self.objs)
cons = sample._constructor
arrs = [ser._values for ser in self.objs]
res = concat_compat(arrs, axis=0)
new_index: Index
if self.ignore_index:
# We can avoid surprisingly-expensive _get_concat_axis
new_index = default_index(len(res))
else:
new_index = self.new_axes[0]
mgr = type(sample._mgr).from_array(res, index=new_index)
result = sample._constructor_from_mgr(mgr, axes=mgr.axes)
result._name = name
return result.__finalize__(self, method="concat")
# combine as columns in a frame
else:
data = dict(zip(range(len(self.objs)), self.objs))
# GH28330 Preserves subclassed objects through concat
cons = sample._constructor_expanddim
index, columns = self.new_axes
df = cons(data, index=index, copy=self.copy)
df.columns = columns
return df.__finalize__(self, method="concat")
# combine block managers
else:
sample = cast("DataFrame", self.objs[0])
mgrs_indexers = []
for obj in self.objs:
indexers = {}
for ax, new_labels in enumerate(self.new_axes):
# ::-1 to convert BlockManager ax to DataFrame ax
if ax == self.bm_axis:
# Suppress reindexing on concat axis
continue
# 1-ax to convert BlockManager axis to DataFrame axis
obj_labels = obj.axes[1 - ax]
if not new_labels.equals(obj_labels):
indexers[ax] = obj_labels.get_indexer(new_labels)
mgrs_indexers.append((obj._mgr, indexers))
new_data = concatenate_managers(
mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy
)
if not self.copy and not using_copy_on_write():
new_data._consolidate_inplace()
out = sample._constructor_from_mgr(new_data, axes=new_data.axes)
return out.__finalize__(self, method="concat")
def _get_result_dim(self) -> int:
if self._is_series and self.bm_axis == 1:
return 2
else:
return self.objs[0].ndim
@cache_readonly
def new_axes(self) -> list[Index]:
ndim = self._get_result_dim()
return [
self._get_concat_axis if i == self.bm_axis else self._get_comb_axis(i)
for i in range(ndim)
]
def _get_comb_axis(self, i: AxisInt) -> Index:
data_axis = self.objs[0]._get_block_manager_axis(i)
return get_objs_combined_axis(
self.objs,
axis=data_axis,
intersect=self.intersect,
sort=self.sort,
copy=self.copy,
)
@cache_readonly
def _get_concat_axis(self) -> Index:
"""
Return index to be used along concatenation axis.
"""
if self._is_series:
if self.bm_axis == 0:
indexes = [x.index for x in self.objs]
elif self.ignore_index:
idx = default_index(len(self.objs))
return idx
elif self.keys is None:
names: list[Hashable] = [None] * len(self.objs)
num = 0
has_names = False
for i, x in enumerate(self.objs):
if x.ndim != 1:
raise TypeError(
f"Cannot concatenate type 'Series' with "
f"object of type '{type(x).__name__}'"
)
if x.name is not None:
names[i] = x.name
has_names = True
else:
names[i] = num
num += 1
if has_names:
return Index(names)
else:
return default_index(len(self.objs))
else:
return ensure_index(self.keys).set_names(self.names)
else:
indexes = [x.axes[self.axis] for x in self.objs]
if self.ignore_index:
idx = default_index(sum(len(i) for i in indexes))
return idx
if self.keys is None:
if self.levels is not None:
raise ValueError("levels supported only when keys is not None")
concat_axis = _concat_indexes(indexes)
else:
concat_axis = _make_concat_multiindex(
indexes, self.keys, self.levels, self.names
)
self._maybe_check_integrity(concat_axis)
return concat_axis
def _maybe_check_integrity(self, concat_index: Index):
if self.verify_integrity:
if not concat_index.is_unique:
overlap = concat_index[concat_index.duplicated()].unique()
raise ValueError(f"Indexes have overlapping values: {overlap}")
def _concat_indexes(indexes) -> Index:
return indexes[0].append(indexes[1:])
def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiIndex:
if (levels is None and isinstance(keys[0], tuple)) or (
levels is not None and len(levels) > 1
):
zipped = list(zip(*keys))
if names is None:
names = [None] * len(zipped)
if levels is None:
_, levels = factorize_from_iterables(zipped)
else:
levels = [ensure_index(x) for x in levels]
else:
zipped = [keys]
if names is None:
names = [None]
if levels is None:
levels = [ensure_index(keys).unique()]
else:
levels = [ensure_index(x) for x in levels]
for level in levels:
if not level.is_unique:
raise ValueError(f"Level values not unique: {level.tolist()}")
if not all_indexes_same(indexes) or not all(level.is_unique for level in levels):
codes_list = []
# things are potentially different sizes, so compute the exact codes
# for each level and pass those to MultiIndex.from_arrays
for hlevel, level in zip(zipped, levels):
to_concat = []
if isinstance(hlevel, Index) and hlevel.equals(level):
lens = [len(idx) for idx in indexes]
codes_list.append(np.repeat(np.arange(len(hlevel)), lens))
else:
for key, index in zip(hlevel, indexes):
# Find matching codes, include matching nan values as equal.
mask = (isna(level) & isna(key)) | (level == key)
if not mask.any():
raise ValueError(f"Key {key} not in level {level}")
i = np.nonzero(mask)[0][0]
to_concat.append(np.repeat(i, len(index)))
codes_list.append(np.concatenate(to_concat))
concat_index = _concat_indexes(indexes)
# these go at the end
if isinstance(concat_index, MultiIndex):
levels.extend(concat_index.levels)
codes_list.extend(concat_index.codes)
else:
codes, categories = factorize_from_iterable(concat_index)
levels.append(categories)
codes_list.append(codes)
if len(names) == len(levels):
names = list(names)
else:
# make sure that all of the passed indices have the same nlevels
if not len({idx.nlevels for idx in indexes}) == 1:
raise AssertionError(
"Cannot concat indices that do not have the same number of levels"
)
# also copies
names = list(names) + list(get_unanimous_names(*indexes))
return MultiIndex(
levels=levels, codes=codes_list, names=names, verify_integrity=False
)
new_index = indexes[0]
n = len(new_index)
kpieces = len(indexes)
# also copies
new_names = list(names)
new_levels = list(levels)
# construct codes
new_codes = []
# do something a bit more speedy
for hlevel, level in zip(zipped, levels):
hlevel_index = ensure_index(hlevel)
mapped = level.get_indexer(hlevel_index)
mask = mapped == -1
if mask.any():
raise ValueError(
f"Values not found in passed level: {hlevel_index[mask]!s}"
)
new_codes.append(np.repeat(mapped, n))
if isinstance(new_index, MultiIndex):
new_levels.extend(new_index.levels)
new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes])
else:
new_levels.append(new_index.unique())
single_codes = new_index.unique().get_indexer(new_index)
new_codes.append(np.tile(single_codes, kpieces))
if len(new_names) < len(new_levels):
new_names.extend(new_index.names)
return MultiIndex(
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
)

View File

@ -0,0 +1,570 @@
from __future__ import annotations
from collections import defaultdict
from collections.abc import (
Hashable,
Iterable,
)
import itertools
from typing import (
TYPE_CHECKING,
cast,
)
import numpy as np
from pandas._libs.sparse import IntIndex
from pandas.core.dtypes.common import (
is_integer_dtype,
is_list_like,
is_object_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
ArrowDtype,
CategoricalDtype,
)
from pandas.core.arrays import SparseArray
from pandas.core.arrays.categorical import factorize_from_iterable
from pandas.core.arrays.string_ import StringDtype
from pandas.core.frame import DataFrame
from pandas.core.indexes.api import (
Index,
default_index,
)
from pandas.core.series import Series
if TYPE_CHECKING:
from pandas._typing import NpDtype
def get_dummies(
data,
prefix=None,
prefix_sep: str | Iterable[str] | dict[str, str] = "_",
dummy_na: bool = False,
columns=None,
sparse: bool = False,
drop_first: bool = False,
dtype: NpDtype | None = None,
) -> DataFrame:
"""
Convert categorical variable into dummy/indicator variables.
Each variable is converted in as many 0/1 variables as there are different
values. Columns in the output are each named after a value; if the input is
a DataFrame, the name of the original variable is prepended to the value.
Parameters
----------
data : array-like, Series, or DataFrame
Data of which to get dummy indicators.
prefix : str, list of str, or dict of str, default None
String to append DataFrame column names.
Pass a list with length equal to the number of columns
when calling get_dummies on a DataFrame. Alternatively, `prefix`
can be a dictionary mapping column names to prefixes.
prefix_sep : str, default '_'
If appending prefix, separator/delimiter to use. Or pass a
list or dictionary as with `prefix`.
dummy_na : bool, default False
Add a column to indicate NaNs, if False NaNs are ignored.
columns : list-like, default None
Column names in the DataFrame to be encoded.
If `columns` is None then all the columns with
`object`, `string`, or `category` dtype will be converted.
sparse : bool, default False
Whether the dummy-encoded columns should be backed by
a :class:`SparseArray` (True) or a regular NumPy array (False).
drop_first : bool, default False
Whether to get k-1 dummies out of k categorical levels by removing the
first level.
dtype : dtype, default bool
Data type for new columns. Only a single dtype is allowed.
Returns
-------
DataFrame
Dummy-coded data. If `data` contains other columns than the
dummy-coded one(s), these will be prepended, unaltered, to the result.
See Also
--------
Series.str.get_dummies : Convert Series of strings to dummy codes.
:func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame``.
Notes
-----
Reference :ref:`the user guide <reshaping.dummies>` for more examples.
Examples
--------
>>> s = pd.Series(list('abca'))
>>> pd.get_dummies(s)
a b c
0 True False False
1 False True False
2 False False True
3 True False False
>>> s1 = ['a', 'b', np.nan]
>>> pd.get_dummies(s1)
a b
0 True False
1 False True
2 False False
>>> pd.get_dummies(s1, dummy_na=True)
a b NaN
0 True False False
1 False True False
2 False False True
>>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
... 'C': [1, 2, 3]})
>>> pd.get_dummies(df, prefix=['col1', 'col2'])
C col1_a col1_b col2_a col2_b col2_c
0 1 True False False True False
1 2 False True True False False
2 3 True False False False True
>>> pd.get_dummies(pd.Series(list('abcaa')))
a b c
0 True False False
1 False True False
2 False False True
3 True False False
4 True False False
>>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
b c
0 False False
1 True False
2 False True
3 False False
4 False False
>>> pd.get_dummies(pd.Series(list('abc')), dtype=float)
a b c
0 1.0 0.0 0.0
1 0.0 1.0 0.0
2 0.0 0.0 1.0
"""
from pandas.core.reshape.concat import concat
dtypes_to_encode = ["object", "string", "category"]
if isinstance(data, DataFrame):
# determine columns being encoded
if columns is None:
data_to_encode = data.select_dtypes(include=dtypes_to_encode)
elif not is_list_like(columns):
raise TypeError("Input must be a list-like for parameter `columns`")
else:
data_to_encode = data[columns]
# validate prefixes and separator to avoid silently dropping cols
def check_len(item, name: str):
if is_list_like(item):
if not len(item) == data_to_encode.shape[1]:
len_msg = (
f"Length of '{name}' ({len(item)}) did not match the "
"length of the columns being encoded "
f"({data_to_encode.shape[1]})."
)
raise ValueError(len_msg)
check_len(prefix, "prefix")
check_len(prefix_sep, "prefix_sep")
if isinstance(prefix, str):
prefix = itertools.cycle([prefix])
if isinstance(prefix, dict):
prefix = [prefix[col] for col in data_to_encode.columns]
if prefix is None:
prefix = data_to_encode.columns
# validate separators
if isinstance(prefix_sep, str):
prefix_sep = itertools.cycle([prefix_sep])
elif isinstance(prefix_sep, dict):
prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]
with_dummies: list[DataFrame]
if data_to_encode.shape == data.shape:
# Encoding the entire df, do not prepend any dropped columns
with_dummies = []
elif columns is not None:
# Encoding only cols specified in columns. Get all cols not in
# columns to prepend to result.
with_dummies = [data.drop(columns, axis=1)]
else:
# Encoding only object and category dtype columns. Get remaining
# columns to prepend to result.
with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]
for col, pre, sep in zip(data_to_encode.items(), prefix, prefix_sep):
# col is (column_name, column), use just column data here
dummy = _get_dummies_1d(
col[1],
prefix=pre,
prefix_sep=sep,
dummy_na=dummy_na,
sparse=sparse,
drop_first=drop_first,
dtype=dtype,
)
with_dummies.append(dummy)
result = concat(with_dummies, axis=1)
else:
result = _get_dummies_1d(
data,
prefix,
prefix_sep,
dummy_na,
sparse=sparse,
drop_first=drop_first,
dtype=dtype,
)
return result
def _get_dummies_1d(
data,
prefix,
prefix_sep: str | Iterable[str] | dict[str, str] = "_",
dummy_na: bool = False,
sparse: bool = False,
drop_first: bool = False,
dtype: NpDtype | None = None,
) -> DataFrame:
from pandas.core.reshape.concat import concat
# Series avoids inconsistent NaN handling
codes, levels = factorize_from_iterable(Series(data, copy=False))
if dtype is None and hasattr(data, "dtype"):
input_dtype = data.dtype
if isinstance(input_dtype, CategoricalDtype):
input_dtype = input_dtype.categories.dtype
if isinstance(input_dtype, ArrowDtype):
import pyarrow as pa
dtype = ArrowDtype(pa.bool_()) # type: ignore[assignment]
elif (
isinstance(input_dtype, StringDtype)
and input_dtype.storage != "pyarrow_numpy"
):
dtype = pandas_dtype("boolean") # type: ignore[assignment]
else:
dtype = np.dtype(bool)
elif dtype is None:
dtype = np.dtype(bool)
_dtype = pandas_dtype(dtype)
if is_object_dtype(_dtype):
raise ValueError("dtype=object is not a valid dtype for get_dummies")
def get_empty_frame(data) -> DataFrame:
index: Index | np.ndarray
if isinstance(data, Series):
index = data.index
else:
index = default_index(len(data))
return DataFrame(index=index)
# if all NaN
if not dummy_na and len(levels) == 0:
return get_empty_frame(data)
codes = codes.copy()
if dummy_na:
codes[codes == -1] = len(levels)
levels = levels.insert(len(levels), np.nan)
# if dummy_na, we just fake a nan level. drop_first will drop it again
if drop_first and len(levels) == 1:
return get_empty_frame(data)
number_of_cols = len(levels)
if prefix is None:
dummy_cols = levels
else:
dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels])
index: Index | None
if isinstance(data, Series):
index = data.index
else:
index = None
if sparse:
fill_value: bool | float
if is_integer_dtype(dtype):
fill_value = 0
elif dtype == np.dtype(bool):
fill_value = False
else:
fill_value = 0.0
sparse_series = []
N = len(data)
sp_indices: list[list] = [[] for _ in range(len(dummy_cols))]
mask = codes != -1
codes = codes[mask]
n_idx = np.arange(N)[mask]
for ndx, code in zip(n_idx, codes):
sp_indices[code].append(ndx)
if drop_first:
# remove first categorical level to avoid perfect collinearity
# GH12042
sp_indices = sp_indices[1:]
dummy_cols = dummy_cols[1:]
for col, ixs in zip(dummy_cols, sp_indices):
sarr = SparseArray(
np.ones(len(ixs), dtype=dtype),
sparse_index=IntIndex(N, ixs),
fill_value=fill_value,
dtype=dtype,
)
sparse_series.append(Series(data=sarr, index=index, name=col, copy=False))
return concat(sparse_series, axis=1, copy=False)
else:
# ensure ndarray layout is column-major
shape = len(codes), number_of_cols
dummy_dtype: NpDtype
if isinstance(_dtype, np.dtype):
dummy_dtype = _dtype
else:
dummy_dtype = np.bool_
dummy_mat = np.zeros(shape=shape, dtype=dummy_dtype, order="F")
dummy_mat[np.arange(len(codes)), codes] = 1
if not dummy_na:
# reset NaN GH4446
dummy_mat[codes == -1] = 0
if drop_first:
# remove first GH12042
dummy_mat = dummy_mat[:, 1:]
dummy_cols = dummy_cols[1:]
return DataFrame(dummy_mat, index=index, columns=dummy_cols, dtype=_dtype)
def from_dummies(
data: DataFrame,
sep: None | str = None,
default_category: None | Hashable | dict[str, Hashable] = None,
) -> DataFrame:
"""
Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables.
Inverts the operation performed by :func:`~pandas.get_dummies`.
.. versionadded:: 1.5.0
Parameters
----------
data : DataFrame
Data which contains dummy-coded variables in form of integer columns of
1's and 0's.
sep : str, default None
Separator used in the column names of the dummy categories they are
character indicating the separation of the categorical names from the prefixes.
For example, if your column names are 'prefix_A' and 'prefix_B',
you can strip the underscore by specifying sep='_'.
default_category : None, Hashable or dict of Hashables, default None
The default category is the implied category when a value has none of the
listed categories specified with a one, i.e. if all dummies in a row are
zero. Can be a single value for all variables or a dict directly mapping
the default categories to a prefix of a variable.
Returns
-------
DataFrame
Categorical data decoded from the dummy input-data.
Raises
------
ValueError
* When the input ``DataFrame`` ``data`` contains NA values.
* When the input ``DataFrame`` ``data`` contains column names with separators
that do not match the separator specified with ``sep``.
* When a ``dict`` passed to ``default_category`` does not include an implied
category for each prefix.
* When a value in ``data`` has more than one category assigned to it.
* When ``default_category=None`` and a value in ``data`` has no category
assigned to it.
TypeError
* When the input ``data`` is not of type ``DataFrame``.
* When the input ``DataFrame`` ``data`` contains non-dummy data.
* When the passed ``sep`` is of a wrong data type.
* When the passed ``default_category`` is of a wrong data type.
See Also
--------
:func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy codes.
:class:`~pandas.Categorical` : Represent a categorical variable in classic.
Notes
-----
The columns of the passed dummy data should only include 1's and 0's,
or boolean values.
Examples
--------
>>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0],
... "c": [0, 0, 1, 0]})
>>> df
a b c
0 1 0 0
1 0 1 0
2 0 0 1
3 1 0 0
>>> pd.from_dummies(df)
0 a
1 b
2 c
3 a
>>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0],
... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],
... "col2_c": [0, 0, 1]})
>>> df
col1_a col1_b col2_a col2_b col2_c
0 1 0 0 1 0
1 0 1 1 0 0
2 1 0 0 0 1
>>> pd.from_dummies(df, sep="_")
col1 col2
0 a b
1 b a
2 a c
>>> df = pd.DataFrame({"col1_a": [1, 0, 0], "col1_b": [0, 1, 0],
... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],
... "col2_c": [0, 0, 0]})
>>> df
col1_a col1_b col2_a col2_b col2_c
0 1 0 0 1 0
1 0 1 1 0 0
2 0 0 0 0 0
>>> pd.from_dummies(df, sep="_", default_category={"col1": "d", "col2": "e"})
col1 col2
0 a b
1 b a
2 d e
"""
from pandas.core.reshape.concat import concat
if not isinstance(data, DataFrame):
raise TypeError(
"Expected 'data' to be a 'DataFrame'; "
f"Received 'data' of type: {type(data).__name__}"
)
col_isna_mask = cast(Series, data.isna().any())
if col_isna_mask.any():
raise ValueError(
"Dummy DataFrame contains NA value in column: "
f"'{col_isna_mask.idxmax()}'"
)
# index data with a list of all columns that are dummies
try:
data_to_decode = data.astype("boolean", copy=False)
except TypeError:
raise TypeError("Passed DataFrame contains non-dummy data")
# collect prefixes and get lists to slice data for each prefix
variables_slice = defaultdict(list)
if sep is None:
variables_slice[""] = list(data.columns)
elif isinstance(sep, str):
for col in data_to_decode.columns:
prefix = col.split(sep)[0]
if len(prefix) == len(col):
raise ValueError(f"Separator not specified for column: {col}")
variables_slice[prefix].append(col)
else:
raise TypeError(
"Expected 'sep' to be of type 'str' or 'None'; "
f"Received 'sep' of type: {type(sep).__name__}"
)
if default_category is not None:
if isinstance(default_category, dict):
if not len(default_category) == len(variables_slice):
len_msg = (
f"Length of 'default_category' ({len(default_category)}) "
f"did not match the length of the columns being encoded "
f"({len(variables_slice)})"
)
raise ValueError(len_msg)
elif isinstance(default_category, Hashable):
default_category = dict(
zip(variables_slice, [default_category] * len(variables_slice))
)
else:
raise TypeError(
"Expected 'default_category' to be of type "
"'None', 'Hashable', or 'dict'; "
"Received 'default_category' of type: "
f"{type(default_category).__name__}"
)
cat_data = {}
for prefix, prefix_slice in variables_slice.items():
if sep is None:
cats = prefix_slice.copy()
else:
cats = [col[len(prefix + sep) :] for col in prefix_slice]
assigned = data_to_decode.loc[:, prefix_slice].sum(axis=1)
if any(assigned > 1):
raise ValueError(
"Dummy DataFrame contains multi-assignment(s); "
f"First instance in row: {assigned.idxmax()}"
)
if any(assigned == 0):
if isinstance(default_category, dict):
cats.append(default_category[prefix])
else:
raise ValueError(
"Dummy DataFrame contains unassigned value(s); "
f"First instance in row: {assigned.idxmin()}"
)
data_slice = concat(
(data_to_decode.loc[:, prefix_slice], assigned == 0), axis=1
)
else:
data_slice = data_to_decode.loc[:, prefix_slice]
cats_array = data._constructor_sliced(cats, dtype=data.columns.dtype)
# get indices of True entries along axis=1
true_values = data_slice.idxmax(axis=1)
indexer = data_slice.columns.get_indexer_for(true_values)
cat_data[prefix] = cats_array.take(indexer).set_axis(data.index)
result = DataFrame(cat_data)
if sep is not None:
result.columns = result.columns.astype(data.columns.dtype)
return result

View File

@ -0,0 +1,512 @@
from __future__ import annotations
import re
from typing import TYPE_CHECKING
import numpy as np
from pandas.util._decorators import Appender
from pandas.core.dtypes.common import is_list_like
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.missing import notna
import pandas.core.algorithms as algos
from pandas.core.indexes.api import MultiIndex
from pandas.core.reshape.concat import concat
from pandas.core.reshape.util import tile_compat
from pandas.core.shared_docs import _shared_docs
from pandas.core.tools.numeric import to_numeric
if TYPE_CHECKING:
from collections.abc import Hashable
from pandas._typing import AnyArrayLike
from pandas import DataFrame
def ensure_list_vars(arg_vars, variable: str, columns) -> list:
if arg_vars is not None:
if not is_list_like(arg_vars):
return [arg_vars]
elif isinstance(columns, MultiIndex) and not isinstance(arg_vars, list):
raise ValueError(
f"{variable} must be a list of tuples when columns are a MultiIndex"
)
else:
return list(arg_vars)
else:
return []
@Appender(_shared_docs["melt"] % {"caller": "pd.melt(df, ", "other": "DataFrame.melt"})
def melt(
frame: DataFrame,
id_vars=None,
value_vars=None,
var_name=None,
value_name: Hashable = "value",
col_level=None,
ignore_index: bool = True,
) -> DataFrame:
if value_name in frame.columns:
raise ValueError(
f"value_name ({value_name}) cannot match an element in "
"the DataFrame columns."
)
id_vars = ensure_list_vars(id_vars, "id_vars", frame.columns)
value_vars_was_not_none = value_vars is not None
value_vars = ensure_list_vars(value_vars, "value_vars", frame.columns)
if id_vars or value_vars:
if col_level is not None:
level = frame.columns.get_level_values(col_level)
else:
level = frame.columns
labels = id_vars + value_vars
idx = level.get_indexer_for(labels)
missing = idx == -1
if missing.any():
missing_labels = [
lab for lab, not_found in zip(labels, missing) if not_found
]
raise KeyError(
"The following id_vars or value_vars are not present in "
f"the DataFrame: {missing_labels}"
)
if value_vars_was_not_none:
frame = frame.iloc[:, algos.unique(idx)]
else:
frame = frame.copy()
else:
frame = frame.copy()
if col_level is not None: # allow list or other?
# frame is a copy
frame.columns = frame.columns.get_level_values(col_level)
if var_name is None:
if isinstance(frame.columns, MultiIndex):
if len(frame.columns.names) == len(set(frame.columns.names)):
var_name = frame.columns.names
else:
var_name = [f"variable_{i}" for i in range(len(frame.columns.names))]
else:
var_name = [
frame.columns.name if frame.columns.name is not None else "variable"
]
elif is_list_like(var_name):
raise ValueError(f"{var_name=} must be a scalar.")
else:
var_name = [var_name]
num_rows, K = frame.shape
num_cols_adjusted = K - len(id_vars)
mdata: dict[Hashable, AnyArrayLike] = {}
for col in id_vars:
id_data = frame.pop(col)
if not isinstance(id_data.dtype, np.dtype):
# i.e. ExtensionDtype
if num_cols_adjusted > 0:
mdata[col] = concat([id_data] * num_cols_adjusted, ignore_index=True)
else:
# We can't concat empty list. (GH 46044)
mdata[col] = type(id_data)([], name=id_data.name, dtype=id_data.dtype)
else:
mdata[col] = np.tile(id_data._values, num_cols_adjusted)
mcolumns = id_vars + var_name + [value_name]
if frame.shape[1] > 0 and not any(
not isinstance(dt, np.dtype) and dt._supports_2d for dt in frame.dtypes
):
mdata[value_name] = concat(
[frame.iloc[:, i] for i in range(frame.shape[1])]
).values
else:
mdata[value_name] = frame._values.ravel("F")
for i, col in enumerate(var_name):
mdata[col] = frame.columns._get_level_values(i).repeat(num_rows)
result = frame._constructor(mdata, columns=mcolumns)
if not ignore_index:
result.index = tile_compat(frame.index, num_cols_adjusted)
return result
def lreshape(data: DataFrame, groups: dict, dropna: bool = True) -> DataFrame:
"""
Reshape wide-format data to long. Generalized inverse of DataFrame.pivot.
Accepts a dictionary, ``groups``, in which each key is a new column name
and each value is a list of old column names that will be "melted" under
the new column name as part of the reshape.
Parameters
----------
data : DataFrame
The wide-format DataFrame.
groups : dict
{new_name : list_of_columns}.
dropna : bool, default True
Do not include columns whose entries are all NaN.
Returns
-------
DataFrame
Reshaped DataFrame.
See Also
--------
melt : Unpivot a DataFrame from wide to long format, optionally leaving
identifiers set.
pivot : Create a spreadsheet-style pivot table as a DataFrame.
DataFrame.pivot : Pivot without aggregation that can handle
non-numeric data.
DataFrame.pivot_table : Generalization of pivot that can handle
duplicate values for one index/column pair.
DataFrame.unstack : Pivot based on the index values instead of a
column.
wide_to_long : Wide panel to long format. Less flexible but more
user-friendly than melt.
Examples
--------
>>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526],
... 'team': ['Red Sox', 'Yankees'],
... 'year1': [2007, 2007], 'year2': [2008, 2008]})
>>> data
hr1 hr2 team year1 year2
0 514 545 Red Sox 2007 2008
1 573 526 Yankees 2007 2008
>>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']})
team year hr
0 Red Sox 2007 514
1 Yankees 2007 573
2 Red Sox 2008 545
3 Yankees 2008 526
"""
mdata = {}
pivot_cols = []
all_cols: set[Hashable] = set()
K = len(next(iter(groups.values())))
for target, names in groups.items():
if len(names) != K:
raise ValueError("All column lists must be same length")
to_concat = [data[col]._values for col in names]
mdata[target] = concat_compat(to_concat)
pivot_cols.append(target)
all_cols = all_cols.union(names)
id_cols = list(data.columns.difference(all_cols))
for col in id_cols:
mdata[col] = np.tile(data[col]._values, K)
if dropna:
mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool)
for c in pivot_cols:
mask &= notna(mdata[c])
if not mask.all():
mdata = {k: v[mask] for k, v in mdata.items()}
return data._constructor(mdata, columns=id_cols + pivot_cols)
def wide_to_long(
df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+"
) -> DataFrame:
r"""
Unpivot a DataFrame from wide to long format.
Less flexible but more user-friendly than melt.
With stubnames ['A', 'B'], this function expects to find one or more
group of columns with format
A-suffix1, A-suffix2,..., B-suffix1, B-suffix2,...
You specify what you want to call this suffix in the resulting long format
with `j` (for example `j='year'`)
Each row of these wide variables are assumed to be uniquely identified by
`i` (can be a single column name or a list of column names)
All remaining variables in the data frame are left intact.
Parameters
----------
df : DataFrame
The wide-format DataFrame.
stubnames : str or list-like
The stub name(s). The wide format variables are assumed to
start with the stub names.
i : str or list-like
Column(s) to use as id variable(s).
j : str
The name of the sub-observation variable. What you wish to name your
suffix in the long format.
sep : str, default ""
A character indicating the separation of the variable names
in the wide format, to be stripped from the names in the long format.
For example, if your column names are A-suffix1, A-suffix2, you
can strip the hyphen by specifying `sep='-'`.
suffix : str, default '\\d+'
A regular expression capturing the wanted suffixes. '\\d+' captures
numeric suffixes. Suffixes with no numbers could be specified with the
negated character class '\\D+'. You can also further disambiguate
suffixes, for example, if your wide variables are of the form A-one,
B-two,.., and you have an unrelated column A-rating, you can ignore the
last one by specifying `suffix='(!?one|two)'`. When all suffixes are
numeric, they are cast to int64/float64.
Returns
-------
DataFrame
A DataFrame that contains each stub name as a variable, with new index
(i, j).
See Also
--------
melt : Unpivot a DataFrame from wide to long format, optionally leaving
identifiers set.
pivot : Create a spreadsheet-style pivot table as a DataFrame.
DataFrame.pivot : Pivot without aggregation that can handle
non-numeric data.
DataFrame.pivot_table : Generalization of pivot that can handle
duplicate values for one index/column pair.
DataFrame.unstack : Pivot based on the index values instead of a
column.
Notes
-----
All extra variables are left untouched. This simply uses
`pandas.melt` under the hood, but is hard-coded to "do the right thing"
in a typical case.
Examples
--------
>>> np.random.seed(123)
>>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
... "A1980" : {0 : "d", 1 : "e", 2 : "f"},
... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
... "X" : dict(zip(range(3), np.random.randn(3)))
... })
>>> df["id"] = df.index
>>> df
A1970 A1980 B1970 B1980 X id
0 a d 2.5 3.2 -1.085631 0
1 b e 1.2 1.3 0.997345 1
2 c f 0.7 0.1 0.282978 2
>>> pd.wide_to_long(df, ["A", "B"], i="id", j="year")
... # doctest: +NORMALIZE_WHITESPACE
X A B
id year
0 1970 -1.085631 a 2.5
1 1970 0.997345 b 1.2
2 1970 0.282978 c 0.7
0 1980 -1.085631 d 3.2
1 1980 0.997345 e 1.3
2 1980 0.282978 f 0.1
With multiple id columns
>>> df = pd.DataFrame({
... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
... })
>>> df
famid birth ht1 ht2
0 1 1 2.8 3.4
1 1 2 2.9 3.8
2 1 3 2.2 2.9
3 2 1 2.0 3.2
4 2 2 1.8 2.8
5 2 3 1.9 2.4
6 3 1 2.2 3.3
7 3 2 2.3 3.4
8 3 3 2.1 2.9
>>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')
>>> l
... # doctest: +NORMALIZE_WHITESPACE
ht
famid birth age
1 1 1 2.8
2 3.4
2 1 2.9
2 3.8
3 1 2.2
2 2.9
2 1 1 2.0
2 3.2
2 1 1.8
2 2.8
3 1 1.9
2 2.4
3 1 1 2.2
2 3.3
2 1 2.3
2 3.4
3 1 2.1
2 2.9
Going from long back to wide just takes some creative use of `unstack`
>>> w = l.unstack()
>>> w.columns = w.columns.map('{0[0]}{0[1]}'.format)
>>> w.reset_index()
famid birth ht1 ht2
0 1 1 2.8 3.4
1 1 2 2.9 3.8
2 1 3 2.2 2.9
3 2 1 2.0 3.2
4 2 2 1.8 2.8
5 2 3 1.9 2.4
6 3 1 2.2 3.3
7 3 2 2.3 3.4
8 3 3 2.1 2.9
Less wieldy column names are also handled
>>> np.random.seed(0)
>>> df = pd.DataFrame({'A(weekly)-2010': np.random.rand(3),
... 'A(weekly)-2011': np.random.rand(3),
... 'B(weekly)-2010': np.random.rand(3),
... 'B(weekly)-2011': np.random.rand(3),
... 'X' : np.random.randint(3, size=3)})
>>> df['id'] = df.index
>>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
A(weekly)-2010 A(weekly)-2011 B(weekly)-2010 B(weekly)-2011 X id
0 0.548814 0.544883 0.437587 0.383442 0 0
1 0.715189 0.423655 0.891773 0.791725 1 1
2 0.602763 0.645894 0.963663 0.528895 1 2
>>> pd.wide_to_long(df, ['A(weekly)', 'B(weekly)'], i='id',
... j='year', sep='-')
... # doctest: +NORMALIZE_WHITESPACE
X A(weekly) B(weekly)
id year
0 2010 0 0.548814 0.437587
1 2010 1 0.715189 0.891773
2 2010 1 0.602763 0.963663
0 2011 0 0.544883 0.383442
1 2011 1 0.423655 0.791725
2 2011 1 0.645894 0.528895
If we have many columns, we could also use a regex to find our
stubnames and pass that list on to wide_to_long
>>> stubnames = sorted(
... set([match[0] for match in df.columns.str.findall(
... r'[A-B]\(.*\)').values if match != []])
... )
>>> list(stubnames)
['A(weekly)', 'B(weekly)']
All of the above examples have integers as suffixes. It is possible to
have non-integers as suffixes.
>>> df = pd.DataFrame({
... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
... })
>>> df
famid birth ht_one ht_two
0 1 1 2.8 3.4
1 1 2 2.9 3.8
2 1 3 2.2 2.9
3 2 1 2.0 3.2
4 2 2 1.8 2.8
5 2 3 1.9 2.4
6 3 1 2.2 3.3
7 3 2 2.3 3.4
8 3 3 2.1 2.9
>>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age',
... sep='_', suffix=r'\w+')
>>> l
... # doctest: +NORMALIZE_WHITESPACE
ht
famid birth age
1 1 one 2.8
two 3.4
2 one 2.9
two 3.8
3 one 2.2
two 2.9
2 1 one 2.0
two 3.2
2 one 1.8
two 2.8
3 one 1.9
two 2.4
3 1 one 2.2
two 3.3
2 one 2.3
two 3.4
3 one 2.1
two 2.9
"""
def get_var_names(df, stub: str, sep: str, suffix: str):
regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$"
return df.columns[df.columns.str.match(regex)]
def melt_stub(df, stub: str, i, j, value_vars, sep: str):
newdf = melt(
df,
id_vars=i,
value_vars=value_vars,
value_name=stub.rstrip(sep),
var_name=j,
)
newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True)
# GH17627 Cast numerics suffixes to int/float
try:
newdf[j] = to_numeric(newdf[j])
except (TypeError, ValueError, OverflowError):
# TODO: anything else to catch?
pass
return newdf.set_index(i + [j])
if not is_list_like(stubnames):
stubnames = [stubnames]
else:
stubnames = list(stubnames)
if df.columns.isin(stubnames).any():
raise ValueError("stubname can't be identical to a column name")
if not is_list_like(i):
i = [i]
else:
i = list(i)
if df[i].duplicated().any():
raise ValueError("the id variables need to uniquely identify each row")
_melted = []
value_vars_flattened = []
for stub in stubnames:
value_var = get_var_names(df, stub, sep, suffix)
value_vars_flattened.extend(value_var)
_melted.append(melt_stub(df, stub, i, j, value_var, sep))
melted = concat(_melted, axis=1)
id_vars = df.columns.difference(value_vars_flattened)
new = df[id_vars]
if len(i) == 1:
return new.set_index(i).join(melted)
else:
return new.merge(melted.reset_index(), on=i).set_index(i + [j])

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,899 @@
from __future__ import annotations
from collections.abc import (
Hashable,
Sequence,
)
from typing import (
TYPE_CHECKING,
Callable,
Literal,
cast,
)
import warnings
import numpy as np
from pandas._libs import lib
from pandas.util._decorators import (
Appender,
Substitution,
)
from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes.cast import maybe_downcast_to_dtype
from pandas.core.dtypes.common import (
is_list_like,
is_nested_list_like,
is_scalar,
)
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCSeries,
)
import pandas.core.common as com
from pandas.core.frame import _shared_docs
from pandas.core.groupby import Grouper
from pandas.core.indexes.api import (
Index,
MultiIndex,
get_objs_combined_axis,
)
from pandas.core.reshape.concat import concat
from pandas.core.reshape.util import cartesian_product
from pandas.core.series import Series
if TYPE_CHECKING:
from pandas._typing import (
AggFuncType,
AggFuncTypeBase,
AggFuncTypeDict,
IndexLabel,
)
from pandas import DataFrame
# Note: We need to make sure `frame` is imported before `pivot`, otherwise
# _shared_docs['pivot_table'] will not yet exist. TODO: Fix this dependency
@Substitution("\ndata : DataFrame")
@Appender(_shared_docs["pivot_table"], indents=1)
def pivot_table(
data: DataFrame,
values=None,
index=None,
columns=None,
aggfunc: AggFuncType = "mean",
fill_value=None,
margins: bool = False,
dropna: bool = True,
margins_name: Hashable = "All",
observed: bool | lib.NoDefault = lib.no_default,
sort: bool = True,
) -> DataFrame:
index = _convert_by(index)
columns = _convert_by(columns)
if isinstance(aggfunc, list):
pieces: list[DataFrame] = []
keys = []
for func in aggfunc:
_table = __internal_pivot_table(
data,
values=values,
index=index,
columns=columns,
fill_value=fill_value,
aggfunc=func,
margins=margins,
dropna=dropna,
margins_name=margins_name,
observed=observed,
sort=sort,
)
pieces.append(_table)
keys.append(getattr(func, "__name__", func))
table = concat(pieces, keys=keys, axis=1)
return table.__finalize__(data, method="pivot_table")
table = __internal_pivot_table(
data,
values,
index,
columns,
aggfunc,
fill_value,
margins,
dropna,
margins_name,
observed,
sort,
)
return table.__finalize__(data, method="pivot_table")
def __internal_pivot_table(
data: DataFrame,
values,
index,
columns,
aggfunc: AggFuncTypeBase | AggFuncTypeDict,
fill_value,
margins: bool,
dropna: bool,
margins_name: Hashable,
observed: bool | lib.NoDefault,
sort: bool,
) -> DataFrame:
"""
Helper of :func:`pandas.pivot_table` for any non-list ``aggfunc``.
"""
keys = index + columns
values_passed = values is not None
if values_passed:
if is_list_like(values):
values_multi = True
values = list(values)
else:
values_multi = False
values = [values]
# GH14938 Make sure value labels are in data
for i in values:
if i not in data:
raise KeyError(i)
to_filter = []
for x in keys + values:
if isinstance(x, Grouper):
x = x.key
try:
if x in data:
to_filter.append(x)
except TypeError:
pass
if len(to_filter) < len(data.columns):
data = data[to_filter]
else:
values = data.columns
for key in keys:
try:
values = values.drop(key)
except (TypeError, ValueError, KeyError):
pass
values = list(values)
observed_bool = False if observed is lib.no_default else observed
grouped = data.groupby(keys, observed=observed_bool, sort=sort, dropna=dropna)
if observed is lib.no_default and any(
ping._passed_categorical for ping in grouped._grouper.groupings
):
warnings.warn(
"The default value of observed=False is deprecated and will change "
"to observed=True in a future version of pandas. Specify "
"observed=False to silence this warning and retain the current behavior",
category=FutureWarning,
stacklevel=find_stack_level(),
)
agged = grouped.agg(aggfunc)
if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
agged = agged.dropna(how="all")
table = agged
# GH17038, this check should only happen if index is defined (not None)
if table.index.nlevels > 1 and index:
# Related GH #17123
# If index_names are integers, determine whether the integers refer
# to the level position or name.
index_names = agged.index.names[: len(index)]
to_unstack = []
for i in range(len(index), len(keys)):
name = agged.index.names[i]
if name is None or name in index_names:
to_unstack.append(i)
else:
to_unstack.append(name)
table = agged.unstack(to_unstack, fill_value=fill_value)
if not dropna:
if isinstance(table.index, MultiIndex):
m = MultiIndex.from_arrays(
cartesian_product(table.index.levels), names=table.index.names
)
table = table.reindex(m, axis=0, fill_value=fill_value)
if isinstance(table.columns, MultiIndex):
m = MultiIndex.from_arrays(
cartesian_product(table.columns.levels), names=table.columns.names
)
table = table.reindex(m, axis=1, fill_value=fill_value)
if sort is True and isinstance(table, ABCDataFrame):
table = table.sort_index(axis=1)
if fill_value is not None:
table = table.fillna(fill_value)
if aggfunc is len and not observed and lib.is_integer(fill_value):
# TODO: can we avoid this? this used to be handled by
# downcast="infer" in fillna
table = table.astype(np.int64)
if margins:
if dropna:
data = data[data.notna().all(axis=1)]
table = _add_margins(
table,
data,
values,
rows=index,
cols=columns,
aggfunc=aggfunc,
observed=dropna,
margins_name=margins_name,
fill_value=fill_value,
)
# discard the top level
if values_passed and not values_multi and table.columns.nlevels > 1:
table.columns = table.columns.droplevel(0)
if len(index) == 0 and len(columns) > 0:
table = table.T
# GH 15193 Make sure empty columns are removed if dropna=True
if isinstance(table, ABCDataFrame) and dropna:
table = table.dropna(how="all", axis=1)
return table
def _add_margins(
table: DataFrame | Series,
data: DataFrame,
values,
rows,
cols,
aggfunc,
observed: bool,
margins_name: Hashable = "All",
fill_value=None,
):
if not isinstance(margins_name, str):
raise ValueError("margins_name argument must be a string")
msg = f'Conflicting name "{margins_name}" in margins'
for level in table.index.names:
if margins_name in table.index.get_level_values(level):
raise ValueError(msg)
grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name)
if table.ndim == 2:
# i.e. DataFrame
for level in table.columns.names[1:]:
if margins_name in table.columns.get_level_values(level):
raise ValueError(msg)
key: str | tuple[str, ...]
if len(rows) > 1:
key = (margins_name,) + ("",) * (len(rows) - 1)
else:
key = margins_name
if not values and isinstance(table, ABCSeries):
# If there are no values and the table is a series, then there is only
# one column in the data. Compute grand margin and return it.
return table._append(table._constructor({key: grand_margin[margins_name]}))
elif values:
marginal_result_set = _generate_marginal_results(
table, data, values, rows, cols, aggfunc, observed, margins_name
)
if not isinstance(marginal_result_set, tuple):
return marginal_result_set
result, margin_keys, row_margin = marginal_result_set
else:
# no values, and table is a DataFrame
assert isinstance(table, ABCDataFrame)
marginal_result_set = _generate_marginal_results_without_values(
table, data, rows, cols, aggfunc, observed, margins_name
)
if not isinstance(marginal_result_set, tuple):
return marginal_result_set
result, margin_keys, row_margin = marginal_result_set
row_margin = row_margin.reindex(result.columns, fill_value=fill_value)
# populate grand margin
for k in margin_keys:
if isinstance(k, str):
row_margin[k] = grand_margin[k]
else:
row_margin[k] = grand_margin[k[0]]
from pandas import DataFrame
margin_dummy = DataFrame(row_margin, columns=Index([key])).T
row_names = result.index.names
# check the result column and leave floats
for dtype in set(result.dtypes):
if isinstance(dtype, ExtensionDtype):
# Can hold NA already
continue
cols = result.select_dtypes([dtype]).columns
margin_dummy[cols] = margin_dummy[cols].apply(
maybe_downcast_to_dtype, args=(dtype,)
)
result = result._append(margin_dummy)
result.index.names = row_names
return result
def _compute_grand_margin(
data: DataFrame, values, aggfunc, margins_name: Hashable = "All"
):
if values:
grand_margin = {}
for k, v in data[values].items():
try:
if isinstance(aggfunc, str):
grand_margin[k] = getattr(v, aggfunc)()
elif isinstance(aggfunc, dict):
if isinstance(aggfunc[k], str):
grand_margin[k] = getattr(v, aggfunc[k])()
else:
grand_margin[k] = aggfunc[k](v)
else:
grand_margin[k] = aggfunc(v)
except TypeError:
pass
return grand_margin
else:
return {margins_name: aggfunc(data.index)}
def _generate_marginal_results(
table,
data: DataFrame,
values,
rows,
cols,
aggfunc,
observed: bool,
margins_name: Hashable = "All",
):
margin_keys: list | Index
if len(cols) > 0:
# need to "interleave" the margins
table_pieces = []
margin_keys = []
def _all_key(key):
return (key, margins_name) + ("",) * (len(cols) - 1)
if len(rows) > 0:
margin = data[rows + values].groupby(rows, observed=observed).agg(aggfunc)
cat_axis = 1
for key, piece in table.T.groupby(level=0, observed=observed):
piece = piece.T
all_key = _all_key(key)
# we are going to mutate this, so need to copy!
piece = piece.copy()
piece[all_key] = margin[key]
table_pieces.append(piece)
margin_keys.append(all_key)
else:
from pandas import DataFrame
cat_axis = 0
for key, piece in table.groupby(level=0, observed=observed):
if len(cols) > 1:
all_key = _all_key(key)
else:
all_key = margins_name
table_pieces.append(piece)
# GH31016 this is to calculate margin for each group, and assign
# corresponded key as index
transformed_piece = DataFrame(piece.apply(aggfunc)).T
if isinstance(piece.index, MultiIndex):
# We are adding an empty level
transformed_piece.index = MultiIndex.from_tuples(
[all_key], names=piece.index.names + [None]
)
else:
transformed_piece.index = Index([all_key], name=piece.index.name)
# append piece for margin into table_piece
table_pieces.append(transformed_piece)
margin_keys.append(all_key)
if not table_pieces:
# GH 49240
return table
else:
result = concat(table_pieces, axis=cat_axis)
if len(rows) == 0:
return result
else:
result = table
margin_keys = table.columns
if len(cols) > 0:
row_margin = data[cols + values].groupby(cols, observed=observed).agg(aggfunc)
row_margin = row_margin.stack(future_stack=True)
# GH#26568. Use names instead of indices in case of numeric names
new_order_indices = [len(cols)] + list(range(len(cols)))
new_order_names = [row_margin.index.names[i] for i in new_order_indices]
row_margin.index = row_margin.index.reorder_levels(new_order_names)
else:
row_margin = data._constructor_sliced(np.nan, index=result.columns)
return result, margin_keys, row_margin
def _generate_marginal_results_without_values(
table: DataFrame,
data: DataFrame,
rows,
cols,
aggfunc,
observed: bool,
margins_name: Hashable = "All",
):
margin_keys: list | Index
if len(cols) > 0:
# need to "interleave" the margins
margin_keys = []
def _all_key():
if len(cols) == 1:
return margins_name
return (margins_name,) + ("",) * (len(cols) - 1)
if len(rows) > 0:
margin = data.groupby(rows, observed=observed)[rows].apply(aggfunc)
all_key = _all_key()
table[all_key] = margin
result = table
margin_keys.append(all_key)
else:
margin = data.groupby(level=0, axis=0, observed=observed).apply(aggfunc)
all_key = _all_key()
table[all_key] = margin
result = table
margin_keys.append(all_key)
return result
else:
result = table
margin_keys = table.columns
if len(cols):
row_margin = data.groupby(cols, observed=observed)[cols].apply(aggfunc)
else:
row_margin = Series(np.nan, index=result.columns)
return result, margin_keys, row_margin
def _convert_by(by):
if by is None:
by = []
elif (
is_scalar(by)
or isinstance(by, (np.ndarray, Index, ABCSeries, Grouper))
or callable(by)
):
by = [by]
else:
by = list(by)
return by
@Substitution("\ndata : DataFrame")
@Appender(_shared_docs["pivot"], indents=1)
def pivot(
data: DataFrame,
*,
columns: IndexLabel,
index: IndexLabel | lib.NoDefault = lib.no_default,
values: IndexLabel | lib.NoDefault = lib.no_default,
) -> DataFrame:
columns_listlike = com.convert_to_list_like(columns)
# If columns is None we will create a MultiIndex level with None as name
# which might cause duplicated names because None is the default for
# level names
data = data.copy(deep=False)
data.index = data.index.copy()
data.index.names = [
name if name is not None else lib.no_default for name in data.index.names
]
indexed: DataFrame | Series
if values is lib.no_default:
if index is not lib.no_default:
cols = com.convert_to_list_like(index)
else:
cols = []
append = index is lib.no_default
# error: Unsupported operand types for + ("List[Any]" and "ExtensionArray")
# error: Unsupported left operand type for + ("ExtensionArray")
indexed = data.set_index(
cols + columns_listlike, append=append # type: ignore[operator]
)
else:
index_list: list[Index] | list[Series]
if index is lib.no_default:
if isinstance(data.index, MultiIndex):
# GH 23955
index_list = [
data.index.get_level_values(i) for i in range(data.index.nlevels)
]
else:
index_list = [
data._constructor_sliced(data.index, name=data.index.name)
]
else:
index_list = [data[idx] for idx in com.convert_to_list_like(index)]
data_columns = [data[col] for col in columns_listlike]
index_list.extend(data_columns)
multiindex = MultiIndex.from_arrays(index_list)
if is_list_like(values) and not isinstance(values, tuple):
# Exclude tuple because it is seen as a single column name
values = cast(Sequence[Hashable], values)
indexed = data._constructor(
data[values]._values, index=multiindex, columns=values
)
else:
indexed = data._constructor_sliced(data[values]._values, index=multiindex)
# error: Argument 1 to "unstack" of "DataFrame" has incompatible type "Union
# [List[Any], ExtensionArray, ndarray[Any, Any], Index, Series]"; expected
# "Hashable"
result = indexed.unstack(columns_listlike) # type: ignore[arg-type]
result.index.names = [
name if name is not lib.no_default else None for name in result.index.names
]
return result
def crosstab(
index,
columns,
values=None,
rownames=None,
colnames=None,
aggfunc=None,
margins: bool = False,
margins_name: Hashable = "All",
dropna: bool = True,
normalize: bool | Literal[0, 1, "all", "index", "columns"] = False,
) -> DataFrame:
"""
Compute a simple cross tabulation of two (or more) factors.
By default, computes a frequency table of the factors unless an
array of values and an aggregation function are passed.
Parameters
----------
index : array-like, Series, or list of arrays/Series
Values to group by in the rows.
columns : array-like, Series, or list of arrays/Series
Values to group by in the columns.
values : array-like, optional
Array of values to aggregate according to the factors.
Requires `aggfunc` be specified.
rownames : sequence, default None
If passed, must match number of row arrays passed.
colnames : sequence, default None
If passed, must match number of column arrays passed.
aggfunc : function, optional
If specified, requires `values` be specified as well.
margins : bool, default False
Add row/column margins (subtotals).
margins_name : str, default 'All'
Name of the row/column that will contain the totals
when margins is True.
dropna : bool, default True
Do not include columns whose entries are all NaN.
normalize : bool, {'all', 'index', 'columns'}, or {0,1}, default False
Normalize by dividing all values by the sum of values.
- If passed 'all' or `True`, will normalize over all values.
- If passed 'index' will normalize over each row.
- If passed 'columns' will normalize over each column.
- If margins is `True`, will also normalize margin values.
Returns
-------
DataFrame
Cross tabulation of the data.
See Also
--------
DataFrame.pivot : Reshape data based on column values.
pivot_table : Create a pivot table as a DataFrame.
Notes
-----
Any Series passed will have their name attributes used unless row or column
names for the cross-tabulation are specified.
Any input passed containing Categorical data will have **all** of its
categories included in the cross-tabulation, even if the actual data does
not contain any instances of a particular category.
In the event that there aren't overlapping indexes an empty DataFrame will
be returned.
Reference :ref:`the user guide <reshaping.crosstabulations>` for more examples.
Examples
--------
>>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar",
... "bar", "bar", "foo", "foo", "foo"], dtype=object)
>>> b = np.array(["one", "one", "one", "two", "one", "one",
... "one", "two", "two", "two", "one"], dtype=object)
>>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny",
... "shiny", "dull", "shiny", "shiny", "shiny"],
... dtype=object)
>>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
b one two
c dull shiny dull shiny
a
bar 1 2 1 0
foo 2 2 1 2
Here 'c' and 'f' are not represented in the data and will not be
shown in the output because dropna is True by default. Set
dropna=False to preserve categories with no data.
>>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])
>>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])
>>> pd.crosstab(foo, bar)
col_0 d e
row_0
a 1 0
b 0 1
>>> pd.crosstab(foo, bar, dropna=False)
col_0 d e f
row_0
a 1 0 0
b 0 1 0
c 0 0 0
"""
if values is None and aggfunc is not None:
raise ValueError("aggfunc cannot be used without values.")
if values is not None and aggfunc is None:
raise ValueError("values cannot be used without an aggfunc.")
if not is_nested_list_like(index):
index = [index]
if not is_nested_list_like(columns):
columns = [columns]
common_idx = None
pass_objs = [x for x in index + columns if isinstance(x, (ABCSeries, ABCDataFrame))]
if pass_objs:
common_idx = get_objs_combined_axis(pass_objs, intersect=True, sort=False)
rownames = _get_names(index, rownames, prefix="row")
colnames = _get_names(columns, colnames, prefix="col")
# duplicate names mapped to unique names for pivot op
(
rownames_mapper,
unique_rownames,
colnames_mapper,
unique_colnames,
) = _build_names_mapper(rownames, colnames)
from pandas import DataFrame
data = {
**dict(zip(unique_rownames, index)),
**dict(zip(unique_colnames, columns)),
}
df = DataFrame(data, index=common_idx)
if values is None:
df["__dummy__"] = 0
kwargs = {"aggfunc": len, "fill_value": 0}
else:
df["__dummy__"] = values
kwargs = {"aggfunc": aggfunc}
# error: Argument 7 to "pivot_table" of "DataFrame" has incompatible type
# "**Dict[str, object]"; expected "Union[...]"
table = df.pivot_table(
"__dummy__",
index=unique_rownames,
columns=unique_colnames,
margins=margins,
margins_name=margins_name,
dropna=dropna,
observed=False,
**kwargs, # type: ignore[arg-type]
)
# Post-process
if normalize is not False:
table = _normalize(
table, normalize=normalize, margins=margins, margins_name=margins_name
)
table = table.rename_axis(index=rownames_mapper, axis=0)
table = table.rename_axis(columns=colnames_mapper, axis=1)
return table
def _normalize(
table: DataFrame, normalize, margins: bool, margins_name: Hashable = "All"
) -> DataFrame:
if not isinstance(normalize, (bool, str)):
axis_subs = {0: "index", 1: "columns"}
try:
normalize = axis_subs[normalize]
except KeyError as err:
raise ValueError("Not a valid normalize argument") from err
if margins is False:
# Actual Normalizations
normalizers: dict[bool | str, Callable] = {
"all": lambda x: x / x.sum(axis=1).sum(axis=0),
"columns": lambda x: x / x.sum(),
"index": lambda x: x.div(x.sum(axis=1), axis=0),
}
normalizers[True] = normalizers["all"]
try:
f = normalizers[normalize]
except KeyError as err:
raise ValueError("Not a valid normalize argument") from err
table = f(table)
table = table.fillna(0)
elif margins is True:
# keep index and column of pivoted table
table_index = table.index
table_columns = table.columns
last_ind_or_col = table.iloc[-1, :].name
# check if margin name is not in (for MI cases) and not equal to last
# index/column and save the column and index margin
if (margins_name not in last_ind_or_col) & (margins_name != last_ind_or_col):
raise ValueError(f"{margins_name} not in pivoted DataFrame")
column_margin = table.iloc[:-1, -1]
index_margin = table.iloc[-1, :-1]
# keep the core table
table = table.iloc[:-1, :-1]
# Normalize core
table = _normalize(table, normalize=normalize, margins=False)
# Fix Margins
if normalize == "columns":
column_margin = column_margin / column_margin.sum()
table = concat([table, column_margin], axis=1)
table = table.fillna(0)
table.columns = table_columns
elif normalize == "index":
index_margin = index_margin / index_margin.sum()
table = table._append(index_margin)
table = table.fillna(0)
table.index = table_index
elif normalize == "all" or normalize is True:
column_margin = column_margin / column_margin.sum()
index_margin = index_margin / index_margin.sum()
index_margin.loc[margins_name] = 1
table = concat([table, column_margin], axis=1)
table = table._append(index_margin)
table = table.fillna(0)
table.index = table_index
table.columns = table_columns
else:
raise ValueError("Not a valid normalize argument")
else:
raise ValueError("Not a valid margins argument")
return table
def _get_names(arrs, names, prefix: str = "row"):
if names is None:
names = []
for i, arr in enumerate(arrs):
if isinstance(arr, ABCSeries) and arr.name is not None:
names.append(arr.name)
else:
names.append(f"{prefix}_{i}")
else:
if len(names) != len(arrs):
raise AssertionError("arrays and names must have the same length")
if not isinstance(names, list):
names = list(names)
return names
def _build_names_mapper(
rownames: list[str], colnames: list[str]
) -> tuple[dict[str, str], list[str], dict[str, str], list[str]]:
"""
Given the names of a DataFrame's rows and columns, returns a set of unique row
and column names and mappers that convert to original names.
A row or column name is replaced if it is duplicate among the rows of the inputs,
among the columns of the inputs or between the rows and the columns.
Parameters
----------
rownames: list[str]
colnames: list[str]
Returns
-------
Tuple(Dict[str, str], List[str], Dict[str, str], List[str])
rownames_mapper: dict[str, str]
a dictionary with new row names as keys and original rownames as values
unique_rownames: list[str]
a list of rownames with duplicate names replaced by dummy names
colnames_mapper: dict[str, str]
a dictionary with new column names as keys and original column names as values
unique_colnames: list[str]
a list of column names with duplicate names replaced by dummy names
"""
def get_duplicates(names):
seen: set = set()
return {name for name in names if name not in seen}
shared_names = set(rownames).intersection(set(colnames))
dup_names = get_duplicates(rownames) | get_duplicates(colnames) | shared_names
rownames_mapper = {
f"row_{i}": name for i, name in enumerate(rownames) if name in dup_names
}
unique_rownames = [
f"row_{i}" if name in dup_names else name for i, name in enumerate(rownames)
]
colnames_mapper = {
f"col_{i}": name for i, name in enumerate(colnames) if name in dup_names
}
unique_colnames = [
f"col_{i}" if name in dup_names else name for i, name in enumerate(colnames)
]
return rownames_mapper, unique_rownames, colnames_mapper, unique_colnames

View File

@ -0,0 +1,989 @@
from __future__ import annotations
import itertools
from typing import (
TYPE_CHECKING,
cast,
)
import warnings
import numpy as np
import pandas._libs.reshape as libreshape
from pandas.errors import PerformanceWarning
from pandas.util._decorators import cache_readonly
from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes.cast import (
find_common_type,
maybe_promote,
)
from pandas.core.dtypes.common import (
ensure_platform_int,
is_1d_only_ea_dtype,
is_integer,
needs_i8_conversion,
)
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.missing import notna
import pandas.core.algorithms as algos
from pandas.core.algorithms import (
factorize,
unique,
)
from pandas.core.arrays.categorical import factorize_from_iterable
from pandas.core.construction import ensure_wrapped_if_datetimelike
from pandas.core.frame import DataFrame
from pandas.core.indexes.api import (
Index,
MultiIndex,
RangeIndex,
)
from pandas.core.reshape.concat import concat
from pandas.core.series import Series
from pandas.core.sorting import (
compress_group_index,
decons_obs_group_ids,
get_compressed_ids,
get_group_index,
get_group_index_sorter,
)
if TYPE_CHECKING:
from pandas._typing import (
ArrayLike,
Level,
npt,
)
from pandas.core.arrays import ExtensionArray
from pandas.core.indexes.frozen import FrozenList
class _Unstacker:
"""
Helper class to unstack data / pivot with multi-level index
Parameters
----------
index : MultiIndex
level : int or str, default last level
Level to "unstack". Accepts a name for the level.
fill_value : scalar, optional
Default value to fill in missing values if subgroups do not have the
same set of labels. By default, missing values will be replaced with
the default fill value for that data type, NaN for float, NaT for
datetimelike, etc. For integer types, by default data will converted to
float and missing values will be set to NaN.
constructor : object
Pandas ``DataFrame`` or subclass used to create unstacked
response. If None, DataFrame will be used.
Examples
--------
>>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
... ('two', 'a'), ('two', 'b')])
>>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index)
>>> s
one a 1
b 2
two a 3
b 4
dtype: int64
>>> s.unstack(level=-1)
a b
one 1 2
two 3 4
>>> s.unstack(level=0)
one two
a 1 3
b 2 4
Returns
-------
unstacked : DataFrame
"""
def __init__(
self, index: MultiIndex, level: Level, constructor, sort: bool = True
) -> None:
self.constructor = constructor
self.sort = sort
self.index = index.remove_unused_levels()
self.level = self.index._get_level_number(level)
# when index includes `nan`, need to lift levels/strides by 1
self.lift = 1 if -1 in self.index.codes[self.level] else 0
# Note: the "pop" below alters these in-place.
self.new_index_levels = list(self.index.levels)
self.new_index_names = list(self.index.names)
self.removed_name = self.new_index_names.pop(self.level)
self.removed_level = self.new_index_levels.pop(self.level)
self.removed_level_full = index.levels[self.level]
if not self.sort:
unique_codes = unique(self.index.codes[self.level])
self.removed_level = self.removed_level.take(unique_codes)
self.removed_level_full = self.removed_level_full.take(unique_codes)
# Bug fix GH 20601
# If the data frame is too big, the number of unique index combination
# will cause int32 overflow on windows environments.
# We want to check and raise an warning before this happens
num_rows = np.max([index_level.size for index_level in self.new_index_levels])
num_columns = self.removed_level.size
# GH20601: This forces an overflow if the number of cells is too high.
num_cells = num_rows * num_columns
# GH 26314: Previous ValueError raised was too restrictive for many users.
if num_cells > np.iinfo(np.int32).max:
warnings.warn(
f"The following operation may generate {num_cells} cells "
f"in the resulting pandas object.",
PerformanceWarning,
stacklevel=find_stack_level(),
)
self._make_selectors()
@cache_readonly
def _indexer_and_to_sort(
self,
) -> tuple[
npt.NDArray[np.intp],
list[np.ndarray], # each has _some_ signed integer dtype
]:
v = self.level
codes = list(self.index.codes)
levs = list(self.index.levels)
to_sort = codes[:v] + codes[v + 1 :] + [codes[v]]
sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]])
comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
ngroups = len(obs_ids)
indexer = get_group_index_sorter(comp_index, ngroups)
return indexer, to_sort
@cache_readonly
def sorted_labels(self) -> list[np.ndarray]:
indexer, to_sort = self._indexer_and_to_sort
if self.sort:
return [line.take(indexer) for line in to_sort]
return to_sort
def _make_sorted_values(self, values: np.ndarray) -> np.ndarray:
if self.sort:
indexer, _ = self._indexer_and_to_sort
sorted_values = algos.take_nd(values, indexer, axis=0)
return sorted_values
return values
def _make_selectors(self):
new_levels = self.new_index_levels
# make the mask
remaining_labels = self.sorted_labels[:-1]
level_sizes = tuple(len(x) for x in new_levels)
comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
ngroups = len(obs_ids)
comp_index = ensure_platform_int(comp_index)
stride = self.index.levshape[self.level] + self.lift
self.full_shape = ngroups, stride
selector = self.sorted_labels[-1] + stride * comp_index + self.lift
mask = np.zeros(np.prod(self.full_shape), dtype=bool)
mask.put(selector, True)
if mask.sum() < len(self.index):
raise ValueError("Index contains duplicate entries, cannot reshape")
self.group_index = comp_index
self.mask = mask
if self.sort:
self.compressor = comp_index.searchsorted(np.arange(ngroups))
else:
self.compressor = np.sort(np.unique(comp_index, return_index=True)[1])
@cache_readonly
def mask_all(self) -> bool:
return bool(self.mask.all())
@cache_readonly
def arange_result(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.bool_]]:
# We cache this for reuse in ExtensionBlock._unstack
dummy_arr = np.arange(len(self.index), dtype=np.intp)
new_values, mask = self.get_new_values(dummy_arr, fill_value=-1)
return new_values, mask.any(0)
# TODO: in all tests we have mask.any(0).all(); can we rely on that?
def get_result(self, values, value_columns, fill_value) -> DataFrame:
if values.ndim == 1:
values = values[:, np.newaxis]
if value_columns is None and values.shape[1] != 1: # pragma: no cover
raise ValueError("must pass column labels for multi-column data")
values, _ = self.get_new_values(values, fill_value)
columns = self.get_new_columns(value_columns)
index = self.new_index
return self.constructor(
values, index=index, columns=columns, dtype=values.dtype
)
def get_new_values(self, values, fill_value=None):
if values.ndim == 1:
values = values[:, np.newaxis]
sorted_values = self._make_sorted_values(values)
# place the values
length, width = self.full_shape
stride = values.shape[1]
result_width = width * stride
result_shape = (length, result_width)
mask = self.mask
mask_all = self.mask_all
# we can simply reshape if we don't have a mask
if mask_all and len(values):
# TODO: Under what circumstances can we rely on sorted_values
# matching values? When that holds, we can slice instead
# of take (in particular for EAs)
new_values = (
sorted_values.reshape(length, width, stride)
.swapaxes(1, 2)
.reshape(result_shape)
)
new_mask = np.ones(result_shape, dtype=bool)
return new_values, new_mask
dtype = values.dtype
# if our mask is all True, then we can use our existing dtype
if mask_all:
dtype = values.dtype
new_values = np.empty(result_shape, dtype=dtype)
else:
if isinstance(dtype, ExtensionDtype):
# GH#41875
# We are assuming that fill_value can be held by this dtype,
# unlike the non-EA case that promotes.
cls = dtype.construct_array_type()
new_values = cls._empty(result_shape, dtype=dtype)
new_values[:] = fill_value
else:
dtype, fill_value = maybe_promote(dtype, fill_value)
new_values = np.empty(result_shape, dtype=dtype)
new_values.fill(fill_value)
name = dtype.name
new_mask = np.zeros(result_shape, dtype=bool)
# we need to convert to a basic dtype
# and possibly coerce an input to our output dtype
# e.g. ints -> floats
if needs_i8_conversion(values.dtype):
sorted_values = sorted_values.view("i8")
new_values = new_values.view("i8")
else:
sorted_values = sorted_values.astype(name, copy=False)
# fill in our values & mask
libreshape.unstack(
sorted_values,
mask.view("u1"),
stride,
length,
width,
new_values,
new_mask.view("u1"),
)
# reconstruct dtype if needed
if needs_i8_conversion(values.dtype):
# view as datetime64 so we can wrap in DatetimeArray and use
# DTA's view method
new_values = new_values.view("M8[ns]")
new_values = ensure_wrapped_if_datetimelike(new_values)
new_values = new_values.view(values.dtype)
return new_values, new_mask
def get_new_columns(self, value_columns: Index | None):
if value_columns is None:
if self.lift == 0:
return self.removed_level._rename(name=self.removed_name)
lev = self.removed_level.insert(0, item=self.removed_level._na_value)
return lev.rename(self.removed_name)
stride = len(self.removed_level) + self.lift
width = len(value_columns)
propagator = np.repeat(np.arange(width), stride)
new_levels: FrozenList | list[Index]
if isinstance(value_columns, MultiIndex):
# error: Cannot determine type of "__add__" [has-type]
new_levels = value_columns.levels + ( # type: ignore[has-type]
self.removed_level_full,
)
new_names = value_columns.names + (self.removed_name,)
new_codes = [lab.take(propagator) for lab in value_columns.codes]
else:
new_levels = [
value_columns,
self.removed_level_full,
]
new_names = [value_columns.name, self.removed_name]
new_codes = [propagator]
repeater = self._repeater
# The entire level is then just a repetition of the single chunk:
new_codes.append(np.tile(repeater, width))
return MultiIndex(
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
)
@cache_readonly
def _repeater(self) -> np.ndarray:
# The two indices differ only if the unstacked level had unused items:
if len(self.removed_level_full) != len(self.removed_level):
# In this case, we remap the new codes to the original level:
repeater = self.removed_level_full.get_indexer(self.removed_level)
if self.lift:
repeater = np.insert(repeater, 0, -1)
else:
# Otherwise, we just use each level item exactly once:
stride = len(self.removed_level) + self.lift
repeater = np.arange(stride) - self.lift
return repeater
@cache_readonly
def new_index(self) -> MultiIndex:
# Does not depend on values or value_columns
result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]]
# construct the new index
if len(self.new_index_levels) == 1:
level, level_codes = self.new_index_levels[0], result_codes[0]
if (level_codes == -1).any():
level = level.insert(len(level), level._na_value)
return level.take(level_codes).rename(self.new_index_names[0])
return MultiIndex(
levels=self.new_index_levels,
codes=result_codes,
names=self.new_index_names,
verify_integrity=False,
)
def _unstack_multiple(
data: Series | DataFrame, clocs, fill_value=None, sort: bool = True
):
if len(clocs) == 0:
return data
# NOTE: This doesn't deal with hierarchical columns yet
index = data.index
index = cast(MultiIndex, index) # caller is responsible for checking
# GH 19966 Make sure if MultiIndexed index has tuple name, they will be
# recognised as a whole
if clocs in index.names:
clocs = [clocs]
clocs = [index._get_level_number(i) for i in clocs]
rlocs = [i for i in range(index.nlevels) if i not in clocs]
clevels = [index.levels[i] for i in clocs]
ccodes = [index.codes[i] for i in clocs]
cnames = [index.names[i] for i in clocs]
rlevels = [index.levels[i] for i in rlocs]
rcodes = [index.codes[i] for i in rlocs]
rnames = [index.names[i] for i in rlocs]
shape = tuple(len(x) for x in clevels)
group_index = get_group_index(ccodes, shape, sort=False, xnull=False)
comp_ids, obs_ids = compress_group_index(group_index, sort=False)
recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False)
if not rlocs:
# Everything is in clocs, so the dummy df has a regular index
dummy_index = Index(obs_ids, name="__placeholder__")
else:
dummy_index = MultiIndex(
levels=rlevels + [obs_ids],
codes=rcodes + [comp_ids],
names=rnames + ["__placeholder__"],
verify_integrity=False,
)
if isinstance(data, Series):
dummy = data.copy()
dummy.index = dummy_index
unstacked = dummy.unstack("__placeholder__", fill_value=fill_value, sort=sort)
new_levels = clevels
new_names = cnames
new_codes = recons_codes
else:
if isinstance(data.columns, MultiIndex):
result = data
while clocs:
val = clocs.pop(0)
result = result.unstack(val, fill_value=fill_value, sort=sort)
clocs = [v if v < val else v - 1 for v in clocs]
return result
# GH#42579 deep=False to avoid consolidating
dummy_df = data.copy(deep=False)
dummy_df.index = dummy_index
unstacked = dummy_df.unstack(
"__placeholder__", fill_value=fill_value, sort=sort
)
if isinstance(unstacked, Series):
unstcols = unstacked.index
else:
unstcols = unstacked.columns
assert isinstance(unstcols, MultiIndex) # for mypy
new_levels = [unstcols.levels[0]] + clevels
new_names = [data.columns.name] + cnames
new_codes = [unstcols.codes[0]]
new_codes.extend(rec.take(unstcols.codes[-1]) for rec in recons_codes)
new_columns = MultiIndex(
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
)
if isinstance(unstacked, Series):
unstacked.index = new_columns
else:
unstacked.columns = new_columns
return unstacked
def unstack(obj: Series | DataFrame, level, fill_value=None, sort: bool = True):
if isinstance(level, (tuple, list)):
if len(level) != 1:
# _unstack_multiple only handles MultiIndexes,
# and isn't needed for a single level
return _unstack_multiple(obj, level, fill_value=fill_value, sort=sort)
else:
level = level[0]
if not is_integer(level) and not level == "__placeholder__":
# check if level is valid in case of regular index
obj.index._get_level_number(level)
if isinstance(obj, DataFrame):
if isinstance(obj.index, MultiIndex):
return _unstack_frame(obj, level, fill_value=fill_value, sort=sort)
else:
return obj.T.stack(future_stack=True)
elif not isinstance(obj.index, MultiIndex):
# GH 36113
# Give nicer error messages when unstack a Series whose
# Index is not a MultiIndex.
raise ValueError(
f"index must be a MultiIndex to unstack, {type(obj.index)} was passed"
)
else:
if is_1d_only_ea_dtype(obj.dtype):
return _unstack_extension_series(obj, level, fill_value, sort=sort)
unstacker = _Unstacker(
obj.index, level=level, constructor=obj._constructor_expanddim, sort=sort
)
return unstacker.get_result(
obj._values, value_columns=None, fill_value=fill_value
)
def _unstack_frame(
obj: DataFrame, level, fill_value=None, sort: bool = True
) -> DataFrame:
assert isinstance(obj.index, MultiIndex) # checked by caller
unstacker = _Unstacker(
obj.index, level=level, constructor=obj._constructor, sort=sort
)
if not obj._can_fast_transpose:
mgr = obj._mgr.unstack(unstacker, fill_value=fill_value)
return obj._constructor_from_mgr(mgr, axes=mgr.axes)
else:
return unstacker.get_result(
obj._values, value_columns=obj.columns, fill_value=fill_value
)
def _unstack_extension_series(
series: Series, level, fill_value, sort: bool
) -> DataFrame:
"""
Unstack an ExtensionArray-backed Series.
The ExtensionDtype is preserved.
Parameters
----------
series : Series
A Series with an ExtensionArray for values
level : Any
The level name or number.
fill_value : Any
The user-level (not physical storage) fill value to use for
missing values introduced by the reshape. Passed to
``series.values.take``.
sort : bool
Whether to sort the resulting MuliIndex levels
Returns
-------
DataFrame
Each column of the DataFrame will have the same dtype as
the input Series.
"""
# Defer to the logic in ExtensionBlock._unstack
df = series.to_frame()
result = df.unstack(level=level, fill_value=fill_value, sort=sort)
# equiv: result.droplevel(level=0, axis=1)
# but this avoids an extra copy
result.columns = result.columns._drop_level_numbers([0])
return result
def stack(frame: DataFrame, level=-1, dropna: bool = True, sort: bool = True):
"""
Convert DataFrame to Series with multi-level Index. Columns become the
second level of the resulting hierarchical index
Returns
-------
stacked : Series or DataFrame
"""
def stack_factorize(index):
if index.is_unique:
return index, np.arange(len(index))
codes, categories = factorize_from_iterable(index)
return categories, codes
N, K = frame.shape
# Will also convert negative level numbers and check if out of bounds.
level_num = frame.columns._get_level_number(level)
if isinstance(frame.columns, MultiIndex):
return _stack_multi_columns(
frame, level_num=level_num, dropna=dropna, sort=sort
)
elif isinstance(frame.index, MultiIndex):
new_levels = list(frame.index.levels)
new_codes = [lab.repeat(K) for lab in frame.index.codes]
clev, clab = stack_factorize(frame.columns)
new_levels.append(clev)
new_codes.append(np.tile(clab, N).ravel())
new_names = list(frame.index.names)
new_names.append(frame.columns.name)
new_index = MultiIndex(
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
)
else:
levels, (ilab, clab) = zip(*map(stack_factorize, (frame.index, frame.columns)))
codes = ilab.repeat(K), np.tile(clab, N).ravel()
new_index = MultiIndex(
levels=levels,
codes=codes,
names=[frame.index.name, frame.columns.name],
verify_integrity=False,
)
new_values: ArrayLike
if not frame.empty and frame._is_homogeneous_type:
# For homogeneous EAs, frame._values will coerce to object. So
# we concatenate instead.
dtypes = list(frame.dtypes._values)
dtype = dtypes[0]
if isinstance(dtype, ExtensionDtype):
arr = dtype.construct_array_type()
new_values = arr._concat_same_type(
[col._values for _, col in frame.items()]
)
new_values = _reorder_for_extension_array_stack(new_values, N, K)
else:
# homogeneous, non-EA
new_values = frame._values.ravel()
else:
# non-homogeneous
new_values = frame._values.ravel()
if dropna:
mask = notna(new_values)
new_values = new_values[mask]
new_index = new_index[mask]
return frame._constructor_sliced(new_values, index=new_index)
def stack_multiple(frame: DataFrame, level, dropna: bool = True, sort: bool = True):
# If all passed levels match up to column names, no
# ambiguity about what to do
if all(lev in frame.columns.names for lev in level):
result = frame
for lev in level:
result = stack(result, lev, dropna=dropna, sort=sort)
# Otherwise, level numbers may change as each successive level is stacked
elif all(isinstance(lev, int) for lev in level):
# As each stack is done, the level numbers decrease, so we need
# to account for that when level is a sequence of ints
result = frame
# _get_level_number() checks level numbers are in range and converts
# negative numbers to positive
level = [frame.columns._get_level_number(lev) for lev in level]
while level:
lev = level.pop(0)
result = stack(result, lev, dropna=dropna, sort=sort)
# Decrement all level numbers greater than current, as these
# have now shifted down by one
level = [v if v <= lev else v - 1 for v in level]
else:
raise ValueError(
"level should contain all level names or all level "
"numbers, not a mixture of the two."
)
return result
def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex:
"""Creates a MultiIndex from the first N-1 levels of this MultiIndex."""
if len(columns.levels) <= 2:
return columns.levels[0]._rename(name=columns.names[0])
levs = [
[lev[c] if c >= 0 else None for c in codes]
for lev, codes in zip(columns.levels[:-1], columns.codes[:-1])
]
# Remove duplicate tuples in the MultiIndex.
tuples = zip(*levs)
unique_tuples = (key for key, _ in itertools.groupby(tuples))
new_levs = zip(*unique_tuples)
# The dtype of each level must be explicitly set to avoid inferring the wrong type.
# See GH-36991.
return MultiIndex.from_arrays(
[
# Not all indices can accept None values.
Index(new_lev, dtype=lev.dtype) if None not in new_lev else new_lev
for new_lev, lev in zip(new_levs, columns.levels)
],
names=columns.names[:-1],
)
def _stack_multi_columns(
frame: DataFrame, level_num: int = -1, dropna: bool = True, sort: bool = True
) -> DataFrame:
def _convert_level_number(level_num: int, columns: Index):
"""
Logic for converting the level number to something we can safely pass
to swaplevel.
If `level_num` matches a column name return the name from
position `level_num`, otherwise return `level_num`.
"""
if level_num in columns.names:
return columns.names[level_num]
return level_num
this = frame.copy(deep=False)
mi_cols = this.columns # cast(MultiIndex, this.columns)
assert isinstance(mi_cols, MultiIndex) # caller is responsible
# this makes life much simpler
if level_num != mi_cols.nlevels - 1:
# roll levels to put selected level at end
roll_columns = mi_cols
for i in range(level_num, mi_cols.nlevels - 1):
# Need to check if the ints conflict with level names
lev1 = _convert_level_number(i, roll_columns)
lev2 = _convert_level_number(i + 1, roll_columns)
roll_columns = roll_columns.swaplevel(lev1, lev2)
this.columns = mi_cols = roll_columns
if not mi_cols._is_lexsorted() and sort:
# Workaround the edge case where 0 is one of the column names,
# which interferes with trying to sort based on the first
# level
level_to_sort = _convert_level_number(0, mi_cols)
this = this.sort_index(level=level_to_sort, axis=1)
mi_cols = this.columns
mi_cols = cast(MultiIndex, mi_cols)
new_columns = _stack_multi_column_index(mi_cols)
# time to ravel the values
new_data = {}
level_vals = mi_cols.levels[-1]
level_codes = unique(mi_cols.codes[-1])
if sort:
level_codes = np.sort(level_codes)
level_vals_nan = level_vals.insert(len(level_vals), None)
level_vals_used = np.take(level_vals_nan, level_codes)
levsize = len(level_codes)
drop_cols = []
for key in new_columns:
try:
loc = this.columns.get_loc(key)
except KeyError:
drop_cols.append(key)
continue
# can make more efficient?
# we almost always return a slice
# but if unsorted can get a boolean
# indexer
if not isinstance(loc, slice):
slice_len = len(loc)
else:
slice_len = loc.stop - loc.start
if slice_len != levsize:
chunk = this.loc[:, this.columns[loc]]
chunk.columns = level_vals_nan.take(chunk.columns.codes[-1])
value_slice = chunk.reindex(columns=level_vals_used).values
else:
subset = this.iloc[:, loc]
dtype = find_common_type(subset.dtypes.tolist())
if isinstance(dtype, ExtensionDtype):
# TODO(EA2D): won't need special case, can go through .values
# paths below (might change to ._values)
value_slice = dtype.construct_array_type()._concat_same_type(
[x._values.astype(dtype, copy=False) for _, x in subset.items()]
)
N, K = subset.shape
idx = np.arange(N * K).reshape(K, N).T.ravel()
value_slice = value_slice.take(idx)
else:
value_slice = subset.values
if value_slice.ndim > 1:
# i.e. not extension
value_slice = value_slice.ravel()
new_data[key] = value_slice
if len(drop_cols) > 0:
new_columns = new_columns.difference(drop_cols)
N = len(this)
if isinstance(this.index, MultiIndex):
new_levels = list(this.index.levels)
new_names = list(this.index.names)
new_codes = [lab.repeat(levsize) for lab in this.index.codes]
else:
old_codes, old_levels = factorize_from_iterable(this.index)
new_levels = [old_levels]
new_codes = [old_codes.repeat(levsize)]
new_names = [this.index.name] # something better?
new_levels.append(level_vals)
new_codes.append(np.tile(level_codes, N))
new_names.append(frame.columns.names[level_num])
new_index = MultiIndex(
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
)
result = frame._constructor(new_data, index=new_index, columns=new_columns)
if frame.columns.nlevels > 1:
desired_columns = frame.columns._drop_level_numbers([level_num]).unique()
if not result.columns.equals(desired_columns):
result = result[desired_columns]
# more efficient way to go about this? can do the whole masking biz but
# will only save a small amount of time...
if dropna:
result = result.dropna(axis=0, how="all")
return result
def _reorder_for_extension_array_stack(
arr: ExtensionArray, n_rows: int, n_columns: int
) -> ExtensionArray:
"""
Re-orders the values when stacking multiple extension-arrays.
The indirect stacking method used for EAs requires a followup
take to get the order correct.
Parameters
----------
arr : ExtensionArray
n_rows, n_columns : int
The number of rows and columns in the original DataFrame.
Returns
-------
taken : ExtensionArray
The original `arr` with elements re-ordered appropriately
Examples
--------
>>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f'])
>>> _reorder_for_extension_array_stack(arr, 2, 3)
array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='<U1')
>>> _reorder_for_extension_array_stack(arr, 3, 2)
array(['a', 'd', 'b', 'e', 'c', 'f'], dtype='<U1')
"""
# final take to get the order correct.
# idx is an indexer like
# [c0r0, c1r0, c2r0, ...,
# c0r1, c1r1, c2r1, ...]
idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel()
return arr.take(idx)
def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame:
if frame.columns.nunique() != len(frame.columns):
raise ValueError("Columns with duplicate values are not supported in stack")
# If we need to drop `level` from columns, it needs to be in descending order
drop_levnums = sorted(level, reverse=True)
stack_cols = frame.columns._drop_level_numbers(
[k for k in range(frame.columns.nlevels) if k not in level][::-1]
)
if len(level) > 1:
# Arrange columns in the order we want to take them, e.g. level=[2, 0, 1]
sorter = np.argsort(level)
ordered_stack_cols = stack_cols._reorder_ilevels(sorter)
else:
ordered_stack_cols = stack_cols
stack_cols_unique = stack_cols.unique()
ordered_stack_cols_unique = ordered_stack_cols.unique()
# Grab data for each unique index to be stacked
buf = []
for idx in stack_cols_unique:
if len(frame.columns) == 1:
data = frame.copy()
else:
# Take the data from frame corresponding to this idx value
if len(level) == 1:
idx = (idx,)
gen = iter(idx)
column_indexer = tuple(
next(gen) if k in level else slice(None)
for k in range(frame.columns.nlevels)
)
data = frame.loc[:, column_indexer]
if len(level) < frame.columns.nlevels:
data.columns = data.columns._drop_level_numbers(drop_levnums)
elif stack_cols.nlevels == 1:
if data.ndim == 1:
data.name = 0
else:
data.columns = RangeIndex(len(data.columns))
buf.append(data)
result: Series | DataFrame
if len(buf) > 0 and not frame.empty:
result = concat(buf)
ratio = len(result) // len(frame)
else:
# input is empty
if len(level) < frame.columns.nlevels:
# concat column order may be different from dropping the levels
new_columns = frame.columns._drop_level_numbers(drop_levnums).unique()
else:
new_columns = [0]
result = DataFrame(columns=new_columns, dtype=frame._values.dtype)
ratio = 0
if len(level) < frame.columns.nlevels:
# concat column order may be different from dropping the levels
desired_columns = frame.columns._drop_level_numbers(drop_levnums).unique()
if not result.columns.equals(desired_columns):
result = result[desired_columns]
# Construct the correct MultiIndex by combining the frame's index and
# stacked columns.
index_levels: list | FrozenList
if isinstance(frame.index, MultiIndex):
index_levels = frame.index.levels
index_codes = list(np.tile(frame.index.codes, (1, ratio)))
else:
codes, uniques = factorize(frame.index, use_na_sentinel=False)
index_levels = [uniques]
index_codes = list(np.tile(codes, (1, ratio)))
if isinstance(stack_cols, MultiIndex):
column_levels = ordered_stack_cols.levels
column_codes = ordered_stack_cols.drop_duplicates().codes
else:
column_levels = [ordered_stack_cols.unique()]
column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]]
column_codes = [np.repeat(codes, len(frame)) for codes in column_codes]
result.index = MultiIndex(
levels=index_levels + column_levels,
codes=index_codes + column_codes,
names=frame.index.names + list(ordered_stack_cols.names),
verify_integrity=False,
)
# sort result, but faster than calling sort_index since we know the order we need
len_df = len(frame)
n_uniques = len(ordered_stack_cols_unique)
indexer = np.arange(n_uniques)
idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques)
result = result.take(idxs)
# Reshape/rename if needed and dropna
if result.ndim == 2 and frame.columns.nlevels == len(level):
if len(result.columns) == 0:
result = Series(index=result.index)
else:
result = result.iloc[:, 0]
if result.ndim == 1:
result.name = None
return result

View File

@ -0,0 +1,638 @@
"""
Quantilization functions and related stuff
"""
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
Callable,
Literal,
)
import numpy as np
from pandas._libs import (
Timedelta,
Timestamp,
lib,
)
from pandas.core.dtypes.common import (
ensure_platform_int,
is_bool_dtype,
is_integer,
is_list_like,
is_numeric_dtype,
is_scalar,
)
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
DatetimeTZDtype,
ExtensionDtype,
)
from pandas.core.dtypes.generic import ABCSeries
from pandas.core.dtypes.missing import isna
from pandas import (
Categorical,
Index,
IntervalIndex,
)
import pandas.core.algorithms as algos
from pandas.core.arrays.datetimelike import dtype_to_unit
if TYPE_CHECKING:
from pandas._typing import (
DtypeObj,
IntervalLeftRight,
)
def cut(
x,
bins,
right: bool = True,
labels=None,
retbins: bool = False,
precision: int = 3,
include_lowest: bool = False,
duplicates: str = "raise",
ordered: bool = True,
):
"""
Bin values into discrete intervals.
Use `cut` when you need to segment and sort data values into bins. This
function is also useful for going from a continuous variable to a
categorical variable. For example, `cut` could convert ages to groups of
age ranges. Supports binning into an equal number of bins, or a
pre-specified array of bins.
Parameters
----------
x : array-like
The input array to be binned. Must be 1-dimensional.
bins : int, sequence of scalars, or IntervalIndex
The criteria to bin by.
* int : Defines the number of equal-width bins in the range of `x`. The
range of `x` is extended by .1% on each side to include the minimum
and maximum values of `x`.
* sequence of scalars : Defines the bin edges allowing for non-uniform
width. No extension of the range of `x` is done.
* IntervalIndex : Defines the exact bins to be used. Note that
IntervalIndex for `bins` must be non-overlapping.
right : bool, default True
Indicates whether `bins` includes the rightmost edge or not. If
``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]``
indicate (1,2], (2,3], (3,4]. This argument is ignored when
`bins` is an IntervalIndex.
labels : array or False, default None
Specifies the labels for the returned bins. Must be the same length as
the resulting bins. If False, returns only integer indicators of the
bins. This affects the type of the output container (see below).
This argument is ignored when `bins` is an IntervalIndex. If True,
raises an error. When `ordered=False`, labels must be provided.
retbins : bool, default False
Whether to return the bins or not. Useful when bins is provided
as a scalar.
precision : int, default 3
The precision at which to store and display the bins labels.
include_lowest : bool, default False
Whether the first interval should be left-inclusive or not.
duplicates : {default 'raise', 'drop'}, optional
If bin edges are not unique, raise ValueError or drop non-uniques.
ordered : bool, default True
Whether the labels are ordered or not. Applies to returned types
Categorical and Series (with Categorical dtype). If True,
the resulting categorical will be ordered. If False, the resulting
categorical will be unordered (labels must be provided).
Returns
-------
out : Categorical, Series, or ndarray
An array-like object representing the respective bin for each value
of `x`. The type depends on the value of `labels`.
* None (default) : returns a Series for Series `x` or a
Categorical for all other inputs. The values stored within
are Interval dtype.
* sequence of scalars : returns a Series for Series `x` or a
Categorical for all other inputs. The values stored within
are whatever the type in the sequence is.
* False : returns an ndarray of integers.
bins : numpy.ndarray or IntervalIndex.
The computed or specified bins. Only returned when `retbins=True`.
For scalar or sequence `bins`, this is an ndarray with the computed
bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For
an IntervalIndex `bins`, this is equal to `bins`.
See Also
--------
qcut : Discretize variable into equal-sized buckets based on rank
or based on sample quantiles.
Categorical : Array type for storing data that come from a
fixed set of values.
Series : One-dimensional array with axis labels (including time series).
IntervalIndex : Immutable Index implementing an ordered, sliceable set.
Notes
-----
Any NA values will be NA in the result. Out of bounds values will be NA in
the resulting Series or Categorical object.
Reference :ref:`the user guide <reshaping.tile.cut>` for more examples.
Examples
--------
Discretize into three equal-sized bins.
>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)
... # doctest: +ELLIPSIS
[(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ...
>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)
... # doctest: +ELLIPSIS
([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ...
array([0.994, 3. , 5. , 7. ]))
Discovers the same bins, but assign them specific labels. Notice that
the returned Categorical's categories are `labels` and is ordered.
>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]),
... 3, labels=["bad", "medium", "good"])
['bad', 'good', 'medium', 'medium', 'good', 'bad']
Categories (3, object): ['bad' < 'medium' < 'good']
``ordered=False`` will result in unordered categories when labels are passed.
This parameter can be used to allow non-unique labels:
>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3,
... labels=["B", "A", "B"], ordered=False)
['B', 'B', 'A', 'A', 'B', 'B']
Categories (2, object): ['A', 'B']
``labels=False`` implies you just want the bins back.
>>> pd.cut([0, 1, 1, 2], bins=4, labels=False)
array([0, 1, 1, 3])
Passing a Series as an input returns a Series with categorical dtype:
>>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
... index=['a', 'b', 'c', 'd', 'e'])
>>> pd.cut(s, 3)
... # doctest: +ELLIPSIS
a (1.992, 4.667]
b (1.992, 4.667]
c (4.667, 7.333]
d (7.333, 10.0]
e (7.333, 10.0]
dtype: category
Categories (3, interval[float64, right]): [(1.992, 4.667] < (4.667, ...
Passing a Series as an input returns a Series with mapping value.
It is used to map numerically to intervals based on bins.
>>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
... index=['a', 'b', 'c', 'd', 'e'])
>>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False)
... # doctest: +ELLIPSIS
(a 1.0
b 2.0
c 3.0
d 4.0
e NaN
dtype: float64,
array([ 0, 2, 4, 6, 8, 10]))
Use `drop` optional when bins is not unique
>>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True,
... right=False, duplicates='drop')
... # doctest: +ELLIPSIS
(a 1.0
b 2.0
c 3.0
d 3.0
e NaN
dtype: float64,
array([ 0, 2, 4, 6, 10]))
Passing an IntervalIndex for `bins` results in those categories exactly.
Notice that values not covered by the IntervalIndex are set to NaN. 0
is to the left of the first bin (which is closed on the right), and 1.5
falls between two bins.
>>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])
>>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)
[NaN, (0.0, 1.0], NaN, (2.0, 3.0], (4.0, 5.0]]
Categories (3, interval[int64, right]): [(0, 1] < (2, 3] < (4, 5]]
"""
# NOTE: this binning code is changed a bit from histogram for var(x) == 0
original = x
x_idx = _preprocess_for_cut(x)
x_idx, _ = _coerce_to_type(x_idx)
if not np.iterable(bins):
bins = _nbins_to_bins(x_idx, bins, right)
elif isinstance(bins, IntervalIndex):
if bins.is_overlapping:
raise ValueError("Overlapping IntervalIndex is not accepted.")
else:
bins = Index(bins)
if not bins.is_monotonic_increasing:
raise ValueError("bins must increase monotonically.")
fac, bins = _bins_to_cuts(
x_idx,
bins,
right=right,
labels=labels,
precision=precision,
include_lowest=include_lowest,
duplicates=duplicates,
ordered=ordered,
)
return _postprocess_for_cut(fac, bins, retbins, original)
def qcut(
x,
q,
labels=None,
retbins: bool = False,
precision: int = 3,
duplicates: str = "raise",
):
"""
Quantile-based discretization function.
Discretize variable into equal-sized buckets based on rank or based
on sample quantiles. For example 1000 values for 10 quantiles would
produce a Categorical object indicating quantile membership for each data point.
Parameters
----------
x : 1d ndarray or Series
q : int or list-like of float
Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles.
labels : array or False, default None
Used as labels for the resulting bins. Must be of the same length as
the resulting bins. If False, return only integer indicators of the
bins. If True, raises an error.
retbins : bool, optional
Whether to return the (bins, labels) or not. Can be useful if bins
is given as a scalar.
precision : int, optional
The precision at which to store and display the bins labels.
duplicates : {default 'raise', 'drop'}, optional
If bin edges are not unique, raise ValueError or drop non-uniques.
Returns
-------
out : Categorical or Series or array of integers if labels is False
The return type (Categorical or Series) depends on the input: a Series
of type category if input is a Series else Categorical. Bins are
represented as categories when categorical data is returned.
bins : ndarray of floats
Returned only if `retbins` is True.
Notes
-----
Out of bounds values will be NA in the resulting Categorical object
Examples
--------
>>> pd.qcut(range(5), 4)
... # doctest: +ELLIPSIS
[(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]
Categories (4, interval[float64, right]): [(-0.001, 1.0] < (1.0, 2.0] ...
>>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"])
... # doctest: +SKIP
[good, good, medium, bad, bad]
Categories (3, object): [good < medium < bad]
>>> pd.qcut(range(5), 4, labels=False)
array([0, 0, 1, 2, 3])
"""
original = x
x_idx = _preprocess_for_cut(x)
x_idx, _ = _coerce_to_type(x_idx)
quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q
bins = x_idx.to_series().dropna().quantile(quantiles)
fac, bins = _bins_to_cuts(
x_idx,
Index(bins),
labels=labels,
precision=precision,
include_lowest=True,
duplicates=duplicates,
)
return _postprocess_for_cut(fac, bins, retbins, original)
def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:
"""
If a user passed an integer N for bins, convert this to a sequence of N
equal(ish)-sized bins.
"""
if is_scalar(nbins) and nbins < 1:
raise ValueError("`bins` should be a positive integer.")
if x_idx.size == 0:
raise ValueError("Cannot cut empty array")
rng = (x_idx.min(), x_idx.max())
mn, mx = rng
if is_numeric_dtype(x_idx.dtype) and (np.isinf(mn) or np.isinf(mx)):
# GH#24314
raise ValueError(
"cannot specify integer `bins` when input data contains infinity"
)
if mn == mx: # adjust end points before binning
if _is_dt_or_td(x_idx.dtype):
# using seconds=1 is pretty arbitrary here
# error: Argument 1 to "dtype_to_unit" has incompatible type
# "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]"
unit = dtype_to_unit(x_idx.dtype) # type: ignore[arg-type]
td = Timedelta(seconds=1).as_unit(unit)
# Use DatetimeArray/TimedeltaArray method instead of linspace
# error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
# has no attribute "_generate_range"
bins = x_idx._values._generate_range( # type: ignore[union-attr]
start=mn - td, end=mx + td, periods=nbins + 1, freq=None, unit=unit
)
else:
mn -= 0.001 * abs(mn) if mn != 0 else 0.001
mx += 0.001 * abs(mx) if mx != 0 else 0.001
bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
else: # adjust end points after binning
if _is_dt_or_td(x_idx.dtype):
# Use DatetimeArray/TimedeltaArray method instead of linspace
# error: Argument 1 to "dtype_to_unit" has incompatible type
# "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]"
unit = dtype_to_unit(x_idx.dtype) # type: ignore[arg-type]
# error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
# has no attribute "_generate_range"
bins = x_idx._values._generate_range( # type: ignore[union-attr]
start=mn, end=mx, periods=nbins + 1, freq=None, unit=unit
)
else:
bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
adj = (mx - mn) * 0.001 # 0.1% of the range
if right:
bins[0] -= adj
else:
bins[-1] += adj
return Index(bins)
def _bins_to_cuts(
x_idx: Index,
bins: Index,
right: bool = True,
labels=None,
precision: int = 3,
include_lowest: bool = False,
duplicates: str = "raise",
ordered: bool = True,
):
if not ordered and labels is None:
raise ValueError("'labels' must be provided if 'ordered = False'")
if duplicates not in ["raise", "drop"]:
raise ValueError(
"invalid value for 'duplicates' parameter, valid options are: raise, drop"
)
result: Categorical | np.ndarray
if isinstance(bins, IntervalIndex):
# we have a fast-path here
ids = bins.get_indexer(x_idx)
cat_dtype = CategoricalDtype(bins, ordered=True)
result = Categorical.from_codes(ids, dtype=cat_dtype, validate=False)
return result, bins
unique_bins = algos.unique(bins)
if len(unique_bins) < len(bins) and len(bins) != 2:
if duplicates == "raise":
raise ValueError(
f"Bin edges must be unique: {repr(bins)}.\n"
f"You can drop duplicate edges by setting the 'duplicates' kwarg"
)
bins = unique_bins
side: Literal["left", "right"] = "left" if right else "right"
try:
ids = bins.searchsorted(x_idx, side=side)
except TypeError as err:
# e.g. test_datetime_nan_error if bins are DatetimeArray and x_idx
# is integers
if x_idx.dtype.kind == "m":
raise ValueError("bins must be of timedelta64 dtype") from err
elif x_idx.dtype.kind == bins.dtype.kind == "M":
raise ValueError(
"Cannot use timezone-naive bins with timezone-aware values, "
"or vice-versa"
) from err
elif x_idx.dtype.kind == "M":
raise ValueError("bins must be of datetime64 dtype") from err
else:
raise
ids = ensure_platform_int(ids)
if include_lowest:
ids[x_idx == bins[0]] = 1
na_mask = isna(x_idx) | (ids == len(bins)) | (ids == 0)
has_nas = na_mask.any()
if labels is not False:
if not (labels is None or is_list_like(labels)):
raise ValueError(
"Bin labels must either be False, None or passed in as a "
"list-like argument"
)
if labels is None:
labels = _format_labels(
bins, precision, right=right, include_lowest=include_lowest
)
elif ordered and len(set(labels)) != len(labels):
raise ValueError(
"labels must be unique if ordered=True; pass ordered=False "
"for duplicate labels"
)
else:
if len(labels) != len(bins) - 1:
raise ValueError(
"Bin labels must be one fewer than the number of bin edges"
)
if not isinstance(getattr(labels, "dtype", None), CategoricalDtype):
labels = Categorical(
labels,
categories=labels if len(set(labels)) == len(labels) else None,
ordered=ordered,
)
# TODO: handle mismatch between categorical label order and pandas.cut order.
np.putmask(ids, na_mask, 0)
result = algos.take_nd(labels, ids - 1)
else:
result = ids - 1
if has_nas:
result = result.astype(np.float64)
np.putmask(result, na_mask, np.nan)
return result, bins
def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]:
"""
if the passed data is of datetime/timedelta, bool or nullable int type,
this method converts it to numeric so that cut or qcut method can
handle it
"""
dtype: DtypeObj | None = None
if _is_dt_or_td(x.dtype):
dtype = x.dtype
elif is_bool_dtype(x.dtype):
# GH 20303
x = x.astype(np.int64)
# To support cut and qcut for IntegerArray we convert to float dtype.
# Will properly support in the future.
# https://github.com/pandas-dev/pandas/pull/31290
# https://github.com/pandas-dev/pandas/issues/31389
elif isinstance(x.dtype, ExtensionDtype) and is_numeric_dtype(x.dtype):
x_arr = x.to_numpy(dtype=np.float64, na_value=np.nan)
x = Index(x_arr)
return Index(x), dtype
def _is_dt_or_td(dtype: DtypeObj) -> bool:
# Note: the dtype here comes from an Index.dtype, so we know that that any
# dt64/td64 dtype is of a supported unit.
return isinstance(dtype, DatetimeTZDtype) or lib.is_np_dtype(dtype, "mM")
def _format_labels(
bins: Index,
precision: int,
right: bool = True,
include_lowest: bool = False,
):
"""based on the dtype, return our labels"""
closed: IntervalLeftRight = "right" if right else "left"
formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta]
if _is_dt_or_td(bins.dtype):
# error: Argument 1 to "dtype_to_unit" has incompatible type
# "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]"
unit = dtype_to_unit(bins.dtype) # type: ignore[arg-type]
formatter = lambda x: x
adjust = lambda x: x - Timedelta(1, unit=unit).as_unit(unit)
else:
precision = _infer_precision(precision, bins)
formatter = lambda x: _round_frac(x, precision)
adjust = lambda x: x - 10 ** (-precision)
breaks = [formatter(b) for b in bins]
if right and include_lowest:
# adjust lhs of first interval by precision to account for being right closed
breaks[0] = adjust(breaks[0])
if _is_dt_or_td(bins.dtype):
# error: "Index" has no attribute "as_unit"
breaks = type(bins)(breaks).as_unit(unit) # type: ignore[attr-defined]
return IntervalIndex.from_breaks(breaks, closed=closed)
def _preprocess_for_cut(x) -> Index:
"""
handles preprocessing for cut where we convert passed
input to array, strip the index information and store it
separately
"""
# Check that the passed array is a Pandas or Numpy object
# We don't want to strip away a Pandas data-type here (e.g. datetimetz)
ndim = getattr(x, "ndim", None)
if ndim is None:
x = np.asarray(x)
if x.ndim != 1:
raise ValueError("Input array must be 1 dimensional")
return Index(x)
def _postprocess_for_cut(fac, bins, retbins: bool, original):
"""
handles post processing for the cut method where
we combine the index information if the originally passed
datatype was a series
"""
if isinstance(original, ABCSeries):
fac = original._constructor(fac, index=original.index, name=original.name)
if not retbins:
return fac
if isinstance(bins, Index) and is_numeric_dtype(bins.dtype):
bins = bins._values
return fac, bins
def _round_frac(x, precision: int):
"""
Round the fractional part of the given number
"""
if not np.isfinite(x) or x == 0:
return x
else:
frac, whole = np.modf(x)
if whole == 0:
digits = -int(np.floor(np.log10(abs(frac)))) - 1 + precision
else:
digits = precision
return np.around(x, digits)
def _infer_precision(base_precision: int, bins: Index) -> int:
"""
Infer an appropriate precision for _round_frac
"""
for precision in range(base_precision, 20):
levels = np.asarray([_round_frac(b, precision) for b in bins])
if algos.unique(levels).size == bins.size:
return precision
return base_precision # default

View File

@ -0,0 +1,85 @@
from __future__ import annotations
from typing import TYPE_CHECKING
import numpy as np
from pandas.core.dtypes.common import is_list_like
if TYPE_CHECKING:
from pandas._typing import NumpyIndexT
def cartesian_product(X) -> list[np.ndarray]:
"""
Numpy version of itertools.product.
Sometimes faster (for large inputs)...
Parameters
----------
X : list-like of list-likes
Returns
-------
product : list of ndarrays
Examples
--------
>>> cartesian_product([list('ABC'), [1, 2]])
[array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='<U1'), array([1, 2, 1, 2, 1, 2])]
See Also
--------
itertools.product : Cartesian product of input iterables. Equivalent to
nested for-loops.
"""
msg = "Input must be a list-like of list-likes"
if not is_list_like(X):
raise TypeError(msg)
for x in X:
if not is_list_like(x):
raise TypeError(msg)
if len(X) == 0:
return []
lenX = np.fromiter((len(x) for x in X), dtype=np.intp)
cumprodX = np.cumprod(lenX)
if np.any(cumprodX < 0):
raise ValueError("Product space too large to allocate arrays!")
a = np.roll(cumprodX, 1)
a[0] = 1
if cumprodX[-1] != 0:
b = cumprodX[-1] / cumprodX
else:
# if any factor is empty, the cartesian product is empty
b = np.zeros_like(cumprodX)
# error: Argument of type "int_" cannot be assigned to parameter "num" of
# type "int" in function "tile_compat"
return [
tile_compat(
np.repeat(x, b[i]),
np.prod(a[i]),
)
for i, x in enumerate(X)
]
def tile_compat(arr: NumpyIndexT, num: int) -> NumpyIndexT:
"""
Index compat for np.tile.
Notes
-----
Does not support multi-dimensional `num`.
"""
if isinstance(arr, np.ndarray):
return np.tile(arr, num)
# Otherwise we have an Index
taker = np.tile(np.arange(len(arr)), num)
return arr.take(taker)