-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
Categorical.(get|from)_dummies #34426
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b5ab7f2
f937c96
dd14132
9dc9da5
ac9cec2
0459cb1
65e68c2
1334026
c2240b6
66771bf
4e769da
fe002af
097f2c6
afe8eda
e78158e
4fb1e5e
9fa5494
61567fd
5d724cc
1182ce5
04ca72a
6e4f71a
a761baf
ed58c77
741cf8f
6f199b6
034f8e1
b80f089
0eb936f
8f212e1
bda5265
6e6ddda
9fcebf0
b9908c4
faeec41
e11f28e
742c940
722137d
4945ba8
1f98233
ff01048
604b839
c71e807
6f9272a
8fd4b72
534bc33
0facec6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,7 @@ | |
from functools import partial | ||
import operator | ||
from shutil import get_terminal_size | ||
from typing import Dict, Hashable, List, Type, Union, cast | ||
from typing import TYPE_CHECKING, Any, Dict, Hashable, List, Optional, Type, Union, cast | ||
from warnings import warn | ||
|
||
import numpy as np | ||
|
@@ -55,6 +55,9 @@ | |
|
||
from pandas.io.formats import console | ||
|
||
if TYPE_CHECKING: | ||
from pandas._typing import DataFrame # noqa: F401 | ||
|
||
|
||
def _cat_compare_op(op): | ||
opname = f"__{op.__name__}__" | ||
|
@@ -370,6 +373,221 @@ def __init__( | |
self._dtype = self._dtype.update_dtype(dtype) | ||
self._codes = coerce_indexer_dtype(codes, dtype.categories) | ||
|
||
@classmethod | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i agree with the others either remove this or make it identical to get_dummies There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the discussion around this was more about the necessity/ implementation of |
||
def from_dummies( | ||
cls, | ||
dummies: "DataFrame", | ||
ordered: Optional[bool] = None, | ||
prefix: Optional[str] = None, | ||
prefix_sep: str = "_", | ||
fillna: Optional[bool] = None, | ||
) -> "Categorical": | ||
"""Create a `Categorical` using a ``DataFrame`` of dummy variables. | ||
|
||
Can use a subset of columns based on the ``prefix`` | ||
and ``prefix_sep`` parameters. | ||
|
||
The ``DataFrame`` must have no more than one truthy value per row. | ||
The columns of the ``DataFrame`` become the categories of the `Categorical`. | ||
A column whose header is NA will be dropped: | ||
any row containing a NA value will be uncategorised. | ||
|
||
Parameters | ||
---------- | ||
dummies : DataFrame | ||
dtypes of columns with non-NA headers must be coercible to bool. | ||
Sparse dataframes are not supported. | ||
ordered : bool | ||
Whether or not this Categorical is ordered. | ||
prefix : optional str | ||
Only take columns whose names are strings starting | ||
with this prefix and ``prefix_sep``, | ||
stripping those elements from the resulting category names. | ||
prefix_sep : str, default "_" | ||
If ``prefix`` is not ``None``, use as the separator | ||
between the prefix and the final name of the category. | ||
fillna : optional bool, default None | ||
How to handle NA values. | ||
If ``True`` or ``False``, NA is filled with that value. | ||
If ``None``, raise a ValueError if there are any NA values. | ||
|
||
Raises | ||
------ | ||
ValueError | ||
If a sample belongs to >1 category | ||
clbarnes marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
Returns | ||
------- | ||
Categorical | ||
|
||
Examples | ||
-------- | ||
>>> simple = pd.DataFrame(np.eye(3), columns=["a", "b", "c"]) | ||
>>> Categorical.from_dummies(simple) | ||
[a, b, c] | ||
Categories (3, object): [a, b, c] | ||
|
||
>>> nan_col = pd.DataFrame(np.eye(4), columns=["a", "b", np.nan, None]) | ||
>>> Categorical.from_dummies(nan_col) | ||
[a, b, NaN, NaN] | ||
Categories (2, object): [a, b] | ||
|
||
>>> nan_cell = pd.DataFrame( | ||
... [[1, 0, np.nan], [0, 1, 0], [0, 0, 1]], | ||
... columns=["a", "b", "c"], | ||
... ) | ||
>>> Categorical.from_dummies(nan_cell) | ||
[NaN, b, c] | ||
Categories (3, object): [a, b, c] | ||
|
||
>>> multi = pd.DataFrame( | ||
... [[1, 0, 1], [0, 1, 0], [0, 0, 1]], | ||
... columns=["a", "b", "c"], | ||
... ) | ||
>>> Categorical.from_dummies(multi) | ||
Traceback (most recent call last): | ||
... | ||
ValueError: 1 record(s) belongs to multiple categories: [0] | ||
""" | ||
from pandas import Series | ||
clbarnes marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
to_drop = dummies.columns[isna(dummies.columns)] | ||
if len(to_drop): | ||
dummies = dummies.drop(columns=to_drop) | ||
|
||
cats: List[Any] | ||
if prefix is None: | ||
cats = list(dummies.columns) | ||
else: | ||
pref = prefix + (prefix_sep or "") | ||
cats = [] | ||
to_keep: List[str] = [] | ||
for c in dummies.columns: | ||
if isinstance(c, str) and c.startswith(pref): | ||
to_keep.append(c) | ||
cats.append(c[len(pref) :]) | ||
dummies = dummies[to_keep] | ||
|
||
df = dummies.astype("boolean") | ||
if fillna is not None: | ||
df = df.fillna(fillna) | ||
|
||
row_totals = df.sum(axis=1, skipna=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why skipna? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If there is no explicit |
||
if row_totals.isna().any(): | ||
raise ValueError("Unhandled NA values in dummy array") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this tested? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are some holes left in the tests, on the to do list |
||
|
||
multicat_rows = row_totals > 1 | ||
if multicat_rows.any(): | ||
raise ValueError( | ||
f"{multicat_rows.sum()} record(s) belongs to multiple categories: " | ||
f"{list(df.index[multicat_rows])}" | ||
) | ||
|
||
codes = Series(np.full(len(row_totals), np.nan), index=df.index, dtype="Int64") | ||
codes[row_totals == 0] = -1 | ||
row_idx, code = np.nonzero(df) | ||
codes[row_idx] = code | ||
|
||
return cls.from_codes(codes.fillna(-1), cats, ordered=ordered) | ||
|
||
def get_dummies( | ||
clbarnes marked this conversation as resolved.
Show resolved
Hide resolved
|
||
self, | ||
prefix: Optional[str] = None, | ||
prefix_sep: str = "_", | ||
dummy_na: bool = False, | ||
sparse: bool = False, | ||
drop_first: bool = False, | ||
dtype: Dtype = None, | ||
) -> "DataFrame": | ||
""" | ||
Convert into dummy/indicator variables. | ||
|
||
Parameters | ||
---------- | ||
prefix : str, default None | ||
String to append DataFrame column names. | ||
prefix_sep : str, default '_' | ||
If appending prefix, separator/delimiter to use. | ||
dummy_na : bool, default False | ||
Add a column to indicate NaNs, if False NaNs are ignored. | ||
sparse : bool, default False | ||
Whether the dummy-encoded columns should be backed by | ||
a :class:`SparseArray` (True) or a regular NumPy array (False). | ||
drop_first : bool, default False | ||
Whether to get k-1 dummies out of k categorical levels by removing the | ||
first level. | ||
dtype : dtype, default np.uint8 | ||
Data type for new columns. Only a single dtype is allowed. | ||
|
||
Returns | ||
------- | ||
DataFrame | ||
Dummy-coded data. | ||
|
||
See Also | ||
-------- | ||
Series.str.get_dummies : Convert Series to dummy codes. | ||
pandas.get_dummies : Convert categorical variable to dummy/indicator variables. | ||
|
||
Examples | ||
-------- | ||
>>> s = pd.Categorical(list('abca')) | ||
|
||
>>> s.get_dummies() | ||
a b c | ||
0 1 0 0 | ||
1 0 1 0 | ||
2 0 0 1 | ||
3 1 0 0 | ||
|
||
>>> s1 = pd.Categorical(['a', 'b', np.nan]) | ||
|
||
>>> s1.get_dummies() | ||
a b | ||
0 1 0 | ||
1 0 1 | ||
2 0 0 | ||
|
||
>>> s1.get_dummies(dummy_na=True) | ||
a b NaN | ||
0 1 0 0 | ||
1 0 1 0 | ||
2 0 0 1 | ||
|
||
>>> pd.Categorical(list('abcaa')).get_dummies() | ||
a b c | ||
0 1 0 0 | ||
1 0 1 0 | ||
2 0 0 1 | ||
3 1 0 0 | ||
4 1 0 0 | ||
|
||
>>> pd.Categorical(list('abcaa')).get_dummies(drop_first=True) | ||
b c | ||
0 0 0 | ||
1 1 0 | ||
2 0 1 | ||
3 0 0 | ||
4 0 0 | ||
|
||
>>> pd.Categorical(list('abc')).get_dummies(dtype=float) | ||
a b c | ||
0 1.0 0.0 0.0 | ||
1 0.0 1.0 0.0 | ||
2 0.0 0.0 1.0 | ||
""" | ||
from pandas import get_dummies | ||
|
||
return get_dummies( | ||
self, | ||
prefix=prefix, | ||
prefix_sep=prefix_sep, | ||
dummy_na=dummy_na, | ||
sparse=sparse, | ||
drop_first=drop_first, | ||
dtype=dtype, | ||
) | ||
|
||
@property | ||
def dtype(self) -> CategoricalDtype: | ||
""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you need to have some good cross links to the current get_dummies section
otherwise this is very confusing
i would prefer that these are actually in the get_dummies with just a small note here