diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index b7475ae7bb132..c8bdc2394ddf9 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -127,6 +127,45 @@ This conversion is likewise done column by column: df_cat['A'] df_cat['B'] +Dummy / indicator / one-hot encoded variables +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some operations, like regression and classification, +encodes a single categorical variable as a column for each category, +with each row having False in all but one column (True). +These are called `dummy variables `_, or one-hot encoding. +:class:`pandas.Categorical` objects can easily be converted to and from such an encoding. + +:meth:`pandas.Categorical.get_dummies` produces a dataframe of dummy variables. +It works in the same way and supports most of the same arguments as :func:`pandas.get_dummies`. + +.. ipython:: python + + cat = pd.Categorical(["a", "b", "b", "c"]) + cat + + cat.get_dummies() + +The :meth:`pandas.Categorical.from_dummies` class method accepts a dataframe +whose dtypes are coercible to boolean, and an ``ordered`` argument +for whether the resulting ``Categorical`` should be considered ordered +(like the ``Categorical`` constructor). +A column with a NA index will be ignored. +Any row which is entirely falsey, or has a missing value, +will be uncategorised. +In the same way that :func:`pandas.get_dummies` can add a prefix to string category names, +:meth:`~pandas.Categorical.from_dummies` can filter a dataframe for columns with a prefix: +the resulting ``Categorical`` will have the prefix stripped from its categories. + +.. ipython:: python + + dummies = pd.get_dummies(["a", "b", "b", "c"], prefix="cat") + dummies + + pd.Categorical.from_dummies(dummies, prefix="cat") + + +.. versionadded:: 1.2.0 Controlling behavior ~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 1b90aeb00cf9c..a666bbd885baf 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -606,7 +606,7 @@ This function is often used along with discretization functions like ``cut``: pd.get_dummies(pd.cut(values, bins)) -See also :func:`Series.str.get_dummies `. +See also :func:`Series.str.get_dummies ` and :func:`Categorical.get_dummies `. :func:`get_dummies` also accepts a ``DataFrame``. By default all categorical variables (categorical in the statistical sense, those with `object` or @@ -679,6 +679,15 @@ To choose another dtype, use the ``dtype`` argument: pd.get_dummies(df, dtype=bool).dtypes +A :class:`~pandas.Categorical` can be recovered from a :class:`~pandas.DataFrame` of such dummy variables using :meth:`~pandas.Categorical.from_dummies`. +Use the ``prefix`` and ``prefix_sep`` arguments to select and rename columns which have had a prefix applied in the same way as :class:`~pandas.get_dummies` does. + +.. ipython:: python + + df = pd.get_dummies(list("abca")) + + pd.Categorical.from_dummies(df) + .. _reshaping.factorize: diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 6a5b4b3b9ff16..c3ac951eb51b1 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -120,6 +120,7 @@ Other enhancements - `Styler` now allows direct CSS class name addition to individual data cells (:issue:`36159`) - :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) - :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`) +- :meth:`Categorical.from_dummies` and :meth:`Categorical.get_dummies` convert between :class:`Categorical` and :class:`DataFrame` objects of dummy variables. .. _whatsnew_120.api_breaking.python: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ef69d6565cfeb..224e336fae9dd 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2,7 +2,7 @@ from functools import partial import operator from shutil import get_terminal_size -from typing import Dict, Hashable, List, Type, Union, cast +from typing import TYPE_CHECKING, Any, Dict, Hashable, List, Optional, Type, Union, cast from warnings import warn import numpy as np @@ -55,6 +55,9 @@ from pandas.io.formats import console +if TYPE_CHECKING: + from pandas._typing import DataFrame # noqa: F401 + def _cat_compare_op(op): opname = f"__{op.__name__}__" @@ -370,6 +373,221 @@ def __init__( self._dtype = self._dtype.update_dtype(dtype) self._codes = coerce_indexer_dtype(codes, dtype.categories) + @classmethod + def from_dummies( + cls, + dummies: "DataFrame", + ordered: Optional[bool] = None, + prefix: Optional[str] = None, + prefix_sep: str = "_", + fillna: Optional[bool] = None, + ) -> "Categorical": + """Create a `Categorical` using a ``DataFrame`` of dummy variables. + + Can use a subset of columns based on the ``prefix`` + and ``prefix_sep`` parameters. + + The ``DataFrame`` must have no more than one truthy value per row. + The columns of the ``DataFrame`` become the categories of the `Categorical`. + A column whose header is NA will be dropped: + any row containing a NA value will be uncategorised. + + Parameters + ---------- + dummies : DataFrame + dtypes of columns with non-NA headers must be coercible to bool. + Sparse dataframes are not supported. + ordered : bool + Whether or not this Categorical is ordered. + prefix : optional str + Only take columns whose names are strings starting + with this prefix and ``prefix_sep``, + stripping those elements from the resulting category names. + prefix_sep : str, default "_" + If ``prefix`` is not ``None``, use as the separator + between the prefix and the final name of the category. + fillna : optional bool, default None + How to handle NA values. + If ``True`` or ``False``, NA is filled with that value. + If ``None``, raise a ValueError if there are any NA values. + + Raises + ------ + ValueError + If a sample belongs to >1 category + + Returns + ------- + Categorical + + Examples + -------- + >>> simple = pd.DataFrame(np.eye(3), columns=["a", "b", "c"]) + >>> Categorical.from_dummies(simple) + [a, b, c] + Categories (3, object): [a, b, c] + + >>> nan_col = pd.DataFrame(np.eye(4), columns=["a", "b", np.nan, None]) + >>> Categorical.from_dummies(nan_col) + [a, b, NaN, NaN] + Categories (2, object): [a, b] + + >>> nan_cell = pd.DataFrame( + ... [[1, 0, np.nan], [0, 1, 0], [0, 0, 1]], + ... columns=["a", "b", "c"], + ... ) + >>> Categorical.from_dummies(nan_cell) + [NaN, b, c] + Categories (3, object): [a, b, c] + + >>> multi = pd.DataFrame( + ... [[1, 0, 1], [0, 1, 0], [0, 0, 1]], + ... columns=["a", "b", "c"], + ... ) + >>> Categorical.from_dummies(multi) + Traceback (most recent call last): + ... + ValueError: 1 record(s) belongs to multiple categories: [0] + """ + from pandas import Series + + to_drop = dummies.columns[isna(dummies.columns)] + if len(to_drop): + dummies = dummies.drop(columns=to_drop) + + cats: List[Any] + if prefix is None: + cats = list(dummies.columns) + else: + pref = prefix + (prefix_sep or "") + cats = [] + to_keep: List[str] = [] + for c in dummies.columns: + if isinstance(c, str) and c.startswith(pref): + to_keep.append(c) + cats.append(c[len(pref) :]) + dummies = dummies[to_keep] + + df = dummies.astype("boolean") + if fillna is not None: + df = df.fillna(fillna) + + row_totals = df.sum(axis=1, skipna=False) + if row_totals.isna().any(): + raise ValueError("Unhandled NA values in dummy array") + + multicat_rows = row_totals > 1 + if multicat_rows.any(): + raise ValueError( + f"{multicat_rows.sum()} record(s) belongs to multiple categories: " + f"{list(df.index[multicat_rows])}" + ) + + codes = Series(np.full(len(row_totals), np.nan), index=df.index, dtype="Int64") + codes[row_totals == 0] = -1 + row_idx, code = np.nonzero(df) + codes[row_idx] = code + + return cls.from_codes(codes.fillna(-1), cats, ordered=ordered) + + def get_dummies( + self, + prefix: Optional[str] = None, + prefix_sep: str = "_", + dummy_na: bool = False, + sparse: bool = False, + drop_first: bool = False, + dtype: Dtype = None, + ) -> "DataFrame": + """ + Convert into dummy/indicator variables. + + Parameters + ---------- + prefix : str, default None + String to append DataFrame column names. + prefix_sep : str, default '_' + If appending prefix, separator/delimiter to use. + dummy_na : bool, default False + Add a column to indicate NaNs, if False NaNs are ignored. + sparse : bool, default False + Whether the dummy-encoded columns should be backed by + a :class:`SparseArray` (True) or a regular NumPy array (False). + drop_first : bool, default False + Whether to get k-1 dummies out of k categorical levels by removing the + first level. + dtype : dtype, default np.uint8 + Data type for new columns. Only a single dtype is allowed. + + Returns + ------- + DataFrame + Dummy-coded data. + + See Also + -------- + Series.str.get_dummies : Convert Series to dummy codes. + pandas.get_dummies : Convert categorical variable to dummy/indicator variables. + + Examples + -------- + >>> s = pd.Categorical(list('abca')) + + >>> s.get_dummies() + a b c + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 + 3 1 0 0 + + >>> s1 = pd.Categorical(['a', 'b', np.nan]) + + >>> s1.get_dummies() + a b + 0 1 0 + 1 0 1 + 2 0 0 + + >>> s1.get_dummies(dummy_na=True) + a b NaN + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 + + >>> pd.Categorical(list('abcaa')).get_dummies() + a b c + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 + 3 1 0 0 + 4 1 0 0 + + >>> pd.Categorical(list('abcaa')).get_dummies(drop_first=True) + b c + 0 0 0 + 1 1 0 + 2 0 1 + 3 0 0 + 4 0 0 + + >>> pd.Categorical(list('abc')).get_dummies(dtype=float) + a b c + 0 1.0 0.0 0.0 + 1 0.0 1.0 0.0 + 2 0.0 0.0 1.0 + """ + from pandas import get_dummies + + return get_dummies( + self, + prefix=prefix, + prefix_sep=prefix_sep, + dummy_na=dummy_na, + sparse=sparse, + drop_first=drop_first, + dtype=dtype, + ) + @property def dtype(self) -> CategoricalDtype: """ diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 18ebe14763797..be5a39a9f90d4 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -768,6 +768,7 @@ def get_dummies( See Also -------- Series.str.get_dummies : Convert Series to dummy codes. + Categorical.get_dummies : Convert a Categorical array to dummy codes. Examples -------- diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index 6fce4b4145ff2..d47841618d6f0 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas import Categorical, CategoricalIndex, DataFrame, Index, Series +from pandas import Categorical, CategoricalIndex, DataFrame, Index, Series, get_dummies import pandas._testing as tm from pandas.core.arrays.categorical import recode_for_categories from pandas.tests.arrays.categorical.common import TestCategorical @@ -399,6 +399,36 @@ def test_remove_unused_categories(self): out = cat.remove_unused_categories() assert out.tolist() == val.tolist() + @pytest.mark.parametrize( + "vals", + [ + ["a", "b", "b", "a"], + ["a", "b", "b", "a", np.nan], + [1, 1.5, "a", (1, "b")], + [1, 1.5, "a", (1, "b"), np.nan], + ], + ) + def test_get_dummies(self, vals): + # GH 8745 + cats = Categorical(Series(vals)) + tm.assert_equal(cats.get_dummies(), get_dummies(cats)) + + @pytest.mark.parametrize( + "vals", + [ + ["a", "b", "b", "a"], + ["a", "b", "b", "a", np.nan], + [1, 1.5, "a", (1, "b")], + [1, 1.5, "a", (1, "b"), np.nan], + ], + ) + def test_dummies_roundtrip(self, vals): + # GH 8745 + cats = Categorical(Series(vals)) + dummies = cats.get_dummies() + cats2 = Categorical.from_dummies(dummies) + tm.assert_equal(cats, cats2) + class TestCategoricalAPIWithFactor(TestCategorical): def test_describe(self): diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index e200f13652a84..b4c3fe55133ae 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -10,6 +10,7 @@ from pandas import ( Categorical, CategoricalIndex, + DataFrame, DatetimeIndex, Index, Interval, @@ -19,6 +20,7 @@ Series, Timestamp, date_range, + get_dummies, period_range, timedelta_range, ) @@ -682,3 +684,59 @@ def test_interval(self): expected_codes = np.array([0, 1], dtype="int8") tm.assert_numpy_array_equal(cat.codes, expected_codes) tm.assert_index_equal(cat.categories, idx) + + @pytest.mark.parametrize("sparse", [True, False]) + def test_from_dummies(self, sparse): + if sparse: + pytest.xfail("from sparse is not supported") + # GH 8745 + raw = ["a", "a", "b", "c", "c", "a"] + dummies = get_dummies(raw, sparse=sparse) + cats = Categorical.from_dummies(dummies) + assert list(cats) == raw + + @pytest.mark.parametrize("na_val", [np.nan, pd.NA, None, pd.NaT]) + def test_from_dummies_nan(self, na_val): + # GH 8745 + raw = ["a", "a", "b", "c", "c", "a", na_val] + dummies = get_dummies(raw) + cats = Categorical.from_dummies(dummies) + assert list(cats)[:-1] == raw[:-1] + assert pd.isna(list(cats)[-1]) + + def test_from_dummies_multiple(self): + # GH 8745 + dummies = DataFrame([[1, 0, 1], [0, 1, 0], [0, 0, 1]], columns=["a", "b", "c"]) + with pytest.raises(ValueError, match="multiple categories"): + Categorical.from_dummies(dummies) + + @pytest.mark.parametrize("ordered", [None, False, True]) + def test_from_dummies_ordered(self, ordered): + # GH 8745 + raw = ["a", "a", "b", "c", "c", "a"] + dummies = get_dummies(raw) + cats = Categorical.from_dummies(dummies, ordered) + assert cats.ordered == bool(ordered) + + def test_from_dummies_types(self): + # GH 8745 + cols = ["a", 1, 1.5, ("a", "b"), (1, "c")] + dummies = DataFrame(np.eye(len(cols)), columns=cols) + cats = Categorical.from_dummies(dummies) + assert list(cats) == cols + + def test_from_dummies_drops_na(self): + # GH 8745 + cols = ["a", "b", np.nan] + dummies = DataFrame(np.eye(len(cols)), columns=cols) + cats = Categorical.from_dummies(dummies) + assert list(cats.categories) == cols[:-1] + assert pd.isna(cats[-1]) + + def test_from_dummies_multiindex(self): + # GH 8745 + tups = [("a", 1), ("a", 2), ("b", 1), ("b", 2)] + cols = MultiIndex.from_tuples(tups) + dummies = DataFrame(np.eye(len(cols)), columns=cols) + cats = Categorical.from_dummies(dummies) + assert list(cats.categories) == tups