diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 6f046d3a9379d..61f8b7c6bcc6a 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -254,6 +254,7 @@ Other enhancements - :func:`read_csv` supports memory-mapping for compressed files (:issue:`37621`) - Improve error reporting for :meth:`DataFrame.merge` when invalid merge column definitions were given (:issue:`16228`) - Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`) +- :meth:`Categorical.unique` has a new parameter ``remove_unused_categories``, which if set to ``False``, keeps the dtype of the original categorical (:issue:`38135`) - Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) - Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 62e508c491740..487a12853f61a 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2035,16 +2035,24 @@ def mode(self, dropna=True): # ------------------------------------------------------------------ # ExtensionArray Interface - def unique(self): + def unique(self, remove_unused_categories: bool = True) -> "Categorical": """ Return the ``Categorical`` which ``categories`` and ``codes`` are - unique. Unused categories are NOT returned. + unique. By default, unused categories are NOT returned. - unordered category: values and categories are sorted by appearance order. - ordered category: values are sorted by appearance order, categories keeps existing order. + Parameters + ---------- + remove_unused_categories : bool, default True + If True, unused categories are not returned. + If False, the input dtype is returned unchanged. + + .. versionadded:: 1.2.0 + Returns ------- unique values : ``Categorical`` @@ -2075,13 +2083,24 @@ def unique(self): ... ).unique() ['b', 'a', 'c'] Categories (3, object): ['a' < 'b' < 'c'] + + By default, unused categories are removed, but this can be changed: + + >>> cat = pd.Categorical(list("baab"), categories=list("abc"), ordered=True) + >>> cat.unique() + ['b', 'a'] + Categories (2, object): ['a' < 'b'] + >>> cat.unique(remove_unused_categories=False) + ['b', 'a'] + Categories (3, object): ['a' < 'b' < 'c'] """ # unlike np.unique, unique1d does not sort unique_codes = unique1d(self.codes) - cat = self.copy() - # keep nan in codes - cat._codes = unique_codes + cat = self._constructor(unique_codes, dtype=self.dtype, fastpath=True) + + if not remove_unused_categories: + return cat # exclude nan from indexer for categories take_codes = unique_codes[unique_codes != -1] diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 7bd7d29ec9703..91559d92dcd92 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -6,7 +6,7 @@ from pandas.compat import PYPY -from pandas import Categorical, Index, NaT, Series, date_range +from pandas import Categorical, CategoricalDtype, Index, NaT, Series, date_range import pandas._testing as tm from pandas.api.types import is_scalar @@ -242,6 +242,28 @@ def test_unique_ordered(self): exp_cat = Categorical(["b", np.nan, "a"], categories=["a", "b"], ordered=True) tm.assert_categorical_equal(res, exp_cat) + @pytest.mark.parametrize( + "values, expected", + [ + [list("abc"), list("abc")], + [list("bac"), list("bac")], + [list("ab"), list("ab")], + [list("bc"), list("bc")], + [list("aabbcc"), list("abc")], + [list("aabb"), list("ab")], + [[np.nan, "a", "b"], [np.nan, "a", "b"]], + [["a", "b", np.nan], ["a", "b", np.nan]], + [["a", "b", "a", "b", np.nan], ["a", "b", np.nan]], + ], + ) + def test_unique_keep_unused_categories(self, values, expected, ordered): + # GH38135 + dtype = CategoricalDtype(list("abc"), ordered=ordered) + result = Categorical(values, dtype=dtype).unique(remove_unused_categories=False) + expected = Categorical(expected, dtype=dtype) + + tm.assert_categorical_equal(result, expected) + def test_unique_index_series(self): c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1]) # Categorical.unique sorts categories by appearance order