BUG: Categorical.unique should keep dtype unchanged (#38140)

topper-123 · web-flow · commit ab622f2a624f · 2021-04-16T13:43:32.000-04:00
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -230,6 +230,38 @@ Notable bug fixes
 
 These are bug fixes that might have notable behavior changes.
 
+``Categorical.unique`` now always maintains same dtype as original
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Previously, when calling :meth:`~Categorical.unique` with categorical data, unused categories in the new array
+would be removed, meaning that the dtype of the new array would be different than the
+original, if some categories are not present in the unique array (:issue:`18291`)
+
+As an example of this, given:
+
+.. ipython:: python
+
+        dtype = pd.CategoricalDtype(['bad', 'neutral', 'good'], ordered=True)
+        cat = pd.Categorical(['good', 'good', 'bad', 'bad'], dtype=dtype)
+        original = pd.Series(cat)
+        unique = original.unique()
+
+*pandas < 1.3.0*:
+
+.. code-block:: ipython
+
+    In [1]: unique
+    ['good', 'bad']
+    Categories (2, object): ['bad' < 'good']
+    In [2]: original.dtype == unique.dtype
+    False
+
+*pandas >= 1.3.0*
+
+.. ipython:: python
+
+        unique
+        original.dtype == unique.dtype
 
 Preserve dtypes in  :meth:`~pandas.DataFrame.combine_first`
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2127,16 +2127,15 @@ def mode(self, dropna=True):
     def unique(self):
         """
         Return the ``Categorical`` which ``categories`` and ``codes`` are
-        unique. Unused categories are NOT returned.
+        unique.
 
-        - unordered category: values and categories are sorted by appearance
-          order.
-        - ordered category: values are sorted by appearance order, categories
-          keeps existing order.
+        .. versionchanged:: 1.3.0
+
+            Previously, unused categories were dropped from the new categories.
 
         Returns
         -------
-        unique values : ``Categorical``
+        Categorical
 
         See Also
         --------
@@ -2146,37 +2145,15 @@ def unique(self):
 
         Examples
         --------
-        An unordered Categorical will return categories in the
-        order of appearance.
-
         >>> pd.Categorical(list("baabc")).unique()
         ['b', 'a', 'c']
-        Categories (3, object): ['b', 'a', 'c']
-
-        >>> pd.Categorical(list("baabc"), categories=list("abc")).unique()
-        ['b', 'a', 'c']
-        Categories (3, object): ['b', 'a', 'c']
-
-        An ordered Categorical preserves the category ordering.
-
-        >>> pd.Categorical(
-        ...     list("baabc"), categories=list("abc"), ordered=True
-        ... ).unique()
-        ['b', 'a', 'c']
+        Categories (3, object): ['a', 'b', 'c']
+        >>> pd.Categorical(list("baab"), categories=list("abc"), ordered=True).unique()
+        ['b', 'a']
         Categories (3, object): ['a' < 'b' < 'c']
         """
-        # unlike np.unique, unique1d does not sort
         unique_codes = unique1d(self.codes)
-        cat = self.copy()
-
-        # keep nan in codes
-        cat._ndarray = unique_codes
-
-        # exclude nan from indexer for categories
-        take_codes = unique_codes[unique_codes != -1]
-        if self.ordered:
-            take_codes = np.sort(take_codes)
-        return cat.set_categories(cat.categories.take(take_codes))
+        return self._from_backing_data(unique_codes)
 
     def _values_for_factorize(self):
         return self._ndarray, -1
diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py
@@ -76,6 +76,13 @@ def recode_for_groupby(
     # sort=False should order groups in as-encountered order (GH-8868)
     cat = c.unique()
 
+    # See GH-38140 for block below
+    # exclude nan from indexer for categories
+    take_codes = cat.codes[cat.codes != -1]
+    if cat.ordered:
+        take_codes = np.sort(take_codes)
+    cat = cat.set_categories(cat.categories.take(take_codes))
+
     # But for groupby to work, all categories should be present,
     # including those missing from the data (GH-13179), which .unique()
     # above dropped
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1993,15 +1993,12 @@ def unique(self) -> ArrayLike:
         ['2016-01-01 00:00:00-05:00']
         Length: 1, dtype: datetime64[ns, US/Eastern]
 
-        An unordered Categorical will return categories in the order of
-        appearance.
+        An Categorical will return categories in the order of
+        appearance and with the same dtype.
 
         >>> pd.Series(pd.Categorical(list('baabc'))).unique()
         ['b', 'a', 'c']
-        Categories (3, object): ['b', 'a', 'c']
-
-        An ordered Categorical preserves the category ordering.
-
+        Categories (3, object): ['a', 'b', 'c']
         >>> pd.Series(pd.Categorical(list('baabc'), categories=list('abc'),
         ...                          ordered=True)).unique()
         ['b', 'a', 'c']
diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py
@@ -8,6 +8,7 @@
 
 from pandas import (
     Categorical,
+    CategoricalDtype,
     Index,
     NaT,
     Series,
@@ -196,84 +197,49 @@ def test_searchsorted(self, ordered):
         with pytest.raises(KeyError, match="cucumber"):
             ser.searchsorted(["bread", "cucumber"])
 
-    def test_unique(self):
+    def test_unique(self, ordered):
+        # GH38140
+        dtype = CategoricalDtype(["a", "b", "c"], ordered=ordered)
+
         # categories are reordered based on value when ordered=False
-        cat = Categorical(["a", "b"])
-        exp = Index(["a", "b"])
+        cat = Categorical(["a", "b", "c"], dtype=dtype)
         res = cat.unique()
-        tm.assert_index_equal(res.categories, exp)
         tm.assert_categorical_equal(res, cat)
 
-        cat = Categorical(["a", "b", "a", "a"], categories=["a", "b", "c"])
+        cat = Categorical(["a", "b", "a", "a"], dtype=dtype)
         res = cat.unique()
-        tm.assert_index_equal(res.categories, exp)
-        tm.assert_categorical_equal(res, Categorical(exp))
+        tm.assert_categorical_equal(res, Categorical(["a", "b"], dtype=dtype))
 
-        cat = Categorical(["c", "a", "b", "a", "a"], categories=["a", "b", "c"])
-        exp = Index(["c", "a", "b"])
+        cat = Categorical(["c", "a", "b", "a", "a"], dtype=dtype)
         res = cat.unique()
-        tm.assert_index_equal(res.categories, exp)
-        exp_cat = Categorical(exp, categories=["c", "a", "b"])
+        exp_cat = Categorical(["c", "a", "b"], dtype=dtype)
         tm.assert_categorical_equal(res, exp_cat)
 
         # nan must be removed
-        cat = Categorical(["b", np.nan, "b", np.nan, "a"], categories=["a", "b", "c"])
-        res = cat.unique()
-        exp = Index(["b", "a"])
-        tm.assert_index_equal(res.categories, exp)
-        exp_cat = Categorical(["b", np.nan, "a"], categories=["b", "a"])
-        tm.assert_categorical_equal(res, exp_cat)
-
-    def test_unique_ordered(self):
-        # keep categories order when ordered=True
-        cat = Categorical(["b", "a", "b"], categories=["a", "b"], ordered=True)
+        cat = Categorical(["b", np.nan, "b", np.nan, "a"], dtype=dtype)
         res = cat.unique()
-        exp_cat = Categorical(["b", "a"], categories=["a", "b"], ordered=True)
+        exp_cat = Categorical(["b", np.nan, "a"], dtype=dtype)
         tm.assert_categorical_equal(res, exp_cat)
 
-        cat = Categorical(
-            ["c", "b", "a", "a"], categories=["a", "b", "c"], ordered=True
-        )
-        res = cat.unique()
-        exp_cat = Categorical(["c", "b", "a"], categories=["a", "b", "c"], ordered=True)
-        tm.assert_categorical_equal(res, exp_cat)
-
-        cat = Categorical(["b", "a", "a"], categories=["a", "b", "c"], ordered=True)
-        res = cat.unique()
-        exp_cat = Categorical(["b", "a"], categories=["a", "b"], ordered=True)
-        tm.assert_categorical_equal(res, exp_cat)
+    def test_unique_index_series(self, ordered):
+        # GH38140
+        dtype = CategoricalDtype([3, 2, 1], ordered=ordered)
 
-        cat = Categorical(
-            ["b", "b", np.nan, "a"], categories=["a", "b", "c"], ordered=True
-        )
-        res = cat.unique()
-        exp_cat = Categorical(["b", np.nan, "a"], categories=["a", "b"], ordered=True)
-        tm.assert_categorical_equal(res, exp_cat)
-
-    def test_unique_index_series(self):
-        c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1])
+        c = Categorical([3, 1, 2, 2, 1], dtype=dtype)
         # Categorical.unique sorts categories by appearance order
         # if ordered=False
-        exp = Categorical([3, 1, 2], categories=[3, 1, 2])
+        exp = Categorical([3, 1, 2], dtype=dtype)
         tm.assert_categorical_equal(c.unique(), exp)
 
         tm.assert_index_equal(Index(c).unique(), Index(exp))
         tm.assert_categorical_equal(Series(c).unique(), exp)
 
-        c = Categorical([1, 1, 2, 2], categories=[3, 2, 1])
-        exp = Categorical([1, 2], categories=[1, 2])
+        c = Categorical([1, 1, 2, 2], dtype=dtype)
+        exp = Categorical([1, 2], dtype=dtype)
         tm.assert_categorical_equal(c.unique(), exp)
         tm.assert_index_equal(Index(c).unique(), Index(exp))
         tm.assert_categorical_equal(Series(c).unique(), exp)
 
-        c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1], ordered=True)
-        # Categorical.unique keeps categories order if ordered=True
-        exp = Categorical([3, 1, 2], categories=[3, 2, 1], ordered=True)
-        tm.assert_categorical_equal(c.unique(), exp)
-
-        tm.assert_index_equal(Index(c).unique(), Index(exp))
-        tm.assert_categorical_equal(Series(c).unique(), exp)
-
     def test_shift(self):
         # GH 9416
         cat = Categorical(["a", "b", "c", "d", "a"])
diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py
@@ -67,8 +67,6 @@ def test_unique_null(null_obj, index_or_series_obj):
         if is_datetime64tz_dtype(obj.dtype):
             result = result.normalize()
             expected = expected.normalize()
-        elif isinstance(obj, pd.CategoricalIndex):
-            expected = expected.set_categories(unique_values_not_null)
         tm.assert_index_equal(result, expected)
     else:
         expected = np.array(unique_values, dtype=obj.dtype)
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
@@ -40,12 +40,16 @@ def test_value_counts_with_normalize(self, data):
         # GH 33172
         data = data[:10].unique()
         values = np.array(data[~data.isna()])
+        ser = pd.Series(data, dtype=data.dtype)
 
-        result = (
-            pd.Series(data, dtype=data.dtype).value_counts(normalize=True).sort_index()
-        )
+        result = ser.value_counts(normalize=True).sort_index()
+
+        if not isinstance(data, pd.Categorical):
+            expected = pd.Series([1 / len(values)] * len(values), index=result.index)
+        else:
+            expected = pd.Series(0.0, index=result.index)
+            expected[result > 0] = 1 / len(values)
 
-        expected = pd.Series([1 / len(values)] * len(values), index=result.index)
         self.assert_series_equal(result, expected)
 
     def test_count(self, data_missing):
diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py
@@ -4,7 +4,10 @@
 from pandas._libs import index as libindex
 
 import pandas as pd
-from pandas import Categorical
+from pandas import (
+    Categorical,
+    CategoricalDtype,
+)
 import pandas._testing as tm
 from pandas.core.indexes.api import (
     CategoricalIndex,
@@ -186,18 +189,19 @@ def test_drop_duplicates(self, data, categories, expected):
             tm.assert_index_equal(result, e)
 
     @pytest.mark.parametrize(
-        "data, categories, expected_data, expected_categories",
+        "data, categories, expected_data",
         [
-            ([1, 1, 1], [1, 2, 3], [1], [1]),
-            ([1, 1, 1], list("abc"), [np.nan], []),
-            ([1, 2, "a"], [1, 2, 3], [1, 2, np.nan], [1, 2]),
-            ([2, "a", "b"], list("abc"), [np.nan, "a", "b"], ["a", "b"]),
+            ([1, 1, 1], [1, 2, 3], [1]),
+            ([1, 1, 1], list("abc"), [np.nan]),
+            ([1, 2, "a"], [1, 2, 3], [1, 2, np.nan]),
+            ([2, "a", "b"], list("abc"), [np.nan, "a", "b"]),
         ],
     )
-    def test_unique(self, data, categories, expected_data, expected_categories):
+    def test_unique(self, data, categories, expected_data, ordered):
+        dtype = CategoricalDtype(categories, ordered=ordered)
 
-        idx = CategoricalIndex(data, categories=categories)
-        expected = CategoricalIndex(expected_data, categories=expected_categories)
+        idx = CategoricalIndex(data, dtype=dtype)
+        expected = CategoricalIndex(expected_data, dtype=dtype)
         tm.assert_index_equal(idx.unique(), expected)
 
     def test_repr_roundtrip(self):
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -602,7 +602,7 @@ def test_categorical(self):
 
         # we are expecting to return in the order
         # of appearance
-        expected = Categorical(list("bac"), categories=list("bac"))
+        expected = Categorical(list("bac"))
 
         # we are expecting to return in the order
         # of the categories
@@ -632,7 +632,7 @@ def test_categorical(self):
         tm.assert_categorical_equal(result, expected)
 
         # CI -> return CI
-        ci = CategoricalIndex(Categorical(list("baabc"), categories=list("bac")))
+        ci = CategoricalIndex(Categorical(list("baabc"), categories=list("abc")))
         expected = CategoricalIndex(expected)
         result = ci.unique()
         tm.assert_index_equal(result, expected)