diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 2b0b62ab7facf..6631a175ecb72 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -230,6 +230,38 @@ Notable bug fixes These are bug fixes that might have notable behavior changes. +``Categorical.unique`` now always maintains same dtype as original +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, when calling :meth:`~Categorical.unique` with categorical data, unused categories in the new array +would be removed, meaning that the dtype of the new array would be different than the +original, if some categories are not present in the unique array (:issue:`18291`) + +As an example of this, given: + +.. ipython:: python + + dtype = pd.CategoricalDtype(['bad', 'neutral', 'good'], ordered=True) + cat = pd.Categorical(['good', 'good', 'bad', 'bad'], dtype=dtype) + original = pd.Series(cat) + unique = original.unique() + +*pandas < 1.3.0*: + +.. code-block:: ipython + + In [1]: unique + ['good', 'bad'] + Categories (2, object): ['bad' < 'good'] + In [2]: original.dtype == unique.dtype + False + +*pandas >= 1.3.0* + +.. ipython:: python + + unique + original.dtype == unique.dtype Preserve dtypes in :meth:`~pandas.DataFrame.combine_first` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index f2b5ad447a0cf..ba36e4a630e1f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2127,16 +2127,15 @@ def mode(self, dropna=True): def unique(self): """ Return the ``Categorical`` which ``categories`` and ``codes`` are - unique. Unused categories are NOT returned. + unique. - - unordered category: values and categories are sorted by appearance - order. - - ordered category: values are sorted by appearance order, categories - keeps existing order. + .. versionchanged:: 1.3.0 + + Previously, unused categories were dropped from the new categories. Returns ------- - unique values : ``Categorical`` + Categorical See Also -------- @@ -2146,37 +2145,15 @@ def unique(self): Examples -------- - An unordered Categorical will return categories in the - order of appearance. - >>> pd.Categorical(list("baabc")).unique() ['b', 'a', 'c'] - Categories (3, object): ['b', 'a', 'c'] - - >>> pd.Categorical(list("baabc"), categories=list("abc")).unique() - ['b', 'a', 'c'] - Categories (3, object): ['b', 'a', 'c'] - - An ordered Categorical preserves the category ordering. - - >>> pd.Categorical( - ... list("baabc"), categories=list("abc"), ordered=True - ... ).unique() - ['b', 'a', 'c'] + Categories (3, object): ['a', 'b', 'c'] + >>> pd.Categorical(list("baab"), categories=list("abc"), ordered=True).unique() + ['b', 'a'] Categories (3, object): ['a' < 'b' < 'c'] """ - # unlike np.unique, unique1d does not sort unique_codes = unique1d(self.codes) - cat = self.copy() - - # keep nan in codes - cat._ndarray = unique_codes - - # exclude nan from indexer for categories - take_codes = unique_codes[unique_codes != -1] - if self.ordered: - take_codes = np.sort(take_codes) - return cat.set_categories(cat.categories.take(take_codes)) + return self._from_backing_data(unique_codes) def _values_for_factorize(self): return self._ndarray, -1 diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index 6de8c1d789097..297681f1e10f5 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -76,6 +76,13 @@ def recode_for_groupby( # sort=False should order groups in as-encountered order (GH-8868) cat = c.unique() + # See GH-38140 for block below + # exclude nan from indexer for categories + take_codes = cat.codes[cat.codes != -1] + if cat.ordered: + take_codes = np.sort(take_codes) + cat = cat.set_categories(cat.categories.take(take_codes)) + # But for groupby to work, all categories should be present, # including those missing from the data (GH-13179), which .unique() # above dropped diff --git a/pandas/core/series.py b/pandas/core/series.py index 5c605a6b441c6..33e3bfb6ee3aa 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1993,15 +1993,12 @@ def unique(self) -> ArrayLike: ['2016-01-01 00:00:00-05:00'] Length: 1, dtype: datetime64[ns, US/Eastern] - An unordered Categorical will return categories in the order of - appearance. + An Categorical will return categories in the order of + appearance and with the same dtype. >>> pd.Series(pd.Categorical(list('baabc'))).unique() ['b', 'a', 'c'] - Categories (3, object): ['b', 'a', 'c'] - - An ordered Categorical preserves the category ordering. - + Categories (3, object): ['a', 'b', 'c'] >>> pd.Series(pd.Categorical(list('baabc'), categories=list('abc'), ... ordered=True)).unique() ['b', 'a', 'c'] diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 6899d821f80ad..56d474497a166 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -8,6 +8,7 @@ from pandas import ( Categorical, + CategoricalDtype, Index, NaT, Series, @@ -196,84 +197,49 @@ def test_searchsorted(self, ordered): with pytest.raises(KeyError, match="cucumber"): ser.searchsorted(["bread", "cucumber"]) - def test_unique(self): + def test_unique(self, ordered): + # GH38140 + dtype = CategoricalDtype(["a", "b", "c"], ordered=ordered) + # categories are reordered based on value when ordered=False - cat = Categorical(["a", "b"]) - exp = Index(["a", "b"]) + cat = Categorical(["a", "b", "c"], dtype=dtype) res = cat.unique() - tm.assert_index_equal(res.categories, exp) tm.assert_categorical_equal(res, cat) - cat = Categorical(["a", "b", "a", "a"], categories=["a", "b", "c"]) + cat = Categorical(["a", "b", "a", "a"], dtype=dtype) res = cat.unique() - tm.assert_index_equal(res.categories, exp) - tm.assert_categorical_equal(res, Categorical(exp)) + tm.assert_categorical_equal(res, Categorical(["a", "b"], dtype=dtype)) - cat = Categorical(["c", "a", "b", "a", "a"], categories=["a", "b", "c"]) - exp = Index(["c", "a", "b"]) + cat = Categorical(["c", "a", "b", "a", "a"], dtype=dtype) res = cat.unique() - tm.assert_index_equal(res.categories, exp) - exp_cat = Categorical(exp, categories=["c", "a", "b"]) + exp_cat = Categorical(["c", "a", "b"], dtype=dtype) tm.assert_categorical_equal(res, exp_cat) # nan must be removed - cat = Categorical(["b", np.nan, "b", np.nan, "a"], categories=["a", "b", "c"]) - res = cat.unique() - exp = Index(["b", "a"]) - tm.assert_index_equal(res.categories, exp) - exp_cat = Categorical(["b", np.nan, "a"], categories=["b", "a"]) - tm.assert_categorical_equal(res, exp_cat) - - def test_unique_ordered(self): - # keep categories order when ordered=True - cat = Categorical(["b", "a", "b"], categories=["a", "b"], ordered=True) + cat = Categorical(["b", np.nan, "b", np.nan, "a"], dtype=dtype) res = cat.unique() - exp_cat = Categorical(["b", "a"], categories=["a", "b"], ordered=True) + exp_cat = Categorical(["b", np.nan, "a"], dtype=dtype) tm.assert_categorical_equal(res, exp_cat) - cat = Categorical( - ["c", "b", "a", "a"], categories=["a", "b", "c"], ordered=True - ) - res = cat.unique() - exp_cat = Categorical(["c", "b", "a"], categories=["a", "b", "c"], ordered=True) - tm.assert_categorical_equal(res, exp_cat) - - cat = Categorical(["b", "a", "a"], categories=["a", "b", "c"], ordered=True) - res = cat.unique() - exp_cat = Categorical(["b", "a"], categories=["a", "b"], ordered=True) - tm.assert_categorical_equal(res, exp_cat) + def test_unique_index_series(self, ordered): + # GH38140 + dtype = CategoricalDtype([3, 2, 1], ordered=ordered) - cat = Categorical( - ["b", "b", np.nan, "a"], categories=["a", "b", "c"], ordered=True - ) - res = cat.unique() - exp_cat = Categorical(["b", np.nan, "a"], categories=["a", "b"], ordered=True) - tm.assert_categorical_equal(res, exp_cat) - - def test_unique_index_series(self): - c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1]) + c = Categorical([3, 1, 2, 2, 1], dtype=dtype) # Categorical.unique sorts categories by appearance order # if ordered=False - exp = Categorical([3, 1, 2], categories=[3, 1, 2]) + exp = Categorical([3, 1, 2], dtype=dtype) tm.assert_categorical_equal(c.unique(), exp) tm.assert_index_equal(Index(c).unique(), Index(exp)) tm.assert_categorical_equal(Series(c).unique(), exp) - c = Categorical([1, 1, 2, 2], categories=[3, 2, 1]) - exp = Categorical([1, 2], categories=[1, 2]) + c = Categorical([1, 1, 2, 2], dtype=dtype) + exp = Categorical([1, 2], dtype=dtype) tm.assert_categorical_equal(c.unique(), exp) tm.assert_index_equal(Index(c).unique(), Index(exp)) tm.assert_categorical_equal(Series(c).unique(), exp) - c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1], ordered=True) - # Categorical.unique keeps categories order if ordered=True - exp = Categorical([3, 1, 2], categories=[3, 2, 1], ordered=True) - tm.assert_categorical_equal(c.unique(), exp) - - tm.assert_index_equal(Index(c).unique(), Index(exp)) - tm.assert_categorical_equal(Series(c).unique(), exp) - def test_shift(self): # GH 9416 cat = Categorical(["a", "b", "c", "d", "a"]) diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index 4aefa4be176fb..26e785a2796b1 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -67,8 +67,6 @@ def test_unique_null(null_obj, index_or_series_obj): if is_datetime64tz_dtype(obj.dtype): result = result.normalize() expected = expected.normalize() - elif isinstance(obj, pd.CategoricalIndex): - expected = expected.set_categories(unique_values_not_null) tm.assert_index_equal(result, expected) else: expected = np.array(unique_values, dtype=obj.dtype) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 3ea5c34201b5c..ca9c2acb9fd12 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -40,12 +40,16 @@ def test_value_counts_with_normalize(self, data): # GH 33172 data = data[:10].unique() values = np.array(data[~data.isna()]) + ser = pd.Series(data, dtype=data.dtype) - result = ( - pd.Series(data, dtype=data.dtype).value_counts(normalize=True).sort_index() - ) + result = ser.value_counts(normalize=True).sort_index() + + if not isinstance(data, pd.Categorical): + expected = pd.Series([1 / len(values)] * len(values), index=result.index) + else: + expected = pd.Series(0.0, index=result.index) + expected[result > 0] = 1 / len(values) - expected = pd.Series([1 / len(values)] * len(values), index=result.index) self.assert_series_equal(result, expected) def test_count(self, data_missing): diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index d3c9b02b3ba23..678344f5b6909 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -4,7 +4,10 @@ from pandas._libs import index as libindex import pandas as pd -from pandas import Categorical +from pandas import ( + Categorical, + CategoricalDtype, +) import pandas._testing as tm from pandas.core.indexes.api import ( CategoricalIndex, @@ -186,18 +189,19 @@ def test_drop_duplicates(self, data, categories, expected): tm.assert_index_equal(result, e) @pytest.mark.parametrize( - "data, categories, expected_data, expected_categories", + "data, categories, expected_data", [ - ([1, 1, 1], [1, 2, 3], [1], [1]), - ([1, 1, 1], list("abc"), [np.nan], []), - ([1, 2, "a"], [1, 2, 3], [1, 2, np.nan], [1, 2]), - ([2, "a", "b"], list("abc"), [np.nan, "a", "b"], ["a", "b"]), + ([1, 1, 1], [1, 2, 3], [1]), + ([1, 1, 1], list("abc"), [np.nan]), + ([1, 2, "a"], [1, 2, 3], [1, 2, np.nan]), + ([2, "a", "b"], list("abc"), [np.nan, "a", "b"]), ], ) - def test_unique(self, data, categories, expected_data, expected_categories): + def test_unique(self, data, categories, expected_data, ordered): + dtype = CategoricalDtype(categories, ordered=ordered) - idx = CategoricalIndex(data, categories=categories) - expected = CategoricalIndex(expected_data, categories=expected_categories) + idx = CategoricalIndex(data, dtype=dtype) + expected = CategoricalIndex(expected_data, dtype=dtype) tm.assert_index_equal(idx.unique(), expected) def test_repr_roundtrip(self): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 127baae6e9352..c9d034361d8c4 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -602,7 +602,7 @@ def test_categorical(self): # we are expecting to return in the order # of appearance - expected = Categorical(list("bac"), categories=list("bac")) + expected = Categorical(list("bac")) # we are expecting to return in the order # of the categories @@ -632,7 +632,7 @@ def test_categorical(self): tm.assert_categorical_equal(result, expected) # CI -> return CI - ci = CategoricalIndex(Categorical(list("baabc"), categories=list("bac"))) + ci = CategoricalIndex(Categorical(list("baabc"), categories=list("abc"))) expected = CategoricalIndex(expected) result = ci.unique() tm.assert_index_equal(result, expected)