Skip to content

Commit ab622f2

Browse files
authored
BUG: Categorical.unique should keep dtype unchanged (#38140)
1 parent c45be0d commit ab622f2

File tree

9 files changed

+93
-108
lines changed

9 files changed

+93
-108
lines changed

doc/source/whatsnew/v1.3.0.rst

+32
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,38 @@ Notable bug fixes
230230

231231
These are bug fixes that might have notable behavior changes.
232232

233+
``Categorical.unique`` now always maintains same dtype as original
234+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
235+
236+
Previously, when calling :meth:`~Categorical.unique` with categorical data, unused categories in the new array
237+
would be removed, meaning that the dtype of the new array would be different than the
238+
original, if some categories are not present in the unique array (:issue:`18291`)
239+
240+
As an example of this, given:
241+
242+
.. ipython:: python
243+
244+
dtype = pd.CategoricalDtype(['bad', 'neutral', 'good'], ordered=True)
245+
cat = pd.Categorical(['good', 'good', 'bad', 'bad'], dtype=dtype)
246+
original = pd.Series(cat)
247+
unique = original.unique()
248+
249+
*pandas < 1.3.0*:
250+
251+
.. code-block:: ipython
252+
253+
In [1]: unique
254+
['good', 'bad']
255+
Categories (2, object): ['bad' < 'good']
256+
In [2]: original.dtype == unique.dtype
257+
False
258+
259+
*pandas >= 1.3.0*
260+
261+
.. ipython:: python
262+
263+
unique
264+
original.dtype == unique.dtype
233265
234266
Preserve dtypes in :meth:`~pandas.DataFrame.combine_first`
235267
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

pandas/core/arrays/categorical.py

+9-32
Original file line numberDiff line numberDiff line change
@@ -2127,16 +2127,15 @@ def mode(self, dropna=True):
21272127
def unique(self):
21282128
"""
21292129
Return the ``Categorical`` which ``categories`` and ``codes`` are
2130-
unique. Unused categories are NOT returned.
2130+
unique.
21312131
2132-
- unordered category: values and categories are sorted by appearance
2133-
order.
2134-
- ordered category: values are sorted by appearance order, categories
2135-
keeps existing order.
2132+
.. versionchanged:: 1.3.0
2133+
2134+
Previously, unused categories were dropped from the new categories.
21362135
21372136
Returns
21382137
-------
2139-
unique values : ``Categorical``
2138+
Categorical
21402139
21412140
See Also
21422141
--------
@@ -2146,37 +2145,15 @@ def unique(self):
21462145
21472146
Examples
21482147
--------
2149-
An unordered Categorical will return categories in the
2150-
order of appearance.
2151-
21522148
>>> pd.Categorical(list("baabc")).unique()
21532149
['b', 'a', 'c']
2154-
Categories (3, object): ['b', 'a', 'c']
2155-
2156-
>>> pd.Categorical(list("baabc"), categories=list("abc")).unique()
2157-
['b', 'a', 'c']
2158-
Categories (3, object): ['b', 'a', 'c']
2159-
2160-
An ordered Categorical preserves the category ordering.
2161-
2162-
>>> pd.Categorical(
2163-
... list("baabc"), categories=list("abc"), ordered=True
2164-
... ).unique()
2165-
['b', 'a', 'c']
2150+
Categories (3, object): ['a', 'b', 'c']
2151+
>>> pd.Categorical(list("baab"), categories=list("abc"), ordered=True).unique()
2152+
['b', 'a']
21662153
Categories (3, object): ['a' < 'b' < 'c']
21672154
"""
2168-
# unlike np.unique, unique1d does not sort
21692155
unique_codes = unique1d(self.codes)
2170-
cat = self.copy()
2171-
2172-
# keep nan in codes
2173-
cat._ndarray = unique_codes
2174-
2175-
# exclude nan from indexer for categories
2176-
take_codes = unique_codes[unique_codes != -1]
2177-
if self.ordered:
2178-
take_codes = np.sort(take_codes)
2179-
return cat.set_categories(cat.categories.take(take_codes))
2156+
return self._from_backing_data(unique_codes)
21802157

21812158
def _values_for_factorize(self):
21822159
return self._ndarray, -1

pandas/core/groupby/categorical.py

+7
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,13 @@ def recode_for_groupby(
7676
# sort=False should order groups in as-encountered order (GH-8868)
7777
cat = c.unique()
7878

79+
# See GH-38140 for block below
80+
# exclude nan from indexer for categories
81+
take_codes = cat.codes[cat.codes != -1]
82+
if cat.ordered:
83+
take_codes = np.sort(take_codes)
84+
cat = cat.set_categories(cat.categories.take(take_codes))
85+
7986
# But for groupby to work, all categories should be present,
8087
# including those missing from the data (GH-13179), which .unique()
8188
# above dropped

pandas/core/series.py

+3-6
Original file line numberDiff line numberDiff line change
@@ -1993,15 +1993,12 @@ def unique(self) -> ArrayLike:
19931993
['2016-01-01 00:00:00-05:00']
19941994
Length: 1, dtype: datetime64[ns, US/Eastern]
19951995
1996-
An unordered Categorical will return categories in the order of
1997-
appearance.
1996+
An Categorical will return categories in the order of
1997+
appearance and with the same dtype.
19981998
19991999
>>> pd.Series(pd.Categorical(list('baabc'))).unique()
20002000
['b', 'a', 'c']
2001-
Categories (3, object): ['b', 'a', 'c']
2002-
2003-
An ordered Categorical preserves the category ordering.
2004-
2001+
Categories (3, object): ['a', 'b', 'c']
20052002
>>> pd.Series(pd.Categorical(list('baabc'), categories=list('abc'),
20062003
... ordered=True)).unique()
20072004
['b', 'a', 'c']

pandas/tests/arrays/categorical/test_analytics.py

+19-53
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from pandas import (
1010
Categorical,
11+
CategoricalDtype,
1112
Index,
1213
NaT,
1314
Series,
@@ -196,84 +197,49 @@ def test_searchsorted(self, ordered):
196197
with pytest.raises(KeyError, match="cucumber"):
197198
ser.searchsorted(["bread", "cucumber"])
198199

199-
def test_unique(self):
200+
def test_unique(self, ordered):
201+
# GH38140
202+
dtype = CategoricalDtype(["a", "b", "c"], ordered=ordered)
203+
200204
# categories are reordered based on value when ordered=False
201-
cat = Categorical(["a", "b"])
202-
exp = Index(["a", "b"])
205+
cat = Categorical(["a", "b", "c"], dtype=dtype)
203206
res = cat.unique()
204-
tm.assert_index_equal(res.categories, exp)
205207
tm.assert_categorical_equal(res, cat)
206208

207-
cat = Categorical(["a", "b", "a", "a"], categories=["a", "b", "c"])
209+
cat = Categorical(["a", "b", "a", "a"], dtype=dtype)
208210
res = cat.unique()
209-
tm.assert_index_equal(res.categories, exp)
210-
tm.assert_categorical_equal(res, Categorical(exp))
211+
tm.assert_categorical_equal(res, Categorical(["a", "b"], dtype=dtype))
211212

212-
cat = Categorical(["c", "a", "b", "a", "a"], categories=["a", "b", "c"])
213-
exp = Index(["c", "a", "b"])
213+
cat = Categorical(["c", "a", "b", "a", "a"], dtype=dtype)
214214
res = cat.unique()
215-
tm.assert_index_equal(res.categories, exp)
216-
exp_cat = Categorical(exp, categories=["c", "a", "b"])
215+
exp_cat = Categorical(["c", "a", "b"], dtype=dtype)
217216
tm.assert_categorical_equal(res, exp_cat)
218217

219218
# nan must be removed
220-
cat = Categorical(["b", np.nan, "b", np.nan, "a"], categories=["a", "b", "c"])
221-
res = cat.unique()
222-
exp = Index(["b", "a"])
223-
tm.assert_index_equal(res.categories, exp)
224-
exp_cat = Categorical(["b", np.nan, "a"], categories=["b", "a"])
225-
tm.assert_categorical_equal(res, exp_cat)
226-
227-
def test_unique_ordered(self):
228-
# keep categories order when ordered=True
229-
cat = Categorical(["b", "a", "b"], categories=["a", "b"], ordered=True)
219+
cat = Categorical(["b", np.nan, "b", np.nan, "a"], dtype=dtype)
230220
res = cat.unique()
231-
exp_cat = Categorical(["b", "a"], categories=["a", "b"], ordered=True)
221+
exp_cat = Categorical(["b", np.nan, "a"], dtype=dtype)
232222
tm.assert_categorical_equal(res, exp_cat)
233223

234-
cat = Categorical(
235-
["c", "b", "a", "a"], categories=["a", "b", "c"], ordered=True
236-
)
237-
res = cat.unique()
238-
exp_cat = Categorical(["c", "b", "a"], categories=["a", "b", "c"], ordered=True)
239-
tm.assert_categorical_equal(res, exp_cat)
240-
241-
cat = Categorical(["b", "a", "a"], categories=["a", "b", "c"], ordered=True)
242-
res = cat.unique()
243-
exp_cat = Categorical(["b", "a"], categories=["a", "b"], ordered=True)
244-
tm.assert_categorical_equal(res, exp_cat)
224+
def test_unique_index_series(self, ordered):
225+
# GH38140
226+
dtype = CategoricalDtype([3, 2, 1], ordered=ordered)
245227

246-
cat = Categorical(
247-
["b", "b", np.nan, "a"], categories=["a", "b", "c"], ordered=True
248-
)
249-
res = cat.unique()
250-
exp_cat = Categorical(["b", np.nan, "a"], categories=["a", "b"], ordered=True)
251-
tm.assert_categorical_equal(res, exp_cat)
252-
253-
def test_unique_index_series(self):
254-
c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1])
228+
c = Categorical([3, 1, 2, 2, 1], dtype=dtype)
255229
# Categorical.unique sorts categories by appearance order
256230
# if ordered=False
257-
exp = Categorical([3, 1, 2], categories=[3, 1, 2])
231+
exp = Categorical([3, 1, 2], dtype=dtype)
258232
tm.assert_categorical_equal(c.unique(), exp)
259233

260234
tm.assert_index_equal(Index(c).unique(), Index(exp))
261235
tm.assert_categorical_equal(Series(c).unique(), exp)
262236

263-
c = Categorical([1, 1, 2, 2], categories=[3, 2, 1])
264-
exp = Categorical([1, 2], categories=[1, 2])
237+
c = Categorical([1, 1, 2, 2], dtype=dtype)
238+
exp = Categorical([1, 2], dtype=dtype)
265239
tm.assert_categorical_equal(c.unique(), exp)
266240
tm.assert_index_equal(Index(c).unique(), Index(exp))
267241
tm.assert_categorical_equal(Series(c).unique(), exp)
268242

269-
c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1], ordered=True)
270-
# Categorical.unique keeps categories order if ordered=True
271-
exp = Categorical([3, 1, 2], categories=[3, 2, 1], ordered=True)
272-
tm.assert_categorical_equal(c.unique(), exp)
273-
274-
tm.assert_index_equal(Index(c).unique(), Index(exp))
275-
tm.assert_categorical_equal(Series(c).unique(), exp)
276-
277243
def test_shift(self):
278244
# GH 9416
279245
cat = Categorical(["a", "b", "c", "d", "a"])

pandas/tests/base/test_unique.py

-2
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,6 @@ def test_unique_null(null_obj, index_or_series_obj):
6767
if is_datetime64tz_dtype(obj.dtype):
6868
result = result.normalize()
6969
expected = expected.normalize()
70-
elif isinstance(obj, pd.CategoricalIndex):
71-
expected = expected.set_categories(unique_values_not_null)
7270
tm.assert_index_equal(result, expected)
7371
else:
7472
expected = np.array(unique_values, dtype=obj.dtype)

pandas/tests/extension/base/methods.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,16 @@ def test_value_counts_with_normalize(self, data):
4040
# GH 33172
4141
data = data[:10].unique()
4242
values = np.array(data[~data.isna()])
43+
ser = pd.Series(data, dtype=data.dtype)
4344

44-
result = (
45-
pd.Series(data, dtype=data.dtype).value_counts(normalize=True).sort_index()
46-
)
45+
result = ser.value_counts(normalize=True).sort_index()
46+
47+
if not isinstance(data, pd.Categorical):
48+
expected = pd.Series([1 / len(values)] * len(values), index=result.index)
49+
else:
50+
expected = pd.Series(0.0, index=result.index)
51+
expected[result > 0] = 1 / len(values)
4752

48-
expected = pd.Series([1 / len(values)] * len(values), index=result.index)
4953
self.assert_series_equal(result, expected)
5054

5155
def test_count(self, data_missing):

pandas/tests/indexes/categorical/test_category.py

+13-9
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,10 @@
44
from pandas._libs import index as libindex
55

66
import pandas as pd
7-
from pandas import Categorical
7+
from pandas import (
8+
Categorical,
9+
CategoricalDtype,
10+
)
811
import pandas._testing as tm
912
from pandas.core.indexes.api import (
1013
CategoricalIndex,
@@ -186,18 +189,19 @@ def test_drop_duplicates(self, data, categories, expected):
186189
tm.assert_index_equal(result, e)
187190

188191
@pytest.mark.parametrize(
189-
"data, categories, expected_data, expected_categories",
192+
"data, categories, expected_data",
190193
[
191-
([1, 1, 1], [1, 2, 3], [1], [1]),
192-
([1, 1, 1], list("abc"), [np.nan], []),
193-
([1, 2, "a"], [1, 2, 3], [1, 2, np.nan], [1, 2]),
194-
([2, "a", "b"], list("abc"), [np.nan, "a", "b"], ["a", "b"]),
194+
([1, 1, 1], [1, 2, 3], [1]),
195+
([1, 1, 1], list("abc"), [np.nan]),
196+
([1, 2, "a"], [1, 2, 3], [1, 2, np.nan]),
197+
([2, "a", "b"], list("abc"), [np.nan, "a", "b"]),
195198
],
196199
)
197-
def test_unique(self, data, categories, expected_data, expected_categories):
200+
def test_unique(self, data, categories, expected_data, ordered):
201+
dtype = CategoricalDtype(categories, ordered=ordered)
198202

199-
idx = CategoricalIndex(data, categories=categories)
200-
expected = CategoricalIndex(expected_data, categories=expected_categories)
203+
idx = CategoricalIndex(data, dtype=dtype)
204+
expected = CategoricalIndex(expected_data, dtype=dtype)
201205
tm.assert_index_equal(idx.unique(), expected)
202206

203207
def test_repr_roundtrip(self):

pandas/tests/test_algos.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -602,7 +602,7 @@ def test_categorical(self):
602602

603603
# we are expecting to return in the order
604604
# of appearance
605-
expected = Categorical(list("bac"), categories=list("bac"))
605+
expected = Categorical(list("bac"))
606606

607607
# we are expecting to return in the order
608608
# of the categories
@@ -632,7 +632,7 @@ def test_categorical(self):
632632
tm.assert_categorical_equal(result, expected)
633633

634634
# CI -> return CI
635-
ci = CategoricalIndex(Categorical(list("baabc"), categories=list("bac")))
635+
ci = CategoricalIndex(Categorical(list("baabc"), categories=list("abc")))
636636
expected = CategoricalIndex(expected)
637637
result = ci.unique()
638638
tm.assert_index_equal(result, expected)

0 commit comments

Comments
 (0)