Skip to content

Commit 609a733

Browse files
committed
BUG: Fix .groupby(categorical, sort=False) failing
1 parent f65a641 commit 609a733

File tree

4 files changed

+42
-12
lines changed

4 files changed

+42
-12
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,7 @@ Bug Fixes
539539

540540
- Bug in ``resample``, where a non-string ```loffset`` argument would not be applied when resampling a timeseries (:issue:`13218`)
541541

542+
- Bug in ``Categorical.unique()`` stripping unused categories resulting in ```.groupby(categorical, sort=False)`` not working (:issue:`13179`)
542543

543544

544545

pandas/core/categorical.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -1851,15 +1851,18 @@ def unique(self):
18511851
"""
18521852

18531853
# unlike np.unique, unique1d does not sort
1854-
unique_codes = unique1d(self.codes)
1854+
unique_codes = unique1d(self.codes).astype(self.codes.dtype)
18551855
cat = self.copy()
18561856
# keep nan in codes
18571857
cat._codes = unique_codes
1858-
# exclude nan from indexer for categories
1859-
take_codes = unique_codes[unique_codes != -1]
1860-
if self.ordered:
1861-
take_codes = sorted(take_codes)
1862-
return cat.set_categories(cat.categories.take(take_codes))
1858+
if not self.ordered:
1859+
take_codes = unique_codes[unique_codes != -1]
1860+
# Sort categories according to codes order, suffixed by
1861+
# unused categories in original order (GH-13179)
1862+
order = (cat.categories[take_codes].tolist() +
1863+
cat.categories.tolist()).index
1864+
cat.set_categories(sorted(cat.categories, key=order), inplace=True)
1865+
return cat
18631866

18641867
def equals(self, other):
18651868
"""

pandas/tests/groupby/test_categorical.py

+24
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,30 @@ def test_groupby_multi_categorical_as_index(self):
284284

285285
tm.assert_frame_equal(result, expected, check_index_type=True)
286286

287+
def test_groupby_preserve_categories(self):
288+
# GH-13179
289+
categories = list('abc')
290+
291+
# ordered=True
292+
df = DataFrame({'A': pd.Categorical(list('ba'),
293+
categories=categories,
294+
ordered=True)})
295+
index = pd.CategoricalIndex(categories, categories, ordered=True)
296+
tm.assert_index_equal(df.groupby('A', sort=True).first().index, index)
297+
tm.assert_index_equal(df.groupby('A', sort=False).first().index, index)
298+
299+
# ordered=False
300+
df = DataFrame({'A': pd.Categorical(list('ba'),
301+
categories=categories,
302+
ordered=False)})
303+
sort_index = pd.CategoricalIndex(categories, categories, ordered=False)
304+
nosort_index = pd.CategoricalIndex(list('bac'), list('bac'),
305+
ordered=False)
306+
tm.assert_index_equal(df.groupby('A', sort=True).first().index,
307+
sort_index)
308+
tm.assert_index_equal(df.groupby('A', sort=False).first().index,
309+
nosort_index)
310+
287311
def test_groupby_preserve_categorical_dtype(self):
288312
# GH13743, GH13854
289313
df = DataFrame({'A': [1, 2, 1, 1, 2],

pandas/tests/test_categorical.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -1287,8 +1287,9 @@ def test_unique(self):
12871287

12881288
cat = Categorical(["a", "b", "a", "a"], categories=["a", "b", "c"])
12891289
res = cat.unique()
1290+
exp = Index(["a", "b", "c"])
12901291
self.assert_index_equal(res.categories, exp)
1291-
tm.assert_categorical_equal(res, Categorical(exp))
1292+
tm.assert_categorical_equal(res, Categorical(["a", "b"], exp))
12921293

12931294
cat = Categorical(["c", "a", "b", "a", "a"],
12941295
categories=["a", "b", "c"])
@@ -1302,9 +1303,9 @@ def test_unique(self):
13021303
cat = Categorical(["b", np.nan, "b", np.nan, "a"],
13031304
categories=["a", "b", "c"])
13041305
res = cat.unique()
1305-
exp = Index(["b", "a"])
1306+
exp = Index(["b", "a", "c"])
13061307
self.assert_index_equal(res.categories, exp)
1307-
exp_cat = Categorical(["b", np.nan, "a"], categories=["b", "a"])
1308+
exp_cat = Categorical(["b", np.nan, "a"], categories=["b", "a", "c"])
13081309
tm.assert_categorical_equal(res, exp_cat)
13091310

13101311
def test_unique_ordered(self):
@@ -1324,13 +1325,14 @@ def test_unique_ordered(self):
13241325
cat = Categorical(['b', 'a', 'a'], categories=['a', 'b', 'c'],
13251326
ordered=True)
13261327
res = cat.unique()
1327-
exp_cat = Categorical(['b', 'a'], categories=['a', 'b'], ordered=True)
1328+
exp_cat = Categorical(['b', 'a'], categories=['a', 'b', 'c'],
1329+
ordered=True)
13281330
tm.assert_categorical_equal(res, exp_cat)
13291331

13301332
cat = Categorical(['b', 'b', np.nan, 'a'], categories=['a', 'b', 'c'],
13311333
ordered=True)
13321334
res = cat.unique()
1333-
exp_cat = Categorical(['b', np.nan, 'a'], categories=['a', 'b'],
1335+
exp_cat = Categorical(['b', np.nan, 'a'], categories=['a', 'b', 'c'],
13341336
ordered=True)
13351337
tm.assert_categorical_equal(res, exp_cat)
13361338

@@ -1345,7 +1347,7 @@ def test_unique_index_series(self):
13451347
tm.assert_categorical_equal(pd.Series(c).unique(), exp)
13461348

13471349
c = Categorical([1, 1, 2, 2], categories=[3, 2, 1])
1348-
exp = Categorical([1, 2], categories=[1, 2])
1350+
exp = Categorical([1, 2], categories=[1, 2, 3])
13491351
tm.assert_categorical_equal(c.unique(), exp)
13501352
tm.assert_index_equal(Index(c).unique(), Index(exp))
13511353
tm.assert_categorical_equal(pd.Series(c).unique(), exp)

0 commit comments

Comments
 (0)