Skip to content

Commit fa3c2e4

Browse files
kerncAnkurDedania
authored andcommitted
BUG: Categorical.unique() preserves categories
closes pandas-dev#13179 Author: Kernc <[email protected]> Closes pandas-dev#15439 from kernc/Categorical.unique-nostrip-unused and squashes the following commits: 55733b8 [Kernc] fixup! BUG: Fix .groupby(categorical, sort=False) failing 2aec326 [Kernc] fixup! BUG: Fix .groupby(categorical, sort=False) failing c813146 [Kernc] PERF: add asv for categorical grouping 0c550e6 [Kernc] BUG: Fix .groupby(categorical, sort=False) failing
1 parent 90a0cf7 commit fa3c2e4

File tree

6 files changed

+141
-18
lines changed

6 files changed

+141
-18
lines changed

asv_bench/benchmarks/groupby.py

+37
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,43 @@ def time_groupby_sum(self):
492492
self.df.groupby(['a'])['b'].sum()
493493

494494

495+
class groupby_categorical(object):
496+
goal_time = 0.2
497+
498+
def setup(self):
499+
N = 100000
500+
arr = np.random.random(N)
501+
502+
self.df = DataFrame(dict(
503+
a=Categorical(np.random.randint(10000, size=N)),
504+
b=arr))
505+
self.df_ordered = DataFrame(dict(
506+
a=Categorical(np.random.randint(10000, size=N), ordered=True),
507+
b=arr))
508+
self.df_extra_cat = DataFrame(dict(
509+
a=Categorical(np.random.randint(100, size=N),
510+
categories=np.arange(10000)),
511+
b=arr))
512+
513+
def time_groupby_sort(self):
514+
self.df.groupby('a')['b'].count()
515+
516+
def time_groupby_nosort(self):
517+
self.df.groupby('a', sort=False)['b'].count()
518+
519+
def time_groupby_ordered_sort(self):
520+
self.df_ordered.groupby('a')['b'].count()
521+
522+
def time_groupby_ordered_nosort(self):
523+
self.df_ordered.groupby('a', sort=False)['b'].count()
524+
525+
def time_groupby_extra_cat_sort(self):
526+
self.df_extra_cat.groupby('a')['b'].count()
527+
528+
def time_groupby_extra_cat_nosort(self):
529+
self.df_extra_cat.groupby('a', sort=False)['b'].count()
530+
531+
495532
class groupby_period(object):
496533
# GH 14338
497534
goal_time = 0.2

doc/source/whatsnew/v0.20.0.txt

+33-1
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,39 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937
120120
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
121121
- Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`)
122122

123+
.. _whatsnew_0200.enhancements.groupy_categorical
124+
125+
GroupBy on Categoricals
126+
^^^^^^^^^^^^^^^^^^^^^^^
127+
128+
In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueError`` when grouping on a categorical series with some categories not appearing in the data. (:issue:`13179`)
129+
130+
.. ipython:: python
131+
132+
chromosomes = np.r_[np.arange(1, 23).astype(str), ['X', 'Y']]
133+
df = pd.DataFrame({
134+
'A': np.random.randint(100),
135+
'B': np.random.randint(100),
136+
'C': np.random.randint(100),
137+
'chromosomes': pd.Categorical(np.random.choice(chromosomes, 100),
138+
categories=chromosomes,
139+
ordered=True)})
140+
df
141+
142+
Previous Behavior:
143+
144+
.. code-block:: ipython
145+
146+
In [3]: df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum()
147+
---------------------------------------------------------------------------
148+
ValueError: items in new_categories are not the same as in old categories
149+
150+
New Behavior:
151+
152+
.. ipython:: python
153+
154+
df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum()
155+
123156
.. _whatsnew_0200.enhancements.other:
124157

125158
Other enhancements
@@ -163,7 +196,6 @@ Other enhancements
163196

164197
.. _whatsnew_0200.api_breaking:
165198

166-
167199
Backwards incompatible API changes
168200
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
169201

pandas/core/categorical.py

+42
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,46 @@ def _get_categories(self):
602602
categories = property(fget=_get_categories, fset=_set_categories,
603603
doc=_categories_doc)
604604

605+
def _codes_for_groupby(self, sort):
606+
"""
607+
If sort=False, return a copy of self, coded with categories as
608+
returned by .unique(), followed by any categories not appearing in
609+
the data. If sort=True, return self.
610+
611+
This method is needed solely to ensure the categorical index of the
612+
GroupBy result has categories in the order of appearance in the data
613+
(GH-8868).
614+
615+
Parameters
616+
----------
617+
sort : boolean
618+
The value of the sort paramter groupby was called with.
619+
620+
Returns
621+
-------
622+
Categorical
623+
If sort=False, the new categories are set to the order of
624+
appearance in codes (unless ordered=True, in which case the
625+
original order is preserved), followed by any unrepresented
626+
categories in the original order.
627+
"""
628+
629+
# Already sorted according to self.categories; all is fine
630+
if sort:
631+
return self
632+
633+
# sort=False should order groups in as-encountered order (GH-8868)
634+
cat = self.unique()
635+
636+
# But for groupby to work, all categories should be present,
637+
# including those missing from the data (GH-13179), which .unique()
638+
# above dropped
639+
cat.add_categories(
640+
self.categories[~self.categories.isin(cat.categories)],
641+
inplace=True)
642+
643+
return self.reorder_categories(cat.categories)
644+
605645
_ordered = None
606646

607647
def set_ordered(self, value, inplace=False):
@@ -1853,8 +1893,10 @@ def unique(self):
18531893
# unlike np.unique, unique1d does not sort
18541894
unique_codes = unique1d(self.codes)
18551895
cat = self.copy()
1896+
18561897
# keep nan in codes
18571898
cat._codes = unique_codes
1899+
18581900
# exclude nan from indexer for categories
18591901
take_codes = unique_codes[unique_codes != -1]
18601902
if self.ordered:

pandas/core/groupby.py

+1-17
Original file line numberDiff line numberDiff line change
@@ -2300,23 +2300,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
23002300
# a passed Categorical
23012301
elif is_categorical_dtype(self.grouper):
23022302

2303-
# must have an ordered categorical
2304-
if self.sort:
2305-
if not self.grouper.ordered:
2306-
2307-
# technically we cannot group on an unordered
2308-
# Categorical
2309-
# but this a user convenience to do so; the ordering
2310-
# is preserved and if it's a reduction it doesn't make
2311-
# any difference
2312-
pass
2313-
2314-
# fix bug #GH8868 sort=False being ignored in categorical
2315-
# groupby
2316-
else:
2317-
cat = self.grouper.unique()
2318-
self.grouper = self.grouper.reorder_categories(
2319-
cat.categories)
2303+
self.grouper = self.grouper._codes_for_groupby(self.sort)
23202304

23212305
# we make a CategoricalIndex out of the cat grouper
23222306
# preserving the categories / ordered attributes

pandas/indexes/category.py

+4
Original file line numberDiff line numberDiff line change
@@ -550,6 +550,10 @@ def _append_same_dtype(self, to_concat, name):
550550
result.name = name
551551
return result
552552

553+
def _codes_for_groupby(self, sort):
554+
""" Return a Categorical adjusted for groupby """
555+
return self.values._codes_for_groupby(sort)
556+
553557
@classmethod
554558
def _add_comparison_methods(cls):
555559
""" add in comparison methods """

pandas/tests/groupby/test_categorical.py

+24
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,30 @@ def test_groupby_multi_categorical_as_index(self):
284284

285285
tm.assert_frame_equal(result, expected, check_index_type=True)
286286

287+
def test_groupby_preserve_categories(self):
288+
# GH-13179
289+
categories = list('abc')
290+
291+
# ordered=True
292+
df = DataFrame({'A': pd.Categorical(list('ba'),
293+
categories=categories,
294+
ordered=True)})
295+
index = pd.CategoricalIndex(categories, categories, ordered=True)
296+
tm.assert_index_equal(df.groupby('A', sort=True).first().index, index)
297+
tm.assert_index_equal(df.groupby('A', sort=False).first().index, index)
298+
299+
# ordered=False
300+
df = DataFrame({'A': pd.Categorical(list('ba'),
301+
categories=categories,
302+
ordered=False)})
303+
sort_index = pd.CategoricalIndex(categories, categories, ordered=False)
304+
nosort_index = pd.CategoricalIndex(list('bac'), list('bac'),
305+
ordered=False)
306+
tm.assert_index_equal(df.groupby('A', sort=True).first().index,
307+
sort_index)
308+
tm.assert_index_equal(df.groupby('A', sort=False).first().index,
309+
nosort_index)
310+
287311
def test_groupby_preserve_categorical_dtype(self):
288312
# GH13743, GH13854
289313
df = DataFrame({'A': [1, 2, 1, 1, 2],

0 commit comments

Comments
 (0)