Skip to content

Commit 2aec326

Browse files
committed
fixup! BUG: Fix .groupby(categorical, sort=False) failing
1 parent c813146 commit 2aec326

File tree

4 files changed

+70
-24
lines changed

4 files changed

+70
-24
lines changed

doc/source/whatsnew/v0.20.0.txt

+36-2
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,42 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937
120120
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
121121
- Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`)
122122

123+
.. _whatsnew_0200.enhancements.groupy_categorical
124+
125+
GroupBy on Categoricals
126+
^^^^^^^^^^^^^^^^^^^^^^^
127+
128+
In previous version, ``.groupby(..., sort=False)`` would fail with a ``ValueError`` when grouping on a categorical series with some categories not appearing in the data. (:issue:`13179`)
129+
130+
Now, it works.
131+
132+
.. ipython:: python
133+
134+
chromosomes = np.r_[np.arange(1, 23).astype(str), ['X', 'Y']]
135+
df = pd.DataFrame({
136+
'A': np.random.randint(100),
137+
'B': np.random.randint(100),
138+
'C': np.random.randint(100),
139+
'chromosomes': pd.Categorical(np.random.choice(chromosomes, 100),
140+
categories=chromosomes,
141+
ordered=True)})
142+
143+
Previous Behavior:
144+
145+
.. code-block:: ipython
146+
147+
In [3]: df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum()
148+
---------------------------------------------------------------------------
149+
ValueError Traceback (most recent call last)
150+
...
151+
ValueError: items in new_categories are not the same as in old categories
152+
153+
New Behavior:
154+
155+
.. ipython:: python
156+
157+
df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum()
158+
123159
.. _whatsnew_0200.enhancements.other:
124160

125161
Other enhancements
@@ -160,7 +196,6 @@ Other enhancements
160196

161197
.. _whatsnew_0200.api_breaking:
162198

163-
164199
Backwards incompatible API changes
165200
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
166201

@@ -539,7 +574,6 @@ Bug Fixes
539574

540575
- Bug in ``resample``, where a non-string ```loffset`` argument would not be applied when resampling a timeseries (:issue:`13218`)
541576

542-
- Bug in ``.groupby`` where ```.groupby(categorical, sort=False)`` would raise ``ValueError`` due to non-matching categories (:issue:`13179`)
543577

544578

545579

pandas/core/categorical.py

+29
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,35 @@ def _get_categories(self):
602602
categories = property(fget=_get_categories, fset=_set_categories,
603603
doc=_categories_doc)
604604

605+
def _codes_for_groupby(self, sort):
606+
"""
607+
Return a Categorical adjusted for groupby
608+
609+
Parameters
610+
----------
611+
sort : boolean
612+
The value of the sort paramter groupby was called with.
613+
614+
Returns
615+
-------
616+
Categorical
617+
In case of sort=True, self is returned with original categories
618+
preserved. In case of sort=False, the new categories are set
619+
to the order of appearance in codes (unless ordered=True),
620+
followed by any unrepresented categories in original order.
621+
"""
622+
cat = self
623+
# sort=False should order groups in as-encountered order (GH-8868)
624+
if not sort:
625+
cat = self.unique()
626+
# But all categories should be present, including those missing
627+
# from the data (GH-13179), which .unique() dropped
628+
cat.add_categories(self.categories[
629+
~self.categories.isin(cat.categories)],
630+
inplace=True)
631+
cat = self.reorder_categories(cat.categories)
632+
return cat
633+
605634
_ordered = None
606635

607636
def set_ordered(self, value, inplace=False):

pandas/core/groupby.py

+1-22
Original file line numberDiff line numberDiff line change
@@ -2300,28 +2300,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
23002300
# a passed Categorical
23012301
elif is_categorical_dtype(self.grouper):
23022302

2303-
# must have an ordered categorical
2304-
if self.sort:
2305-
if not self.grouper.ordered:
2306-
2307-
# technically we cannot group on an unordered
2308-
# Categorical
2309-
# but this a user convenience to do so; the ordering
2310-
# is preserved and if it's a reduction it doesn't make
2311-
# any difference
2312-
pass
2313-
2314-
# fix bug #GH8868 sort=False being ignored in categorical
2315-
# groupby
2316-
else:
2317-
cat = self.grouper.unique()
2318-
all_categories = self.grouper.categories
2319-
cat.add_categories(
2320-
all_categories[
2321-
~all_categories.isin(cat.categories)],
2322-
inplace=True) # GH-13179
2323-
self.grouper = self.grouper.reorder_categories(
2324-
cat.categories)
2303+
self.grouper = self.grouper._codes_for_groupby(self.sort)
23252304

23262305
# we make a CategoricalIndex out of the cat grouper
23272306
# preserving the categories / ordered attributes

pandas/indexes/category.py

+4
Original file line numberDiff line numberDiff line change
@@ -550,6 +550,10 @@ def _append_same_dtype(self, to_concat, name):
550550
result.name = name
551551
return result
552552

553+
def _codes_for_groupby(self, sort):
554+
""" Return a Categorical adjusted for groupby """
555+
return self.values._codes_for_groupby(sort)
556+
553557
@classmethod
554558
def _add_comparison_methods(cls):
555559
""" add in comparison methods """

0 commit comments

Comments
 (0)