Skip to content

Commit 77d4c3e

Browse files
authored
PERF: df.groupby(categorical, sort=False) (pandas-dev#48976)
* refactor using codes * PREF: df.groupby(categorical, sort=False) * Add pr number * remove comment
1 parent 5620f0e commit 77d4c3e

File tree

2 files changed

+15
-14
lines changed

2 files changed

+15
-14
lines changed

doc/source/whatsnew/v1.6.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ Performance improvements
153153
- Performance improvement in ``var`` for nullable dtypes (:issue:`48379`).
154154
- Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`)
155155
- Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`)
156+
- Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`)
156157

157158
.. ---------------------------------------------------------------------------
158159
.. _whatsnew_160.bug_fixes:

pandas/core/groupby/categorical.py

+14-14
Original file line numberDiff line numberDiff line change
@@ -75,21 +75,21 @@ def recode_for_groupby(
7575
return c, None
7676

7777
# sort=False should order groups in as-encountered order (GH-8868)
78-
cat = c.unique()
7978

80-
# See GH-38140 for block below
81-
# exclude nan from indexer for categories
82-
take_codes = cat.codes[cat.codes != -1]
83-
if cat.ordered:
84-
take_codes = np.sort(take_codes)
85-
cat = cat.set_categories(cat.categories.take(take_codes))
86-
87-
# But for groupby to work, all categories should be present,
88-
# including those missing from the data (GH-13179), which .unique()
89-
# above dropped
90-
cat = cat.add_categories(c.categories[~c.categories.isin(cat.categories)])
91-
92-
return c.reorder_categories(cat.categories), None
79+
# xref GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories
80+
all_codes = np.arange(c.categories.nunique(), dtype=np.int8)
81+
# GH 38140: exclude nan from indexer for categories
82+
unique_notnan_codes = unique1d(c.codes[c.codes != -1])
83+
if c.ordered:
84+
unique_notnan_codes = np.sort(unique_notnan_codes)
85+
if len(all_codes) > len(unique_notnan_codes):
86+
# GH 13179: All categories need to be present, even if missing from the data
87+
missing_codes = np.setdiff1d(all_codes, unique_notnan_codes, assume_unique=True)
88+
take_codes = np.concatenate((unique_notnan_codes, missing_codes))
89+
else:
90+
take_codes = unique_notnan_codes
91+
92+
return Categorical(c, c.unique().categories.take(take_codes)), None
9393

9494

9595
def recode_from_groupby(

0 commit comments

Comments
 (0)