Skip to content

Commit 400e14a

Browse files
committed
PERF: don't materialize arrays on checking in groupby
1 parent a6fcec6 commit 400e14a

File tree

3 files changed

+10
-2
lines changed

3 files changed

+10
-2
lines changed

asv_bench/benchmarks/groupby.py

+9
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,11 @@ def setup(self):
368368
self.dates = (np.datetime64('now') + self.offsets)
369369
self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), 'key2': np.random.randint(0, 100, size=self.n), 'value1': np.random.randn(self.n), 'value2': np.random.randn(self.n), 'value3': np.random.randn(self.n), 'dates': self.dates, })
370370

371+
N = 1000000
372+
self.draws = pd.Series(np.random.randn(N))
373+
labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4))
374+
self.cats = labels.astype('category')
375+
371376
def time_groupby_multi_size(self):
372377
self.df.groupby(['key1', 'key2']).size()
373378

@@ -377,6 +382,10 @@ def time_groupby_dt_size(self):
377382
def time_groupby_dt_timegrouper_size(self):
378383
self.df.groupby(TimeGrouper(key='dates', freq='M')).size()
379384

385+
def time_groupby_size(self):
386+
self.draws.groupby(self.cats).size()
387+
388+
380389

381390
#----------------------------------------------------------------------
382391
# groupby with a variable value for ngroups

doc/source/whatsnew/v0.20.2.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ Performance Improvements
2929
- Performance regression fix when indexing with a list-like (:issue:`16285`)
3030
- Performance regression fix for MultiIndexes (:issue:`16319`, :issue:`16346`)
3131
- Improved performance of ``.clip()`` with scalar arguments (:issue:`15400`)
32-
32+
- Improved performance of groupby with categorical groupers (:issue:`16413`)
3333

3434
.. _whatsnew_0202.bug_fixes:
3535

pandas/core/indexes/base.py

-1
Original file line numberDiff line numberDiff line change
@@ -2388,7 +2388,6 @@ def get_loc(self, key, method=None, tolerance=None):
23882388
if tolerance is not None:
23892389
raise ValueError('tolerance argument only valid if using pad, '
23902390
'backfill or nearest lookups')
2391-
key = _values_from_object(key)
23922391
try:
23932392
return self._engine.get_loc(key)
23942393
except KeyError:

0 commit comments

Comments
 (0)