Skip to content

Commit 82d19dd

Browse files
committed
PERF: faster grouping
remove pandas.core.groupby._groupby_indices to use algos.groupsort_indexer add Categorical._reverse_indexer to facilitate closes #14293
1 parent b81d444 commit 82d19dd

File tree

16 files changed

+201
-482
lines changed

16 files changed

+201
-482
lines changed

asv_bench/benchmarks/gil.py

+54-119
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def wrapper(fname):
2222
return wrapper
2323

2424

25-
class nogil_groupby_count_2(object):
25+
class nogil_groupby_base(object):
2626
goal_time = 0.2
2727

2828
def setup(self):
@@ -33,6 +33,9 @@ def setup(self):
3333
if (not have_real_test_parallel):
3434
raise NotImplementedError
3535

36+
37+
class nogil_groupby_count_2(nogil_groupby_base):
38+
3639
def time_nogil_groupby_count_2(self):
3740
self.pg2()
3841

@@ -41,16 +44,7 @@ def pg2(self):
4144
self.df.groupby('key')['data'].count()
4245

4346

44-
class nogil_groupby_last_2(object):
45-
goal_time = 0.2
46-
47-
def setup(self):
48-
self.N = 1000000
49-
self.ngroups = 1000
50-
np.random.seed(1234)
51-
self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
52-
if (not have_real_test_parallel):
53-
raise NotImplementedError
47+
class nogil_groupby_last_2(nogil_groupby_base):
5448

5549
def time_nogil_groupby_last_2(self):
5650
self.pg2()
@@ -60,16 +54,7 @@ def pg2(self):
6054
self.df.groupby('key')['data'].last()
6155

6256

63-
class nogil_groupby_max_2(object):
64-
goal_time = 0.2
65-
66-
def setup(self):
67-
self.N = 1000000
68-
self.ngroups = 1000
69-
np.random.seed(1234)
70-
self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
71-
if (not have_real_test_parallel):
72-
raise NotImplementedError
57+
class nogil_groupby_max_2(nogil_groupby_base):
7358

7459
def time_nogil_groupby_max_2(self):
7560
self.pg2()
@@ -79,16 +64,7 @@ def pg2(self):
7964
self.df.groupby('key')['data'].max()
8065

8166

82-
class nogil_groupby_mean_2(object):
83-
goal_time = 0.2
84-
85-
def setup(self):
86-
self.N = 1000000
87-
self.ngroups = 1000
88-
np.random.seed(1234)
89-
self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
90-
if (not have_real_test_parallel):
91-
raise NotImplementedError
67+
class nogil_groupby_mean_2(nogil_groupby_base):
9268

9369
def time_nogil_groupby_mean_2(self):
9470
self.pg2()
@@ -98,16 +74,7 @@ def pg2(self):
9874
self.df.groupby('key')['data'].mean()
9975

10076

101-
class nogil_groupby_min_2(object):
102-
goal_time = 0.2
103-
104-
def setup(self):
105-
self.N = 1000000
106-
self.ngroups = 1000
107-
np.random.seed(1234)
108-
self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
109-
if (not have_real_test_parallel):
110-
raise NotImplementedError
77+
class nogil_groupby_min_2(nogil_groupby_base):
11178

11279
def time_nogil_groupby_min_2(self):
11380
self.pg2()
@@ -117,16 +84,7 @@ def pg2(self):
11784
self.df.groupby('key')['data'].min()
11885

11986

120-
class nogil_groupby_prod_2(object):
121-
goal_time = 0.2
122-
123-
def setup(self):
124-
self.N = 1000000
125-
self.ngroups = 1000
126-
np.random.seed(1234)
127-
self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
128-
if (not have_real_test_parallel):
129-
raise NotImplementedError
87+
class nogil_groupby_prod_2(nogil_groupby_base):
13088

13189
def time_nogil_groupby_prod_2(self):
13290
self.pg2()
@@ -136,16 +94,7 @@ def pg2(self):
13694
self.df.groupby('key')['data'].prod()
13795

13896

139-
class nogil_groupby_sum_2(object):
140-
goal_time = 0.2
141-
142-
def setup(self):
143-
self.N = 1000000
144-
self.ngroups = 1000
145-
np.random.seed(1234)
146-
self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
147-
if (not have_real_test_parallel):
148-
raise NotImplementedError
97+
class nogil_groupby_sum_2(nogil_groupby_base):
14998

15099
def time_nogil_groupby_sum_2(self):
151100
self.pg2()
@@ -155,107 +104,93 @@ def pg2(self):
155104
self.df.groupby('key')['data'].sum()
156105

157106

158-
class nogil_groupby_sum_4(object):
159-
goal_time = 0.2
160-
161-
def setup(self):
162-
self.N = 1000000
163-
self.ngroups = 1000
164-
np.random.seed(1234)
165-
self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
166-
if (not have_real_test_parallel):
167-
raise NotImplementedError
107+
class nogil_groupby_sum_4(nogil_groupby_base):
168108

169109
def time_nogil_groupby_sum_4(self):
170110
self.pg4()
171111

172112
def f(self):
173113
self.df.groupby('key')['data'].sum()
174114

175-
def g2(self):
176-
for i in range(2):
177-
self.f()
178-
179115
def g4(self):
180116
for i in range(4):
181117
self.f()
182118

183-
def g8(self):
184-
for i in range(8):
185-
self.f()
186-
187-
@test_parallel(num_threads=2)
188-
def pg2(self):
189-
self.f()
190-
191119
@test_parallel(num_threads=4)
192120
def pg4(self):
193121
self.f()
194122

195-
@test_parallel(num_threads=8)
196-
def pg8(self):
197-
self.f()
198123

199-
200-
class nogil_groupby_sum_8(object):
201-
goal_time = 0.2
202-
203-
def setup(self):
204-
self.N = 1000000
205-
self.ngroups = 1000
206-
np.random.seed(1234)
207-
self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
208-
if (not have_real_test_parallel):
209-
raise NotImplementedError
124+
class nogil_groupby_sum_8(nogil_groupby_base):
210125

211126
def time_nogil_groupby_sum_8(self):
212127
self.pg8()
213128

214129
def f(self):
215130
self.df.groupby('key')['data'].sum()
216131

217-
def g2(self):
218-
for i in range(2):
219-
self.f()
220-
221-
def g4(self):
222-
for i in range(4):
223-
self.f()
224-
225132
def g8(self):
226133
for i in range(8):
227134
self.f()
228135

229-
@test_parallel(num_threads=2)
230-
def pg2(self):
231-
self.f()
232-
233-
@test_parallel(num_threads=4)
234-
def pg4(self):
235-
self.f()
236-
237136
@test_parallel(num_threads=8)
238137
def pg8(self):
239138
self.f()
240139

241140

242-
class nogil_groupby_var_2(object):
141+
class nogil_groupby_var_2(nogil_groupby_base):
142+
143+
def time_nogil_groupby_var_2(self):
144+
self.pg2()
145+
146+
@test_parallel(num_threads=2)
147+
def pg2(self):
148+
self.df.groupby('key')['data'].var()
149+
150+
151+
class nogil_groupby_groups(object):
243152
goal_time = 0.2
244153

245154
def setup(self):
246-
self.N = 1000000
247-
self.ngroups = 1000
248155
np.random.seed(1234)
249-
self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
156+
self.size = 2**22
157+
self.ngroups = 100
158+
self.data = Series(np.random.randint(0, self.ngroups, size=self.size))
250159
if (not have_real_test_parallel):
251160
raise NotImplementedError
252161

253-
def time_nogil_groupby_var_2(self):
162+
def f(self):
163+
self.data.groupby(self.data).groups
164+
165+
166+
class nogil_groupby_groups_2(nogil_groupby_groups):
167+
168+
def time_nogil_groupby_groups(self):
254169
self.pg2()
255170

256171
@test_parallel(num_threads=2)
257172
def pg2(self):
258-
self.df.groupby('key')['data'].var()
173+
self.f()
174+
175+
176+
class nogil_groupby_groups_4(nogil_groupby_groups):
177+
178+
def time_nogil_groupby_groups(self):
179+
self.pg4()
180+
181+
@test_parallel(num_threads=4)
182+
def pg4(self):
183+
self.f()
184+
185+
186+
class nogil_groupby_groups_8(nogil_groupby_groups):
187+
188+
def time_nogil_groupby_groups(self):
189+
self.pg8()
190+
191+
@test_parallel(num_threads=8)
192+
def pg8(self):
193+
self.f()
259194

260195

261196
class nogil_take1d_float64(object):

asv_bench/benchmarks/groupby.py

+26
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,32 @@ def time_groupby_apply_dict_return(self):
3232
self.data.groupby(self.labels).apply(self.f)
3333

3434

35+
#----------------------------------------------------------------------
36+
# groups
37+
38+
class groupby_groups(object):
39+
goal_time = 0.1
40+
41+
def setup(self):
42+
size = 2**22
43+
self.data = Series(np.random.randint(0, 100, size=size))
44+
self.data2 = Series(np.random.randint(0, 10000, size=size))
45+
self.data3 = Series(tm.makeStringIndex(100).take(np.random.randint(0, 100, size=size)))
46+
self.data4 = Series(tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=size)))
47+
48+
def time_groupby_groups_int64_small(self):
49+
self.data.groupby(self.data).groups
50+
51+
def time_groupby_groups_int64_large(self):
52+
self.data2.groupby(self.data2).groups
53+
54+
def time_groupby_groups_object_small(self):
55+
self.data3.groupby(self.data3).groups
56+
57+
def time_groupby_groups_object_large(self):
58+
self.data4.groupby(self.data4).groups
59+
60+
3561
#----------------------------------------------------------------------
3662
# First / last functions
3763

doc/source/whatsnew/v0.19.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -1335,6 +1335,7 @@ Other API Changes
13351335
- ``Series`` and ``Index`` now support ``divmod`` which will return a tuple of
13361336
series or indices. This behaves like a standard binary operator with regards
13371337
to broadcasting rules (:issue:`14208`).
1338+
- ``.groupby.groups`` will now return a dictionary of ``Index`` objects, rather than a dictionary of ``np.ndarray`` or ``lists`` (:issue:`14293`)
13381339

13391340
.. _whatsnew_0190.deprecations:
13401341

@@ -1407,6 +1408,7 @@ Performance Improvements
14071408
- Improved performance of hashing ``Period`` (:issue:`12817`)
14081409
- Improved performance of ``factorize`` of datetime with timezone (:issue:`13750`)
14091410
- Improved performance of by lazily creating indexing hashtables on larger Indexes (:issue:`14266`)
1411+
- Improved performance of ``groupby.groups`` (:issue:`14293`)
14101412

14111413

14121414
.. _whatsnew_0190.bug_fixes:

0 commit comments

Comments
 (0)