Skip to content

Commit c98548b

Browse files
committed
Merge pull request #7568 from jreback/perf2
BUG/PERF: perf issues in object groupby aggregations (GH7555)
2 parents 69bb0e8 + f1ded31 commit c98548b

File tree

4 files changed

+30
-7
lines changed

4 files changed

+30
-7
lines changed

pandas/core/groupby.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1414,10 +1414,11 @@ def aggregate(self, values, how, axis=0):
14141414
else:
14151415
is_numeric = issubclass(values.dtype.type, (np.datetime64,
14161416
np.timedelta64))
1417-
out_dtype = 'float64'
14181417
if is_numeric:
1418+
out_dtype = 'float64'
14191419
values = values.view('int64')
14201420
else:
1421+
out_dtype = 'object'
14211422
values = values.astype(object)
14221423

14231424
# will be filled in Cython function

pandas/src/generate_code.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2234,7 +2234,7 @@ def generate_put_template(template, use_ints=True, use_floats=True,
22342234
date_like_list = [
22352235
('int64', 'int64_t', 'float64_t', 'np.float64'),
22362236
]
2237-
object_list = [('object', 'object', 'float64_t', 'np.float64')]
2237+
object_list = [('object', 'object', 'object', 'np.object_')]
22382238
function_list = []
22392239
if use_floats:
22402240
function_list.extend(floats_list)

pandas/src/generated.pyx

+2-2
Original file line numberDiff line numberDiff line change
@@ -6697,7 +6697,7 @@ def group_count_float32(ndarray[float32_t, ndim=2] out,
66976697

66986698
@cython.boundscheck(False)
66996699
@cython.wraparound(False)
6700-
def group_count_object(ndarray[float64_t, ndim=2] out,
6700+
def group_count_object(ndarray[object, ndim=2] out,
67016701
ndarray[int64_t] counts,
67026702
ndarray[object, ndim=2] values,
67036703
ndarray[int64_t] labels):
@@ -6838,7 +6838,7 @@ def group_count_bin_float32(ndarray[float32_t, ndim=2] out,
68386838

68396839
@cython.boundscheck(False)
68406840
@cython.wraparound(False)
6841-
def group_count_bin_object(ndarray[float64_t, ndim=2] out,
6841+
def group_count_bin_object(ndarray[object, ndim=2] out,
68426842
ndarray[int64_t] counts,
68436843
ndarray[object, ndim=2] values,
68446844
ndarray[int64_t] bins):

vb_suite/groupby.py

+25-3
Original file line numberDiff line numberDiff line change
@@ -232,24 +232,46 @@ def f():
232232
labels = labels.take(np.random.permutation(len(labels)))
233233
"""
234234

235-
groupby_first = Benchmark('data.groupby(labels).first()', setup,
235+
groupby_first_float64 = Benchmark('data.groupby(labels).first()', setup,
236236
start_date=datetime(2012, 5, 1))
237237

238238
groupby_first_float32 = Benchmark('data2.groupby(labels).first()', setup,
239239
start_date=datetime(2013, 1, 1))
240240

241-
groupby_last = Benchmark('data.groupby(labels).last()', setup,
241+
groupby_last_float64 = Benchmark('data.groupby(labels).last()', setup,
242242
start_date=datetime(2012, 5, 1))
243243

244244
groupby_last_float32 = Benchmark('data2.groupby(labels).last()', setup,
245245
start_date=datetime(2013, 1, 1))
246246

247+
groupby_nth_float64 = Benchmark('data.groupby(labels).nth(0)', setup,
248+
start_date=datetime(2012, 5, 1))
249+
250+
groupby_nth_float32 = Benchmark('data2.groupby(labels).nth(0)', setup,
251+
start_date=datetime(2013, 1, 1))
252+
247253
# with datetimes (GH7555)
248254
setup = common_setup + """
249255
df = DataFrame({'a' : date_range('1/1/2011',periods=100000,freq='s'),'b' : range(100000)})
250256
"""
251257

252-
groupby_mixed_first = Benchmark('df.groupby("b").first()', setup,
258+
groupby_first_datetimes = Benchmark('df.groupby("b").first()', setup,
259+
start_date=datetime(2013, 5, 1))
260+
groupby_last_datetimes = Benchmark('df.groupby("b").last()', setup,
261+
start_date=datetime(2013, 5, 1))
262+
groupby_nth_datetimes = Benchmark('df.groupby("b").nth(0)', setup,
263+
start_date=datetime(2013, 5, 1))
264+
265+
# with object
266+
setup = common_setup + """
267+
df = DataFrame({'a' : ['foo']*100000,'b' : range(100000)})
268+
"""
269+
270+
groupby_first_object = Benchmark('df.groupby("b").first()', setup,
271+
start_date=datetime(2013, 5, 1))
272+
groupby_last_object = Benchmark('df.groupby("b").last()', setup,
273+
start_date=datetime(2013, 5, 1))
274+
groupby_nth_object = Benchmark('df.groupby("b").nth(0)', setup,
253275
start_date=datetime(2013, 5, 1))
254276

255277
#----------------------------------------------------------------------

0 commit comments

Comments
 (0)