Merge pull request #7568 from jreback/perf2

jreback · jreback · commit c98548b5fe33 · 2014-06-25T09:48:23.000-04:00
BUG/PERF: perf issues in object groupby aggregations (GH7555)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1414,10 +1414,11 @@ def aggregate(self, values, how, axis=0):
         else:
             is_numeric = issubclass(values.dtype.type, (np.datetime64,
                                                         np.timedelta64))
-            out_dtype = 'float64'
             if is_numeric:
+                out_dtype = 'float64'
                 values = values.view('int64')
             else:
+                out_dtype = 'object'
                 values = values.astype(object)
 
         # will be filled in Cython function
diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py
@@ -2234,7 +2234,7 @@ def generate_put_template(template, use_ints=True, use_floats=True,
     date_like_list = [
         ('int64', 'int64_t', 'float64_t', 'np.float64'),
     ]
-    object_list = [('object', 'object', 'float64_t', 'np.float64')]
+    object_list = [('object', 'object', 'object', 'np.object_')]
     function_list = []
     if use_floats:
         function_list.extend(floats_list)
diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx
@@ -6697,7 +6697,7 @@ def group_count_float32(ndarray[float32_t, ndim=2] out,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_count_object(ndarray[float64_t, ndim=2] out,
+def group_count_object(ndarray[object, ndim=2] out,
                          ndarray[int64_t] counts,
                          ndarray[object, ndim=2] values,
                          ndarray[int64_t] labels):
@@ -6838,7 +6838,7 @@ def group_count_bin_float32(ndarray[float32_t, ndim=2] out,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_count_bin_object(ndarray[float64_t, ndim=2] out,
+def group_count_bin_object(ndarray[object, ndim=2] out,
                              ndarray[int64_t] counts,
                              ndarray[object, ndim=2] values,
                              ndarray[int64_t] bins):
diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py
@@ -232,24 +232,46 @@ def f():
 labels = labels.take(np.random.permutation(len(labels)))
 """
 
-groupby_first = Benchmark('data.groupby(labels).first()', setup,
+groupby_first_float64 = Benchmark('data.groupby(labels).first()', setup,
                           start_date=datetime(2012, 5, 1))
 
 groupby_first_float32 = Benchmark('data2.groupby(labels).first()', setup,
                                   start_date=datetime(2013, 1, 1))
 
-groupby_last = Benchmark('data.groupby(labels).last()', setup,
+groupby_last_float64 = Benchmark('data.groupby(labels).last()', setup,
                          start_date=datetime(2012, 5, 1))
 
 groupby_last_float32 = Benchmark('data2.groupby(labels).last()', setup,
                                  start_date=datetime(2013, 1, 1))
 
+groupby_nth_float64 = Benchmark('data.groupby(labels).nth(0)', setup,
+                         start_date=datetime(2012, 5, 1))
+
+groupby_nth_float32 = Benchmark('data2.groupby(labels).nth(0)', setup,
+                                 start_date=datetime(2013, 1, 1))
+
 # with datetimes (GH7555)
 setup = common_setup + """
 df = DataFrame({'a' : date_range('1/1/2011',periods=100000,freq='s'),'b' : range(100000)})
 """
 
-groupby_mixed_first = Benchmark('df.groupby("b").first()', setup,
+groupby_first_datetimes = Benchmark('df.groupby("b").first()', setup,
+                                 start_date=datetime(2013, 5, 1))
+groupby_last_datetimes = Benchmark('df.groupby("b").last()', setup,
+                                 start_date=datetime(2013, 5, 1))
+groupby_nth_datetimes = Benchmark('df.groupby("b").nth(0)', setup,
+                                 start_date=datetime(2013, 5, 1))
+
+# with object
+setup = common_setup + """
+df = DataFrame({'a' : ['foo']*100000,'b' : range(100000)})
+"""
+
+groupby_first_object = Benchmark('df.groupby("b").first()', setup,
+                                 start_date=datetime(2013, 5, 1))
+groupby_last_object = Benchmark('df.groupby("b").last()', setup,
+                                 start_date=datetime(2013, 5, 1))
+groupby_nth_object = Benchmark('df.groupby("b").nth(0)', setup,
                                  start_date=datetime(2013, 5, 1))
 
 #----------------------------------------------------------------------

Original file line number	Diff line number	Diff line change
`@@ -2234,7 +2234,7 @@ def generate_put_template(template, use_ints=True, use_floats=True,`
`2234`	`2234`	`date_like_list = [`
`2235`	`2235`	`('int64', 'int64_t', 'float64_t', 'np.float64'),`
`2236`	`2236`	`]`
`2237`		`- object_list = [('object', 'object', 'float64_t', 'np.float64')]`
	`2237`	`+ object_list = [('object', 'object', 'object', 'np.object_')]`
`2238`	`2238`	`function_list = []`
`2239`	`2239`	`if use_floats:`
`2240`	`2240`	`function_list.extend(floats_list)`