REF: pre-allocate results in libreduction (pandas-dev#29550)

jbrockmendel · proost · commit f2ab033be0c2 · 2019-12-20T01:10:56.000+09:00
diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx
@@ -18,15 +18,13 @@ cimport pandas._libs.util as util
 from pandas._libs.lib import maybe_convert_objects
 
 
-cdef _get_result_array(object obj, Py_ssize_t size, Py_ssize_t cnt):
+cdef _check_result_array(object obj, Py_ssize_t cnt):
 
     if (util.is_array(obj) or
             (isinstance(obj, list) and len(obj) == cnt) or
             getattr(obj, 'shape', None) == (cnt,)):
         raise ValueError('Function does not reduce')
 
-    return np.empty(size, dtype='O')
-
 
 cdef bint _is_sparse_array(object obj):
     # TODO can be removed one SparseArray.values is removed (GH26421)
@@ -116,6 +114,9 @@ cdef class Reducer:
         has_index = self.index is not None
         incr = self.increment
 
+        result = np.empty(self.nresults, dtype='O')
+        it = <flatiter>PyArray_IterNew(result)
+
         try:
             for i in range(self.nresults):
 
@@ -158,10 +159,9 @@ cdef class Reducer:
                         and util.is_array(res.values)):
                     res = res.values
                 if i == 0:
-                    result = _get_result_array(res,
-                                               self.nresults,
-                                               len(self.dummy))
-                    it = <flatiter>PyArray_IterNew(result)
+                    # On the first pass, we check the output shape to see
+                    #  if this looks like a reduction.
+                    _check_result_array(res, len(self.dummy))
 
                 PyArray_SETITEM(result, PyArray_ITER_DATA(it), res)
                 chunk.data = chunk.data + self.increment
@@ -170,9 +170,7 @@ cdef class Reducer:
             # so we don't free the wrong memory
             chunk.data = dummy_buf
 
-        if result.dtype == np.object_:
-            result = maybe_convert_objects(result)
-
+        result = maybe_convert_objects(result)
         return result
 
 
@@ -275,6 +273,8 @@ cdef class SeriesBinGrouper(_BaseGrouper):
         vslider = Slider(self.arr, self.dummy_arr)
         islider = Slider(self.index, self.dummy_index)
 
+        result = np.empty(self.ngroups, dtype='O')
+
         try:
             for i in range(self.ngroups):
                 group_size = counts[i]
@@ -289,10 +289,11 @@ cdef class SeriesBinGrouper(_BaseGrouper):
                 res = self.f(cached_typ)
                 res = _extract_result(res)
                 if not initialized:
+                    # On the first pass, we check the output shape to see
+                    #  if this looks like a reduction.
                     initialized = 1
-                    result = _get_result_array(res,
-                                               self.ngroups,
-                                               len(self.dummy_arr))
+                    _check_result_array(res, len(self.dummy_arr))
+
                 result[i] = res
 
                 islider.advance(group_size)
@@ -303,9 +304,7 @@ cdef class SeriesBinGrouper(_BaseGrouper):
             islider.reset()
             vslider.reset()
 
-        if result.dtype == np.object_:
-            result = maybe_convert_objects(result)
-
+        result = maybe_convert_objects(result)
         return result, counts
 
 
@@ -368,6 +367,8 @@ cdef class SeriesGrouper(_BaseGrouper):
         vslider = Slider(self.arr, self.dummy_arr)
         islider = Slider(self.index, self.dummy_index)
 
+        result = np.empty(self.ngroups, dtype='O')
+
         try:
             for i in range(n):
                 group_size += 1
@@ -391,10 +392,10 @@ cdef class SeriesGrouper(_BaseGrouper):
                     res = self.f(cached_typ)
                     res = _extract_result(res)
                     if not initialized:
+                        # On the first pass, we check the output shape to see
+                        #  if this looks like a reduction.
                         initialized = 1
-                        result = _get_result_array(res,
-                                                   self.ngroups,
-                                                   len(self.dummy_arr))
+                        _check_result_array(res, len(self.dummy_arr))
 
                     result[lab] = res
                     counts[lab] = group_size
@@ -410,10 +411,9 @@ cdef class SeriesGrouper(_BaseGrouper):
 
         # We check for empty series in the constructor, so should always
         #  have result initialized by this point.
-        assert result is not None, "`result` has not been assigned."
+        assert initialized, "`result` has not been initialized."
 
-        if result.dtype == np.object_:
-            result = maybe_convert_objects(result)
+        result = maybe_convert_objects(result)
 
         return result, counts