From 6d9d2253a4244413f9c70622a96461ce2c8d5e15 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 11 Nov 2019 10:05:53 -0800 Subject: [PATCH 1/2] REF: pre-allocate result --- pandas/_libs/reduction.pyx | 32 +++++++++++--------------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index e6e658c0c6979..5a124c785138f 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -113,8 +113,9 @@ cdef class Reducer: chunk.data = arr.data labels = self.labels has_labels = labels is not None - has_index = self.index is not None - incr = self.increment + + result = _get_result_array(None, self.nresults, len(self.dummy)) + it = PyArray_IterNew(result) try: for i in range(self.nresults): @@ -131,26 +132,18 @@ cdef class Reducer: if self.typ is not None: # recreate with the index if supplied - if has_index: - - cached_typ = self.typ( - chunk, index=self.index, name=name) - - else: - - # use the passsed typ, sans index - cached_typ = self.typ(chunk, name=name) + cached_typ = self.typ( + chunk, index=self.index, name=name) # use the cached_typ if possible if cached_typ is not None: - if has_index: - object.__setattr__(cached_typ, 'index', self.index) - + object.__setattr__(cached_typ, 'index', self.index) object.__setattr__( cached_typ._data._block, 'values', chunk) object.__setattr__(cached_typ, 'name', name) res = self.f(cached_typ) + else: res = self.f(chunk) @@ -158,10 +151,9 @@ cdef class Reducer: and util.is_array(res.values)): res = res.values if i == 0: - result = _get_result_array(res, - self.nresults, - len(self.dummy)) - it = PyArray_IterNew(result) + # On the first pass, we check the output shape to see + # if this looks like a reduction. + _get_result_array(res, 0, len(self.dummy)) PyArray_SETITEM(result, PyArray_ITER_DATA(it), res) chunk.data = chunk.data + self.increment @@ -170,9 +162,7 @@ cdef class Reducer: # so we don't free the wrong memory chunk.data = dummy_buf - if result.dtype == np.object_: - result = maybe_convert_objects(result) - + result = maybe_convert_objects(result) return result From aaa906f0107b2473636a83154bc47a18f0407583 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 11 Nov 2019 11:38:05 -0800 Subject: [PATCH 2/2] REF: pre-allocate results --- pandas/_libs/reduction.pyx | 52 +++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 5a124c785138f..79198fa1630bb 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -18,15 +18,13 @@ cimport pandas._libs.util as util from pandas._libs.lib import maybe_convert_objects -cdef _get_result_array(object obj, Py_ssize_t size, Py_ssize_t cnt): +cdef _check_result_array(object obj, Py_ssize_t cnt): if (util.is_array(obj) or (isinstance(obj, list) and len(obj) == cnt) or getattr(obj, 'shape', None) == (cnt,)): raise ValueError('Function does not reduce') - return np.empty(size, dtype='O') - cdef bint _is_sparse_array(object obj): # TODO can be removed one SparseArray.values is removed (GH26421) @@ -113,8 +111,10 @@ cdef class Reducer: chunk.data = arr.data labels = self.labels has_labels = labels is not None + has_index = self.index is not None + incr = self.increment - result = _get_result_array(None, self.nresults, len(self.dummy)) + result = np.empty(self.nresults, dtype='O') it = PyArray_IterNew(result) try: @@ -132,18 +132,26 @@ cdef class Reducer: if self.typ is not None: # recreate with the index if supplied - cached_typ = self.typ( - chunk, index=self.index, name=name) + if has_index: + + cached_typ = self.typ( + chunk, index=self.index, name=name) + + else: + + # use the passsed typ, sans index + cached_typ = self.typ(chunk, name=name) # use the cached_typ if possible if cached_typ is not None: - object.__setattr__(cached_typ, 'index', self.index) + if has_index: + object.__setattr__(cached_typ, 'index', self.index) + object.__setattr__( cached_typ._data._block, 'values', chunk) object.__setattr__(cached_typ, 'name', name) res = self.f(cached_typ) - else: res = self.f(chunk) @@ -153,7 +161,7 @@ cdef class Reducer: if i == 0: # On the first pass, we check the output shape to see # if this looks like a reduction. - _get_result_array(res, 0, len(self.dummy)) + _check_result_array(res, len(self.dummy)) PyArray_SETITEM(result, PyArray_ITER_DATA(it), res) chunk.data = chunk.data + self.increment @@ -248,6 +256,8 @@ cdef class SeriesBinGrouper(_BaseGrouper): vslider = Slider(self.arr, self.dummy_arr) islider = Slider(self.index, self.dummy_index) + result = np.empty(self.ngroups, dtype='O') + try: for i in range(self.ngroups): group_size = counts[i] @@ -274,10 +284,11 @@ cdef class SeriesBinGrouper(_BaseGrouper): res = self.f(cached_typ) res = _extract_result(res) if not initialized: + # On the first pass, we check the output shape to see + # if this looks like a reduction. initialized = 1 - result = _get_result_array(res, - self.ngroups, - len(self.dummy_arr)) + _check_result_array(res, len(self.dummy_arr)) + result[i] = res islider.advance(group_size) @@ -288,9 +299,7 @@ cdef class SeriesBinGrouper(_BaseGrouper): islider.reset() vslider.reset() - if result.dtype == np.object_: - result = maybe_convert_objects(result) - + result = maybe_convert_objects(result) return result, counts @@ -349,6 +358,8 @@ cdef class SeriesGrouper(_BaseGrouper): vslider = Slider(self.arr, self.dummy_arr) islider = Slider(self.index, self.dummy_index) + result = np.empty(self.ngroups, dtype='O') + try: for i in range(n): group_size += 1 @@ -381,10 +392,10 @@ cdef class SeriesGrouper(_BaseGrouper): res = self.f(cached_typ) res = _extract_result(res) if not initialized: + # On the first pass, we check the output shape to see + # if this looks like a reduction. initialized = 1 - result = _get_result_array(res, - self.ngroups, - len(self.dummy_arr)) + _check_result_array(res, len(self.dummy_arr)) result[lab] = res counts[lab] = group_size @@ -398,11 +409,10 @@ cdef class SeriesGrouper(_BaseGrouper): islider.reset() vslider.reset() - if result is None: + if not initialized: raise ValueError("No result.") - if result.dtype == np.object_: - result = maybe_convert_objects(result) + result = maybe_convert_objects(result) return result, counts