From a231a55e87ef325cdcc729bda20b8433d8784412 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 8 May 2020 13:23:57 -0700 Subject: [PATCH 1/7] use numpy nditer protocol --- pandas/_libs/reduction.pyx | 36 +++++++++--------------------------- 1 file changed, 9 insertions(+), 27 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index a7b2d5d5491d5..ce260443aa86c 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -9,9 +9,8 @@ import numpy as np cimport numpy as cnp from numpy cimport (ndarray, int64_t, - PyArray_SETITEM, - PyArray_ITER_NEXT, PyArray_ITER_DATA, PyArray_IterNew, - flatiter) + PyArray_SETITEM) + cnp.import_array() cimport pandas._libs.util as util @@ -45,15 +44,11 @@ cdef class Reducer: n, k = (arr).shape if axis == 0: - if not arr.flags.f_contiguous: - arr = arr.copy('F') - self.nresults = k self.chunksize = n self.increment = n * arr.dtype.itemsize else: - if not arr.flags.c_contiguous: - arr = arr.copy('C') + arr = arr.T self.nresults = n self.chunksize = k @@ -95,31 +90,23 @@ cdef class Reducer: char* dummy_buf ndarray arr, result, chunk Py_ssize_t i - flatiter it object res, name, labels object cached_typ = None arr = self.arr - chunk = self.dummy - dummy_buf = chunk.data - chunk.data = arr.data labels = self.labels result = np.empty(self.nresults, dtype='O') - it = PyArray_IterNew(result) - - try: - for i in range(self.nresults): - # create the cached type - # each time just reassign the data + with np.nditer([arr, result], flags=["reduce_ok", "external_loop", "refs_ok"], op_flags=[["readonly"], ["readwrite"]], order="F") as it: + for i, (x, y) in enumerate(it): if i == 0: if self.typ is not None: # In this case, we also have self.index name = labels[i] cached_typ = self.typ( - chunk, index=self.index, name=name, dtype=arr.dtype) + x, index=self.index, name=name, dtype=arr.dtype) # use the cached_typ if possible if cached_typ is not None: @@ -127,11 +114,11 @@ cdef class Reducer: name = labels[i] object.__setattr__( - cached_typ._mgr._block, 'values', chunk) + cached_typ._mgr._block, 'values', x) object.__setattr__(cached_typ, 'name', name) res = self.f(cached_typ) else: - res = self.f(chunk) + res = self.f(x) # TODO: reason for not squeezing here? res = _extract_result(res, squeeze=False) @@ -140,12 +127,7 @@ cdef class Reducer: # if this looks like a reduction. _check_result_array(res, len(self.dummy)) - PyArray_SETITEM(result, PyArray_ITER_DATA(it), res) - chunk.data = chunk.data + self.increment - PyArray_ITER_NEXT(it) - finally: - # so we don't free the wrong memory - chunk.data = dummy_buf + y[...] = res result = maybe_convert_objects(result) return result From 5e77e1ed1af9caba6699e3381f88d673f01cc9db Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 8 May 2020 13:35:31 -0700 Subject: [PATCH 2/7] cleanups --- pandas/_libs/reduction.pyx | 30 ++++++------------------------ 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index ce260443aa86c..04b25ca8e92f8 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -49,7 +49,6 @@ cdef class Reducer: self.increment = n * arr.dtype.itemsize else: arr = arr.T - self.nresults = n self.chunksize = k self.increment = k * arr.dtype.itemsize @@ -57,38 +56,21 @@ cdef class Reducer: self.f = f self.arr = arr self.labels = labels - self.dummy, self.typ, self.index, self.ityp = self._check_dummy( - dummy=dummy) - - cdef _check_dummy(self, object dummy=None): - cdef: - object index = None, typ = None, ityp = None - - if dummy is None: - dummy = np.empty(self.chunksize, dtype=self.arr.dtype) - # our ref is stolen later since we are creating this array - # in cython, so increment first - Py_INCREF(dummy) - - else: - - # we passed a Series - typ = type(dummy) - index = dummy.index - dummy = dummy.values + # TODO: do we still need this? + self._check_dummy(dummy=dummy) + cdef _check_dummy(self, object dummy=None): + if dummy is not None: if dummy.dtype != self.arr.dtype: raise ValueError('Dummy array must be same dtype') if len(dummy) != self.chunksize: raise ValueError(f'Dummy array must be length {self.chunksize}') - return dummy, typ, index, ityp def get_result(self): cdef: - char* dummy_buf - ndarray arr, result, chunk + ndarray arr, result Py_ssize_t i object res, name, labels object cached_typ = None @@ -125,7 +107,7 @@ cdef class Reducer: if i == 0: # On the first pass, we check the output shape to see # if this looks like a reduction. - _check_result_array(res, len(self.dummy)) + _check_result_array(res, len(x)) y[...] = res From 23124d1931f92560d599f4346de4d4b8ed5210c7 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 8 May 2020 14:21:19 -0700 Subject: [PATCH 3/7] unused import --- pandas/_libs/reduction.pyx | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 04b25ca8e92f8..474f076a8a887 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -7,9 +7,7 @@ from libc.stdlib cimport malloc, free import numpy as np cimport numpy as cnp -from numpy cimport (ndarray, - int64_t, - PyArray_SETITEM) +from numpy cimport ndarray, int64_t cnp.import_array() From 3361732306dddaeadb11a53943cb85e3ede7306e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 14 May 2020 09:17:30 -0700 Subject: [PATCH 4/7] fixed issue with readonly timedelta error --- pandas/_libs/tslibs/timedeltas.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 03419a6267983..a19a5b47b3768 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -217,7 +217,7 @@ cdef convert_to_timedelta64(object ts, object unit): @cython.boundscheck(False) @cython.wraparound(False) -def array_to_timedelta64(object[:] values, unit='ns', errors='raise'): +def array_to_timedelta64(object values, unit='ns', errors='raise'): """ Convert an ndarray to an array of timedeltas. If errors == 'coerce', coerce non-convertible objects to NaT. Otherwise, raise. From 28d7c4d435af82c9dfa23bcb3b9193c3aaea70a6 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 18 May 2020 14:27:17 -0700 Subject: [PATCH 5/7] reuse cached_typ --- pandas/_libs/reduction.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index df434dfa35da2..69f3a339258ca 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -55,8 +55,7 @@ cdef class Reducer: self.arr = arr self.labels = labels - # TODO: do we still need this? - self._check_dummy(dummy=dummy) + self.typ = self._check_dummy(dummy=dummy) cdef _check_dummy(self, object dummy=None): if dummy is not None: @@ -65,6 +64,8 @@ cdef class Reducer: if len(dummy) != self.chunksize: raise ValueError(f'Dummy array must be length {self.chunksize}') + return type(dummy) + def get_result(self): cdef: From 125c9a287ba9ee5d23c17ca64bbdc7222d4ab1f6 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 18 May 2020 14:34:57 -0700 Subject: [PATCH 6/7] another fix --- pandas/_libs/reduction.pyx | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 69f3a339258ca..3028b1a74a41d 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -55,16 +55,21 @@ cdef class Reducer: self.arr = arr self.labels = labels - self.typ = self._check_dummy(dummy=dummy) + self.typ, self.index = self._check_dummy(dummy=dummy) cdef _check_dummy(self, object dummy=None): if dummy is not None: + + # we passed a Series + typ = type(dummy) + index = dummy.index + if dummy.dtype != self.arr.dtype: raise ValueError('Dummy array must be same dtype') if len(dummy) != self.chunksize: raise ValueError(f'Dummy array must be length {self.chunksize}') - return type(dummy) + return typ, index def get_result(self): From 8bcefd0a5cb053fc5b602f0ddacca16b4b4859b4 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 18 May 2020 14:39:12 -0700 Subject: [PATCH 7/7] pulled back in from master --- pandas/_libs/reduction.pyx | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 3028b1a74a41d..b2db2a2934e16 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -55,22 +55,33 @@ cdef class Reducer: self.arr = arr self.labels = labels - self.typ, self.index = self._check_dummy(dummy=dummy) + self.dummy, self.typ, self.index, self.ityp = self._check_dummy( + dummy=dummy) cdef _check_dummy(self, object dummy=None): - if dummy is not None: + cdef: + object index = None, typ = None, ityp = None + + if dummy is None: + dummy = np.empty(self.chunksize, dtype=self.arr.dtype) + + # our ref is stolen later since we are creating this array + # in cython, so increment first + Py_INCREF(dummy) + + else: # we passed a Series typ = type(dummy) index = dummy.index + dummy = dummy.values if dummy.dtype != self.arr.dtype: raise ValueError('Dummy array must be same dtype') if len(dummy) != self.chunksize: raise ValueError(f'Dummy array must be length {self.chunksize}') - return typ, index - + return dummy, typ, index, ityp def get_result(self): cdef: