Skip to content

Simplify Reducer.get_result #34080

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 9 commits into from
Closed
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 15 additions & 51 deletions pandas/_libs/reduction.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@ import numpy as np
cimport numpy as cnp
from numpy cimport (ndarray,
int64_t,
PyArray_SETITEM,
PyArray_ITER_NEXT, PyArray_ITER_DATA, PyArray_IterNew,
flatiter)
PyArray_SETITEM)

cnp.import_array()

cimport pandas._libs.util as util
Expand Down Expand Up @@ -45,107 +44,72 @@ cdef class Reducer:
n, k = (<object>arr).shape

if axis == 0:
if not arr.flags.f_contiguous:
arr = arr.copy('F')

self.nresults = k
self.chunksize = n
self.increment = n * arr.dtype.itemsize
else:
if not arr.flags.c_contiguous:
arr = arr.copy('C')

arr = arr.T
self.nresults = n
self.chunksize = k
self.increment = k * arr.dtype.itemsize

self.f = f
self.arr = arr
self.labels = labels
self.dummy, self.typ, self.index, self.ityp = self._check_dummy(
dummy=dummy)

cdef _check_dummy(self, object dummy=None):
cdef:
object index = None, typ = None, ityp = None

if dummy is None:
dummy = np.empty(self.chunksize, dtype=self.arr.dtype)

# our ref is stolen later since we are creating this array
# in cython, so increment first
Py_INCREF(dummy)

else:

# we passed a Series
typ = type(dummy)
index = dummy.index
dummy = dummy.values
# TODO: do we still need this?
self._check_dummy(dummy=dummy)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The way this is written dummy never gets used. I'm not sure what the purpose of dummy is to begin with, so maybe we can get rid of this check and its use in the constructor altogether. maybe @jbrockmendel knows

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC this exists so that if we are using a Series subclass, that gets retained in the reduction. If that can be ensured without needing dummy, thatd be great


cdef _check_dummy(self, object dummy=None):
if dummy is not None:
if dummy.dtype != self.arr.dtype:
raise ValueError('Dummy array must be same dtype')
if len(dummy) != self.chunksize:
raise ValueError(f'Dummy array must be length {self.chunksize}')

return dummy, typ, index, ityp

def get_result(self):
cdef:
char* dummy_buf
ndarray arr, result, chunk
ndarray arr, result
Py_ssize_t i
flatiter it
object res, name, labels
object cached_typ = None

arr = self.arr
chunk = self.dummy
dummy_buf = chunk.data
chunk.data = arr.data
labels = self.labels

result = np.empty(self.nresults, dtype='O')
it = <flatiter>PyArray_IterNew(result)

try:
for i in range(self.nresults):

# create the cached type
# each time just reassign the data
with np.nditer([arr, result], flags=["reduce_ok", "external_loop", "refs_ok"], op_flags=[["readonly"], ["readwrite"]], order="F") as it:
for i, (x, y) in enumerate(it):
if i == 0:

if self.typ is not None:
# In this case, we also have self.index
name = labels[i]
cached_typ = self.typ(
chunk, index=self.index, name=name, dtype=arr.dtype)
x, index=self.index, name=name, dtype=arr.dtype)

# use the cached_typ if possible
if cached_typ is not None:
# In this case, we also have non-None labels
name = labels[i]

object.__setattr__(
cached_typ._mgr._block, 'values', chunk)
cached_typ._mgr._block, 'values', x)
object.__setattr__(cached_typ, 'name', name)
res = self.f(cached_typ)
else:
res = self.f(chunk)
res = self.f(x)

# TODO: reason for not squeezing here?
res = _extract_result(res, squeeze=False)
if i == 0:
# On the first pass, we check the output shape to see
# if this looks like a reduction.
_check_result_array(res, len(self.dummy))
_check_result_array(res, len(x))

PyArray_SETITEM(result, PyArray_ITER_DATA(it), res)
chunk.data = chunk.data + self.increment
PyArray_ITER_NEXT(it)
finally:
# so we don't free the wrong memory
chunk.data = dummy_buf
y[...] = res

result = maybe_convert_objects(result)
return result
Expand Down