Skip to content

Commit b16517e

Browse files
committed
ENH: Cython Grouper prototype class, per #496
1 parent 596ca32 commit b16517e

File tree

1 file changed

+116
-0
lines changed

1 file changed

+116
-0
lines changed

pandas/src/reduce.pyx

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,122 @@ cdef class Reducer:
8989
raise ValueError('function does not reduce')
9090
return result
9191

92+
cdef class Grouper:
93+
'''
94+
Performs generic grouping operation while avoiding ndarray construction
95+
overhead
96+
'''
97+
cdef:
98+
Py_ssize_t nresults, ngroups
99+
object arr, dummy, f, labels, counts
100+
bint passed_dummy
101+
102+
def __init__(self, object arr, object f, object labels, ngroups, dummy=None):
103+
n = len(arr)
104+
105+
assert(arr.ndim == 1)
106+
107+
if not arr.flags.contiguous:
108+
arr = arr.copy()
109+
110+
self.labels = labels
111+
self.f = f
112+
self.arr = arr
113+
self.dummy = self._check_dummy(dummy)
114+
self.passed_dummy = dummy is not None
115+
116+
self.counts = np.zeros(ngroups, dtype='i4')
117+
118+
self.ngroups = ngroups
119+
120+
def _check_dummy(self, dummy=None):
121+
if dummy is None:
122+
dummy = np.empty(0, dtype=self.arr.dtype)
123+
else:
124+
if dummy.dtype != self.arr.dtype:
125+
raise ValueError('Dummy array must be same dtype')
126+
if len(dummy) != self.chunksize:
127+
raise ValueError('Dummy array must be length %d' %
128+
self.chunksize)
129+
130+
return dummy
131+
132+
def get_result(self):
133+
cdef:
134+
char* dummy_buf
135+
ndarray arr, result, chunk
136+
ndarray[int32_t] labels, counts
137+
Py_ssize_t i, group_size, n, lab
138+
flatiter it
139+
npy_intp *shape
140+
object res
141+
bint initialized = 0
142+
tuple args
143+
object kwds
144+
145+
labels = self.labels
146+
counts = self.counts
147+
148+
arr = self.arr
149+
chunk = self.dummy
150+
151+
dummy_buf = chunk.data
152+
chunk.data = arr.data
153+
154+
shape = chunk.shape
155+
group_size = 0
156+
n = len(arr)
157+
158+
args = cpython.PyTuple_New(1)
159+
kwds = {}
160+
cpython.PyTuple_SET_ITEM(args, 0, chunk)
161+
cpython.Py_INCREF(chunk)
162+
163+
try:
164+
for i in range(n):
165+
group_size += 1
166+
167+
lab = labels[i]
168+
169+
if i == n - 1 or lab != labels[i + 1]:
170+
chunk.shape[0] = group_size
171+
172+
res = cpython.PyObject_Call(self.f, args, kwds)
173+
174+
# res = self.f(chunk)
175+
if not initialized:
176+
result = self._get_result_array(res)
177+
it = <flatiter> PyArray_IterNew(result)
178+
initialized = 1
179+
180+
PyArray_ITER_GOTO1D(it, lab)
181+
PyArray_SETITEM(result, PyArray_ITER_DATA(it), res)
182+
counts[lab] = group_size
183+
184+
chunk.data = chunk.data + group_size
185+
group_size = 0
186+
except:
187+
raise
188+
finally:
189+
# so we don't free the wrong memory
190+
chunk.shape[0] = 0
191+
chunk.data = dummy_buf
192+
193+
if result.dtype == np.object_:
194+
result = maybe_convert_objects(result)
195+
196+
return result
197+
198+
def _get_result_array(self, object res):
199+
try:
200+
assert(not isinstance(res, np.ndarray))
201+
assert(not (isinstance(res, list) and len(res) == len(self.dummy)))
202+
203+
result = np.empty(self.ngroups, dtype='O')
204+
except Exception:
205+
raise ValueError('function does not reduce')
206+
return result
207+
92208
def reduce(arr, f, axis=0, dummy=None):
93209
reducer = Reducer(arr, f, axis=axis, dummy=dummy)
94210
return reducer.get_result()

0 commit comments

Comments
 (0)