Skip to content

Commit 0e61c89

Browse files
committed
ENH: speed up generic multi-key groupby via SeriesGrouper/Slider Cython classes. A little black magic, GH #496
1 parent 8aeeef6 commit 0e61c89

File tree

5 files changed

+142
-66
lines changed

5 files changed

+142
-66
lines changed

pandas/core/groupby.py

Lines changed: 47 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,12 @@ def curried(x):
193193
def primary(self):
194194
return self.groupings[0]
195195

196+
@property
197+
def _group_index(self):
198+
result = get_group_index([ping.labels for ping in self.groupings],
199+
self._group_shape)
200+
return result.astype('i4')
201+
196202
def get_group(self, name, obj=None):
197203
if obj is None:
198204
obj = self.obj
@@ -379,28 +385,18 @@ def _get_group_levels(self, mask):
379385
return [(name, raveled[mask]) for name, raveled in name_list]
380386

381387
def _python_agg_general(self, func, *args, **kwargs):
382-
group_shape = self._group_shape
383-
counts = np.zeros(group_shape, dtype=int)
384-
385-
# todo: cythonize?
386-
def _aggregate(output, counts, generator, shape_axis=0):
387-
for label, group in generator:
388-
if group is None:
389-
continue
390-
counts[label] = group.shape[shape_axis]
391-
output[label] = func(group, *args, **kwargs)
392-
393-
result = np.empty(group_shape, dtype=float)
394-
result.fill(np.nan)
388+
agg_func = lambda x: func(x, *args, **kwargs)
389+
390+
ngroups = np.prod(self._group_shape)
391+
group_index = self._group_index
392+
395393
# iterate through "columns" ex exclusions to populate output dict
396394
output = {}
397395
for name, obj in self._iterate_slices():
398396
try:
399-
_aggregate(result.ravel(), counts.ravel(),
400-
self._generator_factory(obj))
401-
# TODO: same mask for every column...
402-
output[name] = result.ravel().copy()
403-
result.fill(np.nan)
397+
result, counts = self._aggregate_series(obj, agg_func,
398+
group_index, ngroups)
399+
output[name] = result
404400
except TypeError:
405401
continue
406402

@@ -410,6 +406,39 @@ def _aggregate(output, counts, generator, shape_axis=0):
410406

411407
return self._wrap_aggregated_output(output, mask)
412408

409+
def _aggregate_series(self, obj, func, group_index, ngroups):
410+
try:
411+
return self._aggregate_series_fast(obj, func, group_index, ngroups)
412+
except Exception:
413+
return self._aggregate_series_pure_python(obj, func, ngroups)
414+
415+
def _aggregate_series_fast(self, obj, func, group_index, ngroups):
416+
if obj.index._has_complex_internals:
417+
raise TypeError('Incompatible index for Cython grouper')
418+
419+
# avoids object / Series creation overhead
420+
dummy = obj[:0]
421+
indexer = lib.groupsort_indexer(group_index, ngroups)
422+
obj = obj.take(indexer)
423+
group_index = group_index.take(indexer)
424+
grouper = lib.SeriesGrouper(obj, func, group_index, ngroups,
425+
dummy)
426+
result, counts = grouper.get_result()
427+
return result, counts
428+
429+
def _aggregate_series_pure_python(self, obj, func, ngroups):
430+
counts = np.zeros(ngroups, dtype=int)
431+
result = np.empty(ngroups, dtype=float)
432+
result.fill(np.nan)
433+
434+
for label, group in self._generator_factory(obj):
435+
if group is None:
436+
continue
437+
counts[label] = group.shape[0]
438+
result[label] = func(group)
439+
440+
return result, counts
441+
413442
def _python_apply_general(self, func, *args, **kwargs):
414443
result_keys = []
415444
result_values = []

pandas/core/index.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,11 @@ def names(self):
9393
def _constructor(self):
9494
return Index
9595

96+
@property
97+
def _has_complex_internals(self):
98+
# to disable groupby tricks in MultiIndex
99+
return False
100+
96101
def summary(self):
97102
if len(self) > 0:
98103
index_summary = ', %s to %s' % (str(self[0]), str(self[-1]))
@@ -924,6 +929,11 @@ def _is_legacy_format(self):
924929
contents = self.view(np.ndarray)
925930
return len(contents) > 0 and not isinstance(contents[0], tuple)
926931

932+
@property
933+
def _has_complex_internals(self):
934+
# to disable groupby tricks
935+
return True
936+
927937
def get_level_values(self, level):
928938
"""
929939
Return vector of label values for requested level, equal to the length

pandas/src/reduce.pyx

Lines changed: 59 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -95,28 +95,23 @@ cdef class SeriesGrouper:
9595
overhead
9696
'''
9797
cdef:
98-
Py_ssize_t nresults, ngroup
99-
object arr, dummy, f, labels, counts
98+
Py_ssize_t nresults, ngroups
10099
bint passed_dummy
101100

102-
def __init__(self, object arr, object f, object labels, ngroups,
103-
dummy=None):
104-
n = len(arr)
101+
cdef public:
102+
object arr, index, dummy, f, labels
105103

106-
assert(arr.ndim == 1)
107-
108-
if not arr.flags.contiguous:
109-
arr = arr.copy()
104+
def __init__(self, object series, object f, object labels,
105+
Py_ssize_t ngroups, object dummy):
106+
n = len(series)
110107

111108
self.labels = labels
112109
self.f = f
113-
self.arr = arr
110+
self.arr = series
111+
self.index = series.index
114112

115113
self.dummy = self._check_dummy(dummy)
116114
self.passed_dummy = dummy is not None
117-
118-
self.counts = np.zeros(ngroups, dtype='i4')
119-
120115
self.ngroups = ngroups
121116

122117
def _check_dummy(self, dummy=None):
@@ -125,42 +120,25 @@ cdef class SeriesGrouper:
125120
else:
126121
if dummy.dtype != self.arr.dtype:
127122
raise ValueError('Dummy array must be same dtype')
128-
if len(dummy) != self.chunksize:
129-
raise ValueError('Dummy array must be length %d' %
130-
self.chunksize)
131-
132123
return dummy
133124

134125
def get_result(self):
135126
cdef:
136-
char* dummy_buf
137-
ndarray arr, result, chunk
127+
ndarray arr, result
138128
ndarray[int32_t] labels, counts
139-
Py_ssize_t i, group_size, n, lab
140-
flatiter it
141-
npy_intp *shape
142-
object res
129+
Py_ssize_t i, n, group_size, lab
130+
object res, chunk
143131
bint initialized = 0
144-
tuple args
145-
object kwds
132+
Slider vslider, islider
146133

147134
labels = self.labels
148-
counts = self.counts
149-
150-
arr = self.arr
135+
counts = np.zeros(self.ngroups, dtype='i4')
151136
chunk = self.dummy
152-
153-
dummy_buf = chunk.data
154-
chunk.data = arr.data
155-
156-
shape = chunk.shape
157137
group_size = 0
158-
n = len(arr)
138+
n = len(self.arr)
159139

160-
args = cpython.PyTuple_New(1)
161-
kwds = {}
162-
cpython.PyTuple_SET_ITEM(args, 0, chunk)
163-
cpython.Py_INCREF(chunk)
140+
vslider = Slider(self.arr, self.dummy)
141+
islider = Slider(self.index, self.dummy.index)
164142

165143
try:
166144
for i in range(n):
@@ -169,33 +147,32 @@ cdef class SeriesGrouper:
169147
lab = labels[i]
170148

171149
if i == n - 1 or lab != labels[i + 1]:
172-
chunk.shape[0] = group_size
150+
islider.set_length(group_size)
151+
vslider.set_length(group_size)
173152

174-
res = cpython.PyObject_Call(self.f, args, kwds)
153+
res = self.f(chunk)
175154

176-
# res = self.f(chunk)
177155
if not initialized:
178156
result = self._get_result_array(res)
179-
it = <flatiter> PyArray_IterNew(result)
180157
initialized = 1
181158

182-
PyArray_ITER_GOTO1D(it, lab)
183-
PyArray_SETITEM(result, PyArray_ITER_DATA(it), res)
159+
util.assign_value_1d(result, lab, res)
184160
counts[lab] = group_size
161+
islider.advance(group_size)
162+
vslider.advance(group_size)
185163

186-
chunk.data = chunk.data + group_size
187164
group_size = 0
188165
except:
189166
raise
190167
finally:
191168
# so we don't free the wrong memory
192-
chunk.shape[0] = 0
193-
chunk.data = dummy_buf
169+
islider.cleanup()
170+
vslider.cleanup()
194171

195172
if result.dtype == np.object_:
196173
result = maybe_convert_objects(result)
197174

198-
return result
175+
return result, counts
199176

200177
def _get_result_array(self, object res):
201178
try:
@@ -207,6 +184,40 @@ cdef class SeriesGrouper:
207184
raise ValueError('function does not reduce')
208185
return result
209186

187+
cdef class Slider:
188+
'''
189+
Only handles contiguous data for now
190+
'''
191+
cdef:
192+
ndarray values, buf
193+
Py_ssize_t stride, orig_len
194+
char *orig_data
195+
196+
def __init__(self, object values, object buf):
197+
assert(values.ndim == 1)
198+
if not values.flags.contiguous:
199+
values = values.copy()
200+
201+
assert(values.dtype == buf.dtype)
202+
self.values = values
203+
self.buf = buf
204+
self.stride = values.dtype.itemsize
205+
206+
self.orig_data = self.buf.data
207+
self.orig_len = self.buf.shape[0]
208+
209+
self.buf.data = self.values.data
210+
211+
cdef inline advance(self, Py_ssize_t k):
212+
self.buf.data = <char*> self.buf.data + self.stride * k
213+
214+
cdef inline set_length(self, Py_ssize_t length):
215+
self.buf.shape[0] = length
216+
217+
cdef inline cleanup(self):
218+
self.buf.shape[0] = self.orig_len
219+
self.buf.data = self.orig_data
220+
210221
def reduce(arr, f, axis=0, dummy=None):
211222
reducer = Reducer(arr, f, axis=axis, dummy=dummy)
212223
return reducer.get_result()

pandas/src/sandbox.pyx

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,23 @@ def bench_typecheck2(ndarray[object] arr):
5151
for i in range(n):
5252
PyArray_Check(buf[i])
5353

54+
55+
def foo(object _chunk, object _arr):
56+
cdef:
57+
char* dummy_buf
58+
ndarray arr, result, chunk
59+
60+
arr = _arr
61+
chunk = _chunk
62+
63+
dummy_buf = chunk.data
64+
chunk.data = arr.data
65+
66+
shape = chunk.shape
67+
group_size = 0
68+
n = len(arr)
69+
70+
inc = arr.dtype.itemsize
71+
72+
# chunk.shape[0] = 100
73+
return chunk

pandas/tools/util.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from pandas.core.index import Index
2+
3+
def match(needles, haystack):
4+
haystack = Index(haystack)
5+
needles = Index(needles)
6+
return haystack.get_indexer(needles)

0 commit comments

Comments
 (0)