Skip to content

Commit 74f5d6d

Browse files
committed
ENH: Cython Reducer, speed up DataFrame.apply significantly, GH #309
1 parent a1e2798 commit 74f5d6d

File tree

6 files changed

+106
-45
lines changed

6 files changed

+106
-45
lines changed

RELEASE.rst

+3
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,11 @@ pandas 0.5.1
5454
for fast conversion to DataFrame (GH #357)
5555
- Can pass multiple levels to groupby, e.g. `df.groupby(level=[0, 1])` (GH
5656
#103)
57+
- Can sort by multiple columns in `DataFrame.sort_index` (GH #92, PR #362)
5758
- Add fast `get_value` and `put_value` methods to DataFrame and
5859
micro-performance tweaks (GH #360)
60+
- Add `cov` instance methods to Series and DataFrame (GH #194, PR #362)
61+
5962

6063
**Improvements to existing features**
6164

pandas/core/frame.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -1520,7 +1520,6 @@ def sort_index(self, axis=0, by=None, ascending=True):
15201520
else:
15211521
to_sort = self[by].values
15221522

1523-
# stable sort
15241523
indexer = to_sort.argsort()
15251524
else:
15261525
indexer = labels.argsort()
@@ -2187,7 +2186,10 @@ def apply(self, func, axis=0, broadcast=False, raw=False):
21872186
return self._apply_broadcast(func, axis)
21882187

21892188
def _apply_raw(self, func, axis):
2190-
result = np.apply_along_axis(func, axis, self.values)
2189+
try:
2190+
result = lib.reduce(self.values, func, axis=axis)
2191+
except Exception:
2192+
result = np.apply_along_axis(func, axis, self.values)
21912193

21922194
# TODO: mixed type case
21932195
if result.ndim == 2:
@@ -2197,6 +2199,15 @@ def _apply_raw(self, func, axis):
21972199
return Series(result, index=self._get_agg_axis(axis))
21982200

21992201
def _apply_standard(self, func, axis, ignore_failures=False):
2202+
try:
2203+
values = self.values
2204+
dummy = Series(np.nan, index=self._get_axis(axis),
2205+
dtype=values.dtype)
2206+
result = lib.reduce(values, func, axis=axis, dummy=dummy)
2207+
return Series(result, index=self._get_agg_axis(axis))
2208+
except Exception:
2209+
pass
2210+
22002211
if axis == 0:
22012212
series_gen = ((c, self[c]) for c in self.columns)
22022213
res_index = self.columns

pandas/src/reduce.pyx

+83
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
from numpy cimport *
2+
import numpy as np
3+
4+
cdef class Reducer:
5+
'''
6+
Performs generic reduction operation on a C or Fortran-contiguous ndarray
7+
while avoiding ndarray construction overhead
8+
'''
9+
cdef:
10+
Py_ssize_t increment, chunksize, nresults
11+
object arr, dummy, f
12+
13+
def __init__(self, object arr, object f, axis=1, dummy=None):
14+
n, k = arr.shape
15+
16+
if axis == 0:
17+
if not arr.flags.f_contiguous:
18+
arr = arr.copy('F')
19+
20+
self.nresults = k
21+
self.chunksize = n
22+
self.increment = n * arr.dtype.itemsize
23+
else:
24+
if not arr.flags.c_contiguous:
25+
arr = arr.copy('C')
26+
27+
self.nresults = n
28+
self.chunksize = k
29+
self.increment = k * arr.dtype.itemsize
30+
31+
self.f = f
32+
self.arr = arr
33+
self.dummy = self._check_dummy(dummy)
34+
35+
def _check_dummy(self, dummy=None):
36+
if dummy is None:
37+
dummy = np.empty(self.chunksize, dtype=self.arr.dtype)
38+
else:
39+
if dummy.dtype != self.arr.dtype:
40+
raise ValueError('Dummy array must be same dtype')
41+
if len(dummy) != self.chunksize:
42+
raise ValueError('Dummy array must be length %d' %
43+
self.chunksize)
44+
45+
return dummy
46+
47+
def get_result(self):
48+
cdef:
49+
char* dummy_buf
50+
ndarray arr, result, chunk
51+
Py_ssize_t i
52+
flatiter it
53+
54+
arr = self.arr
55+
chunk = self.dummy
56+
57+
result = np.empty(self.nresults, dtype=self.arr.dtype)
58+
it = <flatiter> PyArray_IterNew(result)
59+
60+
test = self.f(self.chunk)
61+
try:
62+
result[0] = test
63+
except Exception:
64+
raise ValueError('function does not reduce')
65+
66+
dummy_buf = chunk.data
67+
chunk.data = arr.data
68+
69+
try:
70+
for i in range(self.nresults):
71+
PyArray_SETITEM(result, PyArray_ITER_DATA(it),
72+
self.f(self.dummy))
73+
chunk.data = chunk.data + self.increment
74+
PyArray_ITER_NEXT(it)
75+
finally:
76+
# so we don't free the wrong memory
77+
chunk.data = dummy_buf
78+
79+
return result
80+
81+
def reduce(arr, f, axis=0, dummy=None):
82+
reducer = Reducer(arr, f, axis=axis, dummy=dummy)
83+
return reducer.get_result()

pandas/src/sandbox.pyx

+5-41
Original file line numberDiff line numberDiff line change
@@ -3,46 +3,10 @@ import numpy as np
33

44
import_array()
55

6-
cdef class ArrayCruncher:
6+
cdef class SeriesIterator:
77

8-
cdef:
9-
ndarray arr
10-
object f
11-
bint raw
12-
Py_ssize_t N, K
8+
def __init__(self, arr):
9+
pass
1310

14-
def __init__(self, arr, f, axis=0, raw=True):
15-
self.arr = arr
16-
self.f = f
17-
self.raw = raw
18-
self.N, self.K = arr.shape
19-
20-
def reduce(self):
21-
cdef:
22-
char* dummy_buf
23-
ndarray arr, result, chunk
24-
Py_ssize_t i, increment
25-
flatiter it
26-
27-
if not self.arr.flags.c_contiguous:
28-
arr = self.arr.copy('C')
29-
else:
30-
arr = self.arr
31-
32-
increment = self.K * self.arr.dtype.itemsize
33-
chunk = np.empty(self.K, dtype=arr.dtype)
34-
result = np.empty(self.N, dtype=arr.dtype)
35-
it = <flatiter> PyArray_IterNew(result)
36-
37-
dummy_buf = chunk.data
38-
chunk.data = arr.data
39-
40-
for i in range(self.N):
41-
PyArray_SETITEM(result, PyArray_ITER_DATA(it), self.f(chunk))
42-
chunk.data = chunk.data + increment
43-
PyArray_ITER_NEXT(it)
44-
45-
# so we don't free the wrong memory
46-
chunk.data = dummy_buf
47-
48-
return result
11+
def next(self):
12+
pass

pandas/src/tseries.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -473,4 +473,4 @@ include "moments.pyx"
473473
include "reindex.pyx"
474474
include "generated.pyx"
475475
include "parsing.pyx"
476-
476+
include "reduce.pyx"

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,7 @@ def run(self):
275275
cmdclass['sdist'] = CheckSDist
276276

277277
tseries_depends = ['reindex', 'groupby', 'skiplist', 'moments',
278-
'generated', 'parsing']
278+
'generated', 'parsing', 'reduce']
279279
def srcpath(name=None, suffix='.pyx', subdir='src'):
280280
return pjoin('pandas', subdir, name+suffix)
281281

0 commit comments

Comments
 (0)