Skip to content

Commit 9e37a7d

Browse files
committed
PERF: perf enhancements for DataFrame.apply (GH6013)
1 parent 26d66c1 commit 9e37a7d

File tree

5 files changed

+49
-30
lines changed

5 files changed

+49
-30
lines changed

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ Improvements to existing features
8888
- perf improvments in indexing with object dtypes (:issue:`5968`)
8989
- improved dtype inference for ``timedelta`` like passed to constructors (:issue:`5458`,:issue:`5689`)
9090
- escape special characters when writing to latex (:issue: `5374`)
91+
- perf improvements in ``DataFrame.apply`` (:issue:`6013`)
9192

9293
.. _release.bug_fixes-0.13.1:
9394

pandas/core/frame.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -3324,16 +3324,16 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True):
33243324
if reduce:
33253325
try:
33263326

3327-
if self._is_mixed_type: # maybe a hack for now
3328-
raise AssertionError('Must be mixed type DataFrame')
3329-
values = self.values
3330-
dummy = Series(NA, index=self._get_axis(axis),
3327+
# can only work with numeric data in the fast path
3328+
numeric = self._get_numeric_data()
3329+
values = numeric.values
3330+
dummy = Series(NA, index=numeric._get_axis(axis),
33313331
dtype=values.dtype)
33323332

33333333
labels = self._get_agg_axis(axis)
33343334
result = lib.reduce(values, func, axis=axis, dummy=dummy,
33353335
labels=labels)
3336-
return Series(result, index=self._get_agg_axis(axis))
3336+
return Series(result, index=labels)
33373337
except Exception:
33383338
pass
33393339

pandas/core/generic.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,8 @@ class NDFrame(PandasObject):
7878
copy : boolean, default False
7979
"""
8080
_internal_names = ['_data', '_cacher', '_item_cache', '_cache',
81-
'is_copy', '_subtyp', '_index', '_default_kind', '_default_fill_value']
81+
'is_copy', '_subtyp', '_index', '_default_kind',
82+
'_default_fill_value','__array_struct__','__array_interface__']
8283
_internal_names_set = set(_internal_names)
8384
_metadata = []
8485
is_copy = None
@@ -698,6 +699,14 @@ def __array_wrap__(self, result):
698699
d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False)
699700
return self._constructor(result, **d).__finalize__(self)
700701

702+
# ideally we would define this to avoid the getattr checks, but
703+
# is slower
704+
#@property
705+
#def __array_interface__(self):
706+
# """ provide numpy array interface method """
707+
# values = self.values
708+
# return dict(typestr=values.dtype.str,shape=values.shape,data=values)
709+
701710
def to_dense(self):
702711
"Return dense representation of NDFrame (as opposed to sparse)"
703712
# compat

pandas/src/reduce.pyx

+33-23
Original file line numberDiff line numberDiff line change
@@ -35,25 +35,26 @@ cdef class Reducer:
3535
self.chunksize = k
3636
self.increment = k * arr.dtype.itemsize
3737

38+
3839
self.f = f
3940
self.arr = arr
4041
self.typ = None
4142
self.labels = labels
42-
self.dummy, index = self._check_dummy(dummy)
43+
self.dummy, index = self._check_dummy(dummy=dummy)
4344

44-
if axis == 0:
45-
self.labels = index
46-
self.index = labels
47-
else:
48-
self.labels = labels
49-
self.index = index
45+
self.labels = labels
46+
self.index = index
5047

5148
def _check_dummy(self, dummy=None):
5249
cdef object index
5350

5451
if dummy is None:
5552
dummy = np.empty(self.chunksize, dtype=self.arr.dtype)
5653
index = None
54+
55+
# our ref is stolen later since we are creating this array
56+
# in cython, so increment first
57+
Py_INCREF(dummy)
5758
else:
5859
if dummy.dtype != self.arr.dtype:
5960
raise ValueError('Dummy array must be same dtype')
@@ -76,39 +77,48 @@ cdef class Reducer:
7677
ndarray arr, result, chunk
7778
Py_ssize_t i, incr
7879
flatiter it
79-
object res, tchunk, name, labels, index, typ
80+
object res, name, labels, index
81+
object cached_typ = None
8082

8183
arr = self.arr
8284
chunk = self.dummy
8385
dummy_buf = chunk.data
8486
chunk.data = arr.data
8587
labels = self.labels
8688
index = self.index
87-
typ = self.typ
8889
incr = self.increment
8990

9091
try:
9192
for i in range(self.nresults):
92-
# need to make sure that we pass an actual object to the function
93-
# and not just an ndarray
94-
if typ is not None:
95-
try:
96-
if labels is not None:
97-
name = labels[i]
93+
94+
if labels is not None:
95+
name = util.get_value_at(labels, i)
96+
else:
97+
name = None
98+
99+
# create the cached type
100+
# each time just reassign the data
101+
if i == 0:
102+
103+
if self.typ is not None:
98104

99105
# recreate with the index if supplied
100106
if index is not None:
101-
tchunk = typ(chunk, index=index, name=name, fastpath=True)
107+
108+
cached_typ = self.typ(chunk, index=index, name=name)
109+
102110
else:
103-
tchunk = typ(chunk, name=name)
104111

105-
except:
106-
tchunk = chunk
107-
typ = None
108-
else:
109-
tchunk = chunk
112+
# use the passsed typ, sans index
113+
cached_typ = self.typ(chunk, name=name)
110114

111-
res = self.f(tchunk)
115+
# use the cached_typ if possible
116+
if cached_typ is not None:
117+
cached_typ._data._block.values = chunk
118+
cached_typ.name = name
119+
res = self.f(cached_typ)
120+
else:
121+
res = self.f(chunk)
112122

113123
if hasattr(res,'values'):
114124
res = res.values

pandas/tests/test_tseries.py

-1
Original file line numberDiff line numberDiff line change
@@ -661,7 +661,6 @@ def test_int_index(self):
661661
from pandas.core.series import Series
662662

663663
arr = np.random.randn(100, 4)
664-
665664
result = lib.reduce(arr, np.sum, labels=Index(np.arange(4)))
666665
expected = arr.sum(0)
667666
assert_almost_equal(result, expected)

0 commit comments

Comments
 (0)