Skip to content

PERF: apply perf enhancements #6024

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 21, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ Improvements to existing features
- perf improvments in indexing with object dtypes (:issue:`5968`)
- improved dtype inference for ``timedelta`` like passed to constructors (:issue:`5458`,:issue:`5689`)
- escape special characters when writing to latex (:issue: `5374`)
- perf improvements in ``DataFrame.apply`` (:issue:`6013`)

.. _release.bug_fixes-0.13.1:

Expand Down
29 changes: 17 additions & 12 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
_default_index, _maybe_upcast, _is_sequence,
_infer_dtype_from_scalar, _values_from_object,
_DATELIKE_DTYPES, is_list_like)
is_list_like)
from pandas.core.generic import NDFrame, _shared_docs
from pandas.core.index import Index, MultiIndex, _ensure_index
from pandas.core.indexing import (_maybe_droplevels,
Expand Down Expand Up @@ -1581,7 +1581,7 @@ def _ixs(self, i, axis=0, copy=False):
else:
new_values, copy = self._data.fast_2d_xs(i, copy=copy)
result = Series(new_values, index=self.columns,
name=self.index[i])
name=self.index[i], dtype=new_values.dtype)
result.is_copy=copy
return result

Expand Down Expand Up @@ -3324,16 +3324,15 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True):
if reduce:
try:

if self._is_mixed_type: # maybe a hack for now
raise AssertionError('Must be mixed type DataFrame')
# the is the fast-path
values = self.values
dummy = Series(NA, index=self._get_axis(axis),
dtype=values.dtype)

labels = self._get_agg_axis(axis)
result = lib.reduce(values, func, axis=axis, dummy=dummy,
labels=labels)
return Series(result, index=self._get_agg_axis(axis))
return Series(result, index=labels)
except Exception:
pass

Expand Down Expand Up @@ -3393,12 +3392,12 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True):
result = result.T
result = result.convert_objects(copy=False)

return result
else:
s = Series(results)
s.index = res_index

return s
result = Series(results)
result.index = res_index

return result

def _apply_broadcast(self, func, axis):
if axis == 0:
Expand Down Expand Up @@ -3932,8 +3931,7 @@ def _reduce(self, op, axis=0, skipna=True, numeric_only=None,
labels = self._get_agg_axis(axis)

# exclude timedelta/datetime unless we are uniform types
if axis == 1 and self._is_mixed_type and len(set(self.dtypes) &
_DATELIKE_DTYPES):
if axis == 1 and self._is_mixed_type and self._is_datelike_mixed_type:
numeric_only = True

if numeric_only is None:
Expand All @@ -3945,7 +3943,14 @@ def _reduce(self, op, axis=0, skipna=True, numeric_only=None,
# try by-column first
if filter_type is None and axis == 0:
try:
return self.apply(f).iloc[0]

# this can end up with a non-reduction
# but not always. if the types are mixed
# with datelike then need to make sure a series
result = self.apply(f,reduce=False)
if result.ndim == self.ndim:
result = result.iloc[0]
return result
except:
pass

Expand Down
16 changes: 15 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ class NDFrame(PandasObject):
copy : boolean, default False
"""
_internal_names = ['_data', '_cacher', '_item_cache', '_cache',
'is_copy', '_subtyp', '_index', '_default_kind', '_default_fill_value']
'is_copy', '_subtyp', '_index', '_default_kind',
'_default_fill_value','__array_struct__','__array_interface__']
_internal_names_set = set(_internal_names)
_metadata = []
is_copy = None
Expand Down Expand Up @@ -698,6 +699,14 @@ def __array_wrap__(self, result):
d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False)
return self._constructor(result, **d).__finalize__(self)

# ideally we would define this to avoid the getattr checks, but
# is slower
#@property
#def __array_interface__(self):
# """ provide numpy array interface method """
# values = self.values
# return dict(typestr=values.dtype.str,shape=values.shape,data=values)

def to_dense(self):
"Return dense representation of NDFrame (as opposed to sparse)"
# compat
Expand Down Expand Up @@ -1828,6 +1837,11 @@ def _is_numeric_mixed_type(self):
f = lambda: self._data.is_numeric_mixed_type
return self._protect_consolidate(f)

@property
def _is_datelike_mixed_type(self):
f = lambda: self._data.is_datelike_mixed_type
return self._protect_consolidate(f)

def _protect_consolidate(self, f):
blocks_before = len(self._data.blocks)
result = f()
Expand Down
11 changes: 11 additions & 0 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,11 @@ def _consolidate_key(self):
def _is_single_block(self):
return self.ndim == 1

@property
def is_datelike(self):
""" return True if I am a non-datelike """
return self.is_datetime or self.is_timedelta

@property
def fill_value(self):
return np.nan
Expand Down Expand Up @@ -2439,6 +2444,12 @@ def is_numeric_mixed_type(self):
self._consolidate_inplace()
return all([block.is_numeric for block in self.blocks])

@property
def is_datelike_mixed_type(self):
# Warning, consolidation needs to get checked upstairs
self._consolidate_inplace()
return any([block.is_datelike for block in self.blocks])

def get_block_map(self, copy=False, typ=None, columns=None,
is_numeric=False, is_bool=False):
""" return a dictionary mapping the ftype -> block list
Expand Down
56 changes: 33 additions & 23 deletions pandas/src/reduce.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -35,25 +35,26 @@ cdef class Reducer:
self.chunksize = k
self.increment = k * arr.dtype.itemsize


self.f = f
self.arr = arr
self.typ = None
self.labels = labels
self.dummy, index = self._check_dummy(dummy)
self.dummy, index = self._check_dummy(dummy=dummy)

if axis == 0:
self.labels = index
self.index = labels
else:
self.labels = labels
self.index = index
self.labels = labels
self.index = index

def _check_dummy(self, dummy=None):
cdef object index

if dummy is None:
dummy = np.empty(self.chunksize, dtype=self.arr.dtype)
index = None

# our ref is stolen later since we are creating this array
# in cython, so increment first
Py_INCREF(dummy)
else:
if dummy.dtype != self.arr.dtype:
raise ValueError('Dummy array must be same dtype')
Expand All @@ -76,39 +77,48 @@ cdef class Reducer:
ndarray arr, result, chunk
Py_ssize_t i, incr
flatiter it
object res, tchunk, name, labels, index, typ
object res, name, labels, index
object cached_typ = None

arr = self.arr
chunk = self.dummy
dummy_buf = chunk.data
chunk.data = arr.data
labels = self.labels
index = self.index
typ = self.typ
incr = self.increment

try:
for i in range(self.nresults):
# need to make sure that we pass an actual object to the function
# and not just an ndarray
if typ is not None:
try:
if labels is not None:
name = labels[i]

if labels is not None:
name = util.get_value_at(labels, i)
else:
name = None

# create the cached type
# each time just reassign the data
if i == 0:

if self.typ is not None:

# recreate with the index if supplied
if index is not None:
tchunk = typ(chunk, index=index, name=name, fastpath=True)

cached_typ = self.typ(chunk, index=index, name=name)

else:
tchunk = typ(chunk, name=name)

except:
tchunk = chunk
typ = None
else:
tchunk = chunk
# use the passsed typ, sans index
cached_typ = self.typ(chunk, name=name)

res = self.f(tchunk)
# use the cached_typ if possible
if cached_typ is not None:
cached_typ._data._block.values = chunk
cached_typ.name = name
res = self.f(cached_typ)
else:
res = self.f(chunk)

if hasattr(res,'values'):
res = res.values
Expand Down
60 changes: 40 additions & 20 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9035,6 +9035,16 @@ def test_apply_mixed_dtype_corner(self):
expected = Series(np.nan, index=[])
assert_series_equal(result, expected)

df = DataFrame({'A': ['foo'],
'B': [1.]})
result = df.apply(lambda x: x['A'], axis=1)
expected = Series(['foo'],index=[0])
assert_series_equal(result, expected)

result = df.apply(lambda x: x['B'], axis=1)
expected = Series([1.],index=[0])
assert_series_equal(result, expected)

def test_apply_empty_infer_type(self):
no_cols = DataFrame(index=['a', 'b', 'c'])
no_index = DataFrame(columns=['a', 'b', 'c'])
Expand Down Expand Up @@ -9970,7 +9980,8 @@ def test_count(self):
self._check_stat_op('count', f,
has_skipna=False,
has_numeric_only=True,
check_dtypes=False)
check_dtype=False,
check_dates=True)

# corner case
frame = DataFrame()
Expand Down Expand Up @@ -9999,10 +10010,9 @@ def test_count(self):
def test_sum(self):
self._check_stat_op('sum', np.sum, has_numeric_only=True)

def test_sum_mixed_numeric(self):
raise nose.SkipTest("skipping for now")
# mixed types
self._check_stat_op('sum', np.sum, frame = self.mixed_float, has_numeric_only=True)
# mixed types (with upcasting happening)
self._check_stat_op('sum', np.sum, frame=self.mixed_float.astype('float32'),
has_numeric_only=True, check_dtype=False, check_less_precise=True)

def test_stat_operators_attempt_obj_array(self):
data = {
Expand All @@ -10028,7 +10038,7 @@ def test_stat_operators_attempt_obj_array(self):
assert_series_equal(result, expected)

def test_mean(self):
self._check_stat_op('mean', np.mean)
self._check_stat_op('mean', np.mean, check_dates=True)

def test_product(self):
self._check_stat_op('product', np.prod)
Expand All @@ -10039,10 +10049,10 @@ def wrapper(x):
return np.nan
return np.median(x)

self._check_stat_op('median', wrapper)
self._check_stat_op('median', wrapper, check_dates=True)

def test_min(self):
self._check_stat_op('min', np.min)
self._check_stat_op('min', np.min, check_dates=True)
self._check_stat_op('min', np.min, frame=self.intframe)

def test_cummin(self):
Expand Down Expand Up @@ -10092,7 +10102,7 @@ def test_cummax(self):
self.assertEqual(np.shape(cummax_xs), np.shape(self.tsframe))

def test_max(self):
self._check_stat_op('max', np.max)
self._check_stat_op('max', np.max, check_dates=True)
self._check_stat_op('max', np.max, frame=self.intframe)

def test_mad(self):
Expand Down Expand Up @@ -10154,7 +10164,8 @@ def alt(x):
assert_series_equal(df.kurt(), df.kurt(level=0).xs('bar'))

def _check_stat_op(self, name, alternative, frame=None, has_skipna=True,
has_numeric_only=False, check_dtypes=True):
has_numeric_only=False, check_dtype=True, check_dates=False,
check_less_precise=False):
if frame is None:
frame = self.frame
# set some NAs
Expand All @@ -10163,14 +10174,16 @@ def _check_stat_op(self, name, alternative, frame=None, has_skipna=True,

f = getattr(frame, name)

if not ('max' in name or 'min' in name or 'count' in name):
if check_dates:
df = DataFrame({'b': date_range('1/1/2001', periods=2)})
_f = getattr(df, name)
#print(df)
self.assertFalse(len(_f()))
result = _f()
self.assert_(isinstance(result, Series))

df['a'] = lrange(len(df))
self.assert_(len(getattr(df, name)()))
result = getattr(df, name)()
self.assert_(isinstance(result, Series))
self.assert_(len(result))

if has_skipna:
def skipna_wrapper(x):
Expand All @@ -10184,21 +10197,27 @@ def wrapper(x):

result0 = f(axis=0, skipna=False)
result1 = f(axis=1, skipna=False)
assert_series_equal(result0, frame.apply(wrapper))
assert_series_equal(result0, frame.apply(wrapper),
check_dtype=check_dtype,
check_less_precise=check_less_precise)
assert_series_equal(result1, frame.apply(wrapper, axis=1),
check_dtype=False) # HACK: win32
check_dtype=False,
check_less_precise=check_less_precise) # HACK: win32
else:
skipna_wrapper = alternative
wrapper = alternative

result0 = f(axis=0)
result1 = f(axis=1)
assert_series_equal(result0, frame.apply(skipna_wrapper))
assert_series_equal(result0, frame.apply(skipna_wrapper),
check_dtype=check_dtype,
check_less_precise=check_less_precise)
assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1),
check_dtype=False)
check_dtype=False,
check_less_precise=check_less_precise)

# check dtypes
if check_dtypes:
if check_dtype:
lcd_dtype = frame.values.dtype
self.assert_(lcd_dtype == result0.dtype)
self.assert_(lcd_dtype == result1.dtype)
Expand Down Expand Up @@ -10331,7 +10350,8 @@ def wrapper(x):
return np.nan
return np.median(x)

self._check_stat_op('median', wrapper, frame=self.intframe, check_dtypes=False)
self._check_stat_op('median', wrapper, frame=self.intframe,
check_dtype=False, check_dates=True)

def test_quantile(self):
from pandas.compat.scipy import scoreatpercentile
Expand Down
Loading