diff --git a/doc/source/release.rst b/doc/source/release.rst index f58620020d254..69dc688cd6097 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -88,6 +88,7 @@ Improvements to existing features - perf improvments in indexing with object dtypes (:issue:`5968`) - improved dtype inference for ``timedelta`` like passed to constructors (:issue:`5458`,:issue:`5689`) - escape special characters when writing to latex (:issue: `5374`) + - perf improvements in ``DataFrame.apply`` (:issue:`6013`) .. _release.bug_fixes-0.13.1: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a400bc1b644ba..adb7b2d2b2691 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -24,7 +24,7 @@ from pandas.core.common import (isnull, notnull, PandasError, _try_sort, _default_index, _maybe_upcast, _is_sequence, _infer_dtype_from_scalar, _values_from_object, - _DATELIKE_DTYPES, is_list_like) + is_list_like) from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import (_maybe_droplevels, @@ -1581,7 +1581,7 @@ def _ixs(self, i, axis=0, copy=False): else: new_values, copy = self._data.fast_2d_xs(i, copy=copy) result = Series(new_values, index=self.columns, - name=self.index[i]) + name=self.index[i], dtype=new_values.dtype) result.is_copy=copy return result @@ -3324,8 +3324,7 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True): if reduce: try: - if self._is_mixed_type: # maybe a hack for now - raise AssertionError('Must be mixed type DataFrame') + # the is the fast-path values = self.values dummy = Series(NA, index=self._get_axis(axis), dtype=values.dtype) @@ -3333,7 +3332,7 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True): labels = self._get_agg_axis(axis) result = lib.reduce(values, func, axis=axis, dummy=dummy, labels=labels) - return Series(result, index=self._get_agg_axis(axis)) + return Series(result, index=labels) except Exception: pass @@ -3393,12 +3392,12 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True): result = result.T result = result.convert_objects(copy=False) - return result else: - s = Series(results) - s.index = res_index - return s + result = Series(results) + result.index = res_index + + return result def _apply_broadcast(self, func, axis): if axis == 0: @@ -3932,8 +3931,7 @@ def _reduce(self, op, axis=0, skipna=True, numeric_only=None, labels = self._get_agg_axis(axis) # exclude timedelta/datetime unless we are uniform types - if axis == 1 and self._is_mixed_type and len(set(self.dtypes) & - _DATELIKE_DTYPES): + if axis == 1 and self._is_mixed_type and self._is_datelike_mixed_type: numeric_only = True if numeric_only is None: @@ -3945,7 +3943,14 @@ def _reduce(self, op, axis=0, skipna=True, numeric_only=None, # try by-column first if filter_type is None and axis == 0: try: - return self.apply(f).iloc[0] + + # this can end up with a non-reduction + # but not always. if the types are mixed + # with datelike then need to make sure a series + result = self.apply(f,reduce=False) + if result.ndim == self.ndim: + result = result.iloc[0] + return result except: pass diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6393083c182e3..000d6a4f6da9b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -78,7 +78,8 @@ class NDFrame(PandasObject): copy : boolean, default False """ _internal_names = ['_data', '_cacher', '_item_cache', '_cache', - 'is_copy', '_subtyp', '_index', '_default_kind', '_default_fill_value'] + 'is_copy', '_subtyp', '_index', '_default_kind', + '_default_fill_value','__array_struct__','__array_interface__'] _internal_names_set = set(_internal_names) _metadata = [] is_copy = None @@ -698,6 +699,14 @@ def __array_wrap__(self, result): d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False) return self._constructor(result, **d).__finalize__(self) + # ideally we would define this to avoid the getattr checks, but + # is slower + #@property + #def __array_interface__(self): + # """ provide numpy array interface method """ + # values = self.values + # return dict(typestr=values.dtype.str,shape=values.shape,data=values) + def to_dense(self): "Return dense representation of NDFrame (as opposed to sparse)" # compat @@ -1828,6 +1837,11 @@ def _is_numeric_mixed_type(self): f = lambda: self._data.is_numeric_mixed_type return self._protect_consolidate(f) + @property + def _is_datelike_mixed_type(self): + f = lambda: self._data.is_datelike_mixed_type + return self._protect_consolidate(f) + def _protect_consolidate(self, f): blocks_before = len(self._data.blocks) result = f() diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 0603746cf9dc5..60e9baf005eb4 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -83,6 +83,11 @@ def _consolidate_key(self): def _is_single_block(self): return self.ndim == 1 + @property + def is_datelike(self): + """ return True if I am a non-datelike """ + return self.is_datetime or self.is_timedelta + @property def fill_value(self): return np.nan @@ -2439,6 +2444,12 @@ def is_numeric_mixed_type(self): self._consolidate_inplace() return all([block.is_numeric for block in self.blocks]) + @property + def is_datelike_mixed_type(self): + # Warning, consolidation needs to get checked upstairs + self._consolidate_inplace() + return any([block.is_datelike for block in self.blocks]) + def get_block_map(self, copy=False, typ=None, columns=None, is_numeric=False, is_bool=False): """ return a dictionary mapping the ftype -> block list diff --git a/pandas/src/reduce.pyx b/pandas/src/reduce.pyx index 13df983c45d53..bfbe0f3ea7938 100644 --- a/pandas/src/reduce.pyx +++ b/pandas/src/reduce.pyx @@ -35,18 +35,15 @@ cdef class Reducer: self.chunksize = k self.increment = k * arr.dtype.itemsize + self.f = f self.arr = arr self.typ = None self.labels = labels - self.dummy, index = self._check_dummy(dummy) + self.dummy, index = self._check_dummy(dummy=dummy) - if axis == 0: - self.labels = index - self.index = labels - else: - self.labels = labels - self.index = index + self.labels = labels + self.index = index def _check_dummy(self, dummy=None): cdef object index @@ -54,6 +51,10 @@ cdef class Reducer: if dummy is None: dummy = np.empty(self.chunksize, dtype=self.arr.dtype) index = None + + # our ref is stolen later since we are creating this array + # in cython, so increment first + Py_INCREF(dummy) else: if dummy.dtype != self.arr.dtype: raise ValueError('Dummy array must be same dtype') @@ -76,7 +77,8 @@ cdef class Reducer: ndarray arr, result, chunk Py_ssize_t i, incr flatiter it - object res, tchunk, name, labels, index, typ + object res, name, labels, index + object cached_typ = None arr = self.arr chunk = self.dummy @@ -84,31 +86,39 @@ cdef class Reducer: chunk.data = arr.data labels = self.labels index = self.index - typ = self.typ incr = self.increment try: for i in range(self.nresults): - # need to make sure that we pass an actual object to the function - # and not just an ndarray - if typ is not None: - try: - if labels is not None: - name = labels[i] + + if labels is not None: + name = util.get_value_at(labels, i) + else: + name = None + + # create the cached type + # each time just reassign the data + if i == 0: + + if self.typ is not None: # recreate with the index if supplied if index is not None: - tchunk = typ(chunk, index=index, name=name, fastpath=True) + + cached_typ = self.typ(chunk, index=index, name=name) + else: - tchunk = typ(chunk, name=name) - except: - tchunk = chunk - typ = None - else: - tchunk = chunk + # use the passsed typ, sans index + cached_typ = self.typ(chunk, name=name) - res = self.f(tchunk) + # use the cached_typ if possible + if cached_typ is not None: + cached_typ._data._block.values = chunk + cached_typ.name = name + res = self.f(cached_typ) + else: + res = self.f(chunk) if hasattr(res,'values'): res = res.values diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 3b6e4ba445ce0..f122a88fe7a25 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -9035,6 +9035,16 @@ def test_apply_mixed_dtype_corner(self): expected = Series(np.nan, index=[]) assert_series_equal(result, expected) + df = DataFrame({'A': ['foo'], + 'B': [1.]}) + result = df.apply(lambda x: x['A'], axis=1) + expected = Series(['foo'],index=[0]) + assert_series_equal(result, expected) + + result = df.apply(lambda x: x['B'], axis=1) + expected = Series([1.],index=[0]) + assert_series_equal(result, expected) + def test_apply_empty_infer_type(self): no_cols = DataFrame(index=['a', 'b', 'c']) no_index = DataFrame(columns=['a', 'b', 'c']) @@ -9970,7 +9980,8 @@ def test_count(self): self._check_stat_op('count', f, has_skipna=False, has_numeric_only=True, - check_dtypes=False) + check_dtype=False, + check_dates=True) # corner case frame = DataFrame() @@ -9999,10 +10010,9 @@ def test_count(self): def test_sum(self): self._check_stat_op('sum', np.sum, has_numeric_only=True) - def test_sum_mixed_numeric(self): - raise nose.SkipTest("skipping for now") - # mixed types - self._check_stat_op('sum', np.sum, frame = self.mixed_float, has_numeric_only=True) + # mixed types (with upcasting happening) + self._check_stat_op('sum', np.sum, frame=self.mixed_float.astype('float32'), + has_numeric_only=True, check_dtype=False, check_less_precise=True) def test_stat_operators_attempt_obj_array(self): data = { @@ -10028,7 +10038,7 @@ def test_stat_operators_attempt_obj_array(self): assert_series_equal(result, expected) def test_mean(self): - self._check_stat_op('mean', np.mean) + self._check_stat_op('mean', np.mean, check_dates=True) def test_product(self): self._check_stat_op('product', np.prod) @@ -10039,10 +10049,10 @@ def wrapper(x): return np.nan return np.median(x) - self._check_stat_op('median', wrapper) + self._check_stat_op('median', wrapper, check_dates=True) def test_min(self): - self._check_stat_op('min', np.min) + self._check_stat_op('min', np.min, check_dates=True) self._check_stat_op('min', np.min, frame=self.intframe) def test_cummin(self): @@ -10092,7 +10102,7 @@ def test_cummax(self): self.assertEqual(np.shape(cummax_xs), np.shape(self.tsframe)) def test_max(self): - self._check_stat_op('max', np.max) + self._check_stat_op('max', np.max, check_dates=True) self._check_stat_op('max', np.max, frame=self.intframe) def test_mad(self): @@ -10154,7 +10164,8 @@ def alt(x): assert_series_equal(df.kurt(), df.kurt(level=0).xs('bar')) def _check_stat_op(self, name, alternative, frame=None, has_skipna=True, - has_numeric_only=False, check_dtypes=True): + has_numeric_only=False, check_dtype=True, check_dates=False, + check_less_precise=False): if frame is None: frame = self.frame # set some NAs @@ -10163,14 +10174,16 @@ def _check_stat_op(self, name, alternative, frame=None, has_skipna=True, f = getattr(frame, name) - if not ('max' in name or 'min' in name or 'count' in name): + if check_dates: df = DataFrame({'b': date_range('1/1/2001', periods=2)}) _f = getattr(df, name) - #print(df) - self.assertFalse(len(_f())) + result = _f() + self.assert_(isinstance(result, Series)) df['a'] = lrange(len(df)) - self.assert_(len(getattr(df, name)())) + result = getattr(df, name)() + self.assert_(isinstance(result, Series)) + self.assert_(len(result)) if has_skipna: def skipna_wrapper(x): @@ -10184,21 +10197,27 @@ def wrapper(x): result0 = f(axis=0, skipna=False) result1 = f(axis=1, skipna=False) - assert_series_equal(result0, frame.apply(wrapper)) + assert_series_equal(result0, frame.apply(wrapper), + check_dtype=check_dtype, + check_less_precise=check_less_precise) assert_series_equal(result1, frame.apply(wrapper, axis=1), - check_dtype=False) # HACK: win32 + check_dtype=False, + check_less_precise=check_less_precise) # HACK: win32 else: skipna_wrapper = alternative wrapper = alternative result0 = f(axis=0) result1 = f(axis=1) - assert_series_equal(result0, frame.apply(skipna_wrapper)) + assert_series_equal(result0, frame.apply(skipna_wrapper), + check_dtype=check_dtype, + check_less_precise=check_less_precise) assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1), - check_dtype=False) + check_dtype=False, + check_less_precise=check_less_precise) # check dtypes - if check_dtypes: + if check_dtype: lcd_dtype = frame.values.dtype self.assert_(lcd_dtype == result0.dtype) self.assert_(lcd_dtype == result1.dtype) @@ -10331,7 +10350,8 @@ def wrapper(x): return np.nan return np.median(x) - self._check_stat_op('median', wrapper, frame=self.intframe, check_dtypes=False) + self._check_stat_op('median', wrapper, frame=self.intframe, + check_dtype=False, check_dates=True) def test_quantile(self): from pandas.compat.scipy import scoreatpercentile diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py index 7215b9dbf934b..5de5eee0ec011 100644 --- a/pandas/tests/test_tseries.py +++ b/pandas/tests/test_tseries.py @@ -661,7 +661,6 @@ def test_int_index(self): from pandas.core.series import Series arr = np.random.randn(100, 4) - result = lib.reduce(arr, np.sum, labels=Index(np.arange(4))) expected = arr.sum(0) assert_almost_equal(result, expected) diff --git a/vb_suite/frame_methods.py b/vb_suite/frame_methods.py index 88d773319817d..c3425389684ae 100644 --- a/vb_suite/frame_methods.py +++ b/vb_suite/frame_methods.py @@ -324,8 +324,43 @@ def f(K=100): df = DataFrame({ i:s for i in range(1028) }) """ frame_apply_user_func = Benchmark('df.apply(lambda x: np.corrcoef(x,s)[0,1])', setup, + name = 'frame_apply_user_func', start_date=datetime(2012,1,1)) +setup = common_setup + """ +df = DataFrame(np.random.randn(1000,100)) +""" +frame_apply_lambda_mean = Benchmark('df.apply(lambda x: x.sum())', setup, + name = 'frame_apply_lambda_mean', + start_date=datetime(2012,1,1)) +setup = common_setup + """ +df = DataFrame(np.random.randn(1000,100)) +""" +frame_apply_np_mean = Benchmark('df.apply(np.mean)', setup, + name = 'frame_apply_np_mean', + start_date=datetime(2012,1,1)) + +setup = common_setup + """ +df = DataFrame(np.random.randn(1000,100)) +""" +frame_apply_pass_thru = Benchmark('df.apply(lambda x: x)', setup, + name = 'frame_apply_pass_thru', + start_date=datetime(2012,1,1)) + +setup = common_setup + """ +df = DataFrame(np.random.randn(1000,100)) +""" +frame_apply_axis_1 = Benchmark('df.apply(lambda x: x+1,axis=1)', setup, + name = 'frame_apply_axis_1', + start_date=datetime(2012,1,1)) + +setup = common_setup + """ +df = DataFrame(np.random.randn(1000,3),columns=list('ABC')) +""" +frame_apply_ref_by_name = Benchmark('df.apply(lambda x: x["A"] + x["B"],axis=1)', setup, + name = 'frame_apply_ref_by_name', + start_date=datetime(2012,1,1)) + #---------------------------------------------------------------------- # dtypes