pandas-dev · jreback · Jan 21, 2014 · Jan 21, 2014 · Jan 21, 2014 · Jan 21, 2014
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -88,6 +88,7 @@ Improvements to existing features
   - perf improvments in indexing with object dtypes (:issue:`5968`)
   - improved dtype inference for ``timedelta`` like passed to constructors (:issue:`5458`,:issue:`5689`)
   - escape special characters when writing to latex (:issue: `5374`)
+  - perf improvements in ``DataFrame.apply`` (:issue:`6013`)
 
 .. _release.bug_fixes-0.13.1:
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -24,7 +24,7 @@
 from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
                                 _default_index, _maybe_upcast, _is_sequence,
                                 _infer_dtype_from_scalar, _values_from_object,
-                                _DATELIKE_DTYPES, is_list_like)
+                                is_list_like)
 from pandas.core.generic import NDFrame, _shared_docs
 from pandas.core.index import Index, MultiIndex, _ensure_index
 from pandas.core.indexing import (_maybe_droplevels,
@@ -1581,7 +1581,7 @@ def _ixs(self, i, axis=0, copy=False):
                 else:
                     new_values, copy = self._data.fast_2d_xs(i, copy=copy)
                     result = Series(new_values, index=self.columns,
-                                    name=self.index[i])
+                                    name=self.index[i], dtype=new_values.dtype)
                 result.is_copy=copy
                 return result
 
@@ -3324,16 +3324,15 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True):
         if reduce:
             try:
 
-                if self._is_mixed_type:  # maybe a hack for now
-                    raise AssertionError('Must be mixed type DataFrame')
+                # the is the fast-path
                 values = self.values
                 dummy = Series(NA, index=self._get_axis(axis),
                                dtype=values.dtype)
 
                 labels = self._get_agg_axis(axis)
                 result = lib.reduce(values, func, axis=axis, dummy=dummy,
                                     labels=labels)
-                return Series(result, index=self._get_agg_axis(axis))
+                return Series(result, index=labels)
             except Exception:
                 pass
 
@@ -3393,12 +3392,12 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True):
                 result = result.T
             result = result.convert_objects(copy=False)
 
-            return result
         else:
-            s = Series(results)
-            s.index = res_index
 
-            return s
+            result = Series(results)
+            result.index = res_index
+
+        return result
 
     def _apply_broadcast(self, func, axis):
         if axis == 0:
@@ -3932,8 +3931,7 @@ def _reduce(self, op, axis=0, skipna=True, numeric_only=None,
         labels = self._get_agg_axis(axis)
 
         # exclude timedelta/datetime unless we are uniform types
-        if axis == 1 and self._is_mixed_type and len(set(self.dtypes) &
-                                                     _DATELIKE_DTYPES):
+        if axis == 1 and self._is_mixed_type and self._is_datelike_mixed_type:
             numeric_only = True
 
         if numeric_only is None:
@@ -3945,7 +3943,14 @@ def _reduce(self, op, axis=0, skipna=True, numeric_only=None,
                 # try by-column first
                 if filter_type is None and axis == 0:
                     try:
-                        return self.apply(f).iloc[0]
+
+                        # this can end up with a non-reduction
+                        # but not always. if the types are mixed
+                        # with datelike then need to make sure a series
+                        result = self.apply(f,reduce=False)
+                        if result.ndim == self.ndim:
+                            result = result.iloc[0]
+                        return result
                     except:
                         pass
 

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -78,7 +78,8 @@ class NDFrame(PandasObject):
     copy : boolean, default False
     """
     _internal_names = ['_data', '_cacher', '_item_cache', '_cache',
-                       'is_copy', '_subtyp', '_index', '_default_kind', '_default_fill_value']
+                       'is_copy', '_subtyp', '_index', '_default_kind',
+                       '_default_fill_value','__array_struct__','__array_interface__']
     _internal_names_set = set(_internal_names)
     _metadata = []
     is_copy = None
@@ -698,6 +699,14 @@ def __array_wrap__(self, result):
         d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False)
         return self._constructor(result, **d).__finalize__(self)
 
+    # ideally we would define this to avoid the getattr checks, but
+    # is slower
+    #@property
+    #def __array_interface__(self):
+    #    """ provide numpy array interface method """
+    #    values = self.values
+    #    return dict(typestr=values.dtype.str,shape=values.shape,data=values)
+
     def to_dense(self):
         "Return dense representation of NDFrame (as opposed to sparse)"
         # compat
@@ -1828,6 +1837,11 @@ def _is_numeric_mixed_type(self):
         f = lambda: self._data.is_numeric_mixed_type
         return self._protect_consolidate(f)
 
+    @property
+    def _is_datelike_mixed_type(self):
+        f = lambda: self._data.is_datelike_mixed_type
+        return self._protect_consolidate(f)
+
     def _protect_consolidate(self, f):
         blocks_before = len(self._data.blocks)
         result = f()

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -83,6 +83,11 @@ def _consolidate_key(self):
     def _is_single_block(self):
         return self.ndim == 1
 
+    @property
+    def is_datelike(self):
+        """ return True if I am a non-datelike """
+        return self.is_datetime or self.is_timedelta
+
     @property
     def fill_value(self):
         return np.nan
@@ -2439,6 +2444,12 @@ def is_numeric_mixed_type(self):
         self._consolidate_inplace()
         return all([block.is_numeric for block in self.blocks])
 
+    @property
+    def is_datelike_mixed_type(self):
+        # Warning, consolidation needs to get checked upstairs
+        self._consolidate_inplace()
+        return any([block.is_datelike for block in self.blocks])
+
     def get_block_map(self, copy=False, typ=None, columns=None,
                       is_numeric=False, is_bool=False):
         """ return a dictionary mapping the ftype -> block list

diff --git a/pandas/src/reduce.pyx b/pandas/src/reduce.pyx
@@ -35,25 +35,26 @@ cdef class Reducer:
             self.chunksize = k
             self.increment = k * arr.dtype.itemsize
 
+
         self.f = f
         self.arr = arr
         self.typ = None
         self.labels = labels
-        self.dummy, index = self._check_dummy(dummy)
+        self.dummy, index = self._check_dummy(dummy=dummy)
 
-        if axis == 0:
-             self.labels = index
-             self.index  = labels
-        else:
-             self.labels = labels
-             self.index  = index
+        self.labels = labels
+        self.index  = index
 
     def _check_dummy(self, dummy=None):
         cdef object index
 
         if dummy is None:
             dummy = np.empty(self.chunksize, dtype=self.arr.dtype)
             index = None
+
+            # our ref is stolen later since we are creating this array
+            # in cython, so increment first
+            Py_INCREF(dummy)
         else:
             if dummy.dtype != self.arr.dtype:
                 raise ValueError('Dummy array must be same dtype')
@@ -76,39 +77,48 @@ cdef class Reducer:
             ndarray arr, result, chunk
             Py_ssize_t i, incr
             flatiter it
-            object res, tchunk, name, labels, index, typ
+            object res, name, labels, index
+            object cached_typ = None
 
         arr = self.arr
         chunk = self.dummy
         dummy_buf = chunk.data
         chunk.data = arr.data
         labels = self.labels
         index = self.index
-        typ = self.typ
         incr = self.increment
 
         try:
             for i in range(self.nresults):
-                # need to make sure that we pass an actual object to the function
-                # and not just an ndarray
-                if typ is not None:
-                     try:
-                         if labels is not None:
-                            name = labels[i]
+
+                if labels is not None:
+                    name = util.get_value_at(labels, i)
+                else:
+                    name = None
+
+                # create the cached type
+                # each time just reassign the data
+                if i == 0:
+
+                    if self.typ is not None:
 
                          # recreate with the index if supplied
                          if index is not None:
-                              tchunk = typ(chunk, index=index, name=name, fastpath=True)
+
+                             cached_typ = self.typ(chunk, index=index, name=name)
+
                          else:
-                             tchunk = typ(chunk, name=name)
 
-                     except:
-                         tchunk = chunk
-                         typ = None
-                else:
-                     tchunk = chunk
+                             # use the passsed typ, sans index
+                             cached_typ = self.typ(chunk, name=name)
 
-                res = self.f(tchunk)
+                # use the cached_typ if possible
+                if cached_typ is not None:
+                    cached_typ._data._block.values = chunk
+                    cached_typ.name = name
+                    res = self.f(cached_typ)
+                else:
+                    res = self.f(chunk)
 
                 if hasattr(res,'values'):
                     res = res.values

diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -9035,6 +9035,16 @@ def test_apply_mixed_dtype_corner(self):
         expected = Series(np.nan, index=[])
         assert_series_equal(result, expected)
 
+        df = DataFrame({'A': ['foo'],
+                        'B': [1.]})
+        result = df.apply(lambda x: x['A'], axis=1)
+        expected = Series(['foo'],index=[0])
+        assert_series_equal(result, expected)
+
+        result = df.apply(lambda x: x['B'], axis=1)
+        expected = Series([1.],index=[0])
+        assert_series_equal(result, expected)
+
     def test_apply_empty_infer_type(self):
         no_cols = DataFrame(index=['a', 'b', 'c'])
         no_index = DataFrame(columns=['a', 'b', 'c'])
@@ -9970,7 +9980,8 @@ def test_count(self):
         self._check_stat_op('count', f,
                             has_skipna=False,
                             has_numeric_only=True,
-                            check_dtypes=False)
+                            check_dtype=False,
+                            check_dates=True)
 
         # corner case
         frame = DataFrame()
@@ -9999,10 +10010,9 @@ def test_count(self):
     def test_sum(self):
         self._check_stat_op('sum', np.sum, has_numeric_only=True)
 
-    def test_sum_mixed_numeric(self):
-        raise nose.SkipTest("skipping for now")
-        # mixed types
-        self._check_stat_op('sum', np.sum, frame = self.mixed_float, has_numeric_only=True)
+        # mixed types (with upcasting happening)
+        self._check_stat_op('sum', np.sum, frame=self.mixed_float.astype('float32'),
+                            has_numeric_only=True, check_dtype=False, check_less_precise=True)
 
     def test_stat_operators_attempt_obj_array(self):
         data = {
@@ -10028,7 +10038,7 @@ def test_stat_operators_attempt_obj_array(self):
                 assert_series_equal(result, expected)
 
     def test_mean(self):
-        self._check_stat_op('mean', np.mean)
+        self._check_stat_op('mean', np.mean, check_dates=True)
 
     def test_product(self):
         self._check_stat_op('product', np.prod)
@@ -10039,10 +10049,10 @@ def wrapper(x):
                 return np.nan
             return np.median(x)
 
-        self._check_stat_op('median', wrapper)
+        self._check_stat_op('median', wrapper, check_dates=True)
 
     def test_min(self):
-        self._check_stat_op('min', np.min)
+        self._check_stat_op('min', np.min, check_dates=True)
         self._check_stat_op('min', np.min, frame=self.intframe)
 
     def test_cummin(self):
@@ -10092,7 +10102,7 @@ def test_cummax(self):
         self.assertEqual(np.shape(cummax_xs), np.shape(self.tsframe))
 
     def test_max(self):
-        self._check_stat_op('max', np.max)
+        self._check_stat_op('max', np.max, check_dates=True)
         self._check_stat_op('max', np.max, frame=self.intframe)
 
     def test_mad(self):
@@ -10154,7 +10164,8 @@ def alt(x):
         assert_series_equal(df.kurt(), df.kurt(level=0).xs('bar'))
 
     def _check_stat_op(self, name, alternative, frame=None, has_skipna=True,
-                       has_numeric_only=False, check_dtypes=True):
+                       has_numeric_only=False, check_dtype=True, check_dates=False,
+                       check_less_precise=False):
         if frame is None:
             frame = self.frame
             # set some NAs
@@ -10163,14 +10174,16 @@ def _check_stat_op(self, name, alternative, frame=None, has_skipna=True,
 
         f = getattr(frame, name)
 
-        if not ('max' in name or 'min' in name or 'count' in name):
+        if check_dates:
             df = DataFrame({'b': date_range('1/1/2001', periods=2)})
             _f = getattr(df, name)
-            #print(df)
-            self.assertFalse(len(_f()))
+            result = _f()
+            self.assert_(isinstance(result, Series))
 
             df['a'] = lrange(len(df))
-            self.assert_(len(getattr(df, name)()))
+            result = getattr(df, name)()
+            self.assert_(isinstance(result, Series))
+            self.assert_(len(result))
 
         if has_skipna:
             def skipna_wrapper(x):
@@ -10184,21 +10197,27 @@ def wrapper(x):
 
             result0 = f(axis=0, skipna=False)
             result1 = f(axis=1, skipna=False)
-            assert_series_equal(result0, frame.apply(wrapper))
+            assert_series_equal(result0, frame.apply(wrapper),
+                                check_dtype=check_dtype,
+                                check_less_precise=check_less_precise)
             assert_series_equal(result1, frame.apply(wrapper, axis=1),
-                                check_dtype=False)  # HACK: win32
+                                check_dtype=False,
+                                check_less_precise=check_less_precise)  # HACK: win32
         else:
             skipna_wrapper = alternative
             wrapper = alternative
 
         result0 = f(axis=0)
         result1 = f(axis=1)
-        assert_series_equal(result0, frame.apply(skipna_wrapper))
+        assert_series_equal(result0, frame.apply(skipna_wrapper),
+                            check_dtype=check_dtype,
+                            check_less_precise=check_less_precise)
         assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1),
-                            check_dtype=False)
+                            check_dtype=False,
+                            check_less_precise=check_less_precise)
 
         # check dtypes
-        if check_dtypes:
+        if check_dtype:
             lcd_dtype = frame.values.dtype
             self.assert_(lcd_dtype == result0.dtype)
             self.assert_(lcd_dtype == result1.dtype)
@@ -10331,7 +10350,8 @@ def wrapper(x):
                 return np.nan
             return np.median(x)
 
-        self._check_stat_op('median', wrapper, frame=self.intframe, check_dtypes=False)
+        self._check_stat_op('median', wrapper, frame=self.intframe,
+                            check_dtype=False, check_dates=True)
 
     def test_quantile(self):
         from pandas.compat.scipy import scoreatpercentile