PERF: Cythonize Groupby.cummin/cummax (#15048)

mroeschke · jreback · commit 0fe491db3582 · 2017-01-11T08:18:33.000-05:00
closes #15048 Author: Matt Roeschke <emailformattr@gmail.com> Closes #15053 from mroeschke/improve_cummin_cummax and squashes the following commits: 5e8ba63 [Matt Roeschke] PERF: Cythonize Groupby.cummin/cummax (#15048)
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -690,6 +690,3 @@ def time_shift(self):
     def time_transform_dataframe(self):
         # GH 12737
         self.df_nans.groupby('key').transform('first')
-
-
-
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -283,7 +283,7 @@ Performance Improvements
 - Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`)
 - Improved performance of timeseries plotting with an irregular DatetimeIndex
   (or with ``compat_x=True``) (:issue:`15073`).
-
+- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`)
 
 - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object.
 
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -75,7 +75,7 @@
     'last', 'first',
     'head', 'tail', 'median',
     'mean', 'sum', 'min', 'max',
-    'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount',
+    'cumcount',
     'resample',
     'describe',
     'rank', 'quantile',
@@ -97,7 +97,8 @@
 _dataframe_apply_whitelist = \
     _common_apply_whitelist | frozenset(['dtypes', 'corrwith'])
 
-_cython_transforms = frozenset(['cumprod', 'cumsum', 'shift'])
+_cython_transforms = frozenset(['cumprod', 'cumsum', 'shift',
+                                'cummin', 'cummax'])
 
 
 def _groupby_function(name, alias, npfunc, numeric_only=True,
@@ -1415,6 +1416,24 @@ def cumsum(self, axis=0, *args, **kwargs):
 
         return self._cython_transform('cumsum')
 
+    @Substitution(name='groupby')
+    @Appender(_doc_template)
+    def cummin(self, axis=0):
+        """Cumulative min for each group"""
+        if axis != 0:
+            return self.apply(lambda x: np.minimum.accumulate(x, axis))
+
+        return self._cython_transform('cummin')
+
+    @Substitution(name='groupby')
+    @Appender(_doc_template)
+    def cummax(self, axis=0):
+        """Cumulative max for each group"""
+        if axis != 0:
+            return self.apply(lambda x: np.maximum.accumulate(x, axis))
+
+        return self._cython_transform('cummax')
+
     @Substitution(name='groupby')
     @Appender(_doc_template)
     def shift(self, periods=1, freq=None, axis=0):
@@ -1752,6 +1771,8 @@ def get_group_levels(self):
         'transform': {
             'cumprod': 'group_cumprod',
             'cumsum': 'group_cumsum',
+            'cummin': 'group_cummin',
+            'cummax': 'group_cummax',
         }
     }
 
diff --git a/pandas/src/algos_groupby_helper.pxi.in b/pandas/src/algos_groupby_helper.pxi.in
@@ -568,6 +568,76 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 else:
                     out[i, j] = minx[i, j]
 
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
+                          ndarray[{{dest_type2}}, ndim=2] values,
+                          ndarray[int64_t] labels,
+                          ndarray[{{dest_type2}}, ndim=2] accum):
+    """
+    Only transforms on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, size
+        {{dest_type2}} val, min_val
+        int64_t lab
+
+    N, K = (<object> values).shape
+    accum.fill({{inf_val}})
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+
+            if lab < 0:
+                continue
+            for j in range(K):
+                val = values[i, j]
+                if val == val:
+                    if val < accum[lab, j]:
+                        min_val = val
+                    accum[lab, j] = min_val
+                    out[i, j] = accum[lab, j]
+                # val = nan
+                else:
+                    out[i, j] = {{nan_val}}
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
+                          ndarray[{{dest_type2}}, ndim=2] values,
+                          ndarray[int64_t] labels,
+                          ndarray[{{dest_type2}}, ndim=2] accum):
+    """
+    Only transforms on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, size
+        {{dest_type2}} val, max_val
+        int64_t lab
+
+    N, K = (<object> values).shape
+    accum.fill(-{{inf_val}})
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+
+            if lab < 0:
+                continue
+            for j in range(K):
+                val = values[i, j]
+                if val == val:
+                    if val > accum[lab, j]:
+                        max_val = val
+                    accum[lab, j] = max_val
+                    out[i, j] = accum[lab, j]
+                # val = nan
+                else:
+                    out[i, j] = {{nan_val}}
+
 {{endfor}}
 
 #----------------------------------------------------------------------
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -4977,10 +4977,6 @@ def test_groupby_whitelist(self):
             'max',
             'head',
             'tail',
-            'cumsum',
-            'cumprod',
-            'cummin',
-            'cummax',
             'cumcount',
             'resample',
             'describe',
@@ -5018,10 +5014,6 @@ def test_groupby_whitelist(self):
             'max',
             'head',
             'tail',
-            'cumsum',
-            'cumprod',
-            'cummin',
-            'cummax',
             'cumcount',
             'resample',
             'describe',
@@ -5777,6 +5769,85 @@ def test_agg_over_numpy_arrays(self):
 
         assert_frame_equal(result, expected)
 
+    def test_cummin_cummax(self):
+        # GH 15048
+        num_types = [np.int32, np.int64, np.float32, np.float64]
+        num_mins = [np.iinfo(np.int32).min, np.iinfo(np.int64).min,
+                    np.finfo(np.float32).min, np.finfo(np.float64).min]
+        num_max = [np.iinfo(np.int32).max, np.iinfo(np.int64).max,
+                   np.finfo(np.float32).max, np.finfo(np.float64).max]
+        base_df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 2, 2],
+                                'B': [3, 4, 3, 2, 2, 3, 2, 1]})
+        expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
+        expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]
+
+        for dtype, min_val, max_val in zip(num_types, num_mins, num_max):
+            df = base_df.astype(dtype)
+
+            # cummin
+            expected = pd.DataFrame({'B': expected_mins}).astype(dtype)
+            result = df.groupby('A').cummin()
+            tm.assert_frame_equal(result, expected)
+            result = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
+            tm.assert_frame_equal(result, expected)
+
+            # Test cummin w/ min value for dtype
+            df.loc[[2, 6], 'B'] = min_val
+            expected.loc[[2, 3, 6, 7], 'B'] = min_val
+            result = df.groupby('A').cummin()
+            tm.assert_frame_equal(result, expected)
+            expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
+            tm.assert_frame_equal(result, expected)
+
+            # cummax
+            expected = pd.DataFrame({'B': expected_maxs}).astype(dtype)
+            result = df.groupby('A').cummax()
+            tm.assert_frame_equal(result, expected)
+            result = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
+            tm.assert_frame_equal(result, expected)
+
+            # Test cummax w/ max value for dtype
+            df.loc[[2, 6], 'B'] = max_val
+            expected.loc[[2, 3, 6, 7], 'B'] = max_val
+            result = df.groupby('A').cummax()
+            tm.assert_frame_equal(result, expected)
+            expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
+            tm.assert_frame_equal(result, expected)
+
+        # Test nan in some values
+        base_df.loc[[0, 2, 4, 6], 'B'] = np.nan
+        expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 2,
+                                       np.nan, 3, np.nan, 1]})
+        result = base_df.groupby('A').cummin()
+        tm.assert_frame_equal(result, expected)
+        expected = (base_df.groupby('A')
+                           .B
+                           .apply(lambda x: x.cummin())
+                           .to_frame())
+        tm.assert_frame_equal(result, expected)
+
+        expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 4,
+                                       np.nan, 3, np.nan, 3]})
+        result = base_df.groupby('A').cummax()
+        tm.assert_frame_equal(result, expected)
+        expected = (base_df.groupby('A')
+                           .B
+                           .apply(lambda x: x.cummax())
+                           .to_frame())
+        tm.assert_frame_equal(result, expected)
+
+        # Test nan in entire column
+        base_df['B'] = np.nan
+        expected = pd.DataFrame({'B': [np.nan] * 8})
+        result = base_df.groupby('A').cummin()
+        tm.assert_frame_equal(expected, result)
+        result = base_df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
+        tm.assert_frame_equal(expected, result)
+        result = base_df.groupby('A').cummax()
+        tm.assert_frame_equal(expected, result)
+        result = base_df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
+        tm.assert_frame_equal(expected, result)
+
 
 def assert_fp_equal(a, b):
     assert (np.abs(a - b) < 1e-12).all()