PERF: Cythonize Groupby.cummin/cummax (#15048)

mroeschke · mroeschke · commit f0d1b24f4bd1 · 2017-01-07T01:02:39.000-08:00
pep8, removed args, kwargs

add whatsnew + test + changed logic

small error in algo

Use a more obvious test

Fixed algo &amp; test passed
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -690,6 +690,3 @@ def time_shift(self):
     def time_transform_dataframe(self):
         # GH 12737
         self.df_nans.groupby('key').transform('first')
-
-
-
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -281,7 +281,7 @@ Performance Improvements
 
 - Improved performance of ``pd.wide_to_long()`` (:issue:`14779`)
 - Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`)
-
+- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`)
 
 
 .. _whatsnew_0200.bug_fixes:
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -75,7 +75,7 @@
     'last', 'first',
     'head', 'tail', 'median',
     'mean', 'sum', 'min', 'max',
-    'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount',
+    'cumcount',
     'resample',
     'describe',
     'rank', 'quantile',
@@ -97,7 +97,8 @@
 _dataframe_apply_whitelist = \
     _common_apply_whitelist | frozenset(['dtypes', 'corrwith'])
 
-_cython_transforms = frozenset(['cumprod', 'cumsum', 'shift'])
+_cython_transforms = frozenset(['cumprod', 'cumsum', 'shift',
+                                'cummin', 'cummax'])
 
 
 def _groupby_function(name, alias, npfunc, numeric_only=True,
@@ -1415,6 +1416,24 @@ def cumsum(self, axis=0, *args, **kwargs):
 
         return self._cython_transform('cumsum')
 
+    @Substitution(name='groupby')
+    @Appender(_doc_template)
+    def cummin(self, axis=0):
+        """Cumulative min for each group"""
+        if axis != 0:
+            return self.apply(lambda x: np.minimum.accumulate(x, axis))
+
+        return self._cython_transform('cummin')
+
+    @Substitution(name='groupby')
+    @Appender(_doc_template)
+    def cummax(self, axis=0):
+        """Cumulative max for each group"""
+        if axis != 0:
+            return self.apply(lambda x: np.maximum.accumulate(x, axis))
+
+        return self._cython_transform('cummax')
+
     @Substitution(name='groupby')
     @Appender(_doc_template)
     def shift(self, periods=1, freq=None, axis=0):
@@ -1752,6 +1771,8 @@ def get_group_levels(self):
         'transform': {
             'cumprod': 'group_cumprod',
             'cumsum': 'group_cumsum',
+            'cummin': 'group_cummin',
+            'cummax': 'group_cummax',
         }
     }
 
diff --git a/pandas/src/algos_groupby_helper.pxi.in b/pandas/src/algos_groupby_helper.pxi.in
@@ -667,6 +667,72 @@ def group_cumsum(numeric[:, :] out,
                     out[i, j] = accum[lab, j]
 
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_cummin(numeric[:, :] out,
+                 numeric[:, :] values,
+                 int64_t[:] labels,
+                 numeric[:, :] accum):
+    """
+    Only transforms on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, size
+        numeric val, min_val
+        int64_t lab
+
+    N, K = (<object> values).shape
+    accum = np.empty_like(accum)
+    accum[:] = _int64_max
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+
+            if lab < 0:
+                continue
+            for j in range(K):
+                val = values[i, j]
+                if val == val:
+                    if val < accum[lab, j]:
+                        min_val = val
+                    accum[lab, j] = min_val
+                    out[i, j] = accum[lab, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_cummax(numeric[:, :] out,
+                 numeric[:, :] values,
+                 int64_t[:] labels,
+                 numeric[:, :] accum):
+    """
+    Only transforms on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, size
+        numeric val, max_val
+        int64_t lab
+
+    N, K = (<object> values).shape
+    accum = np.empty_like(accum)
+    accum[:] = -_int64_max
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+
+            if lab < 0:
+                continue
+            for j in range(K):
+                val = values[i, j]
+                if val == val:
+                    if val > accum[lab, j]:
+                        max_val = val
+                    accum[lab, j] = max_val
+                    out[i, j] = accum[lab, j]
+
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_shift_indexer(int64_t[:] out, int64_t[:] labels,
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -4977,10 +4977,6 @@ def test_groupby_whitelist(self):
             'max',
             'head',
             'tail',
-            'cumsum',
-            'cumprod',
-            'cummin',
-            'cummax',
             'cumcount',
             'resample',
             'describe',
@@ -5018,10 +5014,6 @@ def test_groupby_whitelist(self):
             'max',
             'head',
             'tail',
-            'cumsum',
-            'cumprod',
-            'cummin',
-            'cummax',
             'cumcount',
             'resample',
             'describe',
@@ -5777,6 +5769,19 @@ def test_agg_over_numpy_arrays(self):
 
         assert_frame_equal(result, expected)
 
+    def test_cummin_cummax(self):
+        df = pd.DataFrame({'A': [1, 1, 1, 2, 2, 2],
+                           'B': range(3, 9)})
+        result = df.groupby('A').cummin()
+        expected = pd.DataFrame({'B': [3, 3, 3, 6, 6, 6]})
+        tm.assert_frame_equal(result, expected)
+
+        df = pd.DataFrame({'A': [1, 1, 1, 2, 2, 2],
+                           'B': range(8, 2, -1)})
+        result = df.groupby('A').cummax()
+        expected = pd.DataFrame({'B': [8, 8, 8, 5, 5, 5]})
+        tm.assert_frame_equal(result, expected)
+
 
 def assert_fp_equal(a, b):
     assert (np.abs(a - b) < 1e-12).all()