PERF: Cythonize Groupby.cummin/cummax (#15048)

mroeschke · mroeschke · commit 2fc025bdeb06 · 2017-01-08T22:51:09.000-08:00
pep8, removed args, kwargs

add whatsnew + test + changed logic

small error in algo

Use a more obvious test

Fixed algo &amp; test passed

Add dtypes test

Add additional tests

handle nan case

Adapt max/min for different dtypes + tests

remove uncessary comments
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -690,6 +690,3 @@ def time_shift(self):
     def time_transform_dataframe(self):
         # GH 12737
         self.df_nans.groupby('key').transform('first')
-
-
-
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -281,7 +281,7 @@ Performance Improvements
 
 - Improved performance of ``pd.wide_to_long()`` (:issue:`14779`)
 - Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`)
-
+- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`)
 
 
 .. _whatsnew_0200.bug_fixes:
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -75,7 +75,7 @@
     'last', 'first',
     'head', 'tail', 'median',
     'mean', 'sum', 'min', 'max',
-    'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount',
+    'cumcount',
     'resample',
     'describe',
     'rank', 'quantile',
@@ -97,7 +97,8 @@
 _dataframe_apply_whitelist = \
     _common_apply_whitelist | frozenset(['dtypes', 'corrwith'])
 
-_cython_transforms = frozenset(['cumprod', 'cumsum', 'shift'])
+_cython_transforms = frozenset(['cumprod', 'cumsum', 'shift',
+                                'cummin', 'cummax'])
 
 
 def _groupby_function(name, alias, npfunc, numeric_only=True,
@@ -1415,6 +1416,24 @@ def cumsum(self, axis=0, *args, **kwargs):
 
         return self._cython_transform('cumsum')
 
+    @Substitution(name='groupby')
+    @Appender(_doc_template)
+    def cummin(self, axis=0):
+        """Cumulative min for each group"""
+        if axis != 0:
+            return self.apply(lambda x: np.minimum.accumulate(x, axis))
+
+        return self._cython_transform('cummin')
+
+    @Substitution(name='groupby')
+    @Appender(_doc_template)
+    def cummax(self, axis=0):
+        """Cumulative max for each group"""
+        if axis != 0:
+            return self.apply(lambda x: np.maximum.accumulate(x, axis))
+
+        return self._cython_transform('cummax')
+
     @Substitution(name='groupby')
     @Appender(_doc_template)
     def shift(self, periods=1, freq=None, axis=0):
@@ -1752,6 +1771,8 @@ def get_group_levels(self):
         'transform': {
             'cumprod': 'group_cumprod',
             'cumsum': 'group_cumsum',
+            'cummin': 'group_cummin',
+            'cummax': 'group_cummax',
         }
     }
 
diff --git a/pandas/src/algos_groupby_helper.pxi.in b/pandas/src/algos_groupby_helper.pxi.in
@@ -568,6 +568,78 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 else:
                     out[i, j] = minx[i, j]
 
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
+                          ndarray[{{dest_type2}}, ndim=2] values,
+                          ndarray[int64_t] labels,
+                          ndarray[{{dest_type2}}, ndim=2] accum):
+    """
+    Only transforms on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, size
+        {{dest_type2}} val, min_val
+        int64_t lab
+
+    N, K = (<object> values).shape
+    accum = np.empty_like(accum)
+    accum.fill({{inf_val}})
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+
+            if lab < 0:
+                continue
+            for j in range(K):
+                val = values[i, j]
+                if val == val:
+                    if val < accum[lab, j]:
+                        min_val = val
+                    accum[lab, j] = min_val
+                    out[i, j] = accum[lab, j]
+                # val = nan
+                elif val != val:
+                    out[i, j] = {{nan_val}}
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
+                          ndarray[{{dest_type2}}, ndim=2] values,
+                          ndarray[int64_t] labels,
+                          ndarray[{{dest_type2}}, ndim=2] accum):
+    """
+    Only transforms on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, size
+        {{dest_type2}} val, max_val
+        int64_t lab
+
+    N, K = (<object> values).shape
+    accum = np.empty_like(accum)
+    accum.fill(-{{inf_val}})
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+
+            if lab < 0:
+                continue
+            for j in range(K):
+                val = values[i, j]
+                if val == val:
+                    if val > accum[lab, j]:
+                        max_val = val
+                    accum[lab, j] = max_val
+                    out[i, j] = accum[lab, j]
+                # val = nan
+                elif val != val:
+                    out[i, j] = {{nan_val}}
+
 {{endfor}}
 
 #----------------------------------------------------------------------
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -4977,10 +4977,6 @@ def test_groupby_whitelist(self):
             'max',
             'head',
             'tail',
-            'cumsum',
-            'cumprod',
-            'cummin',
-            'cummax',
             'cumcount',
             'resample',
             'describe',
@@ -5018,10 +5014,6 @@ def test_groupby_whitelist(self):
             'max',
             'head',
             'tail',
-            'cumsum',
-            'cumprod',
-            'cummin',
-            'cummax',
             'cumcount',
             'resample',
             'describe',
@@ -5777,6 +5769,81 @@ def test_agg_over_numpy_arrays(self):
 
         assert_frame_equal(result, expected)
 
+    def test_cummin_cummax(self):
+        # GH 15048
+        # Test with different dtypes
+        for dtype in [np.int32, np.float32, np.float64]:
+            df = pd.DataFrame({'A': [1, 1, 1, 2, 2, 2],
+                               'B': range(3, 9)}).astype(dtype)
+            result = df.groupby('A').cummin()
+            expected = pd.DataFrame({'B': [3, 3, 3, 6, 6, 6]}).astype(dtype)
+            tm.assert_frame_equal(result, expected)
+            expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
+            tm.assert_frame_equal(result, expected)
+            # Test min value for dtype
+            try:
+                min_val = np.iinfo(dtype).min
+            except ValueError:
+                min_val = np.finfo(dtype).min
+            df.loc[[1, 4], 'B'] = min_val
+            result = df.groupby('A').cummin()
+            expected = pd.DataFrame({'B': [3, min_val, min_val,
+                                           6, min_val, min_val]}).astype(dtype)
+            tm.assert_frame_equal(result, expected)
+            expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
+            tm.assert_frame_equal(result, expected)
+
+            df = pd.DataFrame({'A': [1, 1, 1, 2, 2, 2],
+                               'B': range(8, 2, -1)}).astype(dtype)
+            result = df.groupby('A').cummax()
+            expected = pd.DataFrame({'B': [8, 8, 8, 5, 5, 5]}).astype(dtype)
+            tm.assert_frame_equal(result, expected)
+            expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
+            tm.assert_frame_equal(result, expected)
+            # Test max value for dtype
+            try:
+                max_val = np.iinfo(dtype).max
+            except ValueError:
+                max_val = np.finfo(dtype).max
+            df.loc[[1, 4], 'B'] = max_val
+            result = df.groupby('A').cummax()
+            expected = pd.DataFrame({'B': [8, max_val, max_val,
+                                           5, max_val, max_val]}).astype(dtype)
+            tm.assert_frame_equal(result, expected)
+            expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
+            tm.assert_frame_equal(result, expected)
+
+        # Some entries in column are nan
+        df = pd.DataFrame({'A': [1, 1, 1, 2, 2, 2],
+                           'B': range(3, 9)})
+        df.loc[[1, 4], 'B'] = np.nan
+        result = df.groupby('A').cummin()
+        expected = pd.DataFrame({'B': [3, np.nan, 3, 6, np.nan, 6]})
+        tm.assert_frame_equal(result, expected)
+        expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
+        tm.assert_frame_equal(result, expected)
+
+        df = pd.DataFrame({'A': [1, 1, 1, 2, 2, 2],
+                           'B': range(8, 2, -1)})
+        df.loc[[1, 4], 'B'] = np.nan
+        result = df.groupby('A').cummax()
+        expected = pd.DataFrame({'B': [8, np.nan, 8, 5, np.nan, 5]})
+        tm.assert_frame_equal(result, expected)
+        expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
+        tm.assert_frame_equal(result, expected)
+
+        # Entire column is nan
+        df['B'] = np.nan
+        expected = pd.DataFrame({'B': [np.nan] * 6})
+        result = df.groupby('A').cummin()
+        tm.assert_frame_equal(expected, result)
+        result = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
+        tm.assert_frame_equal(expected, result)
+        result = df.groupby('A').cummax()
+        tm.assert_frame_equal(expected, result)
+        result = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
+        tm.assert_frame_equal(expected, result)
+
 
 def assert_fp_equal(a, b):
     assert (np.abs(a - b) < 1e-12).all()