BUG: Incorrect value updating for groupby.cummin/max (#15635)

mroeschke · jreback · commit 4ce9c0c9b9ef · 2017-03-10T06:33:22.000-05:00
closes #15635 Author: Matt Roeschke <emailformattr@gmail.com> Closes #15642 from mroeschke/fix_15635 and squashes the following commits: b92b81a [Matt Roeschke] BUG: Incorrect value updating for groupby.cummin/max (#15635)
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -716,7 +716,7 @@ Performance Improvements
 - Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`)
 - Improved performance of timeseries plotting with an irregular DatetimeIndex
   (or with ``compat_x=True``) (:issue:`15073`).
-- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`, :issue:`15561`)
+- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`, :issue:`15561`, :issue:`15635`)
 - Improved performance and reduced memory when indexing with a ``MultiIndex`` (:issue:`15245`)
 - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. (:issue:`14947`)
 - Improved performance of ``.rank()`` for categorical data (:issue:`15498`)
diff --git a/pandas/_libs/algos_groupby_helper.pxi.in b/pandas/_libs/algos_groupby_helper.pxi.in
@@ -603,7 +603,7 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
     """
     cdef:
         Py_ssize_t i, j, N, K, size
-        {{dest_type2}} val, min_val = 0
+        {{dest_type2}} val, mval
         ndarray[{{dest_type2}}, ndim=2] accum
         int64_t lab
 
@@ -628,10 +628,10 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 {{else}}
                 if val == val:
                 {{endif}}
-                    if val < accum[lab, j]:
-                        min_val = val
-                    accum[lab, j] = min_val
-                    out[i, j] = accum[lab, j]
+                    mval = accum[lab, j]
+                    if val < mval:
+                        accum[lab, j] = mval = val
+                    out[i, j] = mval
 
 
 @cython.boundscheck(False)
@@ -645,7 +645,7 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
     """
     cdef:
         Py_ssize_t i, j, N, K, size
-        {{dest_type2}} val, max_val = 0
+        {{dest_type2}} val, mval
         ndarray[{{dest_type2}}, ndim=2] accum
         int64_t lab
 
@@ -669,10 +669,10 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 {{else}}
                 if val == val:
                 {{endif}}
-                    if val > accum[lab, j]:
-                        max_val = val
-                    accum[lab, j] = max_val
-                    out[i, j] = accum[lab, j]
+                    mval = accum[lab, j]
+                    if val > mval:
+                        accum[lab, j] = mval = val
+                    out[i, j] = mval
 
 {{endfor}}
 
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -4303,6 +4303,17 @@ def test_cummin_cummax(self):
             result = getattr(df.groupby('a')['b'], method)()
             tm.assert_series_equal(expected, result)
 
+        # GH 15635
+        df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1]))
+        result = df.groupby('a').b.cummax()
+        expected = pd.Series([2, 1, 2], name='b')
+        tm.assert_series_equal(result, expected)
+
+        df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2]))
+        result = df.groupby('a').b.cummin()
+        expected = pd.Series([1, 2, 1], name='b')
+        tm.assert_series_equal(result, expected)
+
 
 def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
     tups = lmap(tuple, df[keys].values)