BUG: errors and segfaults in groupby cython transforms (#16771)

abull · Adam · commit dae864558d60 · 2019-04-18T19:21:55.000+01:00
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -388,6 +388,7 @@ Groupby/Resample/Rolling
 - Ensured that result group order is correct when grouping on an ordered ``Categorical`` and specifying ``observed=True`` (:issue:`25871`, :issue:`25167`)
 - Bug in :meth:`pandas.core.window.Rolling.min` and :meth:`pandas.core.window.Rolling.max` that caused a memory leak (:issue:`25893`)
 - Bug in :func:`idxmax` and :func:`idxmin` on :meth:`DataFrame.groupby` with datetime column would return incorrect dtype (:issue:`25444`, :issue:`15306`)
+- Bug in :func:`cumsum`, :func:`cumprod`, :func:`cummin` and :func:`cummax` on :meth:`DataFrame.groupby` with categorical column having absent categories, would return incorrect result or segfault (:issue:`16771`)
 
 Reshaping
 ^^^^^^^^^
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -143,6 +143,7 @@ def group_cumprod_float64(float64_t[:, :] out,
                           const float64_t[:, :] values,
                           const int64_t[:] labels,
                           bint is_datetimelike,
+                          int ngroups,
                           bint skipna=True):
     """
     Only transforms on axis=0
@@ -154,7 +155,7 @@ def group_cumprod_float64(float64_t[:, :] out,
         int64_t lab
 
     N, K = (<object>values).shape
-    accum = np.ones_like(values)
+    accum = np.ones((ngroups, K), np.float64)
 
     with nogil:
         for i in range(N):
@@ -180,6 +181,7 @@ def group_cumsum(numeric[:, :] out,
                  numeric[:, :] values,
                  const int64_t[:] labels,
                  is_datetimelike,
+                 int ngroups,
                  bint skipna=True):
     """
     Only transforms on axis=0
@@ -191,7 +193,7 @@ def group_cumsum(numeric[:, :] out,
         int64_t lab
 
     N, K = (<object>values).shape
-    accum = np.zeros_like(values)
+    accum = np.zeros((ngroups, K), np.array(values).dtype)
 
     with nogil:
         for i in range(N):
diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
@@ -474,7 +474,8 @@ def group_min(groupby_t[:, :] out,
 def group_cummin(groupby_t[:, :] out,
                  groupby_t[:, :] values,
                  const int64_t[:] labels,
-                 bint is_datetimelike):
+                 bint is_datetimelike,
+                 int ngroups):
     """
     Only transforms on axis=0
     """
@@ -485,7 +486,7 @@ def group_cummin(groupby_t[:, :] out,
         int64_t lab
 
     N, K = (<object>values).shape
-    accum = np.empty_like(values)
+    accum = np.empty((ngroups, K), np.array(values).dtype)
     if groupby_t is int64_t:
         accum[:] = _int64_max
     else:
@@ -522,7 +523,8 @@ def group_cummin(groupby_t[:, :] out,
 def group_cummax(groupby_t[:, :] out,
                  groupby_t[:, :] values,
                  const int64_t[:] labels,
-                 bint is_datetimelike):
+                 bint is_datetimelike,
+                 int ngroups):
     """
     Only transforms on axis=0
     """
@@ -533,7 +535,7 @@ def group_cummax(groupby_t[:, :] out,
         int64_t lab
 
     N, K = (<object>values).shape
-    accum = np.empty_like(values)
+    accum = np.empty((ngroups, K), np.array(values).dtype)
     if groupby_t is int64_t:
         accum[:] = -_int64_max
     else:
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -361,7 +361,7 @@ def get_group_levels(self):
             'cummax': 'group_cummax',
             'rank': {
                 'name': 'group_rank',
-                'f': lambda func, a, b, c, d, **kwargs: func(
+                'f': lambda func, a, b, c, d, e, **kwargs: func(
                     a, b, c, d,
                     kwargs.get('ties_method', 'average'),
                     kwargs.get('ascending', True),
@@ -600,9 +600,10 @@ def _transform(self, result, values, comp_ids, transform_func,
             for i, chunk in enumerate(values.transpose(2, 0, 1)):
 
                 transform_func(result[:, :, i], values,
-                               comp_ids, is_datetimelike, **kwargs)
+                               comp_ids, is_datetimelike, ngroups, **kwargs)
         else:
-            transform_func(result, values, comp_ids, is_datetimelike, **kwargs)
+            transform_func(result, values, comp_ids, is_datetimelike, ngroups,
+                           **kwargs)
 
         return result
 
diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py
@@ -9,7 +9,8 @@
 from pandas.core.dtypes.common import ensure_platform_int, is_timedelta64_dtype
 
 import pandas as pd
-from pandas import DataFrame, MultiIndex, Series, Timestamp, concat, date_range
+from pandas import (
+    Categorical, DataFrame, MultiIndex, Series, Timestamp, concat, date_range)
 from pandas.core.groupby.groupby import DataError
 from pandas.util import testing as tm
 from pandas.util.testing import assert_frame_equal, assert_series_equal
@@ -470,7 +471,8 @@ def _check_cython_group_transform_cumulative(pd_op, np_op, dtype):
     ans = np.zeros_like(data)
 
     labels = np.array([0, 0, 0, 0], dtype=np.int64)
-    pd_op(ans, data, labels, is_datetimelike)
+    ngroups = 1
+    pd_op(ans, data, labels, is_datetimelike, ngroups)
 
     tm.assert_numpy_array_equal(np_op(data), ans[:, 0],
                                 check_dtype=False)
@@ -496,17 +498,19 @@ def test_cython_group_transform_algos():
 
     # with nans
     labels = np.array([0, 0, 0, 0, 0], dtype=np.int64)
+    ngroups = 1
 
     data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64')
     actual = np.zeros_like(data)
     actual.fill(np.nan)
-    groupby.group_cumprod_float64(actual, data, labels, is_datetimelike)
+    groupby.group_cumprod_float64(actual, data, labels, is_datetimelike,
+                                  ngroups)
     expected = np.array([1, 2, 6, np.nan, 24], dtype='float64')
     tm.assert_numpy_array_equal(actual[:, 0], expected)
 
     actual = np.zeros_like(data)
     actual.fill(np.nan)
-    groupby.group_cumsum(actual, data, labels, is_datetimelike)
+    groupby.group_cumsum(actual, data, labels, is_datetimelike, ngroups)
     expected = np.array([1, 3, 6, np.nan, 10], dtype='float64')
     tm.assert_numpy_array_equal(actual[:, 0], expected)
 
@@ -515,7 +519,7 @@ def test_cython_group_transform_algos():
     data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None]
     actual = np.zeros_like(data, dtype='int64')
     groupby.group_cumsum(actual, data.view('int64'), labels,
-                         is_datetimelike)
+                         is_datetimelike, ngroups)
     expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64(
         2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'),
         np.timedelta64(5, 'ns')])
@@ -863,3 +867,15 @@ def test_groupby_transform_with_datetimes(func, values):
                          index=dates, name="price")
 
     tm.assert_series_equal(result, expected)
+
+
+def test_transform_absent_categories():
+    # GH 16771
+    # cython transforms with more groups than rows
+    x_vals = [1]
+    x_cats = range(2)
+    y = [1]
+    df = DataFrame(dict(x=Categorical(x_vals, x_cats), y=y))
+    result = df.y.groupby(df.x).cumsum()
+    expected = df.y
+    assert_series_equal(result, expected)