Skip to content

Commit dae8645

Browse files
abullAdam
abull
authored and
Adam
committed
BUG: errors and segfaults in groupby cython transforms (#16771)
1 parent 872a23b commit dae8645

File tree

5 files changed

+36
-14
lines changed

5 files changed

+36
-14
lines changed

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,7 @@ Groupby/Resample/Rolling
388388
- Ensured that result group order is correct when grouping on an ordered ``Categorical`` and specifying ``observed=True`` (:issue:`25871`, :issue:`25167`)
389389
- Bug in :meth:`pandas.core.window.Rolling.min` and :meth:`pandas.core.window.Rolling.max` that caused a memory leak (:issue:`25893`)
390390
- Bug in :func:`idxmax` and :func:`idxmin` on :meth:`DataFrame.groupby` with datetime column would return incorrect dtype (:issue:`25444`, :issue:`15306`)
391+
- Bug in :func:`cumsum`, :func:`cumprod`, :func:`cummin` and :func:`cummax` on :meth:`DataFrame.groupby` with categorical column having absent categories, would return incorrect result or segfault (:issue:`16771`)
391392

392393
Reshaping
393394
^^^^^^^^^

pandas/_libs/groupby.pyx

+4-2
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ def group_cumprod_float64(float64_t[:, :] out,
143143
const float64_t[:, :] values,
144144
const int64_t[:] labels,
145145
bint is_datetimelike,
146+
int ngroups,
146147
bint skipna=True):
147148
"""
148149
Only transforms on axis=0
@@ -154,7 +155,7 @@ def group_cumprod_float64(float64_t[:, :] out,
154155
int64_t lab
155156

156157
N, K = (<object>values).shape
157-
accum = np.ones_like(values)
158+
accum = np.ones((ngroups, K), np.float64)
158159

159160
with nogil:
160161
for i in range(N):
@@ -180,6 +181,7 @@ def group_cumsum(numeric[:, :] out,
180181
numeric[:, :] values,
181182
const int64_t[:] labels,
182183
is_datetimelike,
184+
int ngroups,
183185
bint skipna=True):
184186
"""
185187
Only transforms on axis=0
@@ -191,7 +193,7 @@ def group_cumsum(numeric[:, :] out,
191193
int64_t lab
192194

193195
N, K = (<object>values).shape
194-
accum = np.zeros_like(values)
196+
accum = np.zeros((ngroups, K), np.array(values).dtype)
195197

196198
with nogil:
197199
for i in range(N):

pandas/_libs/groupby_helper.pxi.in

+6-4
Original file line numberDiff line numberDiff line change
@@ -474,7 +474,8 @@ def group_min(groupby_t[:, :] out,
474474
def group_cummin(groupby_t[:, :] out,
475475
groupby_t[:, :] values,
476476
const int64_t[:] labels,
477-
bint is_datetimelike):
477+
bint is_datetimelike,
478+
int ngroups):
478479
"""
479480
Only transforms on axis=0
480481
"""
@@ -485,7 +486,7 @@ def group_cummin(groupby_t[:, :] out,
485486
int64_t lab
486487

487488
N, K = (<object>values).shape
488-
accum = np.empty_like(values)
489+
accum = np.empty((ngroups, K), np.array(values).dtype)
489490
if groupby_t is int64_t:
490491
accum[:] = _int64_max
491492
else:
@@ -522,7 +523,8 @@ def group_cummin(groupby_t[:, :] out,
522523
def group_cummax(groupby_t[:, :] out,
523524
groupby_t[:, :] values,
524525
const int64_t[:] labels,
525-
bint is_datetimelike):
526+
bint is_datetimelike,
527+
int ngroups):
526528
"""
527529
Only transforms on axis=0
528530
"""
@@ -533,7 +535,7 @@ def group_cummax(groupby_t[:, :] out,
533535
int64_t lab
534536

535537
N, K = (<object>values).shape
536-
accum = np.empty_like(values)
538+
accum = np.empty((ngroups, K), np.array(values).dtype)
537539
if groupby_t is int64_t:
538540
accum[:] = -_int64_max
539541
else:

pandas/core/groupby/ops.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,7 @@ def get_group_levels(self):
361361
'cummax': 'group_cummax',
362362
'rank': {
363363
'name': 'group_rank',
364-
'f': lambda func, a, b, c, d, **kwargs: func(
364+
'f': lambda func, a, b, c, d, e, **kwargs: func(
365365
a, b, c, d,
366366
kwargs.get('ties_method', 'average'),
367367
kwargs.get('ascending', True),
@@ -600,9 +600,10 @@ def _transform(self, result, values, comp_ids, transform_func,
600600
for i, chunk in enumerate(values.transpose(2, 0, 1)):
601601

602602
transform_func(result[:, :, i], values,
603-
comp_ids, is_datetimelike, **kwargs)
603+
comp_ids, is_datetimelike, ngroups, **kwargs)
604604
else:
605-
transform_func(result, values, comp_ids, is_datetimelike, **kwargs)
605+
transform_func(result, values, comp_ids, is_datetimelike, ngroups,
606+
**kwargs)
606607

607608
return result
608609

pandas/tests/groupby/test_transform.py

+21-5
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99
from pandas.core.dtypes.common import ensure_platform_int, is_timedelta64_dtype
1010

1111
import pandas as pd
12-
from pandas import DataFrame, MultiIndex, Series, Timestamp, concat, date_range
12+
from pandas import (
13+
Categorical, DataFrame, MultiIndex, Series, Timestamp, concat, date_range)
1314
from pandas.core.groupby.groupby import DataError
1415
from pandas.util import testing as tm
1516
from pandas.util.testing import assert_frame_equal, assert_series_equal
@@ -470,7 +471,8 @@ def _check_cython_group_transform_cumulative(pd_op, np_op, dtype):
470471
ans = np.zeros_like(data)
471472

472473
labels = np.array([0, 0, 0, 0], dtype=np.int64)
473-
pd_op(ans, data, labels, is_datetimelike)
474+
ngroups = 1
475+
pd_op(ans, data, labels, is_datetimelike, ngroups)
474476

475477
tm.assert_numpy_array_equal(np_op(data), ans[:, 0],
476478
check_dtype=False)
@@ -496,17 +498,19 @@ def test_cython_group_transform_algos():
496498

497499
# with nans
498500
labels = np.array([0, 0, 0, 0, 0], dtype=np.int64)
501+
ngroups = 1
499502

500503
data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64')
501504
actual = np.zeros_like(data)
502505
actual.fill(np.nan)
503-
groupby.group_cumprod_float64(actual, data, labels, is_datetimelike)
506+
groupby.group_cumprod_float64(actual, data, labels, is_datetimelike,
507+
ngroups)
504508
expected = np.array([1, 2, 6, np.nan, 24], dtype='float64')
505509
tm.assert_numpy_array_equal(actual[:, 0], expected)
506510

507511
actual = np.zeros_like(data)
508512
actual.fill(np.nan)
509-
groupby.group_cumsum(actual, data, labels, is_datetimelike)
513+
groupby.group_cumsum(actual, data, labels, is_datetimelike, ngroups)
510514
expected = np.array([1, 3, 6, np.nan, 10], dtype='float64')
511515
tm.assert_numpy_array_equal(actual[:, 0], expected)
512516

@@ -515,7 +519,7 @@ def test_cython_group_transform_algos():
515519
data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None]
516520
actual = np.zeros_like(data, dtype='int64')
517521
groupby.group_cumsum(actual, data.view('int64'), labels,
518-
is_datetimelike)
522+
is_datetimelike, ngroups)
519523
expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64(
520524
2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'),
521525
np.timedelta64(5, 'ns')])
@@ -863,3 +867,15 @@ def test_groupby_transform_with_datetimes(func, values):
863867
index=dates, name="price")
864868

865869
tm.assert_series_equal(result, expected)
870+
871+
872+
def test_transform_absent_categories():
873+
# GH 16771
874+
# cython transforms with more groups than rows
875+
x_vals = [1]
876+
x_cats = range(2)
877+
y = [1]
878+
df = DataFrame(dict(x=Categorical(x_vals, x_cats), y=y))
879+
result = df.y.groupby(df.x).cumsum()
880+
expected = df.y
881+
assert_series_equal(result, expected)

0 commit comments

Comments
 (0)