Skip to content

Commit c8ca6ad

Browse files
abullAdam
abull
authored and
Adam
committed
BUG: errors and segfaults in groupby cython transforms (pandas-dev#16771)
1 parent 872a23b commit c8ca6ad

File tree

5 files changed

+117
-22
lines changed

5 files changed

+117
-22
lines changed

doc/source/whatsnew/v0.25.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,8 @@ Groupby/Resample/Rolling
387387
- Ensured that ordering of outputs in ``groupby`` aggregation functions is consistent across all versions of Python (:issue:`25692`)
388388
- Ensured that result group order is correct when grouping on an ordered ``Categorical`` and specifying ``observed=True`` (:issue:`25871`, :issue:`25167`)
389389
- Bug in :meth:`pandas.core.window.Rolling.min` and :meth:`pandas.core.window.Rolling.max` that caused a memory leak (:issue:`25893`)
390-
- Bug in :func:`idxmax` and :func:`idxmin` on :meth:`DataFrame.groupby` with datetime column would return incorrect dtype (:issue:`25444`, :issue:`15306`)
390+
- Bug in :meth:`pandas.core.groupby.GroupBy.idxmax` and :meth:`pandas.core.groupby.GroupBy.idxmin` with datetime column would return incorrect dtype (:issue:`25444`, :issue:`15306`)
391+
- Bug in :meth:`pandas.core.groupby.GroupBy.cumsum`, :meth:`pandas.core.groupby.GroupBy.cumprod`, :meth:`pandas.core.groupby.GroupBy.cummin` and :meth:`pandas.core.groupby.GroupBy.cummax` with categorical column having absent categories, would return incorrect result or segfault (:issue:`16771`)
391392

392393
Reshaping
393394
^^^^^^^^^

pandas/_libs/groupby.pyx

+46-6
Original file line numberDiff line numberDiff line change
@@ -142,19 +142,39 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
142142
def group_cumprod_float64(float64_t[:, :] out,
143143
const float64_t[:, :] values,
144144
const int64_t[:] labels,
145+
int ngroups,
145146
bint is_datetimelike,
146147
bint skipna=True):
148+
"""Cumulative product of columns of `values`, in row groups `labels`.
149+
150+
Parameters
151+
----------
152+
out : float64 array
153+
Array to store cumprod in.
154+
values : float64 array
155+
Values to take cumprod of.
156+
labels : int64 array
157+
Labels to group by.
158+
ngroups : int
159+
Number of groups, larger than all entries of `labels`.
160+
is_datetimelike : bool
161+
Always false, `values` is never datetime-like.
162+
skipna : bool
163+
If true, ignore nans in `values`.
164+
165+
Notes
166+
-----
167+
This method modifies the `out` parameter, rather than returning an object.
147168
"""
148-
Only transforms on axis=0
149-
"""
169+
150170
cdef:
151171
Py_ssize_t i, j, N, K, size
152172
float64_t val
153173
float64_t[:, :] accum
154174
int64_t lab
155175

156176
N, K = (<object>values).shape
157-
accum = np.ones_like(values)
177+
accum = np.ones((ngroups, K), np.float64)
158178

159179
with nogil:
160180
for i in range(N):
@@ -179,19 +199,39 @@ def group_cumprod_float64(float64_t[:, :] out,
179199
def group_cumsum(numeric[:, :] out,
180200
numeric[:, :] values,
181201
const int64_t[:] labels,
202+
int ngroups,
182203
is_datetimelike,
183204
bint skipna=True):
205+
"""Cumulative sum of columns of `values`, in row groups `labels`.
206+
207+
Parameters
208+
----------
209+
out : array
210+
Array to store cumsum in.
211+
values : array
212+
Values to take cumsum of.
213+
labels : int64 array
214+
Labels to group by.
215+
ngroups : int
216+
Number of groups, larger than all entries of `labels`.
217+
is_datetimelike : bool
218+
True if `values` contains datetime-like entries.
219+
skipna : bool
220+
If true, ignore nans in `values`.
221+
222+
Notes
223+
-----
224+
This method modifies the `out` parameter, rather than returning an object.
184225
"""
185-
Only transforms on axis=0
186-
"""
226+
187227
cdef:
188228
Py_ssize_t i, j, N, K, size
189229
numeric val
190230
numeric[:, :] accum
191231
int64_t lab
192232

193233
N, K = (<object>values).shape
194-
accum = np.zeros_like(values)
234+
accum = np.zeros((ngroups, K), np.asarray(values).dtype)
195235

196236
with nogil:
197237
for i in range(N):

pandas/_libs/groupby_helper.pxi.in

+42-6
Original file line numberDiff line numberDiff line change
@@ -474,18 +474,36 @@ def group_min(groupby_t[:, :] out,
474474
def group_cummin(groupby_t[:, :] out,
475475
groupby_t[:, :] values,
476476
const int64_t[:] labels,
477+
int ngroups,
477478
bint is_datetimelike):
479+
"""Cumulative minimum of columns of `values`, in row groups `labels`.
480+
481+
Parameters
482+
----------
483+
out : array
484+
Array to store cummin in.
485+
values : array
486+
Values to take cummin of.
487+
labels : int64 array
488+
Labels to group by.
489+
ngroups : int
490+
Number of groups, larger than all entries of `labels`.
491+
is_datetimelike : bool
492+
True if `values` contains datetime-like entries.
493+
494+
Notes
495+
-----
496+
This method modifies the `out` parameter, rather than returning an object.
478497
"""
479-
Only transforms on axis=0
480-
"""
498+
481499
cdef:
482500
Py_ssize_t i, j, N, K, size
483501
groupby_t val, mval
484502
ndarray[groupby_t, ndim=2] accum
485503
int64_t lab
486504

487505
N, K = (<object>values).shape
488-
accum = np.empty_like(values)
506+
accum = np.empty((ngroups, K), np.asarray(values).dtype)
489507
if groupby_t is int64_t:
490508
accum[:] = _int64_max
491509
else:
@@ -522,18 +540,36 @@ def group_cummin(groupby_t[:, :] out,
522540
def group_cummax(groupby_t[:, :] out,
523541
groupby_t[:, :] values,
524542
const int64_t[:] labels,
543+
int ngroups,
525544
bint is_datetimelike):
545+
"""Cumulative maximum of columns of `values`, in row groups `labels`.
546+
547+
Parameters
548+
----------
549+
out : array
550+
Array to store cummax in.
551+
values : array
552+
Values to take cummax of.
553+
labels : int64 array
554+
Labels to group by.
555+
ngroups : int
556+
Number of groups, larger than all entries of `labels`.
557+
is_datetimelike : bool
558+
True if `values` contains datetime-like entries.
559+
560+
Notes
561+
-----
562+
This method modifies the `out` parameter, rather than returning an object.
526563
"""
527-
Only transforms on axis=0
528-
"""
564+
529565
cdef:
530566
Py_ssize_t i, j, N, K, size
531567
groupby_t val, mval
532568
ndarray[groupby_t, ndim=2] accum
533569
int64_t lab
534570

535571
N, K = (<object>values).shape
536-
accum = np.empty_like(values)
572+
accum = np.empty((ngroups, K), np.asarray(values).dtype)
537573
if groupby_t is int64_t:
538574
accum[:] = -_int64_max
539575
else:

pandas/core/groupby/ops.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -361,8 +361,8 @@ def get_group_levels(self):
361361
'cummax': 'group_cummax',
362362
'rank': {
363363
'name': 'group_rank',
364-
'f': lambda func, a, b, c, d, **kwargs: func(
365-
a, b, c, d,
364+
'f': lambda func, a, b, c, d, e, **kwargs: func(
365+
a, b, c, e,
366366
kwargs.get('ties_method', 'average'),
367367
kwargs.get('ascending', True),
368368
kwargs.get('pct', False),
@@ -600,9 +600,10 @@ def _transform(self, result, values, comp_ids, transform_func,
600600
for i, chunk in enumerate(values.transpose(2, 0, 1)):
601601

602602
transform_func(result[:, :, i], values,
603-
comp_ids, is_datetimelike, **kwargs)
603+
comp_ids, ngroups, is_datetimelike, **kwargs)
604604
else:
605-
transform_func(result, values, comp_ids, is_datetimelike, **kwargs)
605+
transform_func(result, values, comp_ids, ngroups, is_datetimelike,
606+
**kwargs)
606607

607608
return result
608609

pandas/tests/groupby/test_transform.py

+22-5
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99
from pandas.core.dtypes.common import ensure_platform_int, is_timedelta64_dtype
1010

1111
import pandas as pd
12-
from pandas import DataFrame, MultiIndex, Series, Timestamp, concat, date_range
12+
from pandas import (
13+
Categorical, DataFrame, MultiIndex, Series, Timestamp, concat, date_range)
1314
from pandas.core.groupby.groupby import DataError
1415
from pandas.util import testing as tm
1516
from pandas.util.testing import assert_frame_equal, assert_series_equal
@@ -470,7 +471,8 @@ def _check_cython_group_transform_cumulative(pd_op, np_op, dtype):
470471
ans = np.zeros_like(data)
471472

472473
labels = np.array([0, 0, 0, 0], dtype=np.int64)
473-
pd_op(ans, data, labels, is_datetimelike)
474+
ngroups = 1
475+
pd_op(ans, data, labels, ngroups, is_datetimelike)
474476

475477
tm.assert_numpy_array_equal(np_op(data), ans[:, 0],
476478
check_dtype=False)
@@ -496,17 +498,19 @@ def test_cython_group_transform_algos():
496498

497499
# with nans
498500
labels = np.array([0, 0, 0, 0, 0], dtype=np.int64)
501+
ngroups = 1
499502

500503
data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64')
501504
actual = np.zeros_like(data)
502505
actual.fill(np.nan)
503-
groupby.group_cumprod_float64(actual, data, labels, is_datetimelike)
506+
groupby.group_cumprod_float64(actual, data, labels, ngroups,
507+
is_datetimelike)
504508
expected = np.array([1, 2, 6, np.nan, 24], dtype='float64')
505509
tm.assert_numpy_array_equal(actual[:, 0], expected)
506510

507511
actual = np.zeros_like(data)
508512
actual.fill(np.nan)
509-
groupby.group_cumsum(actual, data, labels, is_datetimelike)
513+
groupby.group_cumsum(actual, data, labels, ngroups, is_datetimelike)
510514
expected = np.array([1, 3, 6, np.nan, 10], dtype='float64')
511515
tm.assert_numpy_array_equal(actual[:, 0], expected)
512516

@@ -515,7 +519,7 @@ def test_cython_group_transform_algos():
515519
data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None]
516520
actual = np.zeros_like(data, dtype='int64')
517521
groupby.group_cumsum(actual, data.view('int64'), labels,
518-
is_datetimelike)
522+
ngroups, is_datetimelike)
519523
expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64(
520524
2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'),
521525
np.timedelta64(5, 'ns')])
@@ -863,3 +867,16 @@ def test_groupby_transform_with_datetimes(func, values):
863867
index=dates, name="price")
864868

865869
tm.assert_series_equal(result, expected)
870+
871+
872+
@pytest.mark.parametrize('func', ['cumsum', 'cumprod', 'cummin', 'cummax'])
873+
def test_transform_absent_categories(func):
874+
# GH 16771
875+
# cython transforms with more groups than rows
876+
x_vals = [1]
877+
x_cats = range(2)
878+
y = [1]
879+
df = DataFrame(dict(x=Categorical(x_vals, x_cats), y=y))
880+
result = getattr(df.y.groupby(df.x), func)()
881+
expected = df.y
882+
assert_series_equal(result, expected)

0 commit comments

Comments
 (0)