Skip to content

Commit fa5d600

Browse files
abullAdam
abull
authored and
Adam
committed
BUG: errors and segfaults in groupby cython transforms (pandas-dev#16771)
1 parent 872a23b commit fa5d600

File tree

5 files changed

+78
-22
lines changed

5 files changed

+78
-22
lines changed

doc/source/whatsnew/v0.25.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,8 @@ Groupby/Resample/Rolling
387387
- Ensured that ordering of outputs in ``groupby`` aggregation functions is consistent across all versions of Python (:issue:`25692`)
388388
- Ensured that result group order is correct when grouping on an ordered ``Categorical`` and specifying ``observed=True`` (:issue:`25871`, :issue:`25167`)
389389
- Bug in :meth:`pandas.core.window.Rolling.min` and :meth:`pandas.core.window.Rolling.max` that caused a memory leak (:issue:`25893`)
390-
- Bug in :func:`idxmax` and :func:`idxmin` on :meth:`DataFrame.groupby` with datetime column would return incorrect dtype (:issue:`25444`, :issue:`15306`)
390+
- Bug in :meth:`pandas.core.groupby.GroupBy.idxmax` and :meth:`pandas.core.groupby.GroupBy.idxmin` with datetime column would return incorrect dtype (:issue:`25444`, :issue:`15306`)
391+
- Bug in :meth:`pandas.core.groupby.GroupBy.cumsum`, :meth:`pandas.core.groupby.GroupBy.cumprod`, :meth:`pandas.core.groupby.GroupBy.cummin` and :meth:`pandas.core.groupby.GroupBy.cummax` with categorical column having absent categories, would return incorrect result or segfault (:issue:`16771`)
391392

392393
Reshaping
393394
^^^^^^^^^

pandas/_libs/groupby.pyx

+26-6
Original file line numberDiff line numberDiff line change
@@ -142,19 +142,29 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
142142
def group_cumprod_float64(float64_t[:, :] out,
143143
const float64_t[:, :] values,
144144
const int64_t[:] labels,
145+
int ngroups,
145146
bint is_datetimelike,
146147
bint skipna=True):
148+
"""Cumulative product of columns of `values`, in row groups `labels`.
149+
150+
Arguments:
151+
`out`: array to store cumprod in
152+
`values`: values to take cumprod of
153+
`labels`: labels to group by
154+
`ngroups`: number of groups, larger than all entries of `labels`
155+
`is_datetimelike`: always false, `values` is never datetime-like
156+
`skipna`: if true, ignore nans in `values`
157+
147158
"""
148-
Only transforms on axis=0
149-
"""
159+
150160
cdef:
151161
Py_ssize_t i, j, N, K, size
152162
float64_t val
153163
float64_t[:, :] accum
154164
int64_t lab
155165

156166
N, K = (<object>values).shape
157-
accum = np.ones_like(values)
167+
accum = np.ones((ngroups, K), np.float64)
158168

159169
with nogil:
160170
for i in range(N):
@@ -179,19 +189,29 @@ def group_cumprod_float64(float64_t[:, :] out,
179189
def group_cumsum(numeric[:, :] out,
180190
numeric[:, :] values,
181191
const int64_t[:] labels,
192+
int ngroups,
182193
is_datetimelike,
183194
bint skipna=True):
195+
"""Cumulative sum of columns of `values`, in row groups `labels`.
196+
197+
Arguments:
198+
`out`: array to store cumsum in
199+
`values`: values to take cumsum of
200+
`labels`: labels to group by
201+
`ngroups`: number of groups, larger than all entries of `labels`
202+
`is_datetimelike`: true if `values` contains datetime-like entries
203+
`skipna`: if true, ignore nans in `values`
204+
184205
"""
185-
Only transforms on axis=0
186-
"""
206+
187207
cdef:
188208
Py_ssize_t i, j, N, K, size
189209
numeric val
190210
numeric[:, :] accum
191211
int64_t lab
192212

193213
N, K = (<object>values).shape
194-
accum = np.zeros_like(values)
214+
accum = np.zeros((ngroups, K), np.asarray(values).dtype)
195215

196216
with nogil:
197217
for i in range(N):

pandas/_libs/groupby_helper.pxi.in

+24-6
Original file line numberDiff line numberDiff line change
@@ -474,18 +474,27 @@ def group_min(groupby_t[:, :] out,
474474
def group_cummin(groupby_t[:, :] out,
475475
groupby_t[:, :] values,
476476
const int64_t[:] labels,
477+
int ngroups,
477478
bint is_datetimelike):
479+
"""Cumulative minimum of columns of `values`, in row groups `labels`.
480+
481+
Arguments:
482+
`out`: array to store cummin in
483+
`values`: values to take cummin of
484+
`labels`: labels to group by
485+
`ngroups`: number of groups, larger than all entries of `labels`
486+
`is_datetimelike`: true if `values` contains datetime-like entries
487+
478488
"""
479-
Only transforms on axis=0
480-
"""
489+
481490
cdef:
482491
Py_ssize_t i, j, N, K, size
483492
groupby_t val, mval
484493
ndarray[groupby_t, ndim=2] accum
485494
int64_t lab
486495

487496
N, K = (<object>values).shape
488-
accum = np.empty_like(values)
497+
accum = np.empty((ngroups, K), np.asarray(values).dtype)
489498
if groupby_t is int64_t:
490499
accum[:] = _int64_max
491500
else:
@@ -522,18 +531,27 @@ def group_cummin(groupby_t[:, :] out,
522531
def group_cummax(groupby_t[:, :] out,
523532
groupby_t[:, :] values,
524533
const int64_t[:] labels,
534+
int ngroups,
525535
bint is_datetimelike):
536+
"""Cumulative maximum of columns of `values`, in row groups `labels`.
537+
538+
Arguments:
539+
`out`: array to store cummax in
540+
`values`: values to take cummax of
541+
`labels`: labels to group by
542+
`ngroups`: number of groups, larger than all entries of `labels`
543+
`is_datetimelike`: true if `values` contains datetime-like entries
544+
526545
"""
527-
Only transforms on axis=0
528-
"""
546+
529547
cdef:
530548
Py_ssize_t i, j, N, K, size
531549
groupby_t val, mval
532550
ndarray[groupby_t, ndim=2] accum
533551
int64_t lab
534552

535553
N, K = (<object>values).shape
536-
accum = np.empty_like(values)
554+
accum = np.empty((ngroups, K), np.asarray(values).dtype)
537555
if groupby_t is int64_t:
538556
accum[:] = -_int64_max
539557
else:

pandas/core/groupby/ops.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -361,8 +361,8 @@ def get_group_levels(self):
361361
'cummax': 'group_cummax',
362362
'rank': {
363363
'name': 'group_rank',
364-
'f': lambda func, a, b, c, d, **kwargs: func(
365-
a, b, c, d,
364+
'f': lambda func, a, b, c, d, e, **kwargs: func(
365+
a, b, c, e,
366366
kwargs.get('ties_method', 'average'),
367367
kwargs.get('ascending', True),
368368
kwargs.get('pct', False),
@@ -600,9 +600,10 @@ def _transform(self, result, values, comp_ids, transform_func,
600600
for i, chunk in enumerate(values.transpose(2, 0, 1)):
601601

602602
transform_func(result[:, :, i], values,
603-
comp_ids, is_datetimelike, **kwargs)
603+
comp_ids, ngroups, is_datetimelike, **kwargs)
604604
else:
605-
transform_func(result, values, comp_ids, is_datetimelike, **kwargs)
605+
transform_func(result, values, comp_ids, ngroups, is_datetimelike,
606+
**kwargs)
606607

607608
return result
608609

pandas/tests/groupby/test_transform.py

+21-5
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99
from pandas.core.dtypes.common import ensure_platform_int, is_timedelta64_dtype
1010

1111
import pandas as pd
12-
from pandas import DataFrame, MultiIndex, Series, Timestamp, concat, date_range
12+
from pandas import (
13+
Categorical, DataFrame, MultiIndex, Series, Timestamp, concat, date_range)
1314
from pandas.core.groupby.groupby import DataError
1415
from pandas.util import testing as tm
1516
from pandas.util.testing import assert_frame_equal, assert_series_equal
@@ -470,7 +471,8 @@ def _check_cython_group_transform_cumulative(pd_op, np_op, dtype):
470471
ans = np.zeros_like(data)
471472

472473
labels = np.array([0, 0, 0, 0], dtype=np.int64)
473-
pd_op(ans, data, labels, is_datetimelike)
474+
ngroups = 1
475+
pd_op(ans, data, labels, ngroups, is_datetimelike)
474476

475477
tm.assert_numpy_array_equal(np_op(data), ans[:, 0],
476478
check_dtype=False)
@@ -496,17 +498,19 @@ def test_cython_group_transform_algos():
496498

497499
# with nans
498500
labels = np.array([0, 0, 0, 0, 0], dtype=np.int64)
501+
ngroups = 1
499502

500503
data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64')
501504
actual = np.zeros_like(data)
502505
actual.fill(np.nan)
503-
groupby.group_cumprod_float64(actual, data, labels, is_datetimelike)
506+
groupby.group_cumprod_float64(actual, data, labels, ngroups,
507+
is_datetimelike)
504508
expected = np.array([1, 2, 6, np.nan, 24], dtype='float64')
505509
tm.assert_numpy_array_equal(actual[:, 0], expected)
506510

507511
actual = np.zeros_like(data)
508512
actual.fill(np.nan)
509-
groupby.group_cumsum(actual, data, labels, is_datetimelike)
513+
groupby.group_cumsum(actual, data, labels, ngroups, is_datetimelike)
510514
expected = np.array([1, 3, 6, np.nan, 10], dtype='float64')
511515
tm.assert_numpy_array_equal(actual[:, 0], expected)
512516

@@ -515,7 +519,7 @@ def test_cython_group_transform_algos():
515519
data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None]
516520
actual = np.zeros_like(data, dtype='int64')
517521
groupby.group_cumsum(actual, data.view('int64'), labels,
518-
is_datetimelike)
522+
ngroups, is_datetimelike)
519523
expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64(
520524
2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'),
521525
np.timedelta64(5, 'ns')])
@@ -863,3 +867,15 @@ def test_groupby_transform_with_datetimes(func, values):
863867
index=dates, name="price")
864868

865869
tm.assert_series_equal(result, expected)
870+
871+
872+
def test_transform_absent_categories():
873+
# GH 16771
874+
# cython transforms with more groups than rows
875+
x_vals = [1]
876+
x_cats = range(2)
877+
y = [1]
878+
df = DataFrame(dict(x=Categorical(x_vals, x_cats), y=y))
879+
result = df.y.groupby(df.x).cumsum()
880+
expected = df.y
881+
assert_series_equal(result, expected)

0 commit comments

Comments
 (0)