From 41befd18fa55af800275fdb9d59cf1055c785542 Mon Sep 17 00:00:00 2001 From: abull Date: Thu, 18 Apr 2019 13:40:41 +0100 Subject: [PATCH] BUG: errors and segfaults in groupby cython transforms (#16771) --- doc/source/whatsnew/v0.25.0.rst | 3 +- pandas/_libs/groupby.pyx | 52 +++++++++++++++++++++++--- pandas/_libs/groupby_helper.pxi.in | 48 +++++++++++++++++++++--- pandas/core/groupby/ops.py | 9 +++-- pandas/tests/groupby/test_transform.py | 27 ++++++++++--- 5 files changed, 117 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 20d4f46348be6..353af98f5b64d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -387,7 +387,8 @@ Groupby/Resample/Rolling - Ensured that ordering of outputs in ``groupby`` aggregation functions is consistent across all versions of Python (:issue:`25692`) - Ensured that result group order is correct when grouping on an ordered ``Categorical`` and specifying ``observed=True`` (:issue:`25871`, :issue:`25167`) - Bug in :meth:`pandas.core.window.Rolling.min` and :meth:`pandas.core.window.Rolling.max` that caused a memory leak (:issue:`25893`) -- Bug in :func:`idxmax` and :func:`idxmin` on :meth:`DataFrame.groupby` with datetime column would return incorrect dtype (:issue:`25444`, :issue:`15306`) +- Bug in :meth:`pandas.core.groupby.GroupBy.idxmax` and :meth:`pandas.core.groupby.GroupBy.idxmin` with datetime column would return incorrect dtype (:issue:`25444`, :issue:`15306`) +- Bug in :meth:`pandas.core.groupby.GroupBy.cumsum`, :meth:`pandas.core.groupby.GroupBy.cumprod`, :meth:`pandas.core.groupby.GroupBy.cummin` and :meth:`pandas.core.groupby.GroupBy.cummax` with categorical column having absent categories, would return incorrect result or segfault (:issue:`16771`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index c1fc0062dff09..2498445e78fc3 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -142,11 +142,31 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, def group_cumprod_float64(float64_t[:, :] out, const float64_t[:, :] values, const int64_t[:] labels, + int ngroups, bint is_datetimelike, bint skipna=True): + """Cumulative product of columns of `values`, in row groups `labels`. + + Parameters + ---------- + out : float64 array + Array to store cumprod in. + values : float64 array + Values to take cumprod of. + labels : int64 array + Labels to group by. + ngroups : int + Number of groups, larger than all entries of `labels`. + is_datetimelike : bool + Always false, `values` is never datetime-like. + skipna : bool + If true, ignore nans in `values`. + + Notes + ----- + This method modifies the `out` parameter, rather than returning an object. """ - Only transforms on axis=0 - """ + cdef: Py_ssize_t i, j, N, K, size float64_t val @@ -154,7 +174,7 @@ def group_cumprod_float64(float64_t[:, :] out, int64_t lab N, K = (values).shape - accum = np.ones_like(values) + accum = np.ones((ngroups, K), dtype=np.float64) with nogil: for i in range(N): @@ -179,11 +199,31 @@ def group_cumprod_float64(float64_t[:, :] out, def group_cumsum(numeric[:, :] out, numeric[:, :] values, const int64_t[:] labels, + int ngroups, is_datetimelike, bint skipna=True): + """Cumulative sum of columns of `values`, in row groups `labels`. + + Parameters + ---------- + out : array + Array to store cumsum in. + values : array + Values to take cumsum of. + labels : int64 array + Labels to group by. + ngroups : int + Number of groups, larger than all entries of `labels`. + is_datetimelike : bool + True if `values` contains datetime-like entries. + skipna : bool + If true, ignore nans in `values`. + + Notes + ----- + This method modifies the `out` parameter, rather than returning an object. """ - Only transforms on axis=0 - """ + cdef: Py_ssize_t i, j, N, K, size numeric val @@ -191,7 +231,7 @@ def group_cumsum(numeric[:, :] out, int64_t lab N, K = (values).shape - accum = np.zeros_like(values) + accum = np.zeros((ngroups, K), dtype=np.asarray(values).dtype) with nogil: for i in range(N): diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 63cd4d6ac6ff2..8e351244b7f43 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -474,10 +474,28 @@ def group_min(groupby_t[:, :] out, def group_cummin(groupby_t[:, :] out, groupby_t[:, :] values, const int64_t[:] labels, + int ngroups, bint is_datetimelike): + """Cumulative minimum of columns of `values`, in row groups `labels`. + + Parameters + ---------- + out : array + Array to store cummin in. + values : array + Values to take cummin of. + labels : int64 array + Labels to group by. + ngroups : int + Number of groups, larger than all entries of `labels`. + is_datetimelike : bool + True if `values` contains datetime-like entries. + + Notes + ----- + This method modifies the `out` parameter, rather than returning an object. """ - Only transforms on axis=0 - """ + cdef: Py_ssize_t i, j, N, K, size groupby_t val, mval @@ -485,7 +503,7 @@ def group_cummin(groupby_t[:, :] out, int64_t lab N, K = (values).shape - accum = np.empty_like(values) + accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) if groupby_t is int64_t: accum[:] = _int64_max else: @@ -522,10 +540,28 @@ def group_cummin(groupby_t[:, :] out, def group_cummax(groupby_t[:, :] out, groupby_t[:, :] values, const int64_t[:] labels, + int ngroups, bint is_datetimelike): + """Cumulative maximum of columns of `values`, in row groups `labels`. + + Parameters + ---------- + out : array + Array to store cummax in. + values : array + Values to take cummax of. + labels : int64 array + Labels to group by. + ngroups : int + Number of groups, larger than all entries of `labels`. + is_datetimelike : bool + True if `values` contains datetime-like entries. + + Notes + ----- + This method modifies the `out` parameter, rather than returning an object. """ - Only transforms on axis=0 - """ + cdef: Py_ssize_t i, j, N, K, size groupby_t val, mval @@ -533,7 +569,7 @@ def group_cummax(groupby_t[:, :] out, int64_t lab N, K = (values).shape - accum = np.empty_like(values) + accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) if groupby_t is int64_t: accum[:] = -_int64_max else: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 82b9d6c1269f9..d402ae3682f0e 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -361,8 +361,8 @@ def get_group_levels(self): 'cummax': 'group_cummax', 'rank': { 'name': 'group_rank', - 'f': lambda func, a, b, c, d, **kwargs: func( - a, b, c, d, + 'f': lambda func, a, b, c, d, e, **kwargs: func( + a, b, c, e, kwargs.get('ties_method', 'average'), kwargs.get('ascending', True), kwargs.get('pct', False), @@ -600,9 +600,10 @@ def _transform(self, result, values, comp_ids, transform_func, for i, chunk in enumerate(values.transpose(2, 0, 1)): transform_func(result[:, :, i], values, - comp_ids, is_datetimelike, **kwargs) + comp_ids, ngroups, is_datetimelike, **kwargs) else: - transform_func(result, values, comp_ids, is_datetimelike, **kwargs) + transform_func(result, values, comp_ids, ngroups, is_datetimelike, + **kwargs) return result diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index e865dc35c71b0..e330329644269 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -9,7 +9,8 @@ from pandas.core.dtypes.common import ensure_platform_int, is_timedelta64_dtype import pandas as pd -from pandas import DataFrame, MultiIndex, Series, Timestamp, concat, date_range +from pandas import ( + Categorical, DataFrame, MultiIndex, Series, Timestamp, concat, date_range) from pandas.core.groupby.groupby import DataError from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal @@ -470,7 +471,8 @@ def _check_cython_group_transform_cumulative(pd_op, np_op, dtype): ans = np.zeros_like(data) labels = np.array([0, 0, 0, 0], dtype=np.int64) - pd_op(ans, data, labels, is_datetimelike) + ngroups = 1 + pd_op(ans, data, labels, ngroups, is_datetimelike) tm.assert_numpy_array_equal(np_op(data), ans[:, 0], check_dtype=False) @@ -496,17 +498,19 @@ def test_cython_group_transform_algos(): # with nans labels = np.array([0, 0, 0, 0, 0], dtype=np.int64) + ngroups = 1 data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64') actual = np.zeros_like(data) actual.fill(np.nan) - groupby.group_cumprod_float64(actual, data, labels, is_datetimelike) + groupby.group_cumprod_float64(actual, data, labels, ngroups, + is_datetimelike) expected = np.array([1, 2, 6, np.nan, 24], dtype='float64') tm.assert_numpy_array_equal(actual[:, 0], expected) actual = np.zeros_like(data) actual.fill(np.nan) - groupby.group_cumsum(actual, data, labels, is_datetimelike) + groupby.group_cumsum(actual, data, labels, ngroups, is_datetimelike) expected = np.array([1, 3, 6, np.nan, 10], dtype='float64') tm.assert_numpy_array_equal(actual[:, 0], expected) @@ -515,7 +519,7 @@ def test_cython_group_transform_algos(): data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None] actual = np.zeros_like(data, dtype='int64') groupby.group_cumsum(actual, data.view('int64'), labels, - is_datetimelike) + ngroups, is_datetimelike) expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64( 2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'), np.timedelta64(5, 'ns')]) @@ -863,3 +867,16 @@ def test_groupby_transform_with_datetimes(func, values): index=dates, name="price") tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize('func', ['cumsum', 'cumprod', 'cummin', 'cummax']) +def test_transform_absent_categories(func): + # GH 16771 + # cython transforms with more groups than rows + x_vals = [1] + x_cats = range(2) + y = [1] + df = DataFrame(dict(x=Categorical(x_vals, x_cats), y=y)) + result = getattr(df.y.groupby(df.x), func)() + expected = df.y + assert_series_equal(result, expected)