From 7cd4dc3cc0f7e6bffe80cc1d97293638da66d638 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 26 Mar 2021 14:06:19 -0400 Subject: [PATCH 01/32] wip --- asv_bench/benchmarks/groupby.py | 23 ++++++++++++++++++++ pandas/_libs/groupby.pyx | 22 +++++++++----------- pandas/core/groupby/groupby.py | 11 ++++++---- pandas/core/groupby/ops.py | 37 +++++++++++++++++++-------------- 4 files changed, 61 insertions(+), 32 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 9930c61e34b15..d825c78cc5e5b 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -493,6 +493,29 @@ def time_frame_agg(self, dtype, method): self.df.groupby("key").agg(method) +class CumminMax: + """ + Benchmarks specifically targetting our cython aggregation algorithms + (using a big enough dataframe with simple key, so a large part of the + time is actually spent in the grouped aggregation). + """ + + param_names = ["dtype", "method"] + params = [ + ["float64", "int64", "Float64", "Int64"], + ["cummin", "cummax"], + ] + + def setup(self, dtype, method): + N = 1_000_000 + df = DataFrame(np.random.randn(N, 10), columns=list("abcdefghij")) + df["key"] = np.random.randint(0, 100, size=N) + self.df = df + + def time_frame_agg(self, dtype, method): + self.df.groupby("key").agg(method) + + class RankWithTies: # GH 21237 param_names = ["dtype", "tie_method"] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 545d6a10232ab..3baf99837bc95 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1251,9 +1251,9 @@ def group_min(groupby_t[:, ::1] out, @cython.wraparound(False) def group_cummin_max(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, + uint8_t[:, ::1] mask, const int64_t[:] labels, int ngroups, - bint is_datetimelike, bint compute_max): """ Cumulative minimum/maximum of columns of `values`, in row groups `labels`. @@ -1268,8 +1268,6 @@ def group_cummin_max(groupby_t[:, ::1] out, Labels to group by. ngroups : int Number of groups, larger than all entries of `labels`. - is_datetimelike : bool - True if `values` contains datetime-like entries. compute_max : bool True if cumulative maximum should be computed, False if cumulative minimum should be computed @@ -1281,11 +1279,11 @@ def group_cummin_max(groupby_t[:, ::1] out, cdef: Py_ssize_t i, j, N, K, size groupby_t val, mval - ndarray[groupby_t, ndim=2] accum + groupby_t[:, ::1] accum int64_t lab N, K = (values).shape - accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) + accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype, order="C") if groupby_t is int64_t: accum[:] = -_int64_max if compute_max else _int64_max elif groupby_t is uint64_t: @@ -1302,7 +1300,7 @@ def group_cummin_max(groupby_t[:, ::1] out, for j in range(K): val = values[i, j] - if _treat_as_na(val, is_datetimelike): + if mask[i, j]: out[i, j] = val else: mval = accum[lab, j] @@ -1319,19 +1317,19 @@ def group_cummin_max(groupby_t[:, ::1] out, @cython.wraparound(False) def group_cummin(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, + uint8_t[:, ::1] mask, const int64_t[:] labels, - int ngroups, - bint is_datetimelike): + int ngroups): """See group_cummin_max.__doc__""" - group_cummin_max(out, values, labels, ngroups, is_datetimelike, compute_max=False) + group_cummin_max(out, values, mask, labels, ngroups, compute_max=False) @cython.boundscheck(False) @cython.wraparound(False) def group_cummax(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, + uint8_t[:, ::1] mask, const int64_t[:] labels, - int ngroups, - bint is_datetimelike): + int ngroups): """See group_cummin_max.__doc__""" - group_cummin_max(out, values, labels, ngroups, is_datetimelike, compute_max=True) + group_cummin_max(out, values, mask, labels, ngroups, compute_max=True) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f33833193e4e0..22f363970df3b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1024,7 +1024,7 @@ def _cumcount_array(self, ascending: bool = True): @final def _cython_transform( - self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs + self, how: str, numeric_only: bool = True, axis: int = 0, needs_mask: bool = False, **kwargs ): output: Dict[base.OutputKey, np.ndarray] = {} @@ -1034,9 +1034,12 @@ def _cython_transform( if numeric_only and not is_numeric: continue + mask = None + if needs_mask: + mask = np.require(isna(obj._values).view(np.uint8), requirements='C') try: result = self.grouper._cython_operation( - "transform", obj._values, how, axis, **kwargs + "transform", obj._values, how, axis, mask=mask, **kwargs ) except NotImplementedError: continue @@ -2588,7 +2591,7 @@ def cummin(self, axis=0, **kwargs): if axis != 0: return self.apply(lambda x: np.minimum.accumulate(x, axis)) - return self._cython_transform("cummin", numeric_only=False) + return self._cython_transform("cummin", numeric_only=False, needs_mask=True) @final @Substitution(name="groupby") @@ -2604,7 +2607,7 @@ def cummax(self, axis=0, **kwargs): if axis != 0: return self.apply(lambda x: np.maximum.accumulate(x, axis)) - return self._cython_transform("cummax", numeric_only=False) + return self._cython_transform("cummax", numeric_only=False, needs_mask=True) @final def _get_cythonized_result( diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c5d36d1588a5f..bb06ba629fdd0 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -524,7 +524,7 @@ def _disallow_invalid_ops( @final def _ea_wrap_cython_operation( - self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs + self, kind: str, values, how: str, axis: int, min_count: int = -1, mask: Optional[np.ndarray] = None, **kwargs ) -> np.ndarray: """ If we have an ExtensionArray, unwrap, call _cython_operation, and @@ -538,7 +538,7 @@ def _ea_wrap_cython_operation( # operate on the tz-naive equivalents values = values.view("M8[ns]") res_values = self._cython_operation( - kind, values, how, axis, min_count, **kwargs + kind, values, how, axis, min_count, mask=mask, **kwargs ) if how in ["rank"]: # preserve float64 dtype @@ -550,14 +550,15 @@ def _ea_wrap_cython_operation( elif is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype): # IntegerArray or BooleanArray - values = ensure_int_or_float(values) + values = ensure_int_or_float(values._data) res_values = self._cython_operation( - kind, values, how, axis, min_count, **kwargs + kind, values, how, axis, min_count, mask=mask, **kwargs ) dtype = maybe_cast_result_dtype(orig_values.dtype, how) if isinstance(dtype, ExtensionDtype): cls = dtype.construct_array_type() - return cls._from_sequence(res_values, dtype=dtype) + return cls(res_values, mask.astype(bool, copy=True)) + # return cls._from_sequence(res_values, dtype=dtype) return res_values @@ -565,10 +566,10 @@ def _ea_wrap_cython_operation( # FloatingArray values = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan) res_values = self._cython_operation( - kind, values, how, axis, min_count, **kwargs + kind, values, how, axis, min_count, mask=mask, **kwargs ) - result = type(orig_values)._from_sequence(res_values) - return result + # result = type(orig_values)._from_sequence(res_values) + return type(orig_values)(res_values, mask.astype(bool, copy=True)) raise NotImplementedError( f"function is not implemented for this dtype: {values.dtype}" @@ -576,7 +577,7 @@ def _ea_wrap_cython_operation( @final def _cython_operation( - self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs + self, kind: str, values, how: str, axis: int, min_count: int = -1, mask: Optional[np.ndarray] = None, **kwargs ) -> np.ndarray: """ Returns the values of a cython operation. @@ -600,7 +601,7 @@ def _cython_operation( if is_extension_array_dtype(dtype): return self._ea_wrap_cython_operation( - kind, values, how, axis, min_count, **kwargs + kind, values, how, axis, min_count, mask=mask, **kwargs ) is_datetimelike = needs_i8_conversion(dtype) @@ -628,6 +629,8 @@ def _cython_operation( swapped = False if vdim == 1: values = values[:, None] + if mask is not None: + mask = mask[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: @@ -650,9 +653,8 @@ def _cython_operation( else: out_dtype = "object" - codes, _, _ = self.group_info - if kind == "aggregate": + codes, _, _ = self.group_info result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, codes, func, min_count) @@ -661,7 +663,7 @@ def _cython_operation( # TODO: min_count result = self._transform( - result, values, codes, func, is_datetimelike, **kwargs + result, values, func, is_datetimelike, mask=mask, **kwargs ) if is_integer_dtype(result.dtype) and not is_datetimelike: @@ -704,11 +706,14 @@ def _aggregate( @final def _transform( - self, result, values, comp_ids, transform_func, is_datetimelike: bool, **kwargs - ): + self, result: np.ndarray, values: np.ndarray, transform_func, is_datetimelike: bool, mask: Optional[np.ndarray] = None, **kwargs + ) -> np.ndarray: comp_ids, _, ngroups = self.group_info - transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) + if mask is not None: + transform_func(result, values, mask, comp_ids, ngroups, **kwargs) + else: + transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) return result From 91984dc90663d276f66b22135afa327e3e6872c8 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 26 Mar 2021 14:15:53 -0400 Subject: [PATCH 02/32] wip --- pandas/_libs/groupby.pyx | 22 ++++++++++++---------- pandas/tests/groupby/test_function.py | 6 ++++++ 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 3baf99837bc95..3981661c62105 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1300,17 +1300,19 @@ def group_cummin_max(groupby_t[:, ::1] out, for j in range(K): val = values[i, j] - if mask[i, j]: - out[i, j] = val + # if mask[i, j]: + # out[i, j] = val + # if True: + # + # else: + mval = accum[lab, j] + if compute_max: + if val > mval: + accum[lab, j] = mval = val else: - mval = accum[lab, j] - if compute_max: - if val > mval: - accum[lab, j] = mval = val - else: - if val < mval: - accum[lab, j] = mval = val - out[i, j] = mval + if val < mval: + accum[lab, j] = mval = val + out[i, j] = mval @cython.boundscheck(False) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 515774eae009b..cd25eddb84826 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -846,6 +846,12 @@ def test_cummax_all_nan_column(): tm.assert_frame_equal(expected, result) +# @pytest.mark.parametrize("method", ["cummin", "cummax"]) +# def test_cummin_max_nullable_ints_no_float_cast(method): + + + + @pytest.mark.parametrize( "in_vals, out_vals", [ From b371cc5a96f52b30d39876c183c7cd1c0abb0d95 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 26 Mar 2021 14:28:06 -0400 Subject: [PATCH 03/32] wip --- pandas/core/groupby/groupby.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 22f363970df3b..31147179ea70c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1036,7 +1036,8 @@ def _cython_transform( mask = None if needs_mask: - mask = np.require(isna(obj._values).view(np.uint8), requirements='C') + mask = np.zeros(dtype=np.uint8, order="C", shape=(len(obj._values))) + # mask = np.require(isna(obj._values).view(np.uint8), requirements='C') try: result = self.grouper._cython_operation( "transform", obj._values, how, axis, mask=mask, **kwargs From 69cce960b9b7bf65ccd6c74793f7d926b307d8bf Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 26 Mar 2021 14:32:41 -0400 Subject: [PATCH 04/32] wip --- pandas/_libs/groupby.pyx | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 3981661c62105..3baf99837bc95 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1300,19 +1300,17 @@ def group_cummin_max(groupby_t[:, ::1] out, for j in range(K): val = values[i, j] - # if mask[i, j]: - # out[i, j] = val - # if True: - # - # else: - mval = accum[lab, j] - if compute_max: - if val > mval: - accum[lab, j] = mval = val + if mask[i, j]: + out[i, j] = val else: - if val < mval: - accum[lab, j] = mval = val - out[i, j] = mval + mval = accum[lab, j] + if compute_max: + if val > mval: + accum[lab, j] = mval = val + else: + if val < mval: + accum[lab, j] = mval = val + out[i, j] = mval @cython.boundscheck(False) From dd7f324fde0e265f7ec88dccd404feeadb61aaac Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 26 Mar 2021 14:41:44 -0400 Subject: [PATCH 05/32] wip --- pandas/_libs/groupby.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 3baf99837bc95..ce7a8b853e523 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1279,7 +1279,7 @@ def group_cummin_max(groupby_t[:, ::1] out, cdef: Py_ssize_t i, j, N, K, size groupby_t val, mval - groupby_t[:, ::1] accum + ndarray[groupby_t, ndim=2] accum int64_t lab N, K = (values).shape From 64680d4c68a157c63e1796c31ef1e01f2eb77fd7 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 26 Mar 2021 14:51:24 -0400 Subject: [PATCH 06/32] wip --- pandas/core/groupby/groupby.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 31147179ea70c..1175fa345cfe2 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1036,8 +1036,8 @@ def _cython_transform( mask = None if needs_mask: - mask = np.zeros(dtype=np.uint8, order="C", shape=(len(obj._values))) - # mask = np.require(isna(obj._values).view(np.uint8), requirements='C') + # mask = np.zeros(dtype=np.uint8, order="C", shape=(len(obj._values))) + mask = np.require(isna(obj._values).view(np.uint8), requirements='C') try: result = self.grouper._cython_operation( "transform", obj._values, how, axis, mask=mask, **kwargs From be16f65378b58ae2822cc6eea461ff29b65e82a8 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 26 Mar 2021 15:00:02 -0400 Subject: [PATCH 07/32] wip --- pandas/core/groupby/ops.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index bb06ba629fdd0..6f67153d89c66 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -550,7 +550,7 @@ def _ea_wrap_cython_operation( elif is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype): # IntegerArray or BooleanArray - values = ensure_int_or_float(values._data) + values = values._data res_values = self._cython_operation( kind, values, how, axis, min_count, mask=mask, **kwargs ) @@ -564,7 +564,8 @@ def _ea_wrap_cython_operation( elif is_float_dtype(values.dtype): # FloatingArray - values = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan) + values = values._data + # values = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan) res_values = self._cython_operation( kind, values, how, axis, min_count, mask=mask, **kwargs ) From 9442846bc8561cb466c699e11f15e390c47e99e0 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 26 Mar 2021 15:06:41 -0400 Subject: [PATCH 08/32] wip --- pandas/core/groupby/ops.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 6f67153d89c66..3240646933ef0 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -613,12 +613,13 @@ def _cython_operation( elif is_bool_dtype(dtype): values = ensure_int_or_float(values) elif is_integer_dtype(dtype): + pass # we use iNaT for the missing value on ints # so pre-convert to guard this condition - if (values == iNaT).any(): - values = ensure_float64(values) - else: - values = ensure_int_or_float(values) + # if (values == iNaT).any(): + # values = ensure_float64(values) + # else: + # values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(dtype): values = ensure_float64(values) else: From f089175a58300909435f40c333bd15b69b11a03b Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 26 Mar 2021 17:11:40 -0400 Subject: [PATCH 09/32] wip --- asv_bench/benchmarks/groupby.py | 6 +- pandas/_libs/groupby.pyx | 47 ++++++++++++--- pandas/core/groupby/groupby.py | 15 ++--- pandas/core/groupby/ops.py | 103 +++++++++++++++++++++++--------- 4 files changed, 123 insertions(+), 48 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index d825c78cc5e5b..dc4f7dcfabd7f 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -508,12 +508,12 @@ class CumminMax: def setup(self, dtype, method): N = 1_000_000 - df = DataFrame(np.random.randn(N, 10), columns=list("abcdefghij")) + df = DataFrame(np.random.randint(0, 1000, (N, 10)), columns=list("abcdefghij"), dtype=dtype) df["key"] = np.random.randint(0, 100, size=N) self.df = df - def time_frame_agg(self, dtype, method): - self.df.groupby("key").agg(method) + def time_frame_transform(self, dtype, method): + self.df.groupby("key").transform(method) class RankWithTies: diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index ce7a8b853e523..68ba284a3f4cc 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1254,6 +1254,8 @@ def group_cummin_max(groupby_t[:, ::1] out, uint8_t[:, ::1] mask, const int64_t[:] labels, int ngroups, + bint is_datetimelike, + bint use_mask, bint compute_max): """ Cumulative minimum/maximum of columns of `values`, in row groups `labels`. @@ -1264,10 +1266,19 @@ def group_cummin_max(groupby_t[:, ::1] out, Array to store cummin/max in. values : array Values to take cummin/max of. + mask : array[uint8_t] + If `use_mask`, then indices represent missing values, + otherwise will be passed as a zeroed array labels : int64 array Labels to group by. ngroups : int Number of groups, larger than all entries of `labels`. + is_datetimelike : bool + True if `values` contains datetime-like entries. + use_mask : bool + True if the mask should be used (otherwise we continue + as if it is not a masked algorithm). Avoids the cost + of checking for a completely zeroed mask. compute_max : bool True if cumulative maximum should be computed, False if cumulative minimum should be computed @@ -1281,6 +1292,7 @@ def group_cummin_max(groupby_t[:, ::1] out, groupby_t val, mval ndarray[groupby_t, ndim=2] accum int64_t lab + bint val_is_nan N, K = (values).shape accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype, order="C") @@ -1298,11 +1310,26 @@ def group_cummin_max(groupby_t[:, ::1] out, if lab < 0: continue for j in range(K): - val = values[i, j] - - if mask[i, j]: - out[i, j] = val + val_is_nan = False + + # If using the mask, we can avoid grabbing the + # value unless necessary + if use_mask: + if mask[i, j]: + # `out` does not need to be set since it + # will be masked anyway + val_is_nan = True + val = values[i, j] + + # Otherwise, `out` must be set accordingly if the + # value is missing else: + val = values[i, j] + if _treat_as_na(val, is_datetimelike): + val_is_nan = True + out[i, j] = val + + if not val_is_nan: mval = accum[lab, j] if compute_max: if val > mval: @@ -1319,9 +1346,11 @@ def group_cummin(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, uint8_t[:, ::1] mask, const int64_t[:] labels, - int ngroups): + int ngroups, + bint is_datetimelike, + bint use_mask): """See group_cummin_max.__doc__""" - group_cummin_max(out, values, mask, labels, ngroups, compute_max=False) + group_cummin_max(out, values, mask, labels, ngroups, is_datetimelike, use_mask, compute_max=False) @cython.boundscheck(False) @@ -1330,6 +1359,8 @@ def group_cummax(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, uint8_t[:, ::1] mask, const int64_t[:] labels, - int ngroups): + int ngroups, + bint is_datetimelike, + bint use_mask): """See group_cummin_max.__doc__""" - group_cummin_max(out, values, mask, labels, ngroups, compute_max=True) + group_cummin_max(out, values, mask, labels, ngroups, is_datetimelike, use_mask, compute_max=True) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 1175fa345cfe2..9896086b69033 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -86,6 +86,7 @@ class providing the base-class of operations. Categorical, DatetimeArray, ExtensionArray, + BaseMaskedArray ) from pandas.core.base import ( DataError, @@ -105,6 +106,7 @@ class providing the base-class of operations. Index, MultiIndex, ) +from pandas.core.groupby.ops import does_cython_function_use_mask from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter from pandas.core.util.numba_ import NUMBA_FUNC_CACHE @@ -1024,23 +1026,18 @@ def _cumcount_array(self, ascending: bool = True): @final def _cython_transform( - self, how: str, numeric_only: bool = True, axis: int = 0, needs_mask: bool = False, **kwargs + self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs ): output: Dict[base.OutputKey, np.ndarray] = {} - for idx, obj in enumerate(self._iterate_slices()): name = obj.name is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: continue - mask = None - if needs_mask: - # mask = np.zeros(dtype=np.uint8, order="C", shape=(len(obj._values))) - mask = np.require(isna(obj._values).view(np.uint8), requirements='C') try: result = self.grouper._cython_operation( - "transform", obj._values, how, axis, mask=mask, **kwargs + "transform", obj._values, how, axis, **kwargs ) except NotImplementedError: continue @@ -2592,7 +2589,7 @@ def cummin(self, axis=0, **kwargs): if axis != 0: return self.apply(lambda x: np.minimum.accumulate(x, axis)) - return self._cython_transform("cummin", numeric_only=False, needs_mask=True) + return self._cython_transform("cummin", numeric_only=False) @final @Substitution(name="groupby") @@ -2608,7 +2605,7 @@ def cummax(self, axis=0, **kwargs): if axis != 0: return self.apply(lambda x: np.maximum.accumulate(x, axis)) - return self._cython_transform("cummax", numeric_only=False, needs_mask=True) + return self._cython_transform("cummax", numeric_only=False) @final def _get_cythonized_result( diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 3240646933ef0..b3b7191e533c9 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -20,6 +20,10 @@ Tuple, Type, ) +from pandas.core.arrays.masked import ( + BaseMaskedArray, + BaseMaskedDtype, +) import numpy as np @@ -36,6 +40,7 @@ FrameOrSeries, Shape, final, + ArrayLike ) from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly @@ -70,6 +75,9 @@ isna, maybe_fill, ) +from pandas.core.arrays import ( + BaseMaskedArray +) from pandas.core.base import SelectionMixin import pandas.core.common as com @@ -115,6 +123,7 @@ "cummax": "group_cummax", "rank": "group_rank", }, + "needs_mask": {"cummin", "cummax"} } @@ -154,6 +163,10 @@ def _get_cython_function(kind: str, how: str, dtype: np.dtype, is_numeric: bool) return func +def does_cython_function_use_mask(kind: str) -> bool: + return kind in _CYTHON_FUNCTIONS["needs_mask"] + + class BaseGrouper: """ This is an internal Grouper class, which actually holds @@ -471,6 +484,7 @@ def _get_cython_func_and_vals( ------- func : callable values : np.ndarray + needs_mask : True if a mask must be passed """ try: func = _get_cython_function(kind, how, values.dtype, is_numeric) @@ -486,7 +500,8 @@ def _get_cython_func_and_vals( func = _get_cython_function(kind, how, values.dtype, is_numeric) else: raise - return func, values + needs_mask = how in _CYTHON_FUNCTIONS["needs_mask"] + return func, values, needs_mask @final def _disallow_invalid_ops( @@ -524,8 +539,8 @@ def _disallow_invalid_ops( @final def _ea_wrap_cython_operation( - self, kind: str, values, how: str, axis: int, min_count: int = -1, mask: Optional[np.ndarray] = None, **kwargs - ) -> np.ndarray: + self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs + ) -> ArrayLike: """ If we have an ExtensionArray, unwrap, call _cython_operation, and re-wrap if appropriate. @@ -538,7 +553,7 @@ def _ea_wrap_cython_operation( # operate on the tz-naive equivalents values = values.view("M8[ns]") res_values = self._cython_operation( - kind, values, how, axis, min_count, mask=mask, **kwargs + kind, values, how, axis, min_count, **kwargs ) if how in ["rank"]: # preserve float64 dtype @@ -550,36 +565,60 @@ def _ea_wrap_cython_operation( elif is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype): # IntegerArray or BooleanArray - values = values._data + values = ensure_int_or_float(values) res_values = self._cython_operation( - kind, values, how, axis, min_count, mask=mask, **kwargs + kind, values, how, axis, min_count, **kwargs ) dtype = maybe_cast_result_dtype(orig_values.dtype, how) if isinstance(dtype, ExtensionDtype): cls = dtype.construct_array_type() - return cls(res_values, mask.astype(bool, copy=True)) - # return cls._from_sequence(res_values, dtype=dtype) + return cls._from_sequence(res_values, dtype=dtype) return res_values elif is_float_dtype(values.dtype): # FloatingArray - values = values._data - # values = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan) + values = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan) res_values = self._cython_operation( - kind, values, how, axis, min_count, mask=mask, **kwargs + kind, values, how, axis, min_count, **kwargs ) - # result = type(orig_values)._from_sequence(res_values) - return type(orig_values)(res_values, mask.astype(bool, copy=True)) + result = type(orig_values)._from_sequence(res_values) + return result raise NotImplementedError( f"function is not implemented for this dtype: {values.dtype}" ) + @final + def _masked_ea_wrap_cython_operation( + self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs + ) -> ArrayLike: + """ + Equivalent of `_ea_wrap_cython_operation`, but optimized for masked EA's + and cython algorithms which accept a mask. + """ + orig_values = values + + mask = isna(values).copy() + values = values._data + + if is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype): + # IntegerArray or BooleanArray + values = ensure_int_or_float(values) + + res_values = self._cython_operation( + kind, values, how, axis, min_count, mask=mask, **kwargs + ) + dtype = maybe_cast_result_dtype(orig_values.dtype, how) + cls = dtype.construct_array_type() + + return cls(res_values, mask.astype(bool, copy=True)) + + @final def _cython_operation( self, kind: str, values, how: str, axis: int, min_count: int = -1, mask: Optional[np.ndarray] = None, **kwargs - ) -> np.ndarray: + ) -> ArrayLike: """ Returns the values of a cython operation. """ @@ -601,9 +640,14 @@ def _cython_operation( self._disallow_invalid_ops(dtype, how, is_numeric) if is_extension_array_dtype(dtype): - return self._ea_wrap_cython_operation( - kind, values, how, axis, min_count, mask=mask, **kwargs - ) + if isinstance(dtype, BaseMaskedDtype) and does_cython_function_use_mask(how): + return self._masked_ea_wrap_cython_operation( + kind, values, how, axis, min_count, **kwargs + ) + else: + return self._ea_wrap_cython_operation( + kind, values, how, axis, min_count, **kwargs + ) is_datetimelike = needs_i8_conversion(dtype) @@ -613,13 +657,12 @@ def _cython_operation( elif is_bool_dtype(dtype): values = ensure_int_or_float(values) elif is_integer_dtype(dtype): - pass # we use iNaT for the missing value on ints # so pre-convert to guard this condition - # if (values == iNaT).any(): - # values = ensure_float64(values) - # else: - # values = ensure_int_or_float(values) + if mask is None and (values == iNaT).any(): + values = ensure_float64(values) + else: + values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(dtype): values = ensure_float64(values) else: @@ -633,6 +676,7 @@ def _cython_operation( values = values[:, None] if mask is not None: mask = mask[:, None] + out_shape = (self.ngroups, arity) else: if axis > 0: @@ -645,7 +689,11 @@ def _cython_operation( ) out_shape = (self.ngroups,) + values.shape[1:] - func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric) + func, values, needs_mask = self._get_cython_func_and_vals(kind, how, values, is_numeric) + use_mask = mask is not None + if needs_mask: + if mask is None: + mask = np.zeros_like(values, dtype=np.uint8, order="C") if how == "rank": out_dtype = "float" @@ -662,13 +710,12 @@ def _cython_operation( result = self._aggregate(result, counts, values, codes, func, min_count) elif kind == "transform": result = maybe_fill(np.empty(values.shape, dtype=out_dtype)) - # TODO: min_count result = self._transform( - result, values, func, is_datetimelike, mask=mask, **kwargs + result, values, func, is_datetimelike, use_mask, mask, **kwargs ) - if is_integer_dtype(result.dtype) and not is_datetimelike: + if not use_mask and is_integer_dtype(result.dtype) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") @@ -708,12 +755,12 @@ def _aggregate( @final def _transform( - self, result: np.ndarray, values: np.ndarray, transform_func, is_datetimelike: bool, mask: Optional[np.ndarray] = None, **kwargs + self, result: np.ndarray, values: np.ndarray, transform_func, is_datetimelike: bool, use_mask: bool, mask: np.ndarray | None, **kwargs ) -> np.ndarray: comp_ids, _, ngroups = self.group_info if mask is not None: - transform_func(result, values, mask, comp_ids, ngroups, **kwargs) + transform_func(result, values, mask, comp_ids, ngroups, is_datetimelike, use_mask, **kwargs) else: transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) From 31409f8fdf745be2f298c6de4dd483c62abbff35 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 26 Mar 2021 17:41:42 -0400 Subject: [PATCH 10/32] wip --- asv_bench/benchmarks/groupby.py | 11 ++++++++++- pandas/_libs/groupby.pyx | 1 + pandas/core/groupby/ops.py | 1 - pandas/tests/groupby/test_function.py | 14 ++++++++++++-- 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index dc4f7dcfabd7f..d96f9f3b5fef2 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -508,13 +508,22 @@ class CumminMax: def setup(self, dtype, method): N = 1_000_000 - df = DataFrame(np.random.randint(0, 1000, (N, 10)), columns=list("abcdefghij"), dtype=dtype) + vals = np.random.randint(0, 1000, (N, 10)) + null_vals = vals.copy() + null_vals[::2, :] = np.nan + df = DataFrame(vals, columns=list("abcdefghij"), dtype=dtype) + null_df = DataFrame(null_vals, columns=list("abcdefghij"), dtype=dtype) df["key"] = np.random.randint(0, 100, size=N) + null_df["key"] = np.random.randint(0, 100, size=N) self.df = df + self.null_df = null_df def time_frame_transform(self, dtype, method): self.df.groupby("key").transform(method) + def time_frame_transform_many_nulls(self, dtype, method): + self.null_df.groupby("key").transform(method) + class RankWithTies: # GH 21237 diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 68ba284a3f4cc..8c899c780b8ee 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1319,6 +1319,7 @@ def group_cummin_max(groupby_t[:, ::1] out, # `out` does not need to be set since it # will be masked anyway val_is_nan = True + else: val = values[i, j] # Otherwise, `out` must be set accordingly if the diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b3b7191e533c9..b5c3fd8ed065a 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -614,7 +614,6 @@ def _masked_ea_wrap_cython_operation( return cls(res_values, mask.astype(bool, copy=True)) - @final def _cython_operation( self, kind: str, values, how: str, axis: int, min_count: int = -1, mask: Optional[np.ndarray] = None, **kwargs diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index cd25eddb84826..4b523662679e4 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -846,10 +846,20 @@ def test_cummax_all_nan_column(): tm.assert_frame_equal(expected, result) -# @pytest.mark.parametrize("method", ["cummin", "cummax"]) -# def test_cummin_max_nullable_ints_no_float_cast(method): +@td.skip_if_32bit +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +@pytest.mark.parametrize( + "dtype,val", [("UInt64", np.iinfo("uint64").max), ("Int64", 2 ** 53 + 1)] +) +def test_nullable_int_not_cast_as_float(method, dtype, val): + data = [val, pd.NA] + df = pd.DataFrame({"grp": [1, 1], "b": data}, dtype=dtype) + grouped = df.groupby("grp") + result = grouped.transform(method) + expected = pd.DataFrame({"b": data}, dtype=dtype) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( From 0c05f74dfbf25068f32cbb4244796b68251f2e92 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 26 Mar 2021 19:21:08 -0400 Subject: [PATCH 11/32] wip --- asv_bench/benchmarks/groupby.py | 43 ++------------------- doc/source/whatsnew/v1.3.0.rst | 2 + pandas/core/groupby/ops.py | 5 ++- pandas/tests/groupby/test_function.py | 54 ++++++++++++++------------- 4 files changed, 37 insertions(+), 67 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index d96f9f3b5fef2..49a478e2c024d 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -395,40 +395,10 @@ class GroupByMethods: params = [ ["int", "float", "object", "datetime"], [ - "all", - "any", - "bfill", - "count", - "cumcount", + "cummax", "cummin", - "cumprod", - "cumsum", - "describe", - "ffill", - "first", - "head", - "last", - "mad", - "max", - "min", - "median", - "mean", - "nunique", - "pct_change", - "prod", - "quantile", - "rank", - "sem", - "shift", - "size", - "skew", - "std", - "sum", - "tail", - "unique", - "value_counts", - "var", + ], ["direct", "transformation"], ] @@ -494,12 +464,6 @@ def time_frame_agg(self, dtype, method): class CumminMax: - """ - Benchmarks specifically targetting our cython aggregation algorithms - (using a big enough dataframe with simple key, so a large part of the - time is actually spent in the grouped aggregation). - """ - param_names = ["dtype", "method"] params = [ ["float64", "int64", "Float64", "Int64"], @@ -509,8 +473,9 @@ class CumminMax: def setup(self, dtype, method): N = 1_000_000 vals = np.random.randint(0, 1000, (N, 10)) - null_vals = vals.copy() + null_vals = vals.astype(float, copy=True) null_vals[::2, :] = np.nan + null_vals[::3, :] = np.nan df = DataFrame(vals, columns=list("abcdefghij"), dtype=dtype) null_df = DataFrame(null_vals, columns=list("abcdefghij"), dtype=dtype) df["key"] = np.random.randint(0, 100, size=N) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 512e6e6cbb391..1ded085610f96 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -447,6 +447,8 @@ Performance improvements - Performance improvement in :class:`Styler` where render times are more than 50% reduced (:issue:`39972` :issue:`39952`) - Performance improvement in :meth:`core.window.ewm.ExponentialMovingWindow.mean` with ``times`` (:issue:`39784`) - Performance improvement in :meth:`.GroupBy.apply` when requiring the python fallback implementation (:issue:`40176`) +- Performance improvement in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` when using methods ``cummin`` and ``cummax`` with nullable data types (:issue:`37493`) +- .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b5c3fd8ed065a..06f7db5a49247 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -21,7 +21,6 @@ Type, ) from pandas.core.arrays.masked import ( - BaseMaskedArray, BaseMaskedDtype, ) @@ -599,6 +598,8 @@ def _masked_ea_wrap_cython_operation( """ orig_values = values + # isna just directly returns self._mask, so copy here to prevent + # modifying the original mask = isna(values).copy() values = values._data @@ -612,7 +613,7 @@ def _masked_ea_wrap_cython_operation( dtype = maybe_cast_result_dtype(orig_values.dtype, how) cls = dtype.construct_array_type() - return cls(res_values, mask.astype(bool, copy=True)) + return cls(res_values.astype(dtype.type, copy=False), mask.astype(bool, copy=True)) @final def _cython_operation( diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 4b523662679e4..9703c11a21ca8 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -22,20 +22,27 @@ @pytest.fixture( - params=[np.int32, np.int64, np.float32, np.float64], - ids=["np.int32", "np.int64", "np.float32", "np.float64"], + params=[np.int32, np.int64, np.float32, np.float64, "Int64", "Float64"], + ids=["np.int32", "np.int64", "np.float32", "np.float64", "Int64", "Float64"], ) -def numpy_dtypes_for_minmax(request): +def dtypes_for_minmax(request): """ - Fixture of numpy dtypes with min and max values used for testing + Fixture of dtypes with min and max values used for testing cummin and cummax """ dtype = request.param + + np_type = dtype + if dtype == "Int64": + np_type = np.int64 + elif dtype == "Float64": + np_type = np.float64 + min_val = ( - np.iinfo(dtype).min if np.dtype(dtype).kind == "i" else np.finfo(dtype).min + np.iinfo(np_type).min if np.dtype(np_type).kind == "i" else np.finfo(np_type).min ) max_val = ( - np.iinfo(dtype).max if np.dtype(dtype).kind == "i" else np.finfo(dtype).max + np.iinfo(np_type).max if np.dtype(np_type).kind == "i" else np.finfo(np_type).max ) return (dtype, min_val, max_val) @@ -734,9 +741,9 @@ def test_numpy_compat(func): getattr(g, func)(foo=1) -def test_cummin(numpy_dtypes_for_minmax): - dtype = numpy_dtypes_for_minmax[0] - min_val = numpy_dtypes_for_minmax[1] +def test_cummin(dtypes_for_minmax): + dtype = dtypes_for_minmax[0] + min_val = dtypes_for_minmax[1] # GH 15048 base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) @@ -780,19 +787,24 @@ def test_cummin(numpy_dtypes_for_minmax): tm.assert_series_equal(result, expected) -def test_cummin_all_nan_column(): +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +@pytest.mark.parametrize("dtype", ["UInt64", "Int64", "Float64", "float", "boolean"]) +def test_cummin_max_all_nan_column(method, dtype): base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8}) + base_df["B"] = base_df["B"].astype(dtype) + grouped = base_df.groupby("A") - expected = DataFrame({"B": [np.nan] * 8}) - result = base_df.groupby("A").cummin() + expected = DataFrame({"B": [np.nan] * 8}, dtype=dtype) + result = getattr(grouped, method)() tm.assert_frame_equal(expected, result) - result = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() + + result = getattr(grouped["B"], method)().to_frame() tm.assert_frame_equal(expected, result) -def test_cummax(numpy_dtypes_for_minmax): - dtype = numpy_dtypes_for_minmax[0] - max_val = numpy_dtypes_for_minmax[2] +def test_cummax(dtypes_for_minmax): + dtype = dtypes_for_minmax[0] + max_val = dtypes_for_minmax[2] # GH 15048 base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) @@ -836,16 +848,6 @@ def test_cummax(numpy_dtypes_for_minmax): tm.assert_series_equal(result, expected) -def test_cummax_all_nan_column(): - base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8}) - - expected = DataFrame({"B": [np.nan] * 8}) - result = base_df.groupby("A").cummax() - tm.assert_frame_equal(expected, result) - result = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() - tm.assert_frame_equal(expected, result) - - @td.skip_if_32bit @pytest.mark.parametrize("method", ["cummin", "cummax"]) @pytest.mark.parametrize( From 5c60a1f35888920853a324353c9ccad90cf62e11 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 26 Mar 2021 19:48:36 -0400 Subject: [PATCH 12/32] wip --- asv_bench/benchmarks/groupby.py | 2 - pandas/_libs/groupby.pyx | 32 ++++++++--- pandas/core/groupby/groupby.py | 2 - pandas/core/groupby/ops.py | 83 +++++++++++++++++++-------- pandas/tests/groupby/test_function.py | 12 ++-- 5 files changed, 91 insertions(+), 40 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 49a478e2c024d..803658d101b89 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -395,10 +395,8 @@ class GroupByMethods: params = [ ["int", "float", "object", "datetime"], [ - "cummax", "cummin", - ], ["direct", "transformation"], ] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index fa363425041e9..80f06a980bb9c 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1270,14 +1270,10 @@ def group_cummin_max(groupby_t[:, ::1] out, Array to store cummin/max in. values : array Values to take cummin/max of. -<<<<<<< HEAD mask : array[uint8_t] If `use_mask`, then indices represent missing values, otherwise will be passed as a zeroed array - labels : int64 array -======= labels : np.ndarray[np.intp] ->>>>>>> origin/master Labels to group by. ngroups : int Number of groups, larger than all entries of `labels`. @@ -1320,14 +1316,16 @@ def group_cummin_max(groupby_t[:, ::1] out, for j in range(K): val_is_nan = False - # If using the mask, we can avoid grabbing the - # value unless necessary if use_mask: if mask[i, j]: + # `out` does not need to be set since it # will be masked anyway val_is_nan = True else: + + # If using the mask, we can avoid grabbing the + # value unless necessary val = values[i, j] # Otherwise, `out` must be set accordingly if the @@ -1359,7 +1357,16 @@ def group_cummin(groupby_t[:, ::1] out, bint is_datetimelike, bint use_mask): """See group_cummin_max.__doc__""" - group_cummin_max(out, values, mask, labels, ngroups, is_datetimelike, use_mask, compute_max=False) + group_cummin_max( + out, + values, + mask, + labels, + ngroups, + is_datetimelike, + use_mask, + compute_max=False + ) @cython.boundscheck(False) @@ -1372,4 +1379,13 @@ def group_cummax(groupby_t[:, ::1] out, bint is_datetimelike, bint use_mask): """See group_cummin_max.__doc__""" - group_cummin_max(out, values, mask, labels, ngroups, is_datetimelike, use_mask, compute_max=True) + group_cummin_max( + out, + values, + mask, + labels, + ngroups, + is_datetimelike, + use_mask, + compute_max=True + ) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 69399b5969451..1ee38834c5758 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -85,7 +85,6 @@ class providing the base-class of operations. from pandas.core.arrays import ( Categorical, ExtensionArray, - BaseMaskedArray ) from pandas.core.base import ( DataError, @@ -105,7 +104,6 @@ class providing the base-class of operations. Index, MultiIndex, ) -from pandas.core.groupby.ops import does_cython_function_use_mask from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter from pandas.core.util.numba_ import NUMBA_FUNC_CACHE diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index f8e5f0d95a777..250eaa25a5cda 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -20,9 +20,6 @@ Tuple, Type, ) -from pandas.core.arrays.masked import ( - BaseMaskedDtype, -) import numpy as np @@ -40,7 +37,6 @@ FrameOrSeries, Shape, final, - ArrayLike ) from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly @@ -75,10 +71,11 @@ isna, maybe_fill, ) -from pandas.core.arrays import ( - BaseMaskedArray -) +from pandas.core.arrays.masked import ( + BaseMaskedArray, + BaseMaskedDtype, +) from pandas.core.base import SelectionMixin import pandas.core.common as com from pandas.core.frame import DataFrame @@ -123,7 +120,11 @@ "cummax": "group_cummax", "rank": "group_rank", }, - "needs_mask": {"cummin", "cummax"} +} + +_CYTHON_MASKED_FUNCTIONS = { + "cummin", + "cummax", } @@ -163,8 +164,8 @@ def _get_cython_function(kind: str, how: str, dtype: np.dtype, is_numeric: bool) return func -def does_cython_function_use_mask(kind: str) -> bool: - return kind in _CYTHON_FUNCTIONS["needs_mask"] +def cython_function_uses_mask(kind: str) -> bool: + return kind in _CYTHON_MASKED_FUNCTIONS class BaseGrouper: @@ -590,8 +591,14 @@ def _ea_wrap_cython_operation( @final def _masked_ea_wrap_cython_operation( - self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs - ) -> ArrayLike: + self, + kind: str, + values: BaseMaskedArray, + how: str, + axis: int, + min_count: int = -1, + **kwargs, + ) -> BaseMaskedArray: """ Equivalent of `_ea_wrap_cython_operation`, but optimized for masked EA's and cython algorithms which accept a mask. @@ -601,23 +608,33 @@ def _masked_ea_wrap_cython_operation( # isna just directly returns self._mask, so copy here to prevent # modifying the original mask = isna(values).copy() - values = values._data + arr = values._data if is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype): # IntegerArray or BooleanArray - values = ensure_int_or_float(values) + arr = ensure_int_or_float(arr) res_values = self._cython_operation( - kind, values, how, axis, min_count, mask=mask, **kwargs + kind, arr, how, axis, min_count, mask=mask, **kwargs ) dtype = maybe_cast_result_dtype(orig_values.dtype, how) + assert isinstance(dtype, BaseMaskedDtype) cls = dtype.construct_array_type() - return cls(res_values.astype(dtype.type, copy=False), mask.astype(bool, copy=True)) + return cls( + res_values.astype(dtype.type, copy=False), mask.astype(bool, copy=True) + ) @final def _cython_operation( - self, kind: str, values, how: str, axis: int, min_count: int = -1, mask: Optional[np.ndarray] = None, **kwargs + self, + kind: str, + values, + how: str, + axis: int, + min_count: int = -1, + mask: np.ndarray | None = None, + **kwargs, ) -> ArrayLike: """ Returns the values of a cython operation. @@ -640,7 +657,7 @@ def _cython_operation( self._disallow_invalid_ops(dtype, how, is_numeric) if is_extension_array_dtype(dtype): - if isinstance(dtype, BaseMaskedDtype) and does_cython_function_use_mask(how): + if isinstance(values, BaseMaskedArray) and cython_function_uses_mask(how): return self._masked_ea_wrap_cython_operation( kind, values, how, axis, min_count, **kwargs ) @@ -689,7 +706,9 @@ def _cython_operation( ) out_shape = (self.ngroups,) + values.shape[1:] - func, values, needs_mask = self._get_cython_func_and_vals(kind, how, values, is_numeric) + func, values, needs_mask = self._get_cython_func_and_vals( + kind, how, values, is_numeric + ) use_mask = mask is not None if needs_mask: if mask is None: @@ -716,10 +735,10 @@ def _cython_operation( ) if not use_mask and is_integer_dtype(result.dtype) and not is_datetimelike: - mask = result == iNaT - if mask.any(): + result_mask = result == iNaT + if result_mask.any(): result = result.astype("float64") - result[mask] = np.nan + result[result_mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all(): assert result.ndim != 2 @@ -755,12 +774,28 @@ def _aggregate( @final def _transform( - self, result: np.ndarray, values: np.ndarray, transform_func, is_datetimelike: bool, use_mask: bool, mask: np.ndarray | None, **kwargs + self, + result: np.ndarray, + values: np.ndarray, + transform_func, + is_datetimelike: bool, + use_mask: bool, + mask: np.ndarray | None, + **kwargs, ) -> np.ndarray: comp_ids, _, ngroups = self.group_info if mask is not None: - transform_func(result, values, mask, comp_ids, ngroups, is_datetimelike, use_mask, **kwargs) + transform_func( + result, + values, + mask, + comp_ids, + ngroups, + is_datetimelike, + use_mask, + **kwargs, + ) else: transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 9703c11a21ca8..12a8808d750d7 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -39,10 +39,14 @@ def dtypes_for_minmax(request): np_type = np.float64 min_val = ( - np.iinfo(np_type).min if np.dtype(np_type).kind == "i" else np.finfo(np_type).min + np.iinfo(np_type).min + if np.dtype(np_type).kind == "i" + else np.finfo(np_type).min ) max_val = ( - np.iinfo(np_type).max if np.dtype(np_type).kind == "i" else np.finfo(np_type).max + np.iinfo(np_type).max + if np.dtype(np_type).kind == "i" + else np.finfo(np_type).max ) return (dtype, min_val, max_val) @@ -855,11 +859,11 @@ def test_cummax(dtypes_for_minmax): ) def test_nullable_int_not_cast_as_float(method, dtype, val): data = [val, pd.NA] - df = pd.DataFrame({"grp": [1, 1], "b": data}, dtype=dtype) + df = DataFrame({"grp": [1, 1], "b": data}, dtype=dtype) grouped = df.groupby("grp") result = grouped.transform(method) - expected = pd.DataFrame({"b": data}, dtype=dtype) + expected = DataFrame({"b": data}, dtype=dtype) tm.assert_frame_equal(result, expected) From f0c27ce09a1158391015569128d0fafacd85a06d Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 26 Mar 2021 20:02:38 -0400 Subject: [PATCH 13/32] PERF: use masked algo in groupby cummin and cummax --- asv_bench/benchmarks/groupby.py | 32 ++++++++++++++++++++++++++++++++ pandas/_libs/groupby.pyx | 2 +- pandas/core/groupby/ops.py | 13 +++++-------- 3 files changed, 38 insertions(+), 9 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 803658d101b89..d7e73b3cfcca3 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -395,8 +395,40 @@ class GroupByMethods: params = [ ["int", "float", "object", "datetime"], [ + "all", + "any", + "bfill", + "count", + "cumcount", "cummax", "cummin", + "cumprod", + "cumsum", + "describe", + "ffill", + "first", + "head", + "last", + "mad", + "max", + "min", + "median", + "mean", + "nunique", + "pct_change", + "prod", + "quantile", + "rank", + "sem", + "shift", + "size", + "skew", + "std", + "sum", + "tail", + "unique", + "value_counts", + "var", ], ["direct", "transformation"], ] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 80f06a980bb9c..3395fefcbaad3 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1299,7 +1299,7 @@ def group_cummin_max(groupby_t[:, ::1] out, bint val_is_nan N, K = (values).shape - accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype, order="C") + accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) if groupby_t is int64_t: accum[:] = -_int64_max if compute_max else _int64_max elif groupby_t is uint64_t: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 250eaa25a5cda..50df42f555f53 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -484,7 +484,6 @@ def _get_cython_func_and_vals( ------- func : callable values : np.ndarray - needs_mask : True if a mask must be passed """ try: func = _get_cython_function(kind, how, values.dtype, is_numeric) @@ -500,8 +499,7 @@ def _get_cython_func_and_vals( func = _get_cython_function(kind, how, values.dtype, is_numeric) else: raise - needs_mask = how in _CYTHON_FUNCTIONS["needs_mask"] - return func, values, needs_mask + return func, values @final def _disallow_invalid_ops( @@ -656,8 +654,9 @@ def _cython_operation( # if not raise NotImplementedError self._disallow_invalid_ops(dtype, how, is_numeric) + func_uses_mask = cython_function_uses_mask(how) if is_extension_array_dtype(dtype): - if isinstance(values, BaseMaskedArray) and cython_function_uses_mask(how): + if isinstance(values, BaseMaskedArray) and func_uses_mask: return self._masked_ea_wrap_cython_operation( kind, values, how, axis, min_count, **kwargs ) @@ -706,11 +705,9 @@ def _cython_operation( ) out_shape = (self.ngroups,) + values.shape[1:] - func, values, needs_mask = self._get_cython_func_and_vals( - kind, how, values, is_numeric - ) + func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric) use_mask = mask is not None - if needs_mask: + if func_uses_mask: if mask is None: mask = np.zeros_like(values, dtype=np.uint8, order="C") From 2fa80ad0338861ec09f6874ae01a81f7b4bc81bd Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 26 Mar 2021 20:07:18 -0400 Subject: [PATCH 14/32] Avoid mask copy --- pandas/core/groupby/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 50df42f555f53..145973f6a12f9 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -620,7 +620,7 @@ def _masked_ea_wrap_cython_operation( cls = dtype.construct_array_type() return cls( - res_values.astype(dtype.type, copy=False), mask.astype(bool, copy=True) + res_values.astype(dtype.type, copy=False), mask.astype(bool, copy=False) ) @final From 280c7e5359166535938221d637db72c8126c4768 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 26 Mar 2021 20:31:43 -0400 Subject: [PATCH 15/32] Update whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index c2b3e58440e64..e9c9e8a77f637 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -635,6 +635,8 @@ Groupby/resample/rolling - Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would not retain ``com``, ``span``, ``alpha`` or ``halflife`` attributes (:issue:`40164`) - :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`) - Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`) +- Bug in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` computing wrong result for methods ``cummin`` and ``cummax`` with nullable data types too large to roundtrip when casting to float (:issue:`37493`) +- Reshaping ^^^^^^^^^ From 7e2fbe0dfbe3e9fd670a245bf9477451e68635c2 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 31 Mar 2021 20:14:32 -0400 Subject: [PATCH 16/32] Merge fixup --- pandas/core/groupby/ops.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 9a32a08a11d12..b0e1acd07ac89 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -72,14 +72,11 @@ maybe_fill, ) -<<<<<<< HEAD from pandas.core.arrays.masked import ( BaseMaskedArray, BaseMaskedDtype, ) -======= from pandas.core.arrays import ExtensionArray ->>>>>>> origin/master from pandas.core.base import SelectionMixin import pandas.core.common as com from pandas.core.frame import DataFrame From 0ebb97ad570a386d6fb8fb0e85d584f5f48af9db Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 31 Mar 2021 20:51:44 -0400 Subject: [PATCH 17/32] Follow transpose --- pandas/core/groupby/ops.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b0e1acd07ac89..0a239a77f14b9 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -72,11 +72,11 @@ maybe_fill, ) +from pandas.core.arrays import ExtensionArray from pandas.core.arrays.masked import ( BaseMaskedArray, BaseMaskedDtype, ) -from pandas.core.arrays import ExtensionArray from pandas.core.base import SelectionMixin import pandas.core.common as com from pandas.core.frame import DataFrame @@ -652,7 +652,6 @@ def _cython_operation( # can we do this operation with our cython functions # if not raise NotImplementedError self._disallow_invalid_ops(dtype, how, is_numeric) - func_uses_mask = cython_function_uses_mask(how) if is_extension_array_dtype(dtype): if isinstance(values, BaseMaskedArray) and func_uses_mask: @@ -667,8 +666,6 @@ def _cython_operation( elif values.ndim == 1: # expand to 2d, dispatch, then squeeze if appropriate values2d = values[None, :] - if mask is not None: - mask = mask[None, :] res = self._cython_operation( kind=kind, values=values2d, @@ -705,9 +702,10 @@ def _cython_operation( arity = self._cython_arity.get(how, 1) ngroups = self.ngroups - assert axis == 1 values = values.T + if mask is not None: + mask = mask.reshape(values.shape, order="C") if how == "ohlc": out_shape = (ngroups, 4) elif arity > 1: @@ -733,7 +731,6 @@ def _cython_operation( else: out_dtype = "object" - codes, _, _ = self.group_info result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) From 0009dfd367d6bfb74381a72a3c3e4314b39edfcc Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 31 Mar 2021 22:53:25 -0400 Subject: [PATCH 18/32] Compute mask usage inside algo --- pandas/_libs/groupby.pyx | 17 +++++------------ pandas/core/groupby/ops.py | 14 +++++--------- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 7ef47d461cc6d..5133e40cae66e 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1236,7 +1236,6 @@ def group_cummin_max(groupby_t[:, ::1] out, const intp_t[:] labels, int ngroups, bint is_datetimelike, - bint use_mask, bint compute_max): """ Cumulative minimum/maximum of columns of `values`, in row groups `labels`. @@ -1256,10 +1255,6 @@ def group_cummin_max(groupby_t[:, ::1] out, Number of groups, larger than all entries of `labels`. is_datetimelike : bool True if `values` contains datetime-like entries. - use_mask : bool - True if the mask should be used (otherwise we continue - as if it is not a masked algorithm). Avoids the cost - of checking for a completely zeroed mask. compute_max : bool True if cumulative maximum should be computed, False if cumulative minimum should be computed @@ -1273,7 +1268,9 @@ def group_cummin_max(groupby_t[:, ::1] out, groupby_t val, mval ndarray[groupby_t, ndim=2] accum intp_t lab - bint val_is_nan + bint val_is_nan, use_mask + + use_mask = mask is not None N, K = (values).shape accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) @@ -1331,8 +1328,7 @@ def group_cummin(groupby_t[:, ::1] out, uint8_t[:, ::1] mask, const intp_t[:] labels, int ngroups, - bint is_datetimelike, - bint use_mask): + bint is_datetimelike): """See group_cummin_max.__doc__""" group_cummin_max( out, @@ -1341,7 +1337,6 @@ def group_cummin(groupby_t[:, ::1] out, labels, ngroups, is_datetimelike, - use_mask, compute_max=False ) @@ -1353,8 +1348,7 @@ def group_cummax(groupby_t[:, ::1] out, uint8_t[:, ::1] mask, const intp_t[:] labels, int ngroups, - bint is_datetimelike, - bint use_mask): + bint is_datetimelike): """See group_cummin_max.__doc__""" group_cummin_max( out, @@ -1363,6 +1357,5 @@ def group_cummax(groupby_t[:, ::1] out, labels, ngroups, is_datetimelike, - use_mask, compute_max=True ) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 0a239a77f14b9..708e127796776 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -652,6 +652,7 @@ def _cython_operation( # can we do this operation with our cython functions # if not raise NotImplementedError self._disallow_invalid_ops(dtype, how, is_numeric) + func_uses_mask = cython_function_uses_mask(how) if is_extension_array_dtype(dtype): if isinstance(values, BaseMaskedArray) and func_uses_mask: @@ -718,10 +719,6 @@ def _cython_operation( out_shape = (ngroups,) + values.shape[1:] func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric) - use_mask = mask is not None - if func_uses_mask: - if mask is None: - mask = np.zeros_like(values, dtype=np.uint8, order="C") if how == "rank": out_dtype = "float" @@ -740,10 +737,10 @@ def _cython_operation( elif kind == "transform": # TODO: min_count result = self._transform( - result, values, func, is_datetimelike, use_mask, mask, **kwargs + result, values, func, is_datetimelike, mask, func_uses_mask, **kwargs ) - if not use_mask and is_integer_dtype(result.dtype) and not is_datetimelike: + if mask is None and is_integer_dtype(result.dtype) and not is_datetimelike: result_mask = result == iNaT if result_mask.any(): result = result.astype("float64") @@ -784,13 +781,13 @@ def _transform( values: np.ndarray, transform_func, is_datetimelike: bool, - use_mask: bool, mask: np.ndarray | None, + needs_mask: bool = False, **kwargs, ) -> np.ndarray: comp_ids, _, ngroups = self.group_info - if mask is not None: + if needs_mask: transform_func( result, values, @@ -798,7 +795,6 @@ def _transform( comp_ids, ngroups, is_datetimelike, - use_mask, **kwargs, ) else: From 66638324a7e0ddb9c29749114bfa8a8fb0ee6810 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 31 Mar 2021 23:56:29 -0400 Subject: [PATCH 19/32] try optional --- pandas/_libs/groupby.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 5133e40cae66e..bbde73426a8c1 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1230,7 +1230,7 @@ def group_min(groupby_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_cummin_max(groupby_t[:, ::1] out, +cdef group_cummin_max(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, uint8_t[:, ::1] mask, const intp_t[:] labels, @@ -1268,9 +1268,11 @@ def group_cummin_max(groupby_t[:, ::1] out, groupby_t val, mval ndarray[groupby_t, ndim=2] accum intp_t lab + uint8_t[:, ::1] mask_ bint val_is_nan, use_mask use_mask = mask is not None + mask_ = mask if use_mask else np.zeros_like(values) N, K = (values).shape accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) From 8247f827ed043344de0a5c8fc3302573d864d3e3 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 1 Apr 2021 00:09:12 -0400 Subject: [PATCH 20/32] WIP --- pandas/_libs/groupby.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index bbde73426a8c1..97863735335f3 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1272,7 +1272,7 @@ cdef group_cummin_max(groupby_t[:, ::1] out, bint val_is_nan, use_mask use_mask = mask is not None - mask_ = mask if use_mask else np.zeros_like(values) + mask_ = mask if use_mask else np.zeros_like(values, dtype=np.uint8) N, K = (values).shape accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) @@ -1293,7 +1293,7 @@ cdef group_cummin_max(groupby_t[:, ::1] out, val_is_nan = False if use_mask: - if mask[i, j]: + if mask_[i, j]: # `out` does not need to be set since it # will be masked anyway From 71e1c4fa5d77b596411468f7ab4459f86e760372 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 1 Apr 2021 11:56:26 -0400 Subject: [PATCH 21/32] Use more contiguity --- pandas/_libs/groupby.pyx | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 97863735335f3..bafb555731aa2 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1231,12 +1231,12 @@ def group_min(groupby_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) cdef group_cummin_max(groupby_t[:, ::1] out, - ndarray[groupby_t, ndim=2] values, - uint8_t[:, ::1] mask, - const intp_t[:] labels, - int ngroups, - bint is_datetimelike, - bint compute_max): + ndarray[groupby_t, ndim=2] values, + uint8_t[:, ::1] mask, + const intp_t[:] labels, + int ngroups, + bint is_datetimelike, + bint compute_max): """ Cumulative minimum/maximum of columns of `values`, in row groups `labels`. @@ -1246,9 +1246,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out, Array to store cummin/max in. values : array Values to take cummin/max of. - mask : array[uint8_t] - If `use_mask`, then indices represent missing values, - otherwise will be passed as a zeroed array + mask : array[uint8_t] or None + If not None, indices represent missing values, + otherwise the mask will not be used labels : np.ndarray[np.intp] Labels to group by. ngroups : int @@ -1266,16 +1266,14 @@ cdef group_cummin_max(groupby_t[:, ::1] out, cdef: Py_ssize_t i, j, N, K, size groupby_t val, mval - ndarray[groupby_t, ndim=2] accum + groupby_t[:, ::1] accum intp_t lab - uint8_t[:, ::1] mask_ bint val_is_nan, use_mask use_mask = mask is not None - mask_ = mask if use_mask else np.zeros_like(values, dtype=np.uint8) N, K = (values).shape - accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) + accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype, order='C') if groupby_t is int64_t: accum[:] = -_int64_max if compute_max else _int64_max elif groupby_t is uint64_t: @@ -1293,7 +1291,7 @@ cdef group_cummin_max(groupby_t[:, ::1] out, val_is_nan = False if use_mask: - if mask_[i, j]: + if mask[i, j]: # `out` does not need to be set since it # will be masked anyway From c6cf9ee54b77e5b18e89f272e5cf4d129ea2d0fd Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 1 Apr 2021 12:30:32 -0400 Subject: [PATCH 22/32] Shrink benchmark --- asv_bench/benchmarks/groupby.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index d7e73b3cfcca3..9eee54ac6e317 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -501,15 +501,21 @@ class CumminMax: ] def setup(self, dtype, method): - N = 1_000_000 - vals = np.random.randint(0, 1000, (N, 10)) + N = 500_000 + rg = np.random.default_rng(seed=0) + vals = rg.integers( + 0, + 1000, + (N, 5), + ) null_vals = vals.astype(float, copy=True) null_vals[::2, :] = np.nan null_vals[::3, :] = np.nan - df = DataFrame(vals, columns=list("abcdefghij"), dtype=dtype) - null_df = DataFrame(null_vals, columns=list("abcdefghij"), dtype=dtype) - df["key"] = np.random.randint(0, 100, size=N) - null_df["key"] = np.random.randint(0, 100, size=N) + df = DataFrame(vals, columns=list("abcde"), dtype=dtype) + null_df = DataFrame(null_vals, columns=list("abcde"), dtype=dtype) + keys = rg.integers(0, 100, size=N) + df["key"] = keys + null_df["key"] = keys self.df = df self.null_df = null_df From 293dc6eadc1d4cb89e58e67c740a1d490ed27cbc Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 1 Apr 2021 22:30:55 -0400 Subject: [PATCH 23/32] Revert unrelated --- pandas/core/groupby/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 180917855a9cc..43b39a902452f 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -758,7 +758,7 @@ def _cython_operation( result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if kind == "aggregate": - counts = np.zeros(self.ngroups, dtype=np.int64) + counts = np.zeros(ngroups, dtype=np.int64) func(result, counts, values, comp_ids, min_count) elif kind == "transform": # TODO: min_count From 1bb344e83d8bd9d9aff8c5108a57e3284c6b2b8a Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 12 Apr 2021 11:22:44 -0400 Subject: [PATCH 24/32] Remove merge conflict relic --- pandas/core/groupby/ops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index d5dc26b0dac43..7573f73cb395c 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -74,7 +74,6 @@ BaseMaskedArray, BaseMaskedDtype, ) -from pandas.core.base import SelectionMixin import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame From 97d9eea32eb76d4f6d242baf297fab3a55a845f8 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin <37011898+mzeitlin11@users.noreply.github.com> Date: Tue, 13 Apr 2021 11:09:28 -0400 Subject: [PATCH 25/32] Update doc/source/whatsnew/v1.3.0.rst Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 30182878b6a6b..d43e6b3b0941c 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -585,7 +585,7 @@ Performance improvements - Performance improvement in :meth:`core.window.ewm.ExponentialMovingWindow.mean` with ``times`` (:issue:`39784`) - Performance improvement in :meth:`.GroupBy.apply` when requiring the python fallback implementation (:issue:`40176`) <<<<<<< HEAD -- Performance improvement in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` when using methods ``cummin`` and ``cummax`` with nullable data types (:issue:`37493`) +- Performance improvement in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` with nullable data types (:issue:`37493`) - ======= - Performance improvement for concatenation of data with type :class:`CategoricalDtype` (:issue:`40193`) From 892a92a1a32458827c80fddb257bc4bab8098869 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin <37011898+mzeitlin11@users.noreply.github.com> Date: Tue, 13 Apr 2021 11:09:37 -0400 Subject: [PATCH 26/32] Update doc/source/whatsnew/v1.3.0.rst Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index d43e6b3b0941c..71181f5740377 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -787,7 +787,7 @@ Groupby/resample/rolling - Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`) - Bug in aggregation functions for :class:`DataFrame` not respecting ``numeric_only`` argument when ``level`` keyword was given (:issue:`40660`) - Bug in :class:`core.window.RollingGroupby` where ``as_index=False`` argument in ``groupby`` was ignored (:issue:`39433`) -- Bug in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` computing wrong result for methods ``cummin`` and ``cummax`` with nullable data types too large to roundtrip when casting to float (:issue:`37493`) +- Bug in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` computing wrong result with nullable data types too large to roundtrip when casting to float (:issue:`37493`) Reshaping From a239a68b90e2e9d9f0faea568886341ac2be6ecd Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin <37011898+mzeitlin11@users.noreply.github.com> Date: Tue, 13 Apr 2021 11:30:41 -0400 Subject: [PATCH 27/32] Update pandas/core/groupby/ops.py Co-authored-by: Joris Van den Bossche --- pandas/core/groupby/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7573f73cb395c..177b2e85a1193 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -639,7 +639,7 @@ def _masked_ea_wrap_cython_operation( mask = isna(values).copy() arr = values._data - if is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype): + if is_integer_dtype(arr.dtype) or is_bool_dtype(arr.dtype): # IntegerArray or BooleanArray arr = ensure_int_or_float(arr) From a1422ba54663f083e98f33532c0b7993740f737a Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 13 Apr 2021 12:02:16 -0400 Subject: [PATCH 28/32] Address comments --- doc/source/whatsnew/v1.3.0.rst | 5 +---- pandas/_libs/groupby.pyx | 4 ++-- pandas/core/groupby/ops.py | 9 +++------ 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index fcb5759e9460c..19dcd2b44edae 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -589,12 +589,9 @@ Performance improvements - Performance improvement in :class:`Styler` where render times are more than 50% reduced (:issue:`39972` :issue:`39952`) - Performance improvement in :meth:`core.window.ewm.ExponentialMovingWindow.mean` with ``times`` (:issue:`39784`) - Performance improvement in :meth:`.GroupBy.apply` when requiring the python fallback implementation (:issue:`40176`) -<<<<<<< HEAD +- Performance improvement for concatenation of data with type :class:`CategoricalDtype` (:issue:`40193`) - Performance improvement in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` with nullable data types (:issue:`37493`) - -======= -- Performance improvement for concatenation of data with type :class:`CategoricalDtype` (:issue:`40193`) ->>>>>>> origin/master .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 895629a30b2e8..5b6390fa1896a 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1254,7 +1254,7 @@ cdef group_cummin_max(groupby_t[:, ::1] out, Array to store cummin/max in. values : np.ndarray[groupby_t, ndim=2] Values to take cummin/max of. - mask : array[uint8_t] or None + mask : np.ndarray[bool] or None If not None, indices represent missing values, otherwise the mask will not be used labels : np.ndarray[np.intp] @@ -1281,7 +1281,7 @@ cdef group_cummin_max(groupby_t[:, ::1] out, use_mask = mask is not None N, K = (values).shape - accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype, order='C') + accum = np.empty((ngroups, K), dtype=values.dtype) if groupby_t is int64_t: accum[:] = -_int64_max if compute_max else _int64_max elif groupby_t is uint64_t: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 177b2e85a1193..efff3d8e07976 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -634,9 +634,8 @@ def _masked_ea_wrap_cython_operation( """ orig_values = values - # isna just directly returns self._mask, so copy here to prevent - # modifying the original - mask = isna(values).copy() + # Copy to ensure input and result masks don't end up shared + mask = values._mask.copy() arr = values._data if is_integer_dtype(arr.dtype) or is_bool_dtype(arr.dtype): @@ -650,9 +649,7 @@ def _masked_ea_wrap_cython_operation( assert isinstance(dtype, BaseMaskedDtype) cls = dtype.construct_array_type() - return cls( - res_values.astype(dtype.type, copy=False), mask.astype(bool, copy=False) - ) + return cls(res_values.astype(dtype.type, copy=False), mask) @final def _cython_operation( From 482a209cacedf4ce758ae6a94d0b4dd4e88bbfdd Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 13 Apr 2021 12:31:22 -0400 Subject: [PATCH 29/32] Change random generation style --- asv_bench/benchmarks/groupby.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 9eee54ac6e317..3094d38f20fa0 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -502,18 +502,13 @@ class CumminMax: def setup(self, dtype, method): N = 500_000 - rg = np.random.default_rng(seed=0) - vals = rg.integers( - 0, - 1000, - (N, 5), - ) + vals = np.random.randint(-10, 10, (N, 5)) null_vals = vals.astype(float, copy=True) null_vals[::2, :] = np.nan null_vals[::3, :] = np.nan df = DataFrame(vals, columns=list("abcde"), dtype=dtype) null_df = DataFrame(null_vals, columns=list("abcde"), dtype=dtype) - keys = rg.integers(0, 100, size=N) + keys = np.random.randint(0, 100, size=N) df["key"] = keys null_df["key"] = keys self.df = df From 251c02af59360632e5f3b5aa9ba8f1afe1466dbf Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 17 Apr 2021 20:51:41 -0400 Subject: [PATCH 30/32] Use conditional instead of partial --- pandas/core/groupby/ops.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 473bf4ba5c0bb..f4a8809e54c30 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -9,7 +9,6 @@ import collections import functools -from functools import partial from typing import ( Generic, Hashable, @@ -735,8 +734,6 @@ def _cython_operation( out_shape = cy_op.get_output_shape(ngroups, values) func, values = cy_op.get_cython_func_and_vals(values, is_numeric) - if func_uses_mask: - func = partial(func, mask=mask) out_dtype = cy_op.get_out_dtype(values.dtype) result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) @@ -755,7 +752,18 @@ def _cython_operation( func(result, counts, values, comp_ids, min_count) elif kind == "transform": # TODO: min_count - func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) + if func_uses_mask: + func( + result, + values, + comp_ids, + ngroups, + is_datetimelike, + mask=mask, + **kwargs, + ) + else: + func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) if kind == "aggregate": # i.e. counts is defined. Locations where count Date: Sun, 18 Apr 2021 00:41:19 -0400 Subject: [PATCH 31/32] Remove ensure_int_or_float --- pandas/core/groupby/ops.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index f4a8809e54c30..1438e2cc590bf 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -635,9 +635,8 @@ def _masked_ea_wrap_cython_operation( mask = values._mask.copy() arr = values._data - if is_integer_dtype(arr.dtype) or is_bool_dtype(arr.dtype): - # IntegerArray or BooleanArray - arr = ensure_int_or_float(arr) + if is_bool_dtype(arr.dtype): + arr = arr.astype("int64") res_values = self._cython_operation( kind, arr, how, axis, min_count, mask=mask, **kwargs From 237f86feb08e35841473a63ec0c74066235632d1 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sun, 18 Apr 2021 00:55:01 -0400 Subject: [PATCH 32/32] Remove unnecessary condition --- pandas/core/groupby/ops.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 1438e2cc590bf..6fa124bc5558f 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -635,9 +635,6 @@ def _masked_ea_wrap_cython_operation( mask = values._mask.copy() arr = values._data - if is_bool_dtype(arr.dtype): - arr = arr.astype("int64") - res_values = self._cython_operation( kind, arr, how, axis, min_count, mask=mask, **kwargs )