diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 3ec37718eb652..814cd223eb445 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -51,7 +51,7 @@ def group_any_all( skipna: bool, ) -> None: ... def group_sum( - out: np.ndarray, # complexfloatingintuint_t[:, ::1] + out: np.ndarray, # complexfloatingint64uint64_t[:, ::1] counts: np.ndarray, # int64_t[::1] values: np.ndarray, # ndarray[complexfloatingintuint_t, ndim=2] labels: np.ndarray, # const intp_t[:] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index e4314edecfa7e..f24e0d8faffdc 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -524,11 +524,16 @@ ctypedef fused sum_t: uint64_t object +ctypedef fused sum_out_t: + mean_t + int64_t + uint64_t + object @cython.wraparound(False) @cython.boundscheck(False) def group_sum( - sum_t[:, ::1] out, + sum_out_t[:, ::1] out, int64_t[::1] counts, ndarray[sum_t, ndim=2] values, const intp_t[::1] labels, @@ -542,55 +547,54 @@ def group_sum( """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - sum_t val, t, y - sum_t[:, ::1] sumx, compensation + sum_out_t val, t, y + sum_out_t[:, ::1] sumx, compensation int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) bint uses_mask = mask is not None bint isna_entry - if len_values != len_labels: - raise ValueError("len(index) != len(labels)") + if (sum_out_t is float32_t and not sum_t is float32_t or + sum_t is float32_t and not sum_out_t is float32_t): + raise NotImplementedError # pragma: no cover - nobs = np.zeros((out).shape, dtype=np.int64) - # the below is equivalent to `np.zeros_like(out)` but faster - sumx = np.zeros((out).shape, dtype=(out).base.dtype) - compensation = np.zeros((out).shape, dtype=(out).base.dtype) + elif (sum_out_t is float64_t and not sum_t is float64_t or + sum_t is float64_t and not sum_out_t is float64_t): + raise NotImplementedError # pragma: no cover - N, K = (values).shape + elif (sum_out_t is complex64_t and not sum_t is complex64_t or + sum_t is complex64_t and not sum_out_t is complex64_t): + raise NotImplementedError # pragma: no cover - if sum_t is object: - # NB: this does not use 'compensation' like the non-object track does. - for i in range(N): - lab = labels[i] - if lab < 0: - continue + elif (sum_out_t is complex128_t and not sum_t is complex128_t or + sum_t is complex128_t and not sum_out_t is complex128_t): + raise NotImplementedError # pragma: no cover - counts[lab] += 1 - for j in range(K): - val = values[i, j] + elif (sum_out_t is object and not sum_t is object or + sum_t is object and not sum_out_t is object): + raise NotImplementedError # pragma: no cover - # not nan - if not checknull(val): - nobs[lab, j] += 1 + elif (sum_out_t is uint64_t and ( + sum_t is int8_t or sum_t is int16_t or sum_t is int32_t or sum_t is int64_t) + or sum_out_t is int64_t and ( + sum_t is uint8_t or sum_t is uint16_t or sum_t is uint32_t + or sum_t is uint64_t)): + raise NotImplementedError # pragma: no cover - if nobs[lab, j] == 1: - # i.e. we haven't added anything yet; avoid TypeError - # if e.g. val is a str and sumx[lab, j] is 0 - t = val - else: - t = sumx[lab, j] + val - sumx[lab, j] = t + else: - for i in range(ncounts): - for j in range(K): - if nobs[i, j] < min_count: - out[i, j] = None + if len_values != len_labels: + raise ValueError("len(index) != len(labels)") - else: - out[i, j] = sumx[i, j] - else: - with nogil: + nobs = np.zeros((out).shape, dtype=np.int64) + # the below is equivalent to `np.zeros_like(out)` but faster + sumx = np.zeros((out).shape, dtype=(out).base.dtype) + compensation = np.zeros((out).shape, dtype=(out).base.dtype) + + N, K = (values).shape + + if sum_t is object: + # NB: this does not use 'compensation' like the non-object track does. for i in range(N): lab = labels[i] if lab < 0: @@ -601,49 +605,79 @@ def group_sum( val = values[i, j] # not nan - # With dt64/td64 values, values have been cast to float64 - # instead if int64 for group_sum, but the logic - # is otherwise the same as in _treat_as_na - if uses_mask: - isna_entry = mask[i, j] - elif (sum_t is float32_t or sum_t is float64_t - or sum_t is complex64_t or sum_t is complex64_t): - # avoid warnings because of equality comparison - isna_entry = not val == val - elif sum_t is int64_t and is_datetimelike and val == NPY_NAT: - isna_entry = True - else: - isna_entry = False - - if not isna_entry: + if not checknull(val): nobs[lab, j] += 1 - y = val - compensation[lab, j] - t = sumx[lab, j] + y - compensation[lab, j] = t - sumx[lab, j] - y + + if nobs[lab, j] == 1: + # i.e. we haven't added anything yet; avoid TypeError + # if e.g. val is a str and sumx[lab, j] is 0 + t = val + else: + t = sumx[lab, j] + val sumx[lab, j] = t for i in range(ncounts): for j in range(K): if nobs[i, j] < min_count: - # if we are integer dtype, not is_datetimelike, and - # not uses_mask, then getting here implies that - # counts[i] < min_count, which means we will - # be cast to float64 and masked at the end - # of WrappedCythonOp._call_cython_op. So we can safely - # set a placeholder value in out[i, j]. + out[i, j] = None + + else: + out[i, j] = sumx[i, j] + else: + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + # With dt64/td64 values, values have been cast to float64 + # instead if int64 for group_sum, but the logic + # is otherwise the same as in _treat_as_na if uses_mask: - result_mask[i, j] = True + isna_entry = mask[i, j] elif (sum_t is float32_t or sum_t is float64_t or sum_t is complex64_t or sum_t is complex64_t): - out[i, j] = NAN - elif sum_t is int64_t: - out[i, j] = NPY_NAT + # avoid warnings because of equality comparison + isna_entry = not val == val + elif sum_t is int64_t and is_datetimelike and val == NPY_NAT: + isna_entry = True else: - # placeholder, see above - out[i, j] = 0 + isna_entry = False + + if not isna_entry: + nobs[lab, j] += 1 + y = val - compensation[lab, j] + t = sumx[lab, j] + y + compensation[lab, j] = t - sumx[lab, j] - y + sumx[lab, j] = t + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] < min_count: + # if we are integer dtype, not is_datetimelike, and + # not uses_mask, then getting here implies that + # counts[i] < min_count, which means we will + # be cast to float64 and masked at the end + # of WrappedCythonOp._call_cython_op. So we can safely + # set a placeholder value in out[i, j]. + if uses_mask: + result_mask[i, j] = True + elif (sum_t is float32_t or sum_t is float64_t + or sum_t is complex64_t or sum_t is complex64_t): + out[i, j] = NAN + elif sum_t is int64_t: + out[i, j] = NPY_NAT + else: + # placeholder, see above + out[i, j] = 0 - else: - out[i, j] = sumx[i, j] + else: + out[i, j] = sumx[i, j] @cython.wraparound(False) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7617ca5074c9c..167ca65ff4d73 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -183,7 +183,10 @@ def _get_cython_function( f"function is not implemented for this dtype: " f"[how->{how},dtype->{dtype_str}]" ) - elif "object" not in f.__signatures__: + elif ( + "object" not in f.__signatures__ + and "object|object" not in f.__signatures__ + ): # raise NotImplementedError here rather than TypeError later raise NotImplementedError( f"function is not implemented for this dtype: " @@ -293,6 +296,8 @@ def _get_out_dtype(self, dtype: np.dtype) -> np.dtype: if how == "rank": out_dtype = "float64" + elif how == "sum" and is_integer_dtype(dtype): + out_dtype = f"{dtype.kind}8" else: if is_numeric_dtype(dtype): out_dtype = f"{dtype.kind}{dtype.itemsize}" diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index a7c5b85e365ae..ade2b1ebbecfe 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2829,3 +2829,11 @@ def test_groupby_sum_support_mask(any_numeric_ea_dtype): dtype=any_numeric_ea_dtype, ) tm.assert_frame_equal(result, expected) + + +def test_groupby_sum_int8_overflow(): + # GH#37493 + df = DataFrame({"a": [1, 2, 2], "b": [125, 111, 111]}, dtype="int8") + result = df.groupby("a").sum() + expected = DataFrame({"b": [125, 222]}, index=Index([1, 2], name="a")) + tm.assert_frame_equal(result, expected)