diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi index 9ffcf25f6eacd..5a2005722c85c 100644 --- a/pandas/_libs/algos.pyi +++ b/pandas/_libs/algos.pyi @@ -132,6 +132,7 @@ def ensure_int8(arr: object, copy=...) -> npt.NDArray[np.int8]: ... def ensure_int16(arr: object, copy=...) -> npt.NDArray[np.int16]: ... def ensure_int32(arr: object, copy=...) -> npt.NDArray[np.int32]: ... def ensure_int64(arr: object, copy=...) -> npt.NDArray[np.int64]: ... +def ensure_uint64(arr: object, copy=...) -> npt.NDArray[np.uint64]: ... def take_1d_int8_int8( values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... ) -> None: ... diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in index 991566f9b7143..ce2e1ffbb5870 100644 --- a/pandas/_libs/algos_common_helper.pxi.in +++ b/pandas/_libs/algos_common_helper.pxi.in @@ -41,12 +41,12 @@ dtypes = [('float64', 'FLOAT64', 'float64'), ('int16', 'INT16', 'int16'), ('int32', 'INT32', 'int32'), ('int64', 'INT64', 'int64'), + ('uint64', 'UINT64', 'uint64'), # Disabling uint and complex dtypes because we do not use them - # (and compiling them increases wheel size) + # (and compiling them increases wheel size) (except uint64) # ('uint8', 'UINT8', 'uint8'), # ('uint16', 'UINT16', 'uint16'), # ('uint32', 'UINT32', 'uint32'), - # ('uint64', 'UINT64', 'uint64'), # ('complex64', 'COMPLEX64', 'complex64'), # ('complex128', 'COMPLEX128', 'complex128') ] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index e4314edecfa7e..6e2b79a320dd7 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -513,14 +513,7 @@ ctypedef fused mean_t: ctypedef fused sum_t: mean_t - int8_t - int16_t - int32_t int64_t - - uint8_t - uint16_t - uint32_t uint64_t object diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index be4d50af8a053..f0e4a54c3f05c 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -100,6 +100,7 @@ def ensure_float(arr): ensure_int8 = algos.ensure_int8 ensure_platform_int = algos.ensure_platform_int ensure_object = algos.ensure_object +ensure_uint64 = algos.ensure_uint64 def ensure_str(value: bytes | Any) -> str: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7617ca5074c9c..caea70e03b6f3 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -46,6 +46,7 @@ ensure_float64, ensure_int64, ensure_platform_int, + ensure_uint64, is_1d_only_ea_dtype, is_bool_dtype, is_complex_dtype, @@ -224,6 +225,13 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: # result may still include NaN, so we have to cast values = ensure_float64(values) + elif how == "sum": + # Avoid overflow during group op + if values.dtype.kind == "i": + values = ensure_int64(values) + else: + values = ensure_uint64(values) + return values # TODO: general case implementation overridable by EAs. diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index a7c5b85e365ae..d290aada18293 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2829,3 +2829,16 @@ def test_groupby_sum_support_mask(any_numeric_ea_dtype): dtype=any_numeric_ea_dtype, ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("val, dtype", [(111, "int"), (222, "uint")]) +def test_groupby_sum_overflow(val, dtype): + # GH#37493 + df = DataFrame({"a": 1, "b": [val, val]}, dtype=f"{dtype}8") + result = df.groupby("a").sum() + expected = DataFrame( + {"b": [val * 2]}, + index=Index([1], name="a", dtype=f"{dtype}64"), + dtype=f"{dtype}64", + ) + tm.assert_frame_equal(result, expected)