REGR: Avoid overflow with groupby sum (pandas-dev#48059)

phofl · noatamir · commit 1893b105e07f · 2022-11-09T22:58:14.000+01:00
* REGR: Avoid overflow with groupby sum

* Add comment
diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi
@@ -132,6 +132,7 @@ def ensure_int8(arr: object, copy=...) -> npt.NDArray[np.int8]: ...
 def ensure_int16(arr: object, copy=...) -> npt.NDArray[np.int16]: ...
 def ensure_int32(arr: object, copy=...) -> npt.NDArray[np.int32]: ...
 def ensure_int64(arr: object, copy=...) -> npt.NDArray[np.int64]: ...
+def ensure_uint64(arr: object, copy=...) -> npt.NDArray[np.uint64]: ...
 def take_1d_int8_int8(
     values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
 ) -> None: ...
diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in
@@ -41,12 +41,12 @@ dtypes = [('float64', 'FLOAT64', 'float64'),
           ('int16', 'INT16', 'int16'),
           ('int32', 'INT32', 'int32'),
           ('int64', 'INT64', 'int64'),
+          ('uint64', 'UINT64', 'uint64'),
           # Disabling uint and complex dtypes because we do not use them
-          #  (and compiling them increases wheel size)
+          #  (and compiling them increases wheel size) (except uint64)
           # ('uint8', 'UINT8', 'uint8'),
           # ('uint16', 'UINT16', 'uint16'),
           # ('uint32', 'UINT32', 'uint32'),
-          # ('uint64', 'UINT64', 'uint64'),
           # ('complex64', 'COMPLEX64', 'complex64'),
           # ('complex128', 'COMPLEX128', 'complex128')
 ]
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -513,14 +513,7 @@ ctypedef fused mean_t:
 
 ctypedef fused sum_t:
     mean_t
-    int8_t
-    int16_t
-    int32_t
     int64_t
-
-    uint8_t
-    uint16_t
-    uint32_t
     uint64_t
     object
 
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -100,6 +100,7 @@ def ensure_float(arr):
 ensure_int8 = algos.ensure_int8
 ensure_platform_int = algos.ensure_platform_int
 ensure_object = algos.ensure_object
+ensure_uint64 = algos.ensure_uint64
 
 
 def ensure_str(value: bytes | Any) -> str:
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -46,6 +46,7 @@
     ensure_float64,
     ensure_int64,
     ensure_platform_int,
+    ensure_uint64,
     is_1d_only_ea_dtype,
     is_bool_dtype,
     is_complex_dtype,
@@ -224,6 +225,13 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray:
                 # result may still include NaN, so we have to cast
                 values = ensure_float64(values)
 
+            elif how == "sum":
+                # Avoid overflow during group op
+                if values.dtype.kind == "i":
+                    values = ensure_int64(values)
+                else:
+                    values = ensure_uint64(values)
+
         return values
 
     # TODO: general case implementation overridable by EAs.
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -2829,3 +2829,16 @@ def test_groupby_sum_support_mask(any_numeric_ea_dtype):
         dtype=any_numeric_ea_dtype,
     )
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("val, dtype", [(111, "int"), (222, "uint")])
+def test_groupby_sum_overflow(val, dtype):
+    # GH#37493
+    df = DataFrame({"a": 1, "b": [val, val]}, dtype=f"{dtype}8")
+    result = df.groupby("a").sum()
+    expected = DataFrame(
+        {"b": [val * 2]},
+        index=Index([1], name="a", dtype=f"{dtype}64"),
+        dtype=f"{dtype}64",
+    )
+    tm.assert_frame_equal(result, expected)