Skip to content

Commit 79f7762

Browse files
troelsWillAyd
authored andcommitted
BUG SeriesGroupBy.mean() overflowed on some integer array (pandas-dev#22653)
1 parent 2670494 commit 79f7762

File tree

4 files changed

+39
-1
lines changed

4 files changed

+39
-1
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -768,6 +768,7 @@ Groupby/Resample/Rolling
768768
- Bug in :meth:`Resampler.apply` when passing postiional arguments to applied func (:issue:`14615`).
769769
- Bug in :meth:`Series.resample` when passing ``numpy.timedelta64`` to ``loffset`` kwarg (:issue:`7687`).
770770
- Bug in :meth:`Resampler.asfreq` when frequency of ``TimedeltaIndex`` is a subperiod of a new frequency (:issue:`13022`).
771+
- Bug in :meth:`SeriesGroupBy.mean` when values were integral but could not fit inside of int64, overflowing instead. (:issue:`22487`)
771772

772773
Sparse
773774
^^^^^^

pandas/core/dtypes/common.py

+27
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,33 @@ def ensure_categorical(arr):
9090
return arr
9191

9292

93+
def ensure_int64_or_float64(arr, copy=False):
94+
"""
95+
Ensure that an dtype array of some integer dtype
96+
has an int64 dtype if possible
97+
If it's not possible, potentially because of overflow,
98+
convert the array to float64 instead.
99+
100+
Parameters
101+
----------
102+
arr : array-like
103+
The array whose data type we want to enforce.
104+
copy: boolean
105+
Whether to copy the original array or reuse
106+
it in place, if possible.
107+
108+
Returns
109+
-------
110+
out_arr : The input array cast as int64 if
111+
possible without overflow.
112+
Otherwise the input array cast to float64.
113+
"""
114+
try:
115+
return arr.astype('int64', copy=copy, casting='safe')
116+
except TypeError:
117+
return arr.astype('float64', copy=copy)
118+
119+
93120
def is_object_dtype(arr_or_dtype):
94121
"""
95122
Check whether an array-like or dtype is of the object dtype.

pandas/core/groupby/ops.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
ensure_float64,
2424
ensure_platform_int,
2525
ensure_int64,
26+
ensure_int64_or_float64,
2627
ensure_object,
2728
needs_i8_conversion,
2829
is_integer_dtype,
@@ -471,7 +472,7 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1,
471472
if (values == iNaT).any():
472473
values = ensure_float64(values)
473474
else:
474-
values = values.astype('int64', copy=False)
475+
values = ensure_int64_or_float64(values)
475476
elif is_numeric and not is_complex_dtype(values):
476477
values = ensure_float64(values)
477478
else:

pandas/tests/groupby/test_function.py

+9
Original file line numberDiff line numberDiff line change
@@ -1125,3 +1125,12 @@ def h(df, arg3):
11251125
expected = pd.Series([4, 8, 12], index=pd.Int64Index([1, 2, 3]))
11261126

11271127
tm.assert_series_equal(result, expected)
1128+
1129+
1130+
def test_groupby_mean_no_overflow():
1131+
# Regression test for (#22487)
1132+
df = pd.DataFrame({
1133+
"user": ["A", "A", "A", "A", "A"],
1134+
"connections": [4970, 4749, 4719, 4704, 18446744073699999744]
1135+
})
1136+
assert df.groupby('user')['connections'].mean()['A'] == 3689348814740003840

0 commit comments

Comments
 (0)