diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 9c424f70b1ee0..aeb0200bca3f2 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -714,6 +714,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`) - Bug in :meth:`GroupBy.count` causes segmentation fault when grouped-by column contains NaNs (:issue:`32841`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean series (:issue:`32894`) +- Bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` where a large negative number would be returned when the number of non-null values was below ``min_count`` for nullable integer dtypes (:issue:`32861`) - Bug in :meth:`SeriesGroupBy.quantile` raising on nullable integers (:issue:`33136`) - Bug in :meth:`SeriesGroupBy.first`, :meth:`SeriesGroupBy.last`, :meth:`SeriesGroupBy.min`, and :meth:`SeriesGroupBy.max` returning floats when applied to nullable Booleans (:issue:`33071`) - Bug in :meth:`DataFrameGroupBy.agg` with dictionary input losing ``ExtensionArray`` dtypes (:issue:`32194`) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 71d7a07aadf7f..fe79812e60b6d 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -18,6 +18,7 @@ from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly +from pandas.core.dtypes.cast import maybe_cast_result from pandas.core.dtypes.common import ( ensure_float64, ensure_int64, @@ -548,17 +549,6 @@ def _cython_operation( if mask.any(): result = result.astype("float64") result[mask] = np.nan - elif ( - how == "add" - and is_integer_dtype(orig_values.dtype) - and is_extension_array_dtype(orig_values.dtype) - ): - # We need this to ensure that Series[Int64Dtype].resample().sum() - # remains int64 dtype. - # Two options for avoiding this special case - # 1. mask-aware ops and avoid casting to float with NaN above - # 2. specify the result dtype when calling this method - result = result.astype("int64") if kind == "aggregate" and self._filter_empty_groups and not counts.all(): assert result.ndim != 2 @@ -582,6 +572,9 @@ def _cython_operation( elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) + if is_extension_array_dtype(orig_values.dtype): + result = maybe_cast_result(result=result, obj=orig_values, how=how) + return result, names def aggregate( diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 93dd1bf23c308..fe4ab21b4b348 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1639,3 +1639,20 @@ def test_apply_to_nullable_integer_returns_float(values, function): result = groups.agg([function]) expected.columns = MultiIndex.from_tuples([("b", function)]) tm.assert_frame_equal(result, expected) + + +def test_groupby_sum_below_mincount_nullable_integer(): + # https://github.com/pandas-dev/pandas/issues/32861 + df = pd.DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64") + grouped = df.groupby("a") + idx = pd.Index([0, 1, 2], dtype=object, name="a") + + result = grouped["b"].sum(min_count=2) + expected = pd.Series([pd.NA] * 3, dtype="Int64", index=idx, name="b") + tm.assert_series_equal(result, expected) + + result = grouped.sum(min_count=2) + expected = pd.DataFrame( + {"b": [pd.NA] * 3, "c": [pd.NA] * 3}, dtype="Int64", index=idx + ) + tm.assert_frame_equal(result, expected)