diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a723983590650..f50041dcef466 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -846,6 +846,7 @@ ExtensionArray - Fixed bug that caused :meth:`Series.__repr__()` to crash for extension types whose elements are multidimensional arrays (:issue:`33770`). - Fixed bug where :meth:`Series.update` would raise a ``ValueError`` for ``ExtensionArray`` dtypes with missing values (:issue:`33980`) - Fixed bug where :meth:`StringArray.memory_usage` was not implemented (:issue:`33963`) +- Fixed bug where :meth:`DataFrameGroupBy` would ignore the ``min_count`` argument for aggregations on nullable boolean dtypes (:issue:`34051`) - Fixed bug that `DataFrame(columns=.., dtype='string')` would fail (:issue:`27953`, :issue:`33623`) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index b1d318d4d9678..95bc334905891 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -93,6 +93,10 @@ def __repr__(self) -> str: def _is_boolean(self) -> bool: return True + @property + def _is_numeric(self) -> bool: + return True + def __from_arrow__( self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] ) -> "BooleanArray": diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 9865a7d28542d..4a8f7bbf0b414 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -312,12 +312,14 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj: DtypeObj The desired dtype of the result. """ - d = { - (np.dtype(np.bool), "add"): np.dtype(np.int64), - (np.dtype(np.bool), "cumsum"): np.dtype(np.int64), - (np.dtype(np.bool), "sum"): np.dtype(np.int64), - } - return d.get((dtype, how), dtype) + from pandas.core.arrays.boolean import BooleanDtype + from pandas.core.arrays.integer import Int64Dtype + + if how in ["add", "cumsum", "sum"] and (dtype == np.dtype(np.bool)): + return np.dtype(np.int64) + elif how in ["add", "cumsum", "sum"] and isinstance(dtype, BooleanDtype): + return Int64Dtype() + return dtype def maybe_cast_to_extension_array(cls: Type["ExtensionArray"], obj, dtype=None): diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index d7ec5c63e16b9..4a0fbf071f7f9 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -485,7 +485,7 @@ def _cython_operation( values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): - values = ensure_float64(values) + values = ensure_int_or_float(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 0b0bbd3a6dc48..725067951eeef 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -326,6 +326,23 @@ def test_in_numeric_groupby(self, data_for_grouping): tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("min_count", [0, 10]) + def test_groupby_sum_mincount(self, data_for_grouping, min_count): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) + result = df.groupby("A").sum(min_count=min_count) + if min_count == 0: + expected = pd.DataFrame( + {"B": pd.array([3, 0, 0], dtype="Int64")}, + index=pd.Index([1, 2, 3], name="A"), + ) + tm.assert_frame_equal(result, expected) + else: + expected = pd.DataFrame( + {"B": pd.array([pd.NA] * 3, dtype="Int64")}, + index=pd.Index([1, 2, 3], name="A"), + ) + tm.assert_frame_equal(result, expected) + class TestNumericReduce(base.BaseNumericReduceTests): def check_reduce(self, s, op_name, skipna):