From ac20e20a2e60a17bb08d9f5f654293ae74a38962 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 7 May 2020 14:07:02 -0500 Subject: [PATCH 1/5] BUG: Make nullable booleans numeric --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/arrays/boolean.py | 4 ++++ pandas/core/dtypes/cast.py | 14 ++++++++------ pandas/core/groupby/ops.py | 18 ++++++------------ pandas/tests/extension/test_boolean.py | 17 +++++++++++++++++ 5 files changed, 36 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 9c424f70b1ee0..703718d4a9c93 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -760,6 +760,7 @@ ExtensionArray - Fixed bug that caused :meth:`Series.__repr__()` to crash for extension types whose elements are multidimensional arrays (:issue:`33770`). - Fixed bug where :meth:`Series.update` would raise a ``ValueError`` for ``ExtensionArray`` dtypes with missing values (:issue:`33980`) - Fixed bug where :meth:`StringArray.memory_usage` was not implemented (:issue:`33963`) +- Fixed bug where :meth:`DataFrameGroupBy` would ignore the ``min_count`` argument for aggregations on nullable boolean dtypes (:issue:`34051`) Other diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 685a9ec48228f..7fa15b5f333fe 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -101,6 +101,10 @@ def __repr__(self) -> str: def _is_boolean(self) -> bool: return True + @property + def _is_numeric(self) -> bool: + return True + def __from_arrow__( self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] ) -> "BooleanArray": diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b91cfde45f079..a16b34a7009b4 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -73,6 +73,8 @@ from pandas.core.dtypes.inference import is_list_like from pandas.core.dtypes.missing import isna, notna +import pandas as pd + if TYPE_CHECKING: from pandas import Series from pandas.core.arrays import ExtensionArray # noqa: F401 @@ -313,12 +315,12 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj: DtypeObj The desired dtype of the result. """ - d = { - (np.dtype(np.bool), "add"): np.dtype(np.int64), - (np.dtype(np.bool), "cumsum"): np.dtype(np.int64), - (np.dtype(np.bool), "sum"): np.dtype(np.int64), - } - return d.get((dtype, how), dtype) + if how in ["add", "cumsum", "sum"]: + if dtype == np.dtype(np.bool): + return np.dtype(np.int64) + if isinstance(dtype, pd.BooleanDtype): + return pd.Int64Dtype() + return dtype def maybe_cast_to_extension_array(cls: Type["ExtensionArray"], obj, dtype=None): diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 71d7a07aadf7f..5074815969677 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -11,6 +11,8 @@ import numpy as np +from pandas.core.dtypes.cast import maybe_cast_result + from pandas._libs import NaT, iNaT, lib import pandas._libs.groupby as libgroupby import pandas._libs.reduction as libreduction @@ -484,7 +486,7 @@ def _cython_operation( values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): - values = ensure_float64(values) + values = ensure_int_or_float(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition @@ -548,17 +550,6 @@ def _cython_operation( if mask.any(): result = result.astype("float64") result[mask] = np.nan - elif ( - how == "add" - and is_integer_dtype(orig_values.dtype) - and is_extension_array_dtype(orig_values.dtype) - ): - # We need this to ensure that Series[Int64Dtype].resample().sum() - # remains int64 dtype. - # Two options for avoiding this special case - # 1. mask-aware ops and avoid casting to float with NaN above - # 2. specify the result dtype when calling this method - result = result.astype("int64") if kind == "aggregate" and self._filter_empty_groups and not counts.all(): assert result.ndim != 2 @@ -582,6 +573,9 @@ def _cython_operation( elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) + if is_extension_array_dtype(orig_values.dtype): + result = maybe_cast_result(result=result, obj=orig_values, how=how) + return result, names def aggregate( diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 0b0bbd3a6dc48..fe31cbdeae71d 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -326,6 +326,23 @@ def test_in_numeric_groupby(self, data_for_grouping): tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("min_count", [0, 10]) + def test_groupby_sum_mincount(self, data_for_grouping, min_count): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping,}) + result = df.groupby("A").sum(min_count=min_count) + if min_count == 0: + expected = pd.DataFrame( + {"B": pd.array([3, 0, 0], dtype="Int64")}, + index=pd.Index([1, 2, 3], name="A"), + ) + tm.assert_frame_equal(result, expected) + else: + expected = pd.DataFrame( + {"B": pd.array([pd.NA] * 3, dtype="Int64")}, + index=pd.Index([1, 2, 3], name="A"), + ) + tm.assert_frame_equal(result, expected) + class TestNumericReduce(base.BaseNumericReduceTests): def check_reduce(self, s, op_name, skipna): From b68b57930ed1977741419ca93c3caa3f0b9d8029 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 7 May 2020 15:55:37 -0500 Subject: [PATCH 2/5] Fix --- pandas/tests/extension/test_boolean.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index fe31cbdeae71d..725067951eeef 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -328,7 +328,7 @@ def test_in_numeric_groupby(self, data_for_grouping): @pytest.mark.parametrize("min_count", [0, 10]) def test_groupby_sum_mincount(self, data_for_grouping, min_count): - df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping,}) + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) result = df.groupby("A").sum(min_count=min_count) if min_count == 0: expected = pd.DataFrame( From 1723f05c1c7003468694666e6825f0dbe636547e Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 7 May 2020 16:18:20 -0500 Subject: [PATCH 3/5] Sort imports --- pandas/core/groupby/ops.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 5074815969677..c3ddaa9246ed1 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -11,8 +11,6 @@ import numpy as np -from pandas.core.dtypes.cast import maybe_cast_result - from pandas._libs import NaT, iNaT, lib import pandas._libs.groupby as libgroupby import pandas._libs.reduction as libreduction @@ -20,6 +18,7 @@ from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly +from pandas.core.dtypes.cast import maybe_cast_result from pandas.core.dtypes.common import ( ensure_float64, ensure_int64, From 3a468ec1347299624ac10ef73a2847a147a24179 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 7 May 2020 18:36:15 -0500 Subject: [PATCH 4/5] Nit --- pandas/core/dtypes/cast.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index a16b34a7009b4..6211da763a605 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -315,11 +315,10 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj: DtypeObj The desired dtype of the result. """ - if how in ["add", "cumsum", "sum"]: - if dtype == np.dtype(np.bool): - return np.dtype(np.int64) - if isinstance(dtype, pd.BooleanDtype): - return pd.Int64Dtype() + if how in ["add", "cumsum", "sum"] and (dtype == np.dtype(np.bool)): + return np.dtype(np.int64) + if how in ["add", "cumsum", "sum"] and isinstance(dtype, pd.BooleanDtype): + return pd.Int64Dtype() return dtype From de7c3d49217160e61994a6992469719d2b6d32a1 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 10 May 2020 10:59:16 -0500 Subject: [PATCH 5/5] Don't import pandas --- pandas/core/dtypes/cast.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 39c54b440cd4d..4a8f7bbf0b414 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -73,8 +73,6 @@ from pandas.core.dtypes.inference import is_list_like from pandas.core.dtypes.missing import isna, notna -import pandas as pd - if TYPE_CHECKING: from pandas import Series from pandas.core.arrays import ExtensionArray # noqa: F401 @@ -314,10 +312,13 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj: DtypeObj The desired dtype of the result. """ + from pandas.core.arrays.boolean import BooleanDtype + from pandas.core.arrays.integer import Int64Dtype + if how in ["add", "cumsum", "sum"] and (dtype == np.dtype(np.bool)): return np.dtype(np.int64) - if how in ["add", "cumsum", "sum"] and isinstance(dtype, pd.BooleanDtype): - return pd.Int64Dtype() + elif how in ["add", "cumsum", "sum"] and isinstance(dtype, BooleanDtype): + return Int64Dtype() return dtype