diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 19e8acdaa7384..d232b68e3c450 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -274,7 +274,7 @@ Performance improvements sparse values from ``scipy.sparse`` matrices using the :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`, :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). -- Performance improvement in :meth:`Series.sum` for nullable (integer and boolean) dtypes (:issue:`30982`). +- Performance improvement in reductions (sum, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`). .. --------------------------------------------------------------------------- diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 0fb2605b554c2..b3723340cefd6 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -45,3 +45,44 @@ def sum( return np.sum(values[~mask]) else: return np.sum(values, where=~mask) + + +def _minmax(func, values: np.ndarray, mask: np.ndarray, skipna: bool = True): + """ + Reduction for 1D masked array. + + Parameters + ---------- + func : np.min or np.max + values : np.ndarray + Numpy array with the values (can be of any dtype that support the + operation). + mask : np.ndarray + Boolean numpy array (True values indicate missing values). + skipna : bool, default True + Whether to skip NA. + """ + if not skipna: + if mask.any(): + return libmissing.NA + else: + if values.size: + return func(values) + else: + # min/max with empty array raise in numpy, pandas returns NA + return libmissing.NA + else: + subset = values[~mask] + if subset.size: + return func(values[~mask]) + else: + # min/max with empty array raise in numpy, pandas returns NA + return libmissing.NA + + +def min(values: np.ndarray, mask: np.ndarray, skipna: bool = True): + return _minmax(np.min, values=values, mask=mask, skipna=skipna) + + +def max(values: np.ndarray, mask: np.ndarray, skipna: bool = True): + return _minmax(np.max, values=values, mask=mask, skipna=skipna) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 442d4ca8cef6d..e85534def6b97 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -696,8 +696,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data mask = self._mask - if name == "sum": - return masked_reductions.sum(data, mask, skipna=skipna, **kwargs) + if name in {"sum", "min", "max"}: + op = getattr(masked_reductions, name) + return op(data, mask, skipna=skipna, **kwargs) # coerce to a nan-aware float if needed if self._hasna: @@ -715,9 +716,6 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): if int_result == result: result = int_result - elif name in ["min", "max"] and notna(result): - result = np.bool_(result) - return result def _maybe_mask_result(self, result, mask, other, op_name: str): diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 4f3c68aa03b16..541ecdeb45830 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -561,8 +561,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data mask = self._mask - if name == "sum": - return masked_reductions.sum(data, mask, skipna=skipna, **kwargs) + if name in {"sum", "min", "max"}: + op = getattr(masked_reductions, name) + return op(data, mask, skipna=skipna, **kwargs) # coerce to a nan-aware float if needed # (we explicitly use NaN within reductions) @@ -581,7 +582,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): # if we have a preservable numeric op, # provide coercion back to an integer type if possible - elif name in ["min", "max", "prod"]: + elif name == "prod": # GH#31409 more performant than casting-then-checking result = com.cast_scalar_indexer(result) diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index ee1ec86745246..515013e95c717 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -34,7 +34,7 @@ def test_preserve_dtypes(op): # op result = getattr(df.C, op)() - if op == "sum": + if op in {"sum", "min", "max"}: assert isinstance(result, np.int64) else: assert isinstance(result, int) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 962b105d1e8fc..8fb035e085d40 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -65,27 +65,58 @@ def test_ops(self, opname, obj): assert result.value == expected @pytest.mark.parametrize("opname", ["max", "min"]) - def test_nanops(self, opname, index_or_series): + @pytest.mark.parametrize( + "dtype, val", + [ + ("object", 2.0), + ("float64", 2.0), + ("datetime64[ns]", datetime(2011, 11, 1)), + ("Int64", 2), + ("boolean", True), + ], + ) + def test_nanminmax(self, opname, dtype, val, index_or_series): # GH#7261 klass = index_or_series - arg_op = "arg" + opname if klass is Index else "idx" + opname - obj = klass([np.nan, 2.0]) - assert getattr(obj, opname)() == 2.0 + if dtype in ["Int64", "boolean"] and klass == pd.Index: + pytest.skip("EAs can't yet be stored in an index") - obj = klass([np.nan]) - assert pd.isna(getattr(obj, opname)()) - assert pd.isna(getattr(obj, opname)(skipna=False)) + def check_missing(res): + if dtype == "datetime64[ns]": + return res is pd.NaT + elif dtype == "Int64": + return res is pd.NA + else: + return pd.isna(res) - obj = klass([], dtype=object) - assert pd.isna(getattr(obj, opname)()) - assert pd.isna(getattr(obj, opname)(skipna=False)) + obj = klass([None], dtype=dtype) + assert check_missing(getattr(obj, opname)()) + assert check_missing(getattr(obj, opname)(skipna=False)) - obj = klass([pd.NaT, datetime(2011, 11, 1)]) - # check DatetimeIndex monotonic path - assert getattr(obj, opname)() == datetime(2011, 11, 1) - assert getattr(obj, opname)(skipna=False) is pd.NaT + obj = klass([], dtype=dtype) + assert check_missing(getattr(obj, opname)()) + assert check_missing(getattr(obj, opname)(skipna=False)) + + if dtype == "object": + # generic test with object only works for empty / all NaN + return + + obj = klass([None, val], dtype=dtype) + assert getattr(obj, opname)() == val + assert check_missing(getattr(obj, opname)(skipna=False)) + obj = klass([None, val, None], dtype=dtype) + assert getattr(obj, opname)() == val + assert check_missing(getattr(obj, opname)(skipna=False)) + + @pytest.mark.parametrize("opname", ["max", "min"]) + def test_nanargminmax(self, opname, index_or_series): + # GH#7261 + klass = index_or_series + arg_op = "arg" + opname if klass is Index else "idx" + opname + + obj = klass([pd.NaT, datetime(2011, 11, 1)]) assert getattr(obj, arg_op)() == 1 result = getattr(obj, arg_op)(skipna=False) if klass is Series: @@ -95,9 +126,6 @@ def test_nanops(self, opname, index_or_series): obj = klass([pd.NaT, datetime(2011, 11, 1), pd.NaT]) # check DatetimeIndex non-monotonic path - assert getattr(obj, opname)(), datetime(2011, 11, 1) - assert getattr(obj, opname)(skipna=False) is pd.NaT - assert getattr(obj, arg_op)() == 1 result = getattr(obj, arg_op)(skipna=False) if klass is Series: