diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index bcc3edab4a349..32baec25bef2d 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -166,4 +166,18 @@ def time_division(self, fill_value): self.arr1 / self.arr2 +class MinMax: + + params = (["min", "max"], [0.0, np.nan]) + param_names = ["func", "fill_value"] + + def setup(self, func, fill_value): + N = 1_000_000 + arr = make_array(N, 1e-5, fill_value, np.float64) + self.sp_arr = SparseArray(arr, fill_value=fill_value) + + def time_min_max(self, func, fill_value): + getattr(self.sp_arr, func)() + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index ff8f8b9977134..99bcc1f093a52 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -302,6 +302,7 @@ Performance improvements - Performance improvement in :meth:`Series.sparse.to_coo` (:issue:`42880`) - Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`) - Performance improvement in :meth:`GroupBy.quantile` (:issue:`43469`) +- :meth:`SparseArray.min` and :meth:`SparseArray.max` no longer require converting to a dense array (:issue:`43526`) - .. --------------------------------------------------------------------------- @@ -437,6 +438,7 @@ Reshaping Sparse ^^^^^^ - Bug in :meth:`DataFrame.sparse.to_coo` raising ``AttributeError`` when column names are not unique (:issue:`29564`) +- Bug in :meth:`SparseArray.max` and :meth:`SparseArray.min` raising ``ValueError`` for arrays with 0 non-null elements (:issue:`43527`) - - diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 6ae216cd3263c..f715a4d47ab30 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1456,23 +1456,69 @@ def mean(self, axis=0, *args, **kwargs): nsparse = self.sp_index.ngaps return (sp_sum + self.fill_value * nsparse) / (ct + nsparse) - def max(self, axis=0, *args, **kwargs): + def max(self, axis: int = 0, *args, **kwargs) -> Scalar: + """ + Max of non-NA/null values + + Parameters + ---------- + axis : int, default 0 + Not Used. NumPy compatibility. + *args, **kwargs + Not Used. NumPy compatibility. + + Returns + ------- + scalar + """ nv.validate_max(args, kwargs) + return self._min_max("max") - # This condition returns a nan if there are no valid values in the array. - if self.size > 0 and self._valid_sp_values.size == 0: - return self.fill_value - else: - return np.nanmax(self, axis) + def min(self, axis: int = 0, *args, **kwargs) -> Scalar: + """ + Min of non-NA/null values + + Parameters + ---------- + axis : int, default 0 + Not Used. NumPy compatibility. + *args, **kwargs + Not Used. NumPy compatibility. - def min(self, axis=0, *args, **kwargs): + Returns + ------- + scalar + """ nv.validate_min(args, kwargs) + return self._min_max("min") + + def _min_max(self, kind: Literal["min", "max"]) -> Scalar: + """ + Min/max of non-NA/null values - # This condition returns a nan if there are no valid values in the array. - if self.size > 0 and self._valid_sp_values.size == 0: + Parameters + ---------- + kind : {"min", "max"} + + Returns + ------- + scalar + """ + valid_vals = self._valid_sp_values + has_nonnull_fill_vals = not self._null_fill_value and self.sp_index.ngaps > 0 + if len(valid_vals) > 0: + sp_min_max = getattr(valid_vals, kind)() + + # If a non-null fill value is currently present, it might be the min/max + if has_nonnull_fill_vals: + func = max if kind == "max" else min + return func(sp_min_max, self.fill_value) + else: + return sp_min_max + elif has_nonnull_fill_vals: return self.fill_value else: - return np.nanmin(self, axis) + return na_value_for_dtype(self.dtype.subtype) # ------------------------------------------------------------------------ # Ufuncs diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index c476afb97446e..8c64c5bb3a055 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -1362,26 +1362,54 @@ def test_drop_duplicates_fill_value(): class TestMinMax: - plain_data = np.arange(5).astype(float) - data_neg = plain_data * (-1) - data_NaN = SparseArray(np.array([0, 1, 2, np.nan, 4])) - data_all_NaN = SparseArray(np.array([np.nan, np.nan, np.nan, np.nan, np.nan])) - data_NA_filled = SparseArray( - np.array([np.nan, np.nan, np.nan, np.nan, np.nan]), fill_value=5 - ) - @pytest.mark.parametrize( "raw_data,max_expected,min_expected", [ - (plain_data, [4], [0]), - (data_neg, [0], [-4]), - (data_NaN, [4], [0]), - (data_all_NaN, [np.nan], [np.nan]), - (data_NA_filled, [5], [5]), + (np.arange(5.0), [4], [0]), + (-np.arange(5.0), [0], [-4]), + (np.array([0, 1, 2, np.nan, 4]), [4], [0]), + (np.array([np.nan] * 5), [np.nan], [np.nan]), + (np.array([]), [np.nan], [np.nan]), ], ) - def test_maxmin(self, raw_data, max_expected, min_expected): + def test_nan_fill_value(self, raw_data, max_expected, min_expected): max_result = SparseArray(raw_data).max() min_result = SparseArray(raw_data).min() assert max_result in max_expected assert min_result in min_expected + + @pytest.mark.parametrize( + "fill_value,max_expected,min_expected", + [ + (100, 100, 0), + (-100, 1, -100), + ], + ) + def test_fill_value(self, fill_value, max_expected, min_expected): + arr = SparseArray( + np.array([fill_value, 0, 1]), dtype=SparseDtype("int", fill_value) + ) + max_result = arr.max() + assert max_result == max_expected + + min_result = arr.min() + assert min_result == min_expected + + @pytest.mark.parametrize("func", ["min", "max"]) + @pytest.mark.parametrize("data", [np.array([]), np.array([np.nan, np.nan])]) + @pytest.mark.parametrize( + "dtype,expected", + [ + (SparseDtype(np.float64, np.nan), np.nan), + (SparseDtype(np.float64, 5.0), np.nan), + (SparseDtype("datetime64[ns]", pd.NaT), pd.NaT), + (SparseDtype("datetime64[ns]", pd.to_datetime("2018-05-05")), pd.NaT), + ], + ) + def test_na_value_if_no_valid_values(self, func, data, dtype, expected): + arr = SparseArray(data, dtype=dtype) + result = getattr(arr, func)() + if expected == pd.NaT: + assert result == pd.NaT + else: + assert np.isnan(result)