Skip to content

BUG/PERF: sparse min/max don't densify #43527

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Sep 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions asv_bench/benchmarks/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,4 +166,18 @@ def time_division(self, fill_value):
self.arr1 / self.arr2


class MinMax:

params = (["min", "max"], [0.0, np.nan])
param_names = ["func", "fill_value"]

def setup(self, func, fill_value):
N = 1_000_000
arr = make_array(N, 1e-5, fill_value, np.float64)
self.sp_arr = SparseArray(arr, fill_value=fill_value)

def time_min_max(self, func, fill_value):
getattr(self.sp_arr, func)()


from .pandas_vb_common import setup # noqa: F401 isort:skip
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,7 @@ Performance improvements
- Performance improvement in :meth:`Series.sparse.to_coo` (:issue:`42880`)
- Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`)
- Performance improvement in :meth:`GroupBy.quantile` (:issue:`43469`)
- :meth:`SparseArray.min` and :meth:`SparseArray.max` no longer require converting to a dense array (:issue:`43526`)
-

.. ---------------------------------------------------------------------------
Expand Down Expand Up @@ -437,6 +438,7 @@ Reshaping
Sparse
^^^^^^
- Bug in :meth:`DataFrame.sparse.to_coo` raising ``AttributeError`` when column names are not unique (:issue:`29564`)
- Bug in :meth:`SparseArray.max` and :meth:`SparseArray.min` raising ``ValueError`` for arrays with 0 non-null elements (:issue:`43527`)
-
-

Expand Down
66 changes: 56 additions & 10 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1456,23 +1456,69 @@ def mean(self, axis=0, *args, **kwargs):
nsparse = self.sp_index.ngaps
return (sp_sum + self.fill_value * nsparse) / (ct + nsparse)

def max(self, axis=0, *args, **kwargs):
def max(self, axis: int = 0, *args, **kwargs) -> Scalar:
"""
Max of non-NA/null values

Parameters
----------
axis : int, default 0
Not Used. NumPy compatibility.
*args, **kwargs
Not Used. NumPy compatibility.

Returns
-------
scalar
"""
nv.validate_max(args, kwargs)
return self._min_max("max")

# This condition returns a nan if there are no valid values in the array.
if self.size > 0 and self._valid_sp_values.size == 0:
return self.fill_value
else:
return np.nanmax(self, axis)
def min(self, axis: int = 0, *args, **kwargs) -> Scalar:
"""
Min of non-NA/null values

Parameters
----------
axis : int, default 0
Not Used. NumPy compatibility.
*args, **kwargs
Not Used. NumPy compatibility.

def min(self, axis=0, *args, **kwargs):
Returns
-------
scalar
"""
nv.validate_min(args, kwargs)
return self._min_max("min")

def _min_max(self, kind: Literal["min", "max"]) -> Scalar:
"""
Min/max of non-NA/null values

# This condition returns a nan if there are no valid values in the array.
if self.size > 0 and self._valid_sp_values.size == 0:
Parameters
----------
kind : {"min", "max"}

Returns
-------
scalar
"""
valid_vals = self._valid_sp_values
has_nonnull_fill_vals = not self._null_fill_value and self.sp_index.ngaps > 0
if len(valid_vals) > 0:
sp_min_max = getattr(valid_vals, kind)()

# If a non-null fill value is currently present, it might be the min/max
if has_nonnull_fill_vals:
func = max if kind == "max" else min
return func(sp_min_max, self.fill_value)
else:
return sp_min_max
elif has_nonnull_fill_vals:
return self.fill_value
else:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might has well just do elif else

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks, updated

return np.nanmin(self, axis)
return na_value_for_dtype(self.dtype.subtype)

# ------------------------------------------------------------------------
# Ufuncs
Expand Down
56 changes: 42 additions & 14 deletions pandas/tests/arrays/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1362,26 +1362,54 @@ def test_drop_duplicates_fill_value():


class TestMinMax:
plain_data = np.arange(5).astype(float)
data_neg = plain_data * (-1)
data_NaN = SparseArray(np.array([0, 1, 2, np.nan, 4]))
data_all_NaN = SparseArray(np.array([np.nan, np.nan, np.nan, np.nan, np.nan]))
data_NA_filled = SparseArray(
np.array([np.nan, np.nan, np.nan, np.nan, np.nan]), fill_value=5
)

@pytest.mark.parametrize(
"raw_data,max_expected,min_expected",
[
(plain_data, [4], [0]),
(data_neg, [0], [-4]),
(data_NaN, [4], [0]),
(data_all_NaN, [np.nan], [np.nan]),
(data_NA_filled, [5], [5]),
(np.arange(5.0), [4], [0]),
(-np.arange(5.0), [0], [-4]),
(np.array([0, 1, 2, np.nan, 4]), [4], [0]),
(np.array([np.nan] * 5), [np.nan], [np.nan]),
(np.array([]), [np.nan], [np.nan]),
],
)
def test_maxmin(self, raw_data, max_expected, min_expected):
def test_nan_fill_value(self, raw_data, max_expected, min_expected):
max_result = SparseArray(raw_data).max()
min_result = SparseArray(raw_data).min()
assert max_result in max_expected
assert min_result in min_expected

@pytest.mark.parametrize(
"fill_value,max_expected,min_expected",
[
(100, 100, 0),
(-100, 1, -100),
],
)
def test_fill_value(self, fill_value, max_expected, min_expected):
arr = SparseArray(
np.array([fill_value, 0, 1]), dtype=SparseDtype("int", fill_value)
)
max_result = arr.max()
assert max_result == max_expected

min_result = arr.min()
assert min_result == min_expected

@pytest.mark.parametrize("func", ["min", "max"])
@pytest.mark.parametrize("data", [np.array([]), np.array([np.nan, np.nan])])
@pytest.mark.parametrize(
"dtype,expected",
[
(SparseDtype(np.float64, np.nan), np.nan),
(SparseDtype(np.float64, 5.0), np.nan),
(SparseDtype("datetime64[ns]", pd.NaT), pd.NaT),
(SparseDtype("datetime64[ns]", pd.to_datetime("2018-05-05")), pd.NaT),
],
)
def test_na_value_if_no_valid_values(self, func, data, dtype, expected):
arr = SparseArray(data, dtype=dtype)
result = getattr(arr, func)()
if expected == pd.NaT:
assert result == pd.NaT
else:
assert np.isnan(result)