Skip to content

Commit a882546

Browse files
mzeitlin11AlexeyGy
authored andcommitted
BUG/PERF: sparse min/max don't densify (pandas-dev#43527)
1 parent 8cfda99 commit a882546

File tree

4 files changed

+114
-24
lines changed

4 files changed

+114
-24
lines changed

asv_bench/benchmarks/sparse.py

+14
Original file line numberDiff line numberDiff line change
@@ -166,4 +166,18 @@ def time_division(self, fill_value):
166166
self.arr1 / self.arr2
167167

168168

169+
class MinMax:
170+
171+
params = (["min", "max"], [0.0, np.nan])
172+
param_names = ["func", "fill_value"]
173+
174+
def setup(self, func, fill_value):
175+
N = 1_000_000
176+
arr = make_array(N, 1e-5, fill_value, np.float64)
177+
self.sp_arr = SparseArray(arr, fill_value=fill_value)
178+
179+
def time_min_max(self, func, fill_value):
180+
getattr(self.sp_arr, func)()
181+
182+
169183
from .pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/whatsnew/v1.4.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,7 @@ Performance improvements
302302
- Performance improvement in :meth:`Series.sparse.to_coo` (:issue:`42880`)
303303
- Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`)
304304
- Performance improvement in :meth:`GroupBy.quantile` (:issue:`43469`)
305+
- :meth:`SparseArray.min` and :meth:`SparseArray.max` no longer require converting to a dense array (:issue:`43526`)
305306
-
306307

307308
.. ---------------------------------------------------------------------------
@@ -437,6 +438,7 @@ Reshaping
437438
Sparse
438439
^^^^^^
439440
- Bug in :meth:`DataFrame.sparse.to_coo` raising ``AttributeError`` when column names are not unique (:issue:`29564`)
441+
- Bug in :meth:`SparseArray.max` and :meth:`SparseArray.min` raising ``ValueError`` for arrays with 0 non-null elements (:issue:`43527`)
440442
-
441443
-
442444

pandas/core/arrays/sparse/array.py

+56-10
Original file line numberDiff line numberDiff line change
@@ -1456,23 +1456,69 @@ def mean(self, axis=0, *args, **kwargs):
14561456
nsparse = self.sp_index.ngaps
14571457
return (sp_sum + self.fill_value * nsparse) / (ct + nsparse)
14581458

1459-
def max(self, axis=0, *args, **kwargs):
1459+
def max(self, axis: int = 0, *args, **kwargs) -> Scalar:
1460+
"""
1461+
Max of non-NA/null values
1462+
1463+
Parameters
1464+
----------
1465+
axis : int, default 0
1466+
Not Used. NumPy compatibility.
1467+
*args, **kwargs
1468+
Not Used. NumPy compatibility.
1469+
1470+
Returns
1471+
-------
1472+
scalar
1473+
"""
14601474
nv.validate_max(args, kwargs)
1475+
return self._min_max("max")
14611476

1462-
# This condition returns a nan if there are no valid values in the array.
1463-
if self.size > 0 and self._valid_sp_values.size == 0:
1464-
return self.fill_value
1465-
else:
1466-
return np.nanmax(self, axis)
1477+
def min(self, axis: int = 0, *args, **kwargs) -> Scalar:
1478+
"""
1479+
Min of non-NA/null values
1480+
1481+
Parameters
1482+
----------
1483+
axis : int, default 0
1484+
Not Used. NumPy compatibility.
1485+
*args, **kwargs
1486+
Not Used. NumPy compatibility.
14671487
1468-
def min(self, axis=0, *args, **kwargs):
1488+
Returns
1489+
-------
1490+
scalar
1491+
"""
14691492
nv.validate_min(args, kwargs)
1493+
return self._min_max("min")
1494+
1495+
def _min_max(self, kind: Literal["min", "max"]) -> Scalar:
1496+
"""
1497+
Min/max of non-NA/null values
14701498
1471-
# This condition returns a nan if there are no valid values in the array.
1472-
if self.size > 0 and self._valid_sp_values.size == 0:
1499+
Parameters
1500+
----------
1501+
kind : {"min", "max"}
1502+
1503+
Returns
1504+
-------
1505+
scalar
1506+
"""
1507+
valid_vals = self._valid_sp_values
1508+
has_nonnull_fill_vals = not self._null_fill_value and self.sp_index.ngaps > 0
1509+
if len(valid_vals) > 0:
1510+
sp_min_max = getattr(valid_vals, kind)()
1511+
1512+
# If a non-null fill value is currently present, it might be the min/max
1513+
if has_nonnull_fill_vals:
1514+
func = max if kind == "max" else min
1515+
return func(sp_min_max, self.fill_value)
1516+
else:
1517+
return sp_min_max
1518+
elif has_nonnull_fill_vals:
14731519
return self.fill_value
14741520
else:
1475-
return np.nanmin(self, axis)
1521+
return na_value_for_dtype(self.dtype.subtype)
14761522

14771523
# ------------------------------------------------------------------------
14781524
# Ufuncs

pandas/tests/arrays/sparse/test_array.py

+42-14
Original file line numberDiff line numberDiff line change
@@ -1362,26 +1362,54 @@ def test_drop_duplicates_fill_value():
13621362

13631363

13641364
class TestMinMax:
1365-
plain_data = np.arange(5).astype(float)
1366-
data_neg = plain_data * (-1)
1367-
data_NaN = SparseArray(np.array([0, 1, 2, np.nan, 4]))
1368-
data_all_NaN = SparseArray(np.array([np.nan, np.nan, np.nan, np.nan, np.nan]))
1369-
data_NA_filled = SparseArray(
1370-
np.array([np.nan, np.nan, np.nan, np.nan, np.nan]), fill_value=5
1371-
)
1372-
13731365
@pytest.mark.parametrize(
13741366
"raw_data,max_expected,min_expected",
13751367
[
1376-
(plain_data, [4], [0]),
1377-
(data_neg, [0], [-4]),
1378-
(data_NaN, [4], [0]),
1379-
(data_all_NaN, [np.nan], [np.nan]),
1380-
(data_NA_filled, [5], [5]),
1368+
(np.arange(5.0), [4], [0]),
1369+
(-np.arange(5.0), [0], [-4]),
1370+
(np.array([0, 1, 2, np.nan, 4]), [4], [0]),
1371+
(np.array([np.nan] * 5), [np.nan], [np.nan]),
1372+
(np.array([]), [np.nan], [np.nan]),
13811373
],
13821374
)
1383-
def test_maxmin(self, raw_data, max_expected, min_expected):
1375+
def test_nan_fill_value(self, raw_data, max_expected, min_expected):
13841376
max_result = SparseArray(raw_data).max()
13851377
min_result = SparseArray(raw_data).min()
13861378
assert max_result in max_expected
13871379
assert min_result in min_expected
1380+
1381+
@pytest.mark.parametrize(
1382+
"fill_value,max_expected,min_expected",
1383+
[
1384+
(100, 100, 0),
1385+
(-100, 1, -100),
1386+
],
1387+
)
1388+
def test_fill_value(self, fill_value, max_expected, min_expected):
1389+
arr = SparseArray(
1390+
np.array([fill_value, 0, 1]), dtype=SparseDtype("int", fill_value)
1391+
)
1392+
max_result = arr.max()
1393+
assert max_result == max_expected
1394+
1395+
min_result = arr.min()
1396+
assert min_result == min_expected
1397+
1398+
@pytest.mark.parametrize("func", ["min", "max"])
1399+
@pytest.mark.parametrize("data", [np.array([]), np.array([np.nan, np.nan])])
1400+
@pytest.mark.parametrize(
1401+
"dtype,expected",
1402+
[
1403+
(SparseDtype(np.float64, np.nan), np.nan),
1404+
(SparseDtype(np.float64, 5.0), np.nan),
1405+
(SparseDtype("datetime64[ns]", pd.NaT), pd.NaT),
1406+
(SparseDtype("datetime64[ns]", pd.to_datetime("2018-05-05")), pd.NaT),
1407+
],
1408+
)
1409+
def test_na_value_if_no_valid_values(self, func, data, dtype, expected):
1410+
arr = SparseArray(data, dtype=dtype)
1411+
result = getattr(arr, func)()
1412+
if expected == pd.NaT:
1413+
assert result == pd.NaT
1414+
else:
1415+
assert np.isnan(result)

0 commit comments

Comments
 (0)