Skip to content

Commit 136a880

Browse files
committed
PERF: Faster SparseArray.__getitem__ for boolean masks(pandas-dev#23122)
BUG: unary operators for SparseArray doesn't recalc indexes(pandas-dev#44956)
1 parent 7973c8b commit 136a880

File tree

6 files changed

+135
-51
lines changed

6 files changed

+135
-51
lines changed

asv_bench/benchmarks/sparse.py

+26-4
Original file line numberDiff line numberDiff line change
@@ -196,16 +196,38 @@ def time_take(self, indices, allow_fill):
196196

197197

198198
class GetItem:
199-
def setup(self):
199+
def setup(self, fill_value):
200200
N = 1_000_000
201-
arr = make_array(N, 1e-5, np.nan, np.float64)
201+
d = 1e-5
202+
arr = make_array(N, d, np.nan, np.float64)
202203
self.sp_arr = SparseArray(arr)
203204

204-
def time_integer_indexing(self):
205+
def time_integer_indexing(self, fill_value):
205206
self.sp_arr[78]
206207

207-
def time_slice(self):
208+
def time_slice(self, fill_value):
208209
self.sp_arr[1:]
209210

210211

212+
class GetItemMask:
213+
214+
params = [True, False]
215+
param_names = ["fill_value"]
216+
217+
def setup(self, fill_value):
218+
N = 1_000_000
219+
d = 1e-5
220+
arr = make_array(N, d, np.nan, np.float64)
221+
self.sp_arr = SparseArray(arr)
222+
b_arr = np.full(shape=N, fill_value=fill_value, dtype=np.bool8)
223+
fv_inds = np.unique(
224+
np.random.randint(low=0, high=N - 1, size=int(N * d), dtype=np.int32)
225+
)
226+
b_arr[fv_inds] = not fill_value
227+
self.sp_b_arr = SparseArray(b_arr, dtype=np.bool8, fill_value=fill_value)
228+
229+
def time_mask(self, fill_value):
230+
self.sp_arr[self.sp_b_arr]
231+
232+
211233
from .pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/whatsnew/v1.4.0.rst

+3
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,7 @@ Other Deprecations
536536
- Deprecated passing ``skipna=None`` for :meth:`DataFrame.mad` and :meth:`Series.mad`, pass ``skipna=True`` instead (:issue:`44580`)
537537
- Deprecated :meth:`DateOffset.apply`, use ``offset + other`` instead (:issue:`44522`)
538538
- A deprecation warning is now shown for :meth:`DataFrame.to_latex` indicating the arguments signature may change and emulate more the arguments to :meth:`.Styler.to_latex` in future versions (:issue:`44411`)
539+
- Deprecated direct passing non boolean or non nan value to ``fill_value`` for :class:`SparseDType` when dtype is bool type (:pull:`44955`)
539540
-
540541

541542
.. ---------------------------------------------------------------------------
@@ -589,6 +590,7 @@ Performance improvements
589590
- Performance improvement in :func:`merge` (:issue:`43332`)
590591
- Performance improvement in :func:`read_csv` when ``index_col`` was set with a numeric column (:issue:`44158`)
591592
- Performance improvement in :func:`concat` (:issue:`43354`)
593+
- Performance improvement in :meth:`SparseArray.__getitem__` (:issue:`23122`)
592594
-
593595

594596
.. ---------------------------------------------------------------------------
@@ -807,6 +809,7 @@ Sparse
807809
- Bug in :meth:`SparseArray.max` and :meth:`SparseArray.min` raising ``ValueError`` for arrays with 0 non-null elements (:issue:`43527`)
808810
- Bug in :meth:`DataFrame.sparse.to_coo` silently converting non-zero fill values to zero (:issue:`24817`)
809811
- Bug in :class:`SparseArray` comparison methods with an array-like operand of mismatched length raising ``AssertionError`` or unclear ``ValueError`` depending on the input (:issue:`43863`)
812+
- Bug in :class:`SparseArray` unary methods as well as :meth:`SparseArray.isna` doesn't recalculate indexes (:pull:`44955`)
810813
-
811814

812815
ExtensionArray

pandas/core/arrays/sparse/array.py

+20-14
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@
5252
)
5353
from pandas.core.dtypes.common import (
5454
is_array_like,
55-
is_bool,
5655
is_bool_dtype,
5756
is_datetime64_any_dtype,
5857
is_datetime64tz_dtype,
@@ -950,17 +949,19 @@ def __getitem__(
950949

951950
else:
952951
if isinstance(key, SparseArray):
952+
# NOTE: If we guarantee that SparseDType(bool)
953+
# has only fill_value - true, false or nan
954+
# (see GH PR 44955)
955+
# we can apply mask very fast:
953956
if is_bool_dtype(key):
954-
if is_bool(key.fill_value):
955-
msk = np.full(
956-
shape=len(self),
957-
fill_value=key.fill_value,
958-
dtype=np.bool8,
959-
)
960-
msk[key.sp_index.indices] = not key.fill_value
961-
return self.take(np.arange(len(self), dtype=np.int32)[msk])
962-
else:
963-
key = key.to_dense()
957+
if isna(key.fill_value):
958+
return self.take(key.sp_index.indices[key.sp_values])
959+
if not key.fill_value:
960+
return self.take(key.sp_index.indices)
961+
n = len(self)
962+
mask = np.full(n, True, dtype=np.bool8)
963+
mask[key.sp_index.indices] = False
964+
return self.take(np.arange(n)[mask])
964965
else:
965966
key = np.asarray(key)
966967

@@ -1691,9 +1692,14 @@ def _cmp_method(self, other, op) -> SparseArray:
16911692

16921693
def _unary_method(self, op) -> SparseArray:
16931694
fill_value = op(np.array(self.fill_value)).item()
1694-
values = op(self.sp_values)
1695-
dtype = SparseDtype(values.dtype, fill_value)
1696-
return type(self)._simple_new(values, self.sp_index, dtype)
1695+
dtype = SparseDtype(self.dtype.subtype, fill_value)
1696+
# NOTE: if fill_value doesn't change
1697+
# we just have to apply op to sp_values
1698+
if isna(self.fill_value) or fill_value == self.fill_value:
1699+
values = op(self.sp_values)
1700+
return type(self)._simple_new(values, self.sp_index, self.dtype)
1701+
# In the other case we have to recalc indexes
1702+
return type(self)(op(self.to_dense()), dtype=dtype)
16971703

16981704
def __pos__(self) -> SparseArray:
16991705
return self._unary_method(operator.pos)

pandas/core/arrays/sparse/dtype.py

+15-2
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
)
2525
from pandas.core.dtypes.cast import astype_nansafe
2626
from pandas.core.dtypes.common import (
27+
is_bool,
2728
is_bool_dtype,
2829
is_object_dtype,
2930
is_scalar,
@@ -95,10 +96,9 @@ def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None):
9596
if fill_value is None:
9697
fill_value = na_value_for_dtype(dtype)
9798

98-
if not is_scalar(fill_value):
99-
raise ValueError(f"fill_value must be a scalar. Got {fill_value} instead")
10099
self._dtype = dtype
101100
self._fill_value = fill_value
101+
self._check_fill_value()
102102

103103
def __hash__(self):
104104
# Python3 doesn't inherit __hash__ when a base class overrides
@@ -149,6 +149,19 @@ def fill_value(self):
149149
"""
150150
return self._fill_value
151151

152+
def _check_fill_value(self):
153+
if not is_scalar(self._fill_value):
154+
raise ValueError(
155+
f"fill_value must be a scalar. Got {self._fill_value} instead"
156+
)
157+
if self._is_boolean and not (
158+
is_bool(self._fill_value) or isna(self._fill_value)
159+
):
160+
raise ValueError(
161+
"fill_value must be True, False or nan "
162+
f"for boolean type. Got {self._fill_value} instead"
163+
)
164+
152165
@property
153166
def _is_na_fill_value(self) -> bool:
154167
return isna(self.fill_value)

pandas/tests/arrays/sparse/test_array.py

+66-17
Original file line numberDiff line numberDiff line change
@@ -249,14 +249,22 @@ def test_scalar_with_index_infer_dtype(self, scalar, dtype):
249249
assert exp.dtype == dtype
250250

251251
# GH 23122
252-
def test_get_item_bool_sparse_array(self):
252+
def test_getitem_bool_sparse_array(self):
253253
spar_bool = SparseArray([False, True] * 5, dtype=np.bool8, fill_value=True)
254254
exp = SparseArray([np.nan, 2, np.nan, 5, 6])
255255
tm.assert_sp_array_equal(self.arr[spar_bool], exp)
256256

257-
spar_bool = SparseArray(~spar_bool.to_dense(), dtype=np.bool8, fill_value=False)
257+
spar_bool = ~spar_bool
258+
res = self.arr[spar_bool]
258259
exp = SparseArray([np.nan, 1, 3, 4, np.nan])
259-
tm.assert_sp_array_equal(self.arr[spar_bool], exp)
260+
tm.assert_sp_array_equal(res, exp)
261+
262+
spar_bool = SparseArray(
263+
[False, True, np.nan] * 3, dtype=np.bool8, fill_value=np.nan
264+
)
265+
res = self.arr[spar_bool]
266+
exp = SparseArray([np.nan, 3, 5])
267+
tm.assert_sp_array_equal(res, exp)
260268

261269
def test_get_item(self):
262270

@@ -515,7 +523,9 @@ def test_astype(self):
515523
def test_astype_bool(self):
516524
a = SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0))
517525
result = a.astype(bool)
518-
expected = SparseArray([True, 0, 0, True], dtype=SparseDtype(bool, 0))
526+
expected = SparseArray(
527+
[True, False, False, True], dtype=SparseDtype(bool, False)
528+
)
519529
tm.assert_sp_array_equal(result, expected)
520530

521531
# update fill value
@@ -615,10 +625,9 @@ def test_set_fill_value(self):
615625
assert arr.fill_value
616626

617627
# coerces to bool
618-
# msg = "unable to set fill_value 0 to bool dtype"
619-
# with pytest.raises(ValueError, match=msg):
620-
arr.fill_value = 0
621-
assert arr.fill_value == 0
628+
msg = "fill_value must be True, False or nan"
629+
with pytest.raises(ValueError, match=msg):
630+
arr.fill_value = 0
622631

623632
# msg = "unable to set fill_value nan to bool dtype"
624633
# with pytest.raises(ValueError, match=msg):
@@ -747,6 +756,41 @@ def test_boolean_slice_empty(self):
747756
res = arr[[False, False, False]]
748757
assert res.dtype == arr.dtype
749758

759+
def test_neg_operator(self):
760+
arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
761+
res = -arr
762+
exp = SparseArray([1, 2, np.nan, -3], fill_value=np.nan, dtype=np.int8)
763+
tm.assert_sp_array_equal(exp, res)
764+
765+
arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8)
766+
res = -arr
767+
exp = SparseArray([1, 2, -1, -3], fill_value=1, dtype=np.int8)
768+
tm.assert_sp_array_equal(exp, res)
769+
770+
def test_abs_operator(self):
771+
arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
772+
res = abs(arr)
773+
exp = SparseArray([1, 2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
774+
tm.assert_sp_array_equal(exp, res)
775+
776+
arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8)
777+
res = abs(arr)
778+
exp = SparseArray([1, 2, 1, 3], fill_value=1, dtype=np.int8)
779+
tm.assert_sp_array_equal(exp, res)
780+
781+
def test_invert_operator(self):
782+
arr = SparseArray([False, True, False, True], fill_value=False, dtype=np.bool8)
783+
res = ~arr
784+
exp = SparseArray(
785+
np.invert([False, True, False, True]), fill_value=True, dtype=np.bool8
786+
)
787+
res = ~arr
788+
tm.assert_sp_array_equal(exp, res)
789+
790+
arr = SparseArray([0, 1, 0, 2, 3, 0], fill_value=0, dtype=np.int32)
791+
res = ~arr
792+
exp = SparseArray([-1, -2, -1, -3, -4, -1], fill_value=-1, dtype=np.int32)
793+
750794
@pytest.mark.parametrize("op", ["add", "sub", "mul", "truediv", "floordiv", "pow"])
751795
def test_binary_operators(self, op):
752796
op = getattr(operator, op)
@@ -1015,13 +1059,9 @@ def test_sum(self):
10151059

10161060
@pytest.mark.parametrize(
10171061
"arr",
1018-
[
1019-
np.array([0, 1, np.nan, 1]),
1020-
np.array([0, 1, 1]),
1021-
np.array([True, True, False]),
1022-
],
1062+
[np.array([0, 1, np.nan, 1]), np.array([0, 1, 1])],
10231063
)
1024-
@pytest.mark.parametrize("fill_value", [0, 1, np.nan, True, False])
1064+
@pytest.mark.parametrize("fill_value", [0, 1, np.nan])
10251065
@pytest.mark.parametrize("min_count, expected", [(3, 2), (4, np.nan)])
10261066
def test_sum_min_count(self, arr, fill_value, min_count, expected):
10271067
# https://github.com/pandas-dev/pandas/issues/25777
@@ -1032,6 +1072,15 @@ def test_sum_min_count(self, arr, fill_value, min_count, expected):
10321072
else:
10331073
assert result == expected
10341074

1075+
def test_bool_sum_min_count(self):
1076+
spar_bool = pd.arrays.SparseArray(
1077+
[False, True] * 5, dtype=np.bool8, fill_value=True
1078+
)
1079+
res = spar_bool.sum(min_count=1)
1080+
assert res == 5
1081+
res = spar_bool.sum(min_count=11)
1082+
assert isna(res)
1083+
10351084
def test_numpy_sum(self):
10361085
data = np.arange(10).astype(float)
10371086
out = np.sum(SparseArray(data))
@@ -1131,9 +1180,9 @@ def test_ufunc(self):
11311180
tm.assert_sp_array_equal(np.abs(sparse), result)
11321181

11331182
sparse = SparseArray([1, -1, 2, -2], fill_value=-1)
1134-
result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, fill_value=1)
1135-
tm.assert_sp_array_equal(abs(sparse), result)
1136-
tm.assert_sp_array_equal(np.abs(sparse), result)
1183+
exp = SparseArray([1, 1, 2, 2], fill_value=1)
1184+
tm.assert_sp_array_equal(abs(sparse), exp)
1185+
tm.assert_sp_array_equal(np.abs(sparse), exp)
11371186

11381187
sparse = SparseArray([1, np.nan, 2, np.nan, -2])
11391188
result = SparseArray(np.sin([1, np.nan, 2, np.nan, -2]))

pandas/tests/extension/test_sparse.py

+5-14
Original file line numberDiff line numberDiff line change
@@ -193,26 +193,17 @@ def test_reindex(self, data, na_value):
193193

194194
class TestMissing(BaseSparseTests, base.BaseMissingTests):
195195
def test_isna(self, data_missing):
196+
sarr = SparseArray(data_missing)
196197
expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value))
197198
expected = SparseArray([True, False], dtype=expected_dtype)
198-
199-
result = pd.isna(data_missing)
200-
self.assert_equal(result, expected)
201-
202-
result = pd.Series(data_missing).isna()
203-
expected = pd.Series(expected)
204-
self.assert_series_equal(result, expected)
205-
206-
# GH 21189
207-
result = pd.Series(data_missing).drop([0, 1]).isna()
208-
expected = pd.Series([], dtype=expected_dtype)
209-
self.assert_series_equal(result, expected)
199+
result = sarr.isna()
200+
tm.assert_sp_array_equal(result, expected)
210201

211202
# test isna for arr without na
212-
data_missing = data_missing.fillna(0)
203+
sarr = sarr.fillna(0)
213204
expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value))
214205
expected = SparseArray([False, False], fill_value=False, dtype=expected_dtype)
215-
self.assert_equal(pd.isna(data_missing), expected)
206+
self.assert_equal(sarr.isna(), expected)
216207

217208
def test_fillna_limit_pad(self, data_missing):
218209
with tm.assert_produces_warning(PerformanceWarning):

0 commit comments

Comments
 (0)