Skip to content

Commit f299229

Browse files
committed
PERF: Faster SparseArray.__get_item__ for boolean masks (pandas-dev#23122)
1 parent 7131268 commit f299229

File tree

6 files changed

+154
-39
lines changed

6 files changed

+154
-39
lines changed

asv_bench/benchmarks/sparse.py

+26-4
Original file line numberDiff line numberDiff line change
@@ -196,16 +196,38 @@ def time_take(self, indices, allow_fill):
196196

197197

198198
class GetItem:
199-
def setup(self):
199+
def setup(self, fill_value):
200200
N = 1_000_000
201-
arr = make_array(N, 1e-5, np.nan, np.float64)
201+
d = 1e-5
202+
arr = make_array(N, d, np.nan, np.float64)
202203
self.sp_arr = SparseArray(arr)
203204

204-
def time_integer_indexing(self):
205+
def time_integer_indexing(self, fill_value):
205206
self.sp_arr[78]
206207

207-
def time_slice(self):
208+
def time_slice(self, fill_value):
208209
self.sp_arr[1:]
209210

210211

212+
class GetItemMask:
213+
214+
params = [True, False, np.nan]
215+
param_names = ["fill_value"]
216+
217+
def setup(self, fill_value):
218+
N = 1_000_000
219+
d = 1e-5
220+
arr = make_array(N, d, np.nan, np.float64)
221+
self.sp_arr = SparseArray(arr)
222+
b_arr = np.full(shape=N, fill_value=fill_value, dtype=np.bool8)
223+
fv_inds = np.unique(
224+
np.random.randint(low=0, high=N - 1, size=int(N * d), dtype=np.int32)
225+
)
226+
b_arr[fv_inds] = True if pd.isna(fill_value) else not fill_value
227+
self.sp_b_arr = SparseArray(b_arr, dtype=np.bool8, fill_value=fill_value)
228+
229+
def time_mask(self, fill_value):
230+
self.sp_arr[self.sp_b_arr]
231+
232+
211233
from .pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/whatsnew/v1.4.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,7 @@ Performance improvements
602602
- Performance improvement in :func:`to_csv` when :class:`MultiIndex` contains a lot of unused levels (:issue:`37484`)
603603
- Performance improvement in :func:`read_csv` when ``index_col`` was set with a numeric column (:issue:`44158`)
604604
- Performance improvement in :func:`concat` (:issue:`43354`)
605+
- Performance improvement in :meth:`SparseArray.__getitem__` (:issue:`23122`)
605606
- Performance improvement in constructing a :class:`DataFrame` from array-like objects like a ``Pytorch`` tensor (:issue:`44616`)
606607
-
607608

@@ -847,6 +848,7 @@ Sparse
847848
- Bug in :meth:`DataFrame.sparse.to_coo` silently converting non-zero fill values to zero (:issue:`24817`)
848849
- Bug in :class:`SparseArray` comparison methods with an array-like operand of mismatched length raising ``AssertionError`` or unclear ``ValueError`` depending on the input (:issue:`43863`)
849850
- Bug in :class:`SparseArray` arithmetic methods ``floordiv`` and ``mod`` behaviors when dividing by zero not matching the non-sparse :class:`Series` behavior (:issue:`38172`)
851+
- Bug in :class:`SparseArray` unary methods as well as :meth:`SparseArray.isna` doesn't recalculate indexes (:pull:`44955`)
850852
-
851853

852854
ExtensionArray

pandas/core/arrays/sparse/array.py

+25-9
Original file line numberDiff line numberDiff line change
@@ -719,7 +719,11 @@ def isna(self):
719719
# If null fill value, we want SparseDtype[bool, true]
720720
# to preserve the same memory usage.
721721
dtype = SparseDtype(bool, self._null_fill_value)
722-
return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)
722+
if self._null_fill_value:
723+
return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)
724+
mask = np.full(len(self), False, dtype=np.bool8)
725+
mask[self.sp_index.indices] = isna(self.sp_values)
726+
return type(self)(mask, fill_value=False, dtype=dtype)
723727

724728
def fillna(
725729
self: SparseArrayT,
@@ -963,13 +967,20 @@ def __getitem__(
963967
)
964968

965969
else:
966-
# TODO: I think we can avoid densifying when masking a
967-
# boolean SparseArray with another. Need to look at the
968-
# key's fill_value for True / False, and then do an intersection
969-
# on the indices of the sp_values.
970970
if isinstance(key, SparseArray):
971+
# NOTE: If we guarantee that SparseDType(bool)
972+
# has only fill_value - true, false or nan
973+
# (see GH PR 44955)
974+
# we can apply mask very fast:
971975
if is_bool_dtype(key):
972-
key = key.to_dense()
976+
if isna(key.fill_value):
977+
return self.take(key.sp_index.indices[key.sp_values])
978+
if not key.fill_value:
979+
return self.take(key.sp_index.indices)
980+
n = len(self)
981+
mask = np.full(n, True, dtype=np.bool8)
982+
mask[key.sp_index.indices] = False
983+
return self.take(np.arange(n)[mask])
973984
else:
974985
key = np.asarray(key)
975986

@@ -1684,9 +1695,14 @@ def _cmp_method(self, other, op) -> SparseArray:
16841695

16851696
def _unary_method(self, op) -> SparseArray:
16861697
fill_value = op(np.array(self.fill_value)).item()
1687-
values = op(self.sp_values)
1688-
dtype = SparseDtype(values.dtype, fill_value)
1689-
return type(self)._simple_new(values, self.sp_index, dtype)
1698+
dtype = SparseDtype(self.dtype.subtype, fill_value)
1699+
# NOTE: if fill_value doesn't change
1700+
# we just have to apply op to sp_values
1701+
if isna(self.fill_value) or fill_value == self.fill_value:
1702+
values = op(self.sp_values)
1703+
return type(self)._simple_new(values, self.sp_index, self.dtype)
1704+
# In the other case we have to recalc indexes
1705+
return type(self)(op(self.to_dense()), dtype=dtype)
16901706

16911707
def __pos__(self) -> SparseArray:
16921708
return self._unary_method(operator.pos)

pandas/core/arrays/sparse/dtype.py

+19-2
Original file line numberDiff line numberDiff line change
@@ -95,10 +95,9 @@ def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None):
9595
if fill_value is None:
9696
fill_value = na_value_for_dtype(dtype)
9797

98-
if not is_scalar(fill_value):
99-
raise ValueError(f"fill_value must be a scalar. Got {fill_value} instead")
10098
self._dtype = dtype
10199
self._fill_value = fill_value
100+
self._check_fill_value()
102101

103102
def __hash__(self):
104103
# Python3 doesn't inherit __hash__ when a base class overrides
@@ -149,6 +148,24 @@ def fill_value(self):
149148
"""
150149
return self._fill_value
151150

151+
def _check_fill_value(self):
152+
if not is_scalar(self._fill_value):
153+
raise ValueError(
154+
f"fill_value must be a scalar. Got {self._fill_value} instead"
155+
)
156+
# TODO: Right now we can use Sparse boolean array
157+
# with any fill_value. Here was an attempt
158+
# to allow only 3 value: True, False or nan
159+
# but plenty test has failed.
160+
# see pull 44955
161+
# if self._is_boolean and not (
162+
# is_bool(self._fill_value) or isna(self._fill_value)
163+
# ):
164+
# raise ValueError(
165+
# "fill_value must be True, False or nan "
166+
# f"for boolean type. Got {self._fill_value} instead"
167+
# )
168+
152169
@property
153170
def _is_na_fill_value(self) -> bool:
154171
return isna(self.fill_value)

pandas/tests/arrays/sparse/test_array.py

+74-13
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,24 @@ def test_scalar_with_index_infer_dtype(self, scalar, dtype):
248248
assert arr.dtype == dtype
249249
assert exp.dtype == dtype
250250

251+
# GH 23122
252+
def test_getitem_bool_sparse_array(self):
253+
spar_bool = SparseArray([False, True] * 5, dtype=np.bool8, fill_value=True)
254+
exp = SparseArray([np.nan, 2, np.nan, 5, 6])
255+
tm.assert_sp_array_equal(self.arr[spar_bool], exp)
256+
257+
spar_bool = ~spar_bool
258+
res = self.arr[spar_bool]
259+
exp = SparseArray([np.nan, 1, 3, 4, np.nan])
260+
tm.assert_sp_array_equal(res, exp)
261+
262+
spar_bool = SparseArray(
263+
[False, True, np.nan] * 3, dtype=np.bool8, fill_value=np.nan
264+
)
265+
res = self.arr[spar_bool]
266+
exp = SparseArray([np.nan, 3, 5])
267+
tm.assert_sp_array_equal(res, exp)
268+
251269
def test_get_item(self):
252270

253271
assert np.isnan(self.arr[1])
@@ -505,7 +523,9 @@ def test_astype(self):
505523
def test_astype_bool(self):
506524
a = SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0))
507525
result = a.astype(bool)
508-
expected = SparseArray([True, 0, 0, True], dtype=SparseDtype(bool, 0))
526+
expected = SparseArray(
527+
[True, False, False, True], dtype=SparseDtype(bool, False)
528+
)
509529
tm.assert_sp_array_equal(result, expected)
510530

511531
# update fill value
@@ -605,10 +625,11 @@ def test_set_fill_value(self):
605625
assert arr.fill_value
606626

607627
# coerces to bool
608-
# msg = "unable to set fill_value 0 to bool dtype"
628+
# XXX: we can construct an sparse array of bool
629+
# type and use as fill_value any value
630+
# msg = "fill_value must be True, False or nan"
609631
# with pytest.raises(ValueError, match=msg):
610-
arr.fill_value = 0
611-
assert arr.fill_value == 0
632+
# arr.fill_value = 0
612633

613634
# msg = "unable to set fill_value nan to bool dtype"
614635
# with pytest.raises(ValueError, match=msg):
@@ -737,6 +758,41 @@ def test_boolean_slice_empty(self):
737758
res = arr[[False, False, False]]
738759
assert res.dtype == arr.dtype
739760

761+
def test_neg_operator(self):
762+
arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
763+
res = -arr
764+
exp = SparseArray([1, 2, np.nan, -3], fill_value=np.nan, dtype=np.int8)
765+
tm.assert_sp_array_equal(exp, res)
766+
767+
arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8)
768+
res = -arr
769+
exp = SparseArray([1, 2, -1, -3], fill_value=1, dtype=np.int8)
770+
tm.assert_sp_array_equal(exp, res)
771+
772+
def test_abs_operator(self):
773+
arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
774+
res = abs(arr)
775+
exp = SparseArray([1, 2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
776+
tm.assert_sp_array_equal(exp, res)
777+
778+
arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8)
779+
res = abs(arr)
780+
exp = SparseArray([1, 2, 1, 3], fill_value=1, dtype=np.int8)
781+
tm.assert_sp_array_equal(exp, res)
782+
783+
def test_invert_operator(self):
784+
arr = SparseArray([False, True, False, True], fill_value=False, dtype=np.bool8)
785+
res = ~arr
786+
exp = SparseArray(
787+
np.invert([False, True, False, True]), fill_value=True, dtype=np.bool8
788+
)
789+
res = ~arr
790+
tm.assert_sp_array_equal(exp, res)
791+
792+
arr = SparseArray([0, 1, 0, 2, 3, 0], fill_value=0, dtype=np.int32)
793+
res = ~arr
794+
exp = SparseArray([-1, -2, -1, -3, -4, -1], fill_value=-1, dtype=np.int32)
795+
740796
@pytest.mark.parametrize("op", ["add", "sub", "mul", "truediv", "floordiv", "pow"])
741797
def test_binary_operators(self, op):
742798
op = getattr(operator, op)
@@ -1005,13 +1061,9 @@ def test_sum(self):
10051061

10061062
@pytest.mark.parametrize(
10071063
"arr",
1008-
[
1009-
np.array([0, 1, np.nan, 1]),
1010-
np.array([0, 1, 1]),
1011-
np.array([True, True, False]),
1012-
],
1064+
[np.array([0, 1, np.nan, 1]), np.array([0, 1, 1])],
10131065
)
1014-
@pytest.mark.parametrize("fill_value", [0, 1, np.nan, True, False])
1066+
@pytest.mark.parametrize("fill_value", [0, 1, np.nan])
10151067
@pytest.mark.parametrize("min_count, expected", [(3, 2), (4, np.nan)])
10161068
def test_sum_min_count(self, arr, fill_value, min_count, expected):
10171069
# https://github.com/pandas-dev/pandas/issues/25777
@@ -1022,6 +1074,15 @@ def test_sum_min_count(self, arr, fill_value, min_count, expected):
10221074
else:
10231075
assert result == expected
10241076

1077+
def test_bool_sum_min_count(self):
1078+
spar_bool = pd.arrays.SparseArray(
1079+
[False, True] * 5, dtype=np.bool8, fill_value=True
1080+
)
1081+
res = spar_bool.sum(min_count=1)
1082+
assert res == 5
1083+
res = spar_bool.sum(min_count=11)
1084+
assert isna(res)
1085+
10251086
def test_numpy_sum(self):
10261087
data = np.arange(10).astype(float)
10271088
out = np.sum(SparseArray(data))
@@ -1121,9 +1182,9 @@ def test_ufunc(self):
11211182
tm.assert_sp_array_equal(np.abs(sparse), result)
11221183

11231184
sparse = SparseArray([1, -1, 2, -2], fill_value=-1)
1124-
result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, fill_value=1)
1125-
tm.assert_sp_array_equal(abs(sparse), result)
1126-
tm.assert_sp_array_equal(np.abs(sparse), result)
1185+
exp = SparseArray([1, 1, 2, 2], fill_value=1)
1186+
tm.assert_sp_array_equal(abs(sparse), exp)
1187+
tm.assert_sp_array_equal(np.abs(sparse), exp)
11271188

11281189
sparse = SparseArray([1, np.nan, 2, np.nan, -2])
11291190
result = SparseArray(np.sin([1, np.nan, 2, np.nan, -2]))

pandas/tests/extension/test_sparse.py

+8-11
Original file line numberDiff line numberDiff line change
@@ -193,20 +193,17 @@ def test_reindex(self, data, na_value):
193193

194194
class TestMissing(BaseSparseTests, base.BaseMissingTests):
195195
def test_isna(self, data_missing):
196+
sarr = SparseArray(data_missing)
196197
expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value))
197198
expected = SparseArray([True, False], dtype=expected_dtype)
199+
result = sarr.isna()
200+
tm.assert_sp_array_equal(result, expected)
198201

199-
result = pd.isna(data_missing)
200-
self.assert_equal(result, expected)
201-
202-
result = pd.Series(data_missing).isna()
203-
expected = pd.Series(expected)
204-
self.assert_series_equal(result, expected)
205-
206-
# GH 21189
207-
result = pd.Series(data_missing).drop([0, 1]).isna()
208-
expected = pd.Series([], dtype=expected_dtype)
209-
self.assert_series_equal(result, expected)
202+
# test isna for arr without na
203+
sarr = sarr.fillna(0)
204+
expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value))
205+
expected = SparseArray([False, False], fill_value=False, dtype=expected_dtype)
206+
self.assert_equal(sarr.isna(), expected)
210207

211208
def test_fillna_limit_pad(self, data_missing):
212209
with tm.assert_produces_warning(PerformanceWarning):

0 commit comments

Comments
 (0)