Skip to content

Commit a8d517a

Browse files
committed
PERF: Faster SparseArray.__get_item__ for boolean masks (pandas-dev#23122)
1 parent 097322f commit a8d517a

File tree

2 files changed

+21
-6
lines changed

2 files changed

+21
-6
lines changed

pandas/core/arrays/sparse/array.py

+11-6
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
)
5353
from pandas.core.dtypes.common import (
5454
is_array_like,
55+
is_bool,
5556
is_bool_dtype,
5657
is_datetime64_any_dtype,
5758
is_datetime64tz_dtype,
@@ -181,7 +182,6 @@ def _sparse_array_op(
181182
ltype = SparseDtype(subtype, left.fill_value)
182183
rtype = SparseDtype(subtype, right.fill_value)
183184

184-
# TODO(GH-23092): pass copy=False. Need to fix astype_nansafe
185185
left = left.astype(ltype)
186186
right = right.astype(rtype)
187187
dtype = ltype.subtype
@@ -945,13 +945,18 @@ def __getitem__(
945945
)
946946

947947
else:
948-
# TODO: I think we can avoid densifying when masking a
949-
# boolean SparseArray with another. Need to look at the
950-
# key's fill_value for True / False, and then do an intersection
951-
# on the indices of the sp_values.
952948
if isinstance(key, SparseArray):
953949
if is_bool_dtype(key):
954-
key = key.to_dense()
950+
if is_bool(key.fill_value):
951+
msk = np.full(
952+
shape=len(self),
953+
fill_value=key.fill_value,
954+
dtype=np.bool8,
955+
)
956+
msk[key.sp_index.indices] = not key.fill_value
957+
return self.take(np.arange(len(self), dtype=np.int32)[msk])
958+
else:
959+
key = key.to_dense()
955960
else:
956961
key = np.asarray(key)
957962

pandas/tests/arrays/sparse/test_array.py

+10
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,16 @@ def test_scalar_with_index_infer_dtype(self, scalar, dtype):
248248
assert arr.dtype == dtype
249249
assert exp.dtype == dtype
250250

251+
# GH 23122
252+
def test_get_item_bool_sparse_array(self):
253+
spar_bool = SparseArray([False, True] * 5, dtype=np.bool8, fill_value=True)
254+
exp = SparseArray([np.nan, 2, np.nan, 5, 6])
255+
tm.assert_sp_array_equal(self.arr[spar_bool], exp)
256+
257+
spar_bool = SparseArray(~spar_bool.to_dense(), dtype=np.bool8, fill_value=False)
258+
exp = SparseArray([np.nan, 1, 3, 4, np.nan])
259+
tm.assert_sp_array_equal(self.arr[spar_bool], exp)
260+
251261
def test_get_item(self):
252262

253263
assert np.isnan(self.arr[1])

0 commit comments

Comments
 (0)