Skip to content

Commit 8b18286

Browse files
committed
PERF: Faster SparseArray.__get_item__ for boolean masks (#23122)
1 parent 097322f commit 8b18286

File tree

3 files changed

+32
-7
lines changed

3 files changed

+32
-7
lines changed

pandas/core/arrays/sparse/array.py

+16-7
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
)
5353
from pandas.core.dtypes.common import (
5454
is_array_like,
55+
is_bool,
5556
is_bool_dtype,
5657
is_datetime64_any_dtype,
5758
is_datetime64tz_dtype,
@@ -181,7 +182,6 @@ def _sparse_array_op(
181182
ltype = SparseDtype(subtype, left.fill_value)
182183
rtype = SparseDtype(subtype, right.fill_value)
183184

184-
# TODO(GH-23092): pass copy=False. Need to fix astype_nansafe
185185
left = left.astype(ltype)
186186
right = right.astype(rtype)
187187
dtype = ltype.subtype
@@ -701,7 +701,11 @@ def isna(self):
701701
# If null fill value, we want SparseDtype[bool, true]
702702
# to preserve the same memory usage.
703703
dtype = SparseDtype(bool, self._null_fill_value)
704-
return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)
704+
if self._null_fill_value:
705+
return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)
706+
mask = np.full(len(self), False, dtype=np.bool8)
707+
mask[self.sp_index.indices] = isna(self.sp_values)
708+
return type(self)(mask, fill_value=False, dtype=dtype)
705709

706710
def fillna(
707711
self: SparseArrayT,
@@ -945,13 +949,18 @@ def __getitem__(
945949
)
946950

947951
else:
948-
# TODO: I think we can avoid densifying when masking a
949-
# boolean SparseArray with another. Need to look at the
950-
# key's fill_value for True / False, and then do an intersection
951-
# on the indices of the sp_values.
952952
if isinstance(key, SparseArray):
953953
if is_bool_dtype(key):
954-
key = key.to_dense()
954+
if is_bool(key.fill_value):
955+
msk = np.full(
956+
shape=len(self),
957+
fill_value=key.fill_value,
958+
dtype=np.bool8,
959+
)
960+
msk[key.sp_index.indices] = not key.fill_value
961+
return self.take(np.arange(len(self), dtype=np.int32)[msk])
962+
else:
963+
key = key.to_dense()
955964
else:
956965
key = np.asarray(key)
957966

pandas/tests/arrays/sparse/test_array.py

+10
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,16 @@ def test_scalar_with_index_infer_dtype(self, scalar, dtype):
248248
assert arr.dtype == dtype
249249
assert exp.dtype == dtype
250250

251+
# GH 23122
252+
def test_get_item_bool_sparse_array(self):
253+
spar_bool = SparseArray([False, True] * 5, dtype=np.bool8, fill_value=True)
254+
exp = SparseArray([np.nan, 2, np.nan, 5, 6])
255+
tm.assert_sp_array_equal(self.arr[spar_bool], exp)
256+
257+
spar_bool = SparseArray(~spar_bool.to_dense(), dtype=np.bool8, fill_value=False)
258+
exp = SparseArray([np.nan, 1, 3, 4, np.nan])
259+
tm.assert_sp_array_equal(self.arr[spar_bool], exp)
260+
251261
def test_get_item(self):
252262

253263
assert np.isnan(self.arr[1])

pandas/tests/extension/test_sparse.py

+6
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,12 @@ def test_isna(self, data_missing):
208208
expected = pd.Series([], dtype=expected_dtype)
209209
self.assert_series_equal(result, expected)
210210

211+
# test isna for arr without na
212+
data_missing = data_missing.fillna(0)
213+
expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value))
214+
expected = SparseArray([False, False], fill_value=False, dtype=expected_dtype)
215+
self.assert_equal(pd.isna(data_missing), expected)
216+
211217
def test_fillna_limit_pad(self, data_missing):
212218
with tm.assert_produces_warning(PerformanceWarning):
213219
super().test_fillna_limit_pad(data_missing)

0 commit comments

Comments
 (0)