Skip to content

Commit ae4771d

Browse files
committed
PERF: Faster SparseArray.__get_item__ for boolean masks (pandas-dev#23122)
1 parent 2cc9ab3 commit ae4771d

File tree

3 files changed

+32
-6
lines changed

3 files changed

+32
-6
lines changed

pandas/core/arrays/sparse/array.py

+16-6
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
)
5353
from pandas.core.dtypes.common import (
5454
is_array_like,
55+
is_bool,
5556
is_bool_dtype,
5657
is_datetime64_any_dtype,
5758
is_datetime64tz_dtype,
@@ -719,7 +720,11 @@ def isna(self):
719720
# If null fill value, we want SparseDtype[bool, true]
720721
# to preserve the same memory usage.
721722
dtype = SparseDtype(bool, self._null_fill_value)
722-
return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)
723+
if self._null_fill_value:
724+
return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)
725+
mask = np.full(len(self), False, dtype=np.bool8)
726+
mask[self.sp_index.indices] = isna(self.sp_values)
727+
return type(self)(mask, fill_value=False, dtype=dtype)
723728

724729
def fillna(
725730
self: SparseArrayT,
@@ -963,13 +968,18 @@ def __getitem__(
963968
)
964969

965970
else:
966-
# TODO: I think we can avoid densifying when masking a
967-
# boolean SparseArray with another. Need to look at the
968-
# key's fill_value for True / False, and then do an intersection
969-
# on the indices of the sp_values.
970971
if isinstance(key, SparseArray):
971972
if is_bool_dtype(key):
972-
key = key.to_dense()
973+
if is_bool(key.fill_value):
974+
msk = np.full(
975+
shape=len(self),
976+
fill_value=key.fill_value,
977+
dtype=np.bool8,
978+
)
979+
msk[key.sp_index.indices] = not key.fill_value
980+
return self.take(np.arange(len(self), dtype=np.int32)[msk])
981+
else:
982+
key = key.to_dense()
973983
else:
974984
key = np.asarray(key)
975985

pandas/tests/arrays/sparse/test_array.py

+10
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,16 @@ def test_scalar_with_index_infer_dtype(self, scalar, dtype):
248248
assert arr.dtype == dtype
249249
assert exp.dtype == dtype
250250

251+
# GH 23122
252+
def test_get_item_bool_sparse_array(self):
253+
spar_bool = SparseArray([False, True] * 5, dtype=np.bool8, fill_value=True)
254+
exp = SparseArray([np.nan, 2, np.nan, 5, 6])
255+
tm.assert_sp_array_equal(self.arr[spar_bool], exp)
256+
257+
spar_bool = SparseArray(~spar_bool.to_dense(), dtype=np.bool8, fill_value=False)
258+
exp = SparseArray([np.nan, 1, 3, 4, np.nan])
259+
tm.assert_sp_array_equal(self.arr[spar_bool], exp)
260+
251261
def test_get_item(self):
252262

253263
assert np.isnan(self.arr[1])

pandas/tests/extension/test_sparse.py

+6
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,12 @@ def test_isna(self, data_missing):
208208
expected = pd.Series([], dtype=expected_dtype)
209209
self.assert_series_equal(result, expected)
210210

211+
# test isna for arr without na
212+
data_missing = data_missing.fillna(0)
213+
expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value))
214+
expected = SparseArray([False, False], fill_value=False, dtype=expected_dtype)
215+
self.assert_equal(pd.isna(data_missing), expected)
216+
211217
def test_fillna_limit_pad(self, data_missing):
212218
with tm.assert_produces_warning(PerformanceWarning):
213219
super().test_fillna_limit_pad(data_missing)

0 commit comments

Comments
 (0)