Skip to content

PERF: Faster SparseArray.__getitem__ for boolean masks (#23122) #44955

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion asv_bench/benchmarks/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,8 @@ def time_take(self, indices, allow_fill):
class GetItem:
def setup(self):
N = 1_000_000
arr = make_array(N, 1e-5, np.nan, np.float64)
d = 1e-5
arr = make_array(N, d, np.nan, np.float64)
self.sp_arr = SparseArray(arr)

def time_integer_indexing(self):
Expand All @@ -208,4 +209,25 @@ def time_slice(self):
self.sp_arr[1:]


class GetItemMask:

params = [True, False, np.nan]
param_names = ["fill_value"]

def setup(self, fill_value):
N = 1_000_000
d = 1e-5
arr = make_array(N, d, np.nan, np.float64)
self.sp_arr = SparseArray(arr)
b_arr = np.full(shape=N, fill_value=fill_value, dtype=np.bool8)
fv_inds = np.unique(
np.random.randint(low=0, high=N - 1, size=int(N * d), dtype=np.int32)
)
b_arr[fv_inds] = True if pd.isna(fill_value) else not fill_value
self.sp_b_arr = SparseArray(b_arr, dtype=np.bool8, fill_value=fill_value)

def time_mask(self, fill_value):
self.sp_arr[self.sp_b_arr]


from .pandas_vb_common import setup # noqa: F401 isort:skip
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,7 @@ Performance improvements
- Performance improvement in :func:`to_csv` when :class:`MultiIndex` contains a lot of unused levels (:issue:`37484`)
- Performance improvement in :func:`read_csv` when ``index_col`` was set with a numeric column (:issue:`44158`)
- Performance improvement in :func:`concat` (:issue:`43354`)
- Performance improvement in :meth:`SparseArray.__getitem__` (:issue:`23122`)
- Performance improvement in constructing a :class:`DataFrame` from array-like objects like a ``Pytorch`` tensor (:issue:`44616`)
-

Expand Down Expand Up @@ -847,6 +848,7 @@ Sparse
- Bug in :meth:`DataFrame.sparse.to_coo` silently converting non-zero fill values to zero (:issue:`24817`)
- Bug in :class:`SparseArray` comparison methods with an array-like operand of mismatched length raising ``AssertionError`` or unclear ``ValueError`` depending on the input (:issue:`43863`)
- Bug in :class:`SparseArray` arithmetic methods ``floordiv`` and ``mod`` behaviors when dividing by zero not matching the non-sparse :class:`Series` behavior (:issue:`38172`)
- Bug in :class:`SparseArray` unary methods as well as :meth:`SparseArray.isna` doesn't recalculate indexes (:issue:`44955`)
-

ExtensionArray
Expand Down
34 changes: 25 additions & 9 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -719,7 +719,11 @@ def isna(self):
# If null fill value, we want SparseDtype[bool, true]
# to preserve the same memory usage.
dtype = SparseDtype(bool, self._null_fill_value)
return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)
if self._null_fill_value:
return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)
mask = np.full(len(self), False, dtype=np.bool8)
mask[self.sp_index.indices] = isna(self.sp_values)
return type(self)(mask, fill_value=False, dtype=dtype)

def fillna(
self: SparseArrayT,
Expand Down Expand Up @@ -963,13 +967,20 @@ def __getitem__(
)

else:
# TODO: I think we can avoid densifying when masking a
# boolean SparseArray with another. Need to look at the
# key's fill_value for True / False, and then do an intersection
# on the indices of the sp_values.
if isinstance(key, SparseArray):
# NOTE: If we guarantee that SparseDType(bool)
# has only fill_value - true, false or nan
# (see GH PR 44955)
# we can apply mask very fast:
if is_bool_dtype(key):
key = key.to_dense()
if isna(key.fill_value):
return self.take(key.sp_index.indices[key.sp_values])
if not key.fill_value:
return self.take(key.sp_index.indices)
n = len(self)
mask = np.full(n, True, dtype=np.bool8)
mask[key.sp_index.indices] = False
return self.take(np.arange(n)[mask])
else:
key = np.asarray(key)

Expand Down Expand Up @@ -1684,9 +1695,14 @@ def _cmp_method(self, other, op) -> SparseArray:

def _unary_method(self, op) -> SparseArray:
fill_value = op(np.array(self.fill_value)).item()
values = op(self.sp_values)
dtype = SparseDtype(values.dtype, fill_value)
return type(self)._simple_new(values, self.sp_index, dtype)
dtype = SparseDtype(self.dtype.subtype, fill_value)
# NOTE: if fill_value doesn't change
# we just have to apply op to sp_values
if isna(self.fill_value) or fill_value == self.fill_value:
values = op(self.sp_values)
return type(self)._simple_new(values, self.sp_index, self.dtype)
# In the other case we have to recalc indexes
return type(self)(op(self.to_dense()), dtype=dtype)

def __pos__(self) -> SparseArray:
return self._unary_method(operator.pos)
Expand Down
21 changes: 19 additions & 2 deletions pandas/core/arrays/sparse/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,9 @@ def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None):
if fill_value is None:
fill_value = na_value_for_dtype(dtype)

if not is_scalar(fill_value):
raise ValueError(f"fill_value must be a scalar. Got {fill_value} instead")
self._dtype = dtype
self._fill_value = fill_value
self._check_fill_value()

def __hash__(self):
# Python3 doesn't inherit __hash__ when a base class overrides
Expand Down Expand Up @@ -149,6 +148,24 @@ def fill_value(self):
"""
return self._fill_value

def _check_fill_value(self):
if not is_scalar(self._fill_value):
raise ValueError(
f"fill_value must be a scalar. Got {self._fill_value} instead"
)
# TODO: Right now we can use Sparse boolean array
# with any fill_value. Here was an attempt
# to allow only 3 value: True, False or nan
# but plenty test has failed.
# see pull 44955
# if self._is_boolean and not (
# is_bool(self._fill_value) or isna(self._fill_value)
# ):
# raise ValueError(
# "fill_value must be True, False or nan "
# f"for boolean type. Got {self._fill_value} instead"
# )

@property
def _is_na_fill_value(self) -> bool:
return isna(self.fill_value)
Expand Down
87 changes: 74 additions & 13 deletions pandas/tests/arrays/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,24 @@ def test_scalar_with_index_infer_dtype(self, scalar, dtype):
assert arr.dtype == dtype
assert exp.dtype == dtype

# GH 23122
def test_getitem_bool_sparse_array(self):
spar_bool = SparseArray([False, True] * 5, dtype=np.bool8, fill_value=True)
exp = SparseArray([np.nan, 2, np.nan, 5, 6])
tm.assert_sp_array_equal(self.arr[spar_bool], exp)

spar_bool = ~spar_bool
res = self.arr[spar_bool]
exp = SparseArray([np.nan, 1, 3, 4, np.nan])
tm.assert_sp_array_equal(res, exp)

spar_bool = SparseArray(
[False, True, np.nan] * 3, dtype=np.bool8, fill_value=np.nan
)
res = self.arr[spar_bool]
exp = SparseArray([np.nan, 3, 5])
tm.assert_sp_array_equal(res, exp)

def test_get_item(self):

assert np.isnan(self.arr[1])
Expand Down Expand Up @@ -505,7 +523,9 @@ def test_astype(self):
def test_astype_bool(self):
a = SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0))
result = a.astype(bool)
expected = SparseArray([True, 0, 0, True], dtype=SparseDtype(bool, 0))
expected = SparseArray(
[True, False, False, True], dtype=SparseDtype(bool, False)
)
tm.assert_sp_array_equal(result, expected)

# update fill value
Expand Down Expand Up @@ -605,10 +625,11 @@ def test_set_fill_value(self):
assert arr.fill_value

# coerces to bool
# msg = "unable to set fill_value 0 to bool dtype"
# XXX: we can construct an sparse array of bool
# type and use as fill_value any value
# msg = "fill_value must be True, False or nan"
# with pytest.raises(ValueError, match=msg):
arr.fill_value = 0
assert arr.fill_value == 0
# arr.fill_value = 0

# msg = "unable to set fill_value nan to bool dtype"
# with pytest.raises(ValueError, match=msg):
Expand Down Expand Up @@ -737,6 +758,41 @@ def test_boolean_slice_empty(self):
res = arr[[False, False, False]]
assert res.dtype == arr.dtype

def test_neg_operator(self):
arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
res = -arr
exp = SparseArray([1, 2, np.nan, -3], fill_value=np.nan, dtype=np.int8)
tm.assert_sp_array_equal(exp, res)

arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8)
res = -arr
exp = SparseArray([1, 2, -1, -3], fill_value=1, dtype=np.int8)
tm.assert_sp_array_equal(exp, res)

def test_abs_operator(self):
arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
res = abs(arr)
exp = SparseArray([1, 2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
tm.assert_sp_array_equal(exp, res)

arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8)
res = abs(arr)
exp = SparseArray([1, 2, 1, 3], fill_value=1, dtype=np.int8)
tm.assert_sp_array_equal(exp, res)

def test_invert_operator(self):
arr = SparseArray([False, True, False, True], fill_value=False, dtype=np.bool8)
res = ~arr
exp = SparseArray(
np.invert([False, True, False, True]), fill_value=True, dtype=np.bool8
)
res = ~arr
tm.assert_sp_array_equal(exp, res)

arr = SparseArray([0, 1, 0, 2, 3, 0], fill_value=0, dtype=np.int32)
res = ~arr
exp = SparseArray([-1, -2, -1, -3, -4, -1], fill_value=-1, dtype=np.int32)

@pytest.mark.parametrize("op", ["add", "sub", "mul", "truediv", "floordiv", "pow"])
def test_binary_operators(self, op):
op = getattr(operator, op)
Expand Down Expand Up @@ -1005,13 +1061,9 @@ def test_sum(self):

@pytest.mark.parametrize(
"arr",
[
np.array([0, 1, np.nan, 1]),
np.array([0, 1, 1]),
np.array([True, True, False]),
],
[np.array([0, 1, np.nan, 1]), np.array([0, 1, 1])],
)
@pytest.mark.parametrize("fill_value", [0, 1, np.nan, True, False])
@pytest.mark.parametrize("fill_value", [0, 1, np.nan])
@pytest.mark.parametrize("min_count, expected", [(3, 2), (4, np.nan)])
def test_sum_min_count(self, arr, fill_value, min_count, expected):
# https://github.com/pandas-dev/pandas/issues/25777
Expand All @@ -1022,6 +1074,15 @@ def test_sum_min_count(self, arr, fill_value, min_count, expected):
else:
assert result == expected

def test_bool_sum_min_count(self):
spar_bool = pd.arrays.SparseArray(
[False, True] * 5, dtype=np.bool8, fill_value=True
)
res = spar_bool.sum(min_count=1)
assert res == 5
res = spar_bool.sum(min_count=11)
assert isna(res)

def test_numpy_sum(self):
data = np.arange(10).astype(float)
out = np.sum(SparseArray(data))
Expand Down Expand Up @@ -1121,9 +1182,9 @@ def test_ufunc(self):
tm.assert_sp_array_equal(np.abs(sparse), result)

sparse = SparseArray([1, -1, 2, -2], fill_value=-1)
result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, fill_value=1)
tm.assert_sp_array_equal(abs(sparse), result)
tm.assert_sp_array_equal(np.abs(sparse), result)
exp = SparseArray([1, 1, 2, 2], fill_value=1)
tm.assert_sp_array_equal(abs(sparse), exp)
tm.assert_sp_array_equal(np.abs(sparse), exp)

sparse = SparseArray([1, np.nan, 2, np.nan, -2])
result = SparseArray(np.sin([1, np.nan, 2, np.nan, -2]))
Expand Down
19 changes: 8 additions & 11 deletions pandas/tests/extension/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,20 +193,17 @@ def test_reindex(self, data, na_value):

class TestMissing(BaseSparseTests, base.BaseMissingTests):
def test_isna(self, data_missing):
sarr = SparseArray(data_missing)
expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value))
expected = SparseArray([True, False], dtype=expected_dtype)
result = sarr.isna()
tm.assert_sp_array_equal(result, expected)

result = pd.isna(data_missing)
self.assert_equal(result, expected)

result = pd.Series(data_missing).isna()
expected = pd.Series(expected)
self.assert_series_equal(result, expected)

# GH 21189
result = pd.Series(data_missing).drop([0, 1]).isna()
expected = pd.Series([], dtype=expected_dtype)
self.assert_series_equal(result, expected)
# test isna for arr without na
sarr = sarr.fillna(0)
expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value))
expected = SparseArray([False, False], fill_value=False, dtype=expected_dtype)
self.assert_equal(sarr.isna(), expected)

def test_fillna_limit_pad(self, data_missing):
with tm.assert_produces_warning(PerformanceWarning):
Expand Down