Skip to content

PERF: materialize less on slice in sparse __getitem__ #43777

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Sep 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions asv_bench/benchmarks/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,4 +195,17 @@ def time_take(self, indices, allow_fill):
self.sp_arr.take(indices, allow_fill=allow_fill)


class GetItem:
def setup(self):
N = 1_000_000
arr = make_array(N, 1e-5, np.nan, np.float64)
self.sp_arr = SparseArray(arr)

def time_integer_indexing(self):
self.sp_arr[78]

def time_slice(self):
self.sp_arr[1:]


from .pandas_vb_common import setup # noqa: F401 isort:skip
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -357,8 +357,10 @@ Performance improvements
- Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`)
- Performance improvement in :meth:`GroupBy.quantile` (:issue:`43469`)
- :meth:`SparseArray.min` and :meth:`SparseArray.max` no longer require converting to a dense array (:issue:`43526`)
- Indexing into a :class:`SparseArray` with a ``slice`` with ``step=1`` no longer requires converting to a dense array (:issue:`43777`)
- Performance improvement in :meth:`SparseArray.take` with ``allow_fill=False`` (:issue:`43654`)
- Performance improvement in :meth:`.Rolling.mean` and :meth:`.Expanding.mean` with ``engine="numba"`` (:issue:`43612`)
-

.. ---------------------------------------------------------------------------

Expand Down
44 changes: 35 additions & 9 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -892,13 +892,39 @@ def __getitem__(
elif isinstance(key, tuple):
data_slice = self.to_dense()[key]
elif isinstance(key, slice):
# special case to preserve dtypes
if key == slice(None):
return self.copy()
# TODO: this logic is surely elsewhere
# TODO: this could be more efficient
indices = np.arange(len(self), dtype=np.int32)[key]
return self.take(indices)

# Avoid densifying when handling contiguous slices
if key.step is None or key.step == 1:
start = 0 if key.start is None else key.start
if start < 0:
start += len(self)

end = len(self) if key.stop is None else key.stop
if end < 0:
end += len(self)

indices = self.sp_index.to_int_index().indices
keep_inds = np.flatnonzero((indices >= start) & (indices < end))
sp_vals = self.sp_values[keep_inds]

sp_index = indices[keep_inds].copy()

# If we've sliced to not include the start of the array, all our indices
# should be shifted. NB: here we are careful to also not shift by a
# negative value for a case like [0, 1][-100:] where the start index
# should be treated like 0
if start > 0:
sp_index -= start

# Length of our result should match applying this slice to a range
# of the length of our original array
new_len = len(range(len(self))[key])
new_sp_index = make_sparse_index(new_len, sp_index, self.kind)
return type(self)._simple_new(sp_vals, new_sp_index, self.dtype)
else:
indices = np.arange(len(self), dtype=np.int32)[key]
return self.take(indices)

else:
# TODO: I think we can avoid densifying when masking a
# boolean SparseArray with another. Need to look at the
Expand Down Expand Up @@ -1745,10 +1771,10 @@ def make_sparse_index(length: int, indices, kind: Literal["integer"]) -> IntInde

def make_sparse_index(length: int, indices, kind: SparseIndexKind) -> SparseIndex:
index: SparseIndex
if kind == "block" or isinstance(kind, BlockIndex):
if kind == "block":
locs, lens = splib.get_blocks(indices)
index = BlockIndex(length, locs, lens)
elif kind == "integer" or isinstance(kind, IntIndex):
elif kind == "integer":
index = IntIndex(length, indices)
else: # pragma: no cover
raise ValueError("must be block or integer type")
Expand Down
48 changes: 31 additions & 17 deletions pandas/tests/arrays/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,23 +679,37 @@ def test_getitem_arraylike_mask(self):
expected = SparseArray([0, 2])
tm.assert_sp_array_equal(result, expected)

def test_getslice(self):
result = self.arr[:-3]
exp = SparseArray(self.arr.to_dense()[:-3])
tm.assert_sp_array_equal(result, exp)

result = self.arr[-4:]
exp = SparseArray(self.arr.to_dense()[-4:])
tm.assert_sp_array_equal(result, exp)

# two corner cases from Series
result = self.arr[-12:]
exp = SparseArray(self.arr)
tm.assert_sp_array_equal(result, exp)

result = self.arr[:-12]
exp = SparseArray(self.arr.to_dense()[:0])
tm.assert_sp_array_equal(result, exp)
@pytest.mark.parametrize(
"slc",
[
np.s_[:],
np.s_[1:10],
np.s_[1:100],
np.s_[10:1],
np.s_[:-3],
np.s_[-5:-4],
np.s_[:-12],
np.s_[-12:],
np.s_[2:],
np.s_[2::3],
np.s_[::2],
np.s_[::-1],
np.s_[::-2],
np.s_[1:6:2],
np.s_[:-6:-2],
],
)
@pytest.mark.parametrize(
"as_dense", [[np.nan] * 10, [1] * 10, [np.nan] * 5 + [1] * 5, []]
)
def test_getslice(self, slc, as_dense):
as_dense = np.array(as_dense)
arr = SparseArray(as_dense)

result = arr[slc]
expected = SparseArray(as_dense[slc])

tm.assert_sp_array_equal(result, expected)

def test_getslice_tuple(self):
dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0])
Expand Down