Skip to content

Commit d6d8e3b

Browse files
mzeitlin11gasparitiago
authored andcommitted
PERF: materialize less on slice in sparse __getitem__ (pandas-dev#43777)
1 parent 2c897db commit d6d8e3b

File tree

4 files changed

+81
-26
lines changed

4 files changed

+81
-26
lines changed

asv_bench/benchmarks/sparse.py

+13
Original file line numberDiff line numberDiff line change
@@ -195,4 +195,17 @@ def time_take(self, indices, allow_fill):
195195
self.sp_arr.take(indices, allow_fill=allow_fill)
196196

197197

198+
class GetItem:
199+
def setup(self):
200+
N = 1_000_000
201+
arr = make_array(N, 1e-5, np.nan, np.float64)
202+
self.sp_arr = SparseArray(arr)
203+
204+
def time_integer_indexing(self):
205+
self.sp_arr[78]
206+
207+
def time_slice(self):
208+
self.sp_arr[1:]
209+
210+
198211
from .pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/whatsnew/v1.4.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -357,8 +357,10 @@ Performance improvements
357357
- Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`)
358358
- Performance improvement in :meth:`GroupBy.quantile` (:issue:`43469`)
359359
- :meth:`SparseArray.min` and :meth:`SparseArray.max` no longer require converting to a dense array (:issue:`43526`)
360+
- Indexing into a :class:`SparseArray` with a ``slice`` with ``step=1`` no longer requires converting to a dense array (:issue:`43777`)
360361
- Performance improvement in :meth:`SparseArray.take` with ``allow_fill=False`` (:issue:`43654`)
361362
- Performance improvement in :meth:`.Rolling.mean` and :meth:`.Expanding.mean` with ``engine="numba"`` (:issue:`43612`)
363+
-
362364

363365
.. ---------------------------------------------------------------------------
364366

pandas/core/arrays/sparse/array.py

+35-9
Original file line numberDiff line numberDiff line change
@@ -892,13 +892,39 @@ def __getitem__(
892892
elif isinstance(key, tuple):
893893
data_slice = self.to_dense()[key]
894894
elif isinstance(key, slice):
895-
# special case to preserve dtypes
896-
if key == slice(None):
897-
return self.copy()
898-
# TODO: this logic is surely elsewhere
899-
# TODO: this could be more efficient
900-
indices = np.arange(len(self), dtype=np.int32)[key]
901-
return self.take(indices)
895+
896+
# Avoid densifying when handling contiguous slices
897+
if key.step is None or key.step == 1:
898+
start = 0 if key.start is None else key.start
899+
if start < 0:
900+
start += len(self)
901+
902+
end = len(self) if key.stop is None else key.stop
903+
if end < 0:
904+
end += len(self)
905+
906+
indices = self.sp_index.to_int_index().indices
907+
keep_inds = np.flatnonzero((indices >= start) & (indices < end))
908+
sp_vals = self.sp_values[keep_inds]
909+
910+
sp_index = indices[keep_inds].copy()
911+
912+
# If we've sliced to not include the start of the array, all our indices
913+
# should be shifted. NB: here we are careful to also not shift by a
914+
# negative value for a case like [0, 1][-100:] where the start index
915+
# should be treated like 0
916+
if start > 0:
917+
sp_index -= start
918+
919+
# Length of our result should match applying this slice to a range
920+
# of the length of our original array
921+
new_len = len(range(len(self))[key])
922+
new_sp_index = make_sparse_index(new_len, sp_index, self.kind)
923+
return type(self)._simple_new(sp_vals, new_sp_index, self.dtype)
924+
else:
925+
indices = np.arange(len(self), dtype=np.int32)[key]
926+
return self.take(indices)
927+
902928
else:
903929
# TODO: I think we can avoid densifying when masking a
904930
# boolean SparseArray with another. Need to look at the
@@ -1745,10 +1771,10 @@ def make_sparse_index(length: int, indices, kind: Literal["integer"]) -> IntInde
17451771

17461772
def make_sparse_index(length: int, indices, kind: SparseIndexKind) -> SparseIndex:
17471773
index: SparseIndex
1748-
if kind == "block" or isinstance(kind, BlockIndex):
1774+
if kind == "block":
17491775
locs, lens = splib.get_blocks(indices)
17501776
index = BlockIndex(length, locs, lens)
1751-
elif kind == "integer" or isinstance(kind, IntIndex):
1777+
elif kind == "integer":
17521778
index = IntIndex(length, indices)
17531779
else: # pragma: no cover
17541780
raise ValueError("must be block or integer type")

pandas/tests/arrays/sparse/test_array.py

+31-17
Original file line numberDiff line numberDiff line change
@@ -679,23 +679,37 @@ def test_getitem_arraylike_mask(self):
679679
expected = SparseArray([0, 2])
680680
tm.assert_sp_array_equal(result, expected)
681681

682-
def test_getslice(self):
683-
result = self.arr[:-3]
684-
exp = SparseArray(self.arr.to_dense()[:-3])
685-
tm.assert_sp_array_equal(result, exp)
686-
687-
result = self.arr[-4:]
688-
exp = SparseArray(self.arr.to_dense()[-4:])
689-
tm.assert_sp_array_equal(result, exp)
690-
691-
# two corner cases from Series
692-
result = self.arr[-12:]
693-
exp = SparseArray(self.arr)
694-
tm.assert_sp_array_equal(result, exp)
695-
696-
result = self.arr[:-12]
697-
exp = SparseArray(self.arr.to_dense()[:0])
698-
tm.assert_sp_array_equal(result, exp)
682+
@pytest.mark.parametrize(
683+
"slc",
684+
[
685+
np.s_[:],
686+
np.s_[1:10],
687+
np.s_[1:100],
688+
np.s_[10:1],
689+
np.s_[:-3],
690+
np.s_[-5:-4],
691+
np.s_[:-12],
692+
np.s_[-12:],
693+
np.s_[2:],
694+
np.s_[2::3],
695+
np.s_[::2],
696+
np.s_[::-1],
697+
np.s_[::-2],
698+
np.s_[1:6:2],
699+
np.s_[:-6:-2],
700+
],
701+
)
702+
@pytest.mark.parametrize(
703+
"as_dense", [[np.nan] * 10, [1] * 10, [np.nan] * 5 + [1] * 5, []]
704+
)
705+
def test_getslice(self, slc, as_dense):
706+
as_dense = np.array(as_dense)
707+
arr = SparseArray(as_dense)
708+
709+
result = arr[slc]
710+
expected = SparseArray(as_dense[slc])
711+
712+
tm.assert_sp_array_equal(result, expected)
699713

700714
def test_getslice_tuple(self):
701715
dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0])

0 commit comments

Comments
 (0)