PERF: materialize less on slice in sparse __getitem__ (pandas-dev#43777)

mzeitlin11 · gasparitiago · commit d6d8e3bc0da9 · 2021-10-09T20:11:54.000-03:00
diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py
@@ -195,4 +195,17 @@ def time_take(self, indices, allow_fill):
         self.sp_arr.take(indices, allow_fill=allow_fill)
 
 
+class GetItem:
+    def setup(self):
+        N = 1_000_000
+        arr = make_array(N, 1e-5, np.nan, np.float64)
+        self.sp_arr = SparseArray(arr)
+
+    def time_integer_indexing(self):
+        self.sp_arr[78]
+
+    def time_slice(self):
+        self.sp_arr[1:]
+
+
 from .pandas_vb_common import setup  # noqa: F401 isort:skip
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -357,8 +357,10 @@ Performance improvements
 - Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`)
 - Performance improvement in :meth:`GroupBy.quantile` (:issue:`43469`)
 - :meth:`SparseArray.min` and :meth:`SparseArray.max` no longer require converting to a dense array (:issue:`43526`)
+- Indexing into a :class:`SparseArray` with a ``slice`` with ``step=1`` no longer requires converting to a dense array (:issue:`43777`)
 - Performance improvement in :meth:`SparseArray.take` with ``allow_fill=False`` (:issue:`43654`)
 - Performance improvement in :meth:`.Rolling.mean` and :meth:`.Expanding.mean` with ``engine="numba"`` (:issue:`43612`)
+-
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -892,13 +892,39 @@ def __getitem__(
         elif isinstance(key, tuple):
             data_slice = self.to_dense()[key]
         elif isinstance(key, slice):
-            # special case to preserve dtypes
-            if key == slice(None):
-                return self.copy()
-            # TODO: this logic is surely elsewhere
-            # TODO: this could be more efficient
-            indices = np.arange(len(self), dtype=np.int32)[key]
-            return self.take(indices)
+
+            # Avoid densifying when handling contiguous slices
+            if key.step is None or key.step == 1:
+                start = 0 if key.start is None else key.start
+                if start < 0:
+                    start += len(self)
+
+                end = len(self) if key.stop is None else key.stop
+                if end < 0:
+                    end += len(self)
+
+                indices = self.sp_index.to_int_index().indices
+                keep_inds = np.flatnonzero((indices >= start) & (indices < end))
+                sp_vals = self.sp_values[keep_inds]
+
+                sp_index = indices[keep_inds].copy()
+
+                # If we've sliced to not include the start of the array, all our indices
+                # should be shifted. NB: here we are careful to also not shift by a
+                # negative value for a case like [0, 1][-100:] where the start index
+                # should be treated like 0
+                if start > 0:
+                    sp_index -= start
+
+                # Length of our result should match applying this slice to a range
+                # of the length of our original array
+                new_len = len(range(len(self))[key])
+                new_sp_index = make_sparse_index(new_len, sp_index, self.kind)
+                return type(self)._simple_new(sp_vals, new_sp_index, self.dtype)
+            else:
+                indices = np.arange(len(self), dtype=np.int32)[key]
+                return self.take(indices)
+
         else:
             # TODO: I think we can avoid densifying when masking a
             # boolean SparseArray with another. Need to look at the
@@ -1745,10 +1771,10 @@ def make_sparse_index(length: int, indices, kind: Literal["integer"]) -> IntInde
 
 def make_sparse_index(length: int, indices, kind: SparseIndexKind) -> SparseIndex:
     index: SparseIndex
-    if kind == "block" or isinstance(kind, BlockIndex):
+    if kind == "block":
         locs, lens = splib.get_blocks(indices)
         index = BlockIndex(length, locs, lens)
-    elif kind == "integer" or isinstance(kind, IntIndex):
+    elif kind == "integer":
         index = IntIndex(length, indices)
     else:  # pragma: no cover
         raise ValueError("must be block or integer type")
diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
@@ -679,23 +679,37 @@ def test_getitem_arraylike_mask(self):
         expected = SparseArray([0, 2])
         tm.assert_sp_array_equal(result, expected)
 
-    def test_getslice(self):
-        result = self.arr[:-3]
-        exp = SparseArray(self.arr.to_dense()[:-3])
-        tm.assert_sp_array_equal(result, exp)
-
-        result = self.arr[-4:]
-        exp = SparseArray(self.arr.to_dense()[-4:])
-        tm.assert_sp_array_equal(result, exp)
-
-        # two corner cases from Series
-        result = self.arr[-12:]
-        exp = SparseArray(self.arr)
-        tm.assert_sp_array_equal(result, exp)
-
-        result = self.arr[:-12]
-        exp = SparseArray(self.arr.to_dense()[:0])
-        tm.assert_sp_array_equal(result, exp)
+    @pytest.mark.parametrize(
+        "slc",
+        [
+            np.s_[:],
+            np.s_[1:10],
+            np.s_[1:100],
+            np.s_[10:1],
+            np.s_[:-3],
+            np.s_[-5:-4],
+            np.s_[:-12],
+            np.s_[-12:],
+            np.s_[2:],
+            np.s_[2::3],
+            np.s_[::2],
+            np.s_[::-1],
+            np.s_[::-2],
+            np.s_[1:6:2],
+            np.s_[:-6:-2],
+        ],
+    )
+    @pytest.mark.parametrize(
+        "as_dense", [[np.nan] * 10, [1] * 10, [np.nan] * 5 + [1] * 5, []]
+    )
+    def test_getslice(self, slc, as_dense):
+        as_dense = np.array(as_dense)
+        arr = SparseArray(as_dense)
+
+        result = arr[slc]
+        expected = SparseArray(as_dense[slc])
+
+        tm.assert_sp_array_equal(result, expected)
 
     def test_getslice_tuple(self):
         dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0])