Skip to content

Commit a5264b8

Browse files
authored
PERF: sparse take (#43654)
1 parent 4e3bcb6 commit a5264b8

File tree

4 files changed

+29
-30
lines changed

4 files changed

+29
-30
lines changed

asv_bench/benchmarks/sparse.py

+15
Original file line numberDiff line numberDiff line change
@@ -180,4 +180,19 @@ def time_min_max(self, func, fill_value):
180180
getattr(self.sp_arr, func)()
181181

182182

183+
class Take:
184+
185+
params = ([np.array([0]), np.arange(100_000), np.full(100_000, -1)], [True, False])
186+
param_names = ["indices", "allow_fill"]
187+
188+
def setup(self, indices, allow_fill):
189+
N = 1_000_000
190+
fill_value = 0.0
191+
arr = make_array(N, 1e-5, fill_value, np.float64)
192+
self.sp_arr = SparseArray(arr, fill_value=fill_value)
193+
194+
def time_take(self, indices, allow_fill):
195+
self.sp_arr.take(indices, allow_fill=allow_fill)
196+
197+
183198
from .pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,7 @@ Performance improvements
354354
- Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`)
355355
- Performance improvement in :meth:`GroupBy.quantile` (:issue:`43469`)
356356
- :meth:`SparseArray.min` and :meth:`SparseArray.max` no longer require converting to a dense array (:issue:`43526`)
357+
- Performance improvement in :meth:`SparseArray.take` with ``allow_fill=False`` (:issue:`43654 `)
357358
-
358359

359360
.. ---------------------------------------------------------------------------

pandas/core/arrays/sparse/array.py

+8-25
Original file line numberDiff line numberDiff line change
@@ -953,10 +953,7 @@ def take(
953953
elif allow_fill:
954954
result = self._take_with_fill(indices, fill_value=fill_value)
955955
else:
956-
# error: Incompatible types in assignment (expression has type
957-
# "Union[ndarray, SparseArray]", variable has type "ndarray")
958-
result = self._take_without_fill(indices) # type: ignore[assignment]
959-
dtype = self.dtype
956+
return self._take_without_fill(indices)
960957

961958
return type(self)(
962959
result, fill_value=self.fill_value, kind=self.kind, dtype=dtype
@@ -1027,9 +1024,8 @@ def _take_with_fill(self, indices, fill_value=None) -> np.ndarray:
10271024

10281025
return taken
10291026

1030-
def _take_without_fill(self, indices) -> np.ndarray | SparseArray:
1027+
def _take_without_fill(self: SparseArrayT, indices) -> SparseArrayT:
10311028
to_shift = indices < 0
1032-
indices = indices.copy()
10331029

10341030
n = len(self)
10351031

@@ -1040,30 +1036,17 @@ def _take_without_fill(self, indices) -> np.ndarray | SparseArray:
10401036
raise IndexError("out of bounds value in 'indices'.")
10411037

10421038
if to_shift.any():
1039+
indices = indices.copy()
10431040
indices[to_shift] += n
10441041

1045-
if self.sp_index.npoints == 0:
1046-
# edge case in take...
1047-
# I think just return
1048-
out = np.full(
1049-
indices.shape,
1050-
self.fill_value,
1051-
dtype=np.result_type(type(self.fill_value)),
1052-
)
1053-
arr, sp_index, fill_value = make_sparse(out, fill_value=self.fill_value)
1054-
return type(self)(arr, sparse_index=sp_index, fill_value=fill_value)
1055-
10561042
sp_indexer = self.sp_index.lookup_array(indices)
1057-
taken = self.sp_values.take(sp_indexer)
1058-
fillable = sp_indexer < 0
1043+
value_mask = sp_indexer != -1
1044+
new_sp_values = self.sp_values[sp_indexer[value_mask]]
10591045

1060-
if fillable.any():
1061-
# TODO: may need to coerce array to fill value
1062-
result_type = np.result_type(taken, type(self.fill_value))
1063-
taken = taken.astype(result_type)
1064-
taken[fillable] = self.fill_value
1046+
value_indices = np.flatnonzero(value_mask).astype(np.int32, copy=False)
10651047

1066-
return taken
1048+
new_sp_index = make_sparse_index(len(indices), value_indices, kind=self.kind)
1049+
return type(self)._simple_new(new_sp_values, new_sp_index, dtype=self.dtype)
10671050

10681051
def searchsorted(
10691052
self,

pandas/tests/arrays/sparse/test_array.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -382,15 +382,15 @@ def test_take_filling_fill_value(self):
382382
with pytest.raises(IndexError, match=msg):
383383
sparse.take(np.array([1, 5]), fill_value=True)
384384

385-
def test_take_filling_all_nan(self):
386-
sparse = SparseArray([np.nan, np.nan, np.nan, np.nan, np.nan])
387-
# XXX: did the default kind from take change?
385+
@pytest.mark.parametrize("kind", ["block", "integer"])
386+
def test_take_filling_all_nan(self, kind):
387+
sparse = SparseArray([np.nan, np.nan, np.nan, np.nan, np.nan], kind=kind)
388388
result = sparse.take(np.array([1, 0, -1]))
389-
expected = SparseArray([np.nan, np.nan, np.nan], kind="block")
389+
expected = SparseArray([np.nan, np.nan, np.nan], kind=kind)
390390
tm.assert_sp_array_equal(result, expected)
391391

392392
result = sparse.take(np.array([1, 0, -1]), fill_value=True)
393-
expected = SparseArray([np.nan, np.nan, np.nan], kind="block")
393+
expected = SparseArray([np.nan, np.nan, np.nan], kind=kind)
394394
tm.assert_sp_array_equal(result, expected)
395395

396396
msg = "out of bounds value in 'indices'"

0 commit comments

Comments
 (0)