Skip to content

PERF: sparse take #43654

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Sep 21, 2021
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions asv_bench/benchmarks/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,4 +180,19 @@ def time_min_max(self, func, fill_value):
getattr(self.sp_arr, func)()


class Take:

params = ([np.array([0]), np.arange(100_000), np.full(100_000, -1)], [True, False])
param_names = ["indices", "allow_fill"]

def setup(self, indices, allow_fill):
N = 1_000_000
fill_value = 0.0
arr = make_array(N, 1e-5, fill_value, np.float64)
self.sp_arr = SparseArray(arr, fill_value=fill_value)

def time_take(self, indices, allow_fill):
self.sp_arr.take(indices, allow_fill=allow_fill)


from .pandas_vb_common import setup # noqa: F401 isort:skip
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,7 @@ Performance improvements
- Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`)
- Performance improvement in :meth:`GroupBy.quantile` (:issue:`43469`)
- :meth:`SparseArray.min` and :meth:`SparseArray.max` no longer require converting to a dense array (:issue:`43526`)
- Performance improvement in :meth:`SparseArray.take` with ``allow_fill=False`` (:issue:`?`)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fill in ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks!

-

.. ---------------------------------------------------------------------------
Expand Down
33 changes: 8 additions & 25 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -953,10 +953,7 @@ def take(
elif allow_fill:
result = self._take_with_fill(indices, fill_value=fill_value)
else:
# error: Incompatible types in assignment (expression has type
# "Union[ndarray, SparseArray]", variable has type "ndarray")
result = self._take_without_fill(indices) # type: ignore[assignment]
dtype = self.dtype
return self._take_without_fill(indices)

return type(self)(
result, fill_value=self.fill_value, kind=self.kind, dtype=dtype
Expand Down Expand Up @@ -1027,9 +1024,8 @@ def _take_with_fill(self, indices, fill_value=None) -> np.ndarray:

return taken

def _take_without_fill(self, indices) -> np.ndarray | SparseArray:
def _take_without_fill(self: SparseArrayT, indices) -> SparseArrayT:
to_shift = indices < 0
indices = indices.copy()

n = len(self)

Expand All @@ -1040,30 +1036,17 @@ def _take_without_fill(self, indices) -> np.ndarray | SparseArray:
raise IndexError("out of bounds value in 'indices'.")

if to_shift.any():
indices = indices.copy()
indices[to_shift] += n

if self.sp_index.npoints == 0:
# edge case in take...
# I think just return
out = np.full(
indices.shape,
self.fill_value,
dtype=np.result_type(type(self.fill_value)),
)
arr, sp_index, fill_value = make_sparse(out, fill_value=self.fill_value)
return type(self)(arr, sparse_index=sp_index, fill_value=fill_value)

sp_indexer = self.sp_index.lookup_array(indices)
taken = self.sp_values.take(sp_indexer)
fillable = sp_indexer < 0
value_mask = sp_indexer != -1
new_sp_values = self.sp_values[sp_indexer[value_mask]]

if fillable.any():
# TODO: may need to coerce array to fill value
result_type = np.result_type(taken, type(self.fill_value))
taken = taken.astype(result_type)
taken[fillable] = self.fill_value
value_indices = np.flatnonzero(value_mask).astype(np.int32, copy=False)

return taken
new_sp_index = make_sparse_index(len(indices), value_indices, kind=self.kind)
return type(self)._simple_new(new_sp_values, new_sp_index, dtype=self.dtype)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Main reasoning for the simplifications here keeping same behavior is that take with allow_fill=False should give a result with the same dtype as our SparseArray


def searchsorted(
self,
Expand Down
10 changes: 5 additions & 5 deletions pandas/tests/arrays/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,15 +382,15 @@ def test_take_filling_fill_value(self):
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, 5]), fill_value=True)

def test_take_filling_all_nan(self):
sparse = SparseArray([np.nan, np.nan, np.nan, np.nan, np.nan])
# XXX: did the default kind from take change?
@pytest.mark.parametrize("kind", ["block", "integer"])
def test_take_filling_all_nan(self, kind):
sparse = SparseArray([np.nan, np.nan, np.nan, np.nan, np.nan], kind=kind)
result = sparse.take(np.array([1, 0, -1]))
expected = SparseArray([np.nan, np.nan, np.nan], kind="block")
expected = SparseArray([np.nan, np.nan, np.nan], kind=kind)
tm.assert_sp_array_equal(result, expected)

result = sparse.take(np.array([1, 0, -1]), fill_value=True)
expected = SparseArray([np.nan, np.nan, np.nan], kind="block")
expected = SparseArray([np.nan, np.nan, np.nan], kind=kind)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In all other take cases, we preserve the kind of SparseIndex, so I think it makes sense to do so here (otherwise is surprising value-dependent behavior IMO, somewhat expressed in the comment originally here).

tm.assert_sp_array_equal(result, expected)

msg = "out of bounds value in 'indices'"
Expand Down