From c54b60af899cc355901801b9f9c36f37f195fe59 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 18 Sep 2021 21:56:10 -0400 Subject: [PATCH 1/5] Clean up casting --- pandas/core/arrays/sparse/array.py | 33 +++++------------------- pandas/tests/arrays/sparse/test_array.py | 10 +++---- 2 files changed, 12 insertions(+), 31 deletions(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 8e92114d7b3de..cfb9a8d9e733c 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -953,10 +953,7 @@ def take( elif allow_fill: result = self._take_with_fill(indices, fill_value=fill_value) else: - # error: Incompatible types in assignment (expression has type - # "Union[ndarray, SparseArray]", variable has type "ndarray") - result = self._take_without_fill(indices) # type: ignore[assignment] - dtype = self.dtype + return self._take_without_fill(indices) return type(self)( result, fill_value=self.fill_value, kind=self.kind, dtype=dtype @@ -1027,7 +1024,7 @@ def _take_with_fill(self, indices, fill_value=None) -> np.ndarray: return taken - def _take_without_fill(self, indices) -> np.ndarray | SparseArray: + def _take_without_fill(self: SparseArrayT, indices) -> SparseArrayT: to_shift = indices < 0 indices = indices.copy() @@ -1042,28 +1039,12 @@ def _take_without_fill(self, indices) -> np.ndarray | SparseArray: if to_shift.any(): indices[to_shift] += n - if self.sp_index.npoints == 0: - # edge case in take... - # I think just return - out = np.full( - indices.shape, - self.fill_value, - dtype=np.result_type(type(self.fill_value)), - ) - arr, sp_index, fill_value = make_sparse(out, fill_value=self.fill_value) - return type(self)(arr, sparse_index=sp_index, fill_value=fill_value) - sp_indexer = self.sp_index.lookup_array(indices) - taken = self.sp_values.take(sp_indexer) - fillable = sp_indexer < 0 - - if fillable.any(): - # TODO: may need to coerce array to fill value - result_type = np.result_type(taken, type(self.fill_value)) - taken = taken.astype(result_type) - taken[fillable] = self.fill_value - - return taken + value_mask = sp_indexer != -1 + new_sp_values = self.sp_values[sp_indexer[value_mask]] + value_indices = np.flatnonzero(value_mask).astype(np.int32) + new_sp_index = make_sparse_index(len(indices), value_indices, kind=self.kind) + return type(self)._simple_new(new_sp_values, new_sp_index, dtype=self.dtype) def searchsorted( self, diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 8c64c5bb3a055..34ee68dbbbf18 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -382,15 +382,15 @@ def test_take_filling_fill_value(self): with pytest.raises(IndexError, match=msg): sparse.take(np.array([1, 5]), fill_value=True) - def test_take_filling_all_nan(self): - sparse = SparseArray([np.nan, np.nan, np.nan, np.nan, np.nan]) - # XXX: did the default kind from take change? + @pytest.mark.parametrize("kind", ["block", "integer"]) + def test_take_filling_all_nan(self, kind): + sparse = SparseArray([np.nan, np.nan, np.nan, np.nan, np.nan], kind=kind) result = sparse.take(np.array([1, 0, -1])) - expected = SparseArray([np.nan, np.nan, np.nan], kind="block") + expected = SparseArray([np.nan, np.nan, np.nan], kind=kind) tm.assert_sp_array_equal(result, expected) result = sparse.take(np.array([1, 0, -1]), fill_value=True) - expected = SparseArray([np.nan, np.nan, np.nan], kind="block") + expected = SparseArray([np.nan, np.nan, np.nan], kind=kind) tm.assert_sp_array_equal(result, expected) msg = "out of bounds value in 'indices'" From 3a6c3e48e4edf74bd05510d2cfc32d9c2ce4cfbe Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 18 Sep 2021 22:44:19 -0400 Subject: [PATCH 2/5] Add benchmark and whatsnew --- asv_bench/benchmarks/sparse.py | 15 +++++++++++++++ doc/source/whatsnew/v1.4.0.rst | 1 + 2 files changed, 16 insertions(+) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index 32baec25bef2d..ff1c4c92fe551 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -180,4 +180,19 @@ def time_min_max(self, func, fill_value): getattr(self.sp_arr, func)() +class Take: + + params = ([np.array([0]), np.arange(100_000), np.full(100_000, -1)], [True, False]) + param_names = ["indices", "allow_fill"] + + def setup(self, indices, allow_fill): + N = 1_000_000 + fill_value = 0.0 + arr = make_array(N, 1e-5, fill_value, np.float64) + self.sp_arr = SparseArray(arr, fill_value=fill_value) + + def time_take(self, indices, allow_fill): + self.sp_arr.take(indices, allow_fill=allow_fill) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index f18b3b75ca3d2..685bd334c5ec0 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -354,6 +354,7 @@ Performance improvements - Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`) - Performance improvement in :meth:`GroupBy.quantile` (:issue:`43469`) - :meth:`SparseArray.min` and :meth:`SparseArray.max` no longer require converting to a dense array (:issue:`43526`) +- Performance improvement in :meth:`SparseArray.take` with ``allow_fill=False`` (:issue:`?`) - .. --------------------------------------------------------------------------- From daadddccc58ca26e5cc0883343345ece69599b86 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 18 Sep 2021 22:50:06 -0400 Subject: [PATCH 3/5] Avoid copy --- pandas/core/arrays/sparse/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index cfb9a8d9e733c..fa43af90699b5 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1026,7 +1026,6 @@ def _take_with_fill(self, indices, fill_value=None) -> np.ndarray: def _take_without_fill(self: SparseArrayT, indices) -> SparseArrayT: to_shift = indices < 0 - indices = indices.copy() n = len(self) @@ -1037,6 +1036,7 @@ def _take_without_fill(self: SparseArrayT, indices) -> SparseArrayT: raise IndexError("out of bounds value in 'indices'.") if to_shift.any(): + indices = indices.copy() indices[to_shift] += n sp_indexer = self.sp_index.lookup_array(indices) From 4de1fe27af5e68d1a581af1a13b03f3aa64c94fb Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 18 Sep 2021 22:58:05 -0400 Subject: [PATCH 4/5] Clean up spacing --- pandas/core/arrays/sparse/array.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index fa43af90699b5..57f9c7262bce3 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1042,7 +1042,9 @@ def _take_without_fill(self: SparseArrayT, indices) -> SparseArrayT: sp_indexer = self.sp_index.lookup_array(indices) value_mask = sp_indexer != -1 new_sp_values = self.sp_values[sp_indexer[value_mask]] - value_indices = np.flatnonzero(value_mask).astype(np.int32) + + value_indices = np.flatnonzero(value_mask).astype(np.int32, copy=False) + new_sp_index = make_sparse_index(len(indices), value_indices, kind=self.kind) return type(self)._simple_new(new_sp_values, new_sp_index, dtype=self.dtype) From 58efeb5875ee235e1090c8be19fddb7303c7963a Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin <37011898+mzeitlin11@users.noreply.github.com> Date: Mon, 20 Sep 2021 13:02:52 -0400 Subject: [PATCH 5/5] Update doc/source/whatsnew/v1.4.0.rst --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index ff7dc1b0c3f8a..911b7db8e55f3 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -354,7 +354,7 @@ Performance improvements - Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`) - Performance improvement in :meth:`GroupBy.quantile` (:issue:`43469`) - :meth:`SparseArray.min` and :meth:`SparseArray.max` no longer require converting to a dense array (:issue:`43526`) -- Performance improvement in :meth:`SparseArray.take` with ``allow_fill=False`` (:issue:`?`) +- Performance improvement in :meth:`SparseArray.take` with ``allow_fill=False`` (:issue:`43654 `) - .. ---------------------------------------------------------------------------