Skip to content

Commit c90cdde

Browse files
sinhrksjreback
authored andcommitted
ENH/PERF SparseArray.take indexing
related to #4400 Added more tests for sparse indexing. `SparseArray.take`` has optimized logic to omit dense ``np.ndarray`` creation. SparseSeires.iloc` can work with negative indices. Made ``SparseArray.take`` to handle negative indices as the same rule as ``Index`` (#12676) Author: sinhrks <[email protected]> Closes #12796 from sinhrks/sparse_test_at and squashes the following commits: df1f056 [sinhrks] ENH/PERF SparseArray.take indexing
1 parent 8250d7c commit c90cdde

File tree

9 files changed

+567
-69
lines changed

9 files changed

+567
-69
lines changed

doc/source/whatsnew/v0.18.1.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ These changes conform sparse handling to return the correct types and work to ma
8181
- Bug in ``SparseSeries.__repr__`` raises ``TypeError`` when it is longer than ``max_rows`` (:issue:`10560`)
8282
- Bug in ``SparseSeries.shape`` ignores ``fill_value`` (:issue:`10452`)
8383
- Bug in ``SparseArray.to_dense()`` does not preserve ``dtype`` (:issue:`10648`)
84-
- ``SparseArray.take`` now returns scalar for scalar input, ``SparseArray`` for others (:issue:`10560`)
84+
- ``SparseArray.take`` now returns scalar for scalar input, ``SparseArray`` for others. Also now it handles negative indexer as the same rule as ``Index`` (:issue:`10560`, :issue:`12796`)
8585

8686
.. ipython:: python
8787

pandas/core/series.py

-3
Original file line numberDiff line numberDiff line change
@@ -809,9 +809,6 @@ def _set_values(self, key, value):
809809
self._data = self._data.setitem(indexer=key, value=value)
810810
self._maybe_update_cacher()
811811

812-
# help out SparseSeries
813-
_get_val_at = ndarray.__getitem__
814-
815812
def repeat(self, reps):
816813
"""
817814
return a new Series with the values repeated reps times

pandas/indexes/base.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1332,7 +1332,7 @@ def _ensure_compat_concat(indexes):
13321332
return indexes
13331333

13341334
_index_shared_docs['take'] = """
1335-
return a new Index of the values selected by the indices
1335+
return a new %(klass)s of the values selected by the indices
13361336
13371337
For internal compatibility with numpy arrays.
13381338
@@ -1352,7 +1352,7 @@ def _ensure_compat_concat(indexes):
13521352
numpy.ndarray.take
13531353
"""
13541354

1355-
@Appender(_index_shared_docs['take'])
1355+
@Appender(_index_shared_docs['take'] % _index_doc_kwargs)
13561356
def take(self, indices, axis=0, allow_fill=True, fill_value=None):
13571357
indices = com._ensure_platform_int(indices)
13581358
if self._can_hold_na:

pandas/sparse/array.py

+65-39
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,16 @@
1313
from pandas import compat, lib
1414
from pandas.compat import range
1515

16-
from pandas._sparse import BlockIndex, IntIndex
16+
from pandas._sparse import SparseIndex, BlockIndex, IntIndex
1717
import pandas._sparse as splib
1818
import pandas.index as _index
1919
import pandas.core.ops as ops
2020
import pandas.formats.printing as printing
21+
from pandas.util.decorators import Appender
22+
from pandas.indexes.base import _index_shared_docs
23+
24+
25+
_sparray_doc_kwargs = dict(klass='SparseArray')
2126

2227

2328
def _arith_method(op, name, str_rep=None, default_axis=None, fill_zeros=None,
@@ -167,10 +172,19 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer',
167172
fill_value = bool(fill_value)
168173

169174
# Change the class of the array to be the subclass type.
170-
output = subarr.view(cls)
171-
output.sp_index = sparse_index
172-
output.fill_value = fill_value
173-
return output
175+
return cls._simple_new(subarr, sparse_index, fill_value)
176+
177+
@classmethod
178+
def _simple_new(cls, data, sp_index, fill_value):
179+
result = data.view(cls)
180+
181+
if not isinstance(sp_index, SparseIndex):
182+
# caller must pass SparseIndex
183+
raise ValueError('sp_index must be a SparseIndex')
184+
185+
result.sp_index = sp_index
186+
result.fill_value = fill_value
187+
return result
174188

175189
@property
176190
def _constructor(self):
@@ -308,46 +322,53 @@ def _get_val_at(self, loc):
308322
else:
309323
return _index.get_value_at(self, sp_loc)
310324

311-
def take(self, indices, axis=0):
312-
"""
313-
Sparse-compatible version of ndarray.take
325+
@Appender(_index_shared_docs['take'] % _sparray_doc_kwargs)
326+
def take(self, indices, axis=0, allow_fill=True,
327+
fill_value=None):
328+
329+
# Sparse-compatible version of ndarray.take, returns SparseArray
314330

315-
Returns
316-
-------
317-
taken : ndarray
318-
"""
319331
if axis:
320332
raise ValueError("axis must be 0, input was {0}".format(axis))
321333

322334
if com.is_integer(indices):
323335
# return scalar
324336
return self[indices]
325337

326-
indices = np.atleast_1d(np.asarray(indices, dtype=int))
327-
328-
# allow -1 to indicate missing values
338+
indices = com._ensure_platform_int(indices)
329339
n = len(self)
330-
if ((indices >= n) | (indices < -1)).any():
331-
raise IndexError('out of bounds access')
332-
333-
if self.sp_index.npoints > 0:
334-
locs = np.array([self.sp_index.lookup(loc) if loc > -1 else -1
335-
for loc in indices])
336-
result = self.sp_values.take(locs)
337-
mask = locs == -1
338-
if mask.any():
339-
try:
340-
result[mask] = self.fill_value
341-
except ValueError:
342-
# wrong dtype
343-
result = result.astype('float64')
344-
result[mask] = self.fill_value
345-
340+
if allow_fill and fill_value is not None:
341+
# allow -1 to indicate self.fill_value,
342+
# self.fill_value may not be NaN
343+
if (indices < -1).any():
344+
msg = ('When allow_fill=True and fill_value is not None, '
345+
'all indices must be >= -1')
346+
raise ValueError(msg)
347+
elif (n <= indices).any():
348+
msg = 'index is out of bounds for size {0}'
349+
raise IndexError(msg.format(n))
350+
else:
351+
if ((indices < -n) | (n <= indices)).any():
352+
msg = 'index is out of bounds for size {0}'
353+
raise IndexError(msg.format(n))
354+
355+
indices = indices.astype(np.int32)
356+
if not (allow_fill and fill_value is not None):
357+
indices = indices.copy()
358+
indices[indices < 0] += n
359+
360+
locs = self.sp_index.lookup_array(indices)
361+
indexer = np.arange(len(locs), dtype=np.int32)
362+
mask = locs != -1
363+
if mask.any():
364+
indexer = indexer[mask]
365+
new_values = self.sp_values.take(locs[mask])
346366
else:
347-
result = np.empty(len(indices))
348-
result.fill(self.fill_value)
367+
indexer = np.empty(shape=(0, ), dtype=np.int32)
368+
new_values = np.empty(shape=(0, ), dtype=self.sp_values.dtype)
349369

350-
return self._constructor(result)
370+
sp_index = _make_index(len(indices), indexer, kind=self.sp_index)
371+
return self._simple_new(new_values, sp_index, self.fill_value)
351372

352373
def __setitem__(self, key, value):
353374
# if com.is_integer(key):
@@ -525,16 +546,21 @@ def make_sparse(arr, kind='block', fill_value=nan):
525546
else:
526547
indices = np.arange(length, dtype=np.int32)[mask]
527548

528-
if kind == 'block':
549+
index = _make_index(length, indices, kind)
550+
sparsified_values = arr[mask]
551+
return sparsified_values, index
552+
553+
554+
def _make_index(length, indices, kind):
555+
556+
if kind == 'block' or isinstance(kind, BlockIndex):
529557
locs, lens = splib.get_blocks(indices)
530558
index = BlockIndex(length, locs, lens)
531-
elif kind == 'integer':
559+
elif kind == 'integer' or isinstance(kind, IntIndex):
532560
index = IntIndex(length, indices)
533561
else: # pragma: no cover
534562
raise ValueError('must be block or integer type')
535-
536-
sparsified_values = arr[mask]
537-
return sparsified_values, index
563+
return index
538564

539565

540566
ops.add_special_arithmetic_methods(SparseArray, arith_method=_arith_method,

pandas/sparse/series.py

+1-6
Original file line numberDiff line numberDiff line change
@@ -165,10 +165,10 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block',
165165
if index is None:
166166
index = data.index.view()
167167
else:
168+
168169
data = data.reindex(index, copy=False)
169170

170171
else:
171-
172172
length = len(index)
173173

174174
if data == fill_value or (isnull(data) and isnull(fill_value)):
@@ -376,11 +376,6 @@ def _get_val_at(self, loc):
376376
""" forward to the array """
377377
return self.block.values._get_val_at(loc)
378378

379-
def _slice(self, slobj, axis=0, kind=None):
380-
slobj = self.index._convert_slice_indexer(slobj,
381-
kind=kind or 'getitem')
382-
return self._get_values(slobj)
383-
384379
def __getitem__(self, key):
385380
"""
386381

0 commit comments

Comments
 (0)