Skip to content

Commit 1f88312

Browse files
sinhrksjreback
authored andcommitted
BUG: Sparse indexing with bool sparse may be incorrect
Author: sinhrks <[email protected]> Closes #13985 from sinhrks/sparse_bool_indexing and squashes the following commits: 0909fa8 [sinhrks] BUG: Sparse indexing with bool sparse may be incorrect
1 parent e0c3291 commit 1f88312

File tree

5 files changed

+72
-3
lines changed

5 files changed

+72
-3
lines changed

doc/source/whatsnew/v0.19.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -766,6 +766,7 @@ Note that the limitation is applied to ``fill_value`` which default is ``np.nan`
766766
- Bug in ``SparseArray`` and ``SparseSeries`` don't apply ufunc to ``fill_value`` (:issue:`13853`)
767767
- Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`)
768768
- Bug in single row slicing on multi-type ``SparseDataFrame``s, types were previously forced to float (:issue:`13917`)
769+
- Bug in sparse indexing using ``SparseArray`` with ``bool`` dtype may return incorrect result (:issue:`13985`)
769770

770771
.. _whatsnew_0190.deprecations:
771772

pandas/core/indexing.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
is_list_like,
1111
is_sequence,
1212
is_scalar,
13+
is_sparse,
1314
_ensure_platform_int)
1415
from pandas.types.missing import isnull, _infer_fill_value
1516

@@ -1811,9 +1812,10 @@ def check_bool_indexer(ax, key):
18111812
mask = isnull(result._values)
18121813
if mask.any():
18131814
raise IndexingError('Unalignable boolean Series key provided')
1814-
18151815
result = result.astype(bool)._values
1816-
1816+
elif is_sparse(result):
1817+
result = result.to_dense()
1818+
result = np.asarray(result, dtype=bool)
18171819
else:
18181820
# is_bool_indexer has already checked for nulls in the case of an
18191821
# object array key, so no check needed here

pandas/sparse/array.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from pandas.types.generic import ABCSparseArray, ABCSparseSeries
1818
from pandas.types.common import (is_float, is_integer,
1919
is_integer_dtype, _ensure_platform_int,
20+
is_bool_dtype,
2021
is_list_like,
2122
is_scalar, is_dtype_equal)
2223
from pandas.types.cast import (_possibly_convert_platform, _maybe_promote,
@@ -385,7 +386,10 @@ def __getitem__(self, key):
385386
data_slice = self.values[key]
386387
else:
387388
if isinstance(key, SparseArray):
388-
key = np.asarray(key)
389+
if is_bool_dtype(key):
390+
key = key.to_dense()
391+
else:
392+
key = np.asarray(key)
389393

390394
if hasattr(key, '__len__') and len(self) != len(key):
391395
return self.take(key)

pandas/sparse/series.py

+1
Original file line numberDiff line numberDiff line change
@@ -609,6 +609,7 @@ def take(self, indices, axis=0, convert=True, *args, **kwargs):
609609
-------
610610
taken : ndarray
611611
"""
612+
612613
convert = nv.validate_take_with_convert(convert, args, kwargs)
613614
new_values = SparseArray.take(self.values, indices)
614615
new_index = self.index.take(indices)

pandas/sparse/tests/test_indexing.py

+61
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@ def test_getitem(self):
3636
exp = orig[orig % 2 == 1].to_sparse()
3737
tm.assert_sp_series_equal(result, exp)
3838

39+
# sparse array
40+
result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)]
41+
tm.assert_sp_series_equal(result, exp)
42+
3943
def test_getitem_slice(self):
4044
orig = self.orig
4145
sparse = self.sparse
@@ -68,6 +72,10 @@ def test_getitem_fill_value(self):
6872
exp = orig[orig % 2 == 1].to_sparse(fill_value=0)
6973
tm.assert_sp_series_equal(result, exp)
7074

75+
# sparse array
76+
result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)]
77+
tm.assert_sp_series_equal(result, exp)
78+
7179
def test_getitem_ellipsis(self):
7280
# GH 9467
7381
s = pd.SparseSeries([1, np.nan, 2, 0, np.nan])
@@ -116,6 +124,10 @@ def test_loc(self):
116124
exp = orig.loc[orig % 2 == 1].to_sparse()
117125
tm.assert_sp_series_equal(result, exp)
118126

127+
# sparse array
128+
result = sparse.loc[pd.SparseArray(sparse % 2 == 1, dtype=bool)]
129+
tm.assert_sp_series_equal(result, exp)
130+
119131
def test_loc_index(self):
120132
orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list('ABCDE'))
121133
sparse = orig.to_sparse()
@@ -137,6 +149,10 @@ def test_loc_index(self):
137149
exp = orig.loc[orig % 2 == 1].to_sparse()
138150
tm.assert_sp_series_equal(result, exp)
139151

152+
# sparse array
153+
result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)]
154+
tm.assert_sp_series_equal(result, exp)
155+
140156
def test_loc_index_fill_value(self):
141157
orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE'))
142158
sparse = orig.to_sparse(fill_value=0)
@@ -368,6 +384,35 @@ def test_reindex_fill_value(self):
368384
exp = orig.reindex(['A', 'E', 'C', 'D']).to_sparse(fill_value=0)
369385
tm.assert_sp_series_equal(res, exp)
370386

387+
def tests_indexing_with_sparse(self):
388+
# GH 13985
389+
390+
for kind in ['integer', 'block']:
391+
for fill in [True, False, np.nan]:
392+
arr = pd.SparseArray([1, 2, 3], kind=kind)
393+
indexer = pd.SparseArray([True, False, True], fill_value=fill,
394+
dtype=bool)
395+
396+
tm.assert_sp_array_equal(pd.SparseArray([1, 3], kind=kind),
397+
arr[indexer])
398+
399+
s = pd.SparseSeries(arr, index=['a', 'b', 'c'],
400+
dtype=np.float64)
401+
exp = pd.SparseSeries([1, 3], index=['a', 'c'],
402+
dtype=np.float64, kind=kind)
403+
tm.assert_sp_series_equal(s[indexer], exp)
404+
tm.assert_sp_series_equal(s.loc[indexer], exp)
405+
tm.assert_sp_series_equal(s.iloc[indexer], exp)
406+
407+
indexer = pd.SparseSeries(indexer, index=['a', 'b', 'c'])
408+
tm.assert_sp_series_equal(s[indexer], exp)
409+
tm.assert_sp_series_equal(s.loc[indexer], exp)
410+
411+
msg = ("iLocation based boolean indexing cannot use an "
412+
"indexable as a mask")
413+
with tm.assertRaisesRegexp(ValueError, msg):
414+
s.iloc[indexer]
415+
371416

372417
class TestSparseSeriesMultiIndexing(TestSparseSeriesIndexing):
373418

@@ -405,6 +450,10 @@ def test_getitem_multi(self):
405450
exp = orig[orig % 2 == 1].to_sparse()
406451
tm.assert_sp_series_equal(result, exp)
407452

453+
# sparse array
454+
result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)]
455+
tm.assert_sp_series_equal(result, exp)
456+
408457
def test_getitem_multi_tuple(self):
409458
orig = self.orig
410459
sparse = self.sparse
@@ -454,6 +503,10 @@ def test_loc(self):
454503
exp = orig.loc[orig % 2 == 1].to_sparse()
455504
tm.assert_sp_series_equal(result, exp)
456505

506+
# sparse array
507+
result = sparse.loc[pd.SparseArray(sparse % 2 == 1, dtype=bool)]
508+
tm.assert_sp_series_equal(result, exp)
509+
457510
def test_loc_multi_tuple(self):
458511
orig = self.orig
459512
sparse = self.sparse
@@ -578,6 +631,10 @@ def test_loc(self):
578631
exp = orig.loc[orig.x % 2 == 1].to_sparse()
579632
tm.assert_sp_frame_equal(result, exp)
580633

634+
# sparse array
635+
result = sparse.loc[pd.SparseArray(sparse.x % 2 == 1, dtype=bool)]
636+
tm.assert_sp_frame_equal(result, exp)
637+
581638
def test_loc_index(self):
582639
orig = pd.DataFrame([[1, np.nan, np.nan],
583640
[2, 3, np.nan],
@@ -627,6 +684,10 @@ def test_loc_index(self):
627684
exp = orig.loc[orig.x % 2 == 1].to_sparse()
628685
tm.assert_sp_frame_equal(result, exp)
629686

687+
# sparse array
688+
result = sparse.loc[pd.SparseArray(sparse.x % 2 == 1, dtype=bool)]
689+
tm.assert_sp_frame_equal(result, exp)
690+
630691
def test_loc_slice(self):
631692
orig = pd.DataFrame([[1, np.nan, np.nan],
632693
[2, 3, np.nan],

0 commit comments

Comments
 (0)