diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 411b2b0abaf5a..0fafa6003c945 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -766,6 +766,7 @@ Note that the limitation is applied to ``fill_value`` which default is ``np.nan` - Bug in ``SparseArray`` and ``SparseSeries`` don't apply ufunc to ``fill_value`` (:issue:`13853`) - Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`) - Bug in single row slicing on multi-type ``SparseDataFrame``s, types were previously forced to float (:issue:`13917`) +- Bug in sparse indexing using ``SparseArray`` with ``bool`` dtype may return incorrect result (:issue:`13985`) .. _whatsnew_0190.deprecations: diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 933ecd1b8de86..a7cc3b9dddd36 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -10,6 +10,7 @@ is_list_like, is_sequence, is_scalar, + is_sparse, _ensure_platform_int) from pandas.types.missing import isnull, _infer_fill_value @@ -1811,9 +1812,10 @@ def check_bool_indexer(ax, key): mask = isnull(result._values) if mask.any(): raise IndexingError('Unalignable boolean Series key provided') - result = result.astype(bool)._values - + elif is_sparse(result): + result = result.to_dense() + result = np.asarray(result, dtype=bool) else: # is_bool_indexer has already checked for nulls in the case of an # object array key, so no check needed here diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index e22a62ee7f917..d14a8eadddc13 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -17,6 +17,7 @@ from pandas.types.generic import ABCSparseArray, ABCSparseSeries from pandas.types.common import (is_float, is_integer, is_integer_dtype, _ensure_platform_int, + is_bool_dtype, is_list_like, is_scalar, is_dtype_equal) from pandas.types.cast import (_possibly_convert_platform, _maybe_promote, @@ -385,7 +386,10 @@ def __getitem__(self, key): data_slice = self.values[key] else: if isinstance(key, SparseArray): - key = np.asarray(key) + if is_bool_dtype(key): + key = key.to_dense() + else: + key = np.asarray(key) if hasattr(key, '__len__') and len(self) != len(key): return self.take(key) diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 9045784287d9c..4ad77b4deab4f 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -609,6 +609,7 @@ def take(self, indices, axis=0, convert=True, *args, **kwargs): ------- taken : ndarray """ + convert = nv.validate_take_with_convert(convert, args, kwargs) new_values = SparseArray.take(self.values, indices) new_index = self.index.take(indices) diff --git a/pandas/sparse/tests/test_indexing.py b/pandas/sparse/tests/test_indexing.py index 74c3785b06d77..d176d95bb7dbf 100644 --- a/pandas/sparse/tests/test_indexing.py +++ b/pandas/sparse/tests/test_indexing.py @@ -36,6 +36,10 @@ def test_getitem(self): exp = orig[orig % 2 == 1].to_sparse() tm.assert_sp_series_equal(result, exp) + # sparse array + result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)] + tm.assert_sp_series_equal(result, exp) + def test_getitem_slice(self): orig = self.orig sparse = self.sparse @@ -68,6 +72,10 @@ def test_getitem_fill_value(self): exp = orig[orig % 2 == 1].to_sparse(fill_value=0) tm.assert_sp_series_equal(result, exp) + # sparse array + result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)] + tm.assert_sp_series_equal(result, exp) + def test_getitem_ellipsis(self): # GH 9467 s = pd.SparseSeries([1, np.nan, 2, 0, np.nan]) @@ -116,6 +124,10 @@ def test_loc(self): exp = orig.loc[orig % 2 == 1].to_sparse() tm.assert_sp_series_equal(result, exp) + # sparse array + result = sparse.loc[pd.SparseArray(sparse % 2 == 1, dtype=bool)] + tm.assert_sp_series_equal(result, exp) + def test_loc_index(self): orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list('ABCDE')) sparse = orig.to_sparse() @@ -137,6 +149,10 @@ def test_loc_index(self): exp = orig.loc[orig % 2 == 1].to_sparse() tm.assert_sp_series_equal(result, exp) + # sparse array + result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)] + tm.assert_sp_series_equal(result, exp) + def test_loc_index_fill_value(self): orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE')) sparse = orig.to_sparse(fill_value=0) @@ -368,6 +384,35 @@ def test_reindex_fill_value(self): exp = orig.reindex(['A', 'E', 'C', 'D']).to_sparse(fill_value=0) tm.assert_sp_series_equal(res, exp) + def tests_indexing_with_sparse(self): + # GH 13985 + + for kind in ['integer', 'block']: + for fill in [True, False, np.nan]: + arr = pd.SparseArray([1, 2, 3], kind=kind) + indexer = pd.SparseArray([True, False, True], fill_value=fill, + dtype=bool) + + tm.assert_sp_array_equal(pd.SparseArray([1, 3], kind=kind), + arr[indexer]) + + s = pd.SparseSeries(arr, index=['a', 'b', 'c'], + dtype=np.float64) + exp = pd.SparseSeries([1, 3], index=['a', 'c'], + dtype=np.float64, kind=kind) + tm.assert_sp_series_equal(s[indexer], exp) + tm.assert_sp_series_equal(s.loc[indexer], exp) + tm.assert_sp_series_equal(s.iloc[indexer], exp) + + indexer = pd.SparseSeries(indexer, index=['a', 'b', 'c']) + tm.assert_sp_series_equal(s[indexer], exp) + tm.assert_sp_series_equal(s.loc[indexer], exp) + + msg = ("iLocation based boolean indexing cannot use an " + "indexable as a mask") + with tm.assertRaisesRegexp(ValueError, msg): + s.iloc[indexer] + class TestSparseSeriesMultiIndexing(TestSparseSeriesIndexing): @@ -405,6 +450,10 @@ def test_getitem_multi(self): exp = orig[orig % 2 == 1].to_sparse() tm.assert_sp_series_equal(result, exp) + # sparse array + result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)] + tm.assert_sp_series_equal(result, exp) + def test_getitem_multi_tuple(self): orig = self.orig sparse = self.sparse @@ -454,6 +503,10 @@ def test_loc(self): exp = orig.loc[orig % 2 == 1].to_sparse() tm.assert_sp_series_equal(result, exp) + # sparse array + result = sparse.loc[pd.SparseArray(sparse % 2 == 1, dtype=bool)] + tm.assert_sp_series_equal(result, exp) + def test_loc_multi_tuple(self): orig = self.orig sparse = self.sparse @@ -578,6 +631,10 @@ def test_loc(self): exp = orig.loc[orig.x % 2 == 1].to_sparse() tm.assert_sp_frame_equal(result, exp) + # sparse array + result = sparse.loc[pd.SparseArray(sparse.x % 2 == 1, dtype=bool)] + tm.assert_sp_frame_equal(result, exp) + def test_loc_index(self): orig = pd.DataFrame([[1, np.nan, np.nan], [2, 3, np.nan], @@ -627,6 +684,10 @@ def test_loc_index(self): exp = orig.loc[orig.x % 2 == 1].to_sparse() tm.assert_sp_frame_equal(result, exp) + # sparse array + result = sparse.loc[pd.SparseArray(sparse.x % 2 == 1, dtype=bool)] + tm.assert_sp_frame_equal(result, exp) + def test_loc_slice(self): orig = pd.DataFrame([[1, np.nan, np.nan], [2, 3, np.nan],