diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 814dbe999d5c1..1864367db0c0b 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1123,7 +1123,7 @@ Sparse - Creating a :class:`SparseArray` from timezone-aware dtype will issue a warning before dropping timezone information, instead of doing so silently (:issue:`32501`) - Bug in :meth:`arrays.SparseArray.from_spmatrix` wrongly read scipy sparse matrix (:issue:`31991`) - Bug in :meth:`Series.sum` with ``SparseArray`` raises ``TypeError`` (:issue:`25777`) -- Bug where :class:`DataFrame` containing :class:`SparseArray` filled with ``NaN`` when indexed by a list-like (:issue:`27781`, :issue:`29563`) +- Bug where :class:`DataFrame` containing an all-sparse :class:`SparseArray` filled with ``NaN`` when indexed by a list-like (:issue:`27781`, :issue:`29563`) - The repr of :class:`SparseDtype` now includes the repr of its ``fill_value`` attribute. Previously it used ``fill_value``'s string representation (:issue:`34352`) - Bug where empty :class:`DataFrame` could not be cast to :class:`SparseDtype` (:issue:`33113`) - Bug in :meth:`arrays.SparseArray` was returning the incorrect type when indexing a sparse dataframe with an iterable (:issue:`34526`, :issue:`34540`) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index b18a58da3950f..1d675b54a9c62 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -862,21 +862,26 @@ def _take_with_fill(self, indices, fill_value=None) -> np.ndarray: else: raise IndexError("cannot do a non-empty take from an empty axes.") + # sp_indexer may be -1 for two reasons + # 1.) we took for an index of -1 (new) + # 2.) we took a value that was self.fill_value (old) sp_indexer = self.sp_index.lookup_array(indices) + new_fill_indices = indices == -1 + old_fill_indices = (sp_indexer == -1) & ~new_fill_indices - if self.sp_index.npoints == 0: + if self.sp_index.npoints == 0 and old_fill_indices.all(): + # We've looked up all valid points on an all-sparse array. + taken = np.full( + sp_indexer.shape, fill_value=self.fill_value, dtype=self.dtype.subtype + ) + + elif self.sp_index.npoints == 0: # Avoid taking from the empty self.sp_values _dtype = np.result_type(self.dtype.subtype, type(fill_value)) taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype) else: taken = self.sp_values.take(sp_indexer) - # sp_indexer may be -1 for two reasons - # 1.) we took for an index of -1 (new) - # 2.) we took a value that was self.fill_value (old) - new_fill_indices = indices == -1 - old_fill_indices = (sp_indexer == -1) & ~new_fill_indices - # Fill in two steps. # Old fill values # New fill values diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6a4b3318d3aa7..cc0f09ced7399 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1636,10 +1636,7 @@ def _holder(self): @property def fill_value(self): # Used in reindex_indexer - if is_sparse(self.values): - return self.values.dtype.fill_value - else: - return self.values.dtype.na_value + return self.values.dtype.na_value @property def _can_hold_na(self): diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index d0cdec712f39d..04215bfe1bedb 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -281,6 +281,11 @@ def test_take(self): exp = SparseArray(np.take(self.arr_data, [0, 1, 2])) tm.assert_sp_array_equal(self.arr.take([0, 1, 2]), exp) + def test_take_all_empty(self): + a = pd.array([0, 0], dtype=pd.SparseDtype("int64")) + result = a.take([0, 1], allow_fill=True, fill_value=np.nan) + tm.assert_sp_array_equal(a, result) + def test_take_fill_value(self): data = np.array([1, np.nan, 0, 3, 0]) sparse = SparseArray(data, fill_value=0) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 5d0ea69007e27..251376798efc3 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -399,31 +399,3 @@ def test_item(self, data): with pytest.raises(ValueError, match=msg): s.item() - - def test_boolean_mask_frame_fill_value(self, data): - # https://github.com/pandas-dev/pandas/issues/27781 - df = pd.DataFrame({"A": data}) - - mask = np.random.choice([True, False], df.shape[0]) - result = pd.isna(df.iloc[mask]["A"]) - expected = pd.isna(df["A"].iloc[mask]) - self.assert_series_equal(result, expected) - - mask = pd.Series(mask, index=df.index) - result = pd.isna(df.loc[mask]["A"]) - expected = pd.isna(df["A"].loc[mask]) - self.assert_series_equal(result, expected) - - def test_fancy_index_frame_fill_value(self, data): - # https://github.com/pandas-dev/pandas/issues/29563 - df = pd.DataFrame({"A": data}) - - mask = np.random.choice(df.shape[0], df.shape[0]) - result = pd.isna(df.iloc[mask]["A"]) - expected = pd.isna(df["A"].iloc[mask]) - self.assert_series_equal(result, expected) - - mask = pd.Series(mask, index=df.index) - result = pd.isna(df.loc[mask]["A"]) - expected = pd.isna(df["A"].loc[mask]) - self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 68e521b005c02..b411ca1c482a4 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -41,11 +41,6 @@ def data_for_twos(request): return SparseArray(np.ones(100) * 2) -@pytest.fixture(params=[0, np.nan]) -def data_zeros(request): - return SparseArray(np.zeros(100, dtype=int), fill_value=request.param) - - @pytest.fixture(params=[0, np.nan]) def data_missing(request): """Length 2 array with [NA, Valid]""" diff --git a/pandas/tests/frame/indexing/test_sparse.py b/pandas/tests/frame/indexing/test_sparse.py index 876fbe212c466..04e1c8b94c4d9 100644 --- a/pandas/tests/frame/indexing/test_sparse.py +++ b/pandas/tests/frame/indexing/test_sparse.py @@ -49,3 +49,23 @@ def test_locindexer_from_spmatrix(self, spmatrix_t, dtype): result = df.loc[itr_idx].dtypes.values expected = np.full(cols, SparseDtype(dtype, fill_value=0)) tm.assert_numpy_array_equal(result, expected) + + def test_reindex(self): + # https://github.com/pandas-dev/pandas/issues/35286 + df = pd.DataFrame( + {"A": [0, 1], "B": pd.array([0, 1], dtype=pd.SparseDtype("int64", 0))} + ) + result = df.reindex([0, 2]) + expected = pd.DataFrame( + { + "A": [0.0, np.nan], + "B": pd.array([0.0, np.nan], dtype=pd.SparseDtype("float64", 0.0)), + }, + index=[0, 2], + ) + tm.assert_frame_equal(result, expected) + + def test_all_sparse(self): + df = pd.DataFrame({"A": pd.array([0, 0], dtype=pd.SparseDtype("int64"))}) + result = df.loc[[0, 1]] + tm.assert_frame_equal(result, df)