Skip to content

Commit 92c9e69

Browse files
Fix indexing, reindex on all-sparse SparseArray. (pandas-dev#35287)
1 parent 1b16a79 commit 92c9e69

File tree

7 files changed

+39
-45
lines changed

7 files changed

+39
-45
lines changed

doc/source/whatsnew/v1.1.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -1163,7 +1163,7 @@ Sparse
11631163
- Creating a :class:`SparseArray` from timezone-aware dtype will issue a warning before dropping timezone information, instead of doing so silently (:issue:`32501`)
11641164
- Bug in :meth:`arrays.SparseArray.from_spmatrix` wrongly read scipy sparse matrix (:issue:`31991`)
11651165
- Bug in :meth:`Series.sum` with ``SparseArray`` raises ``TypeError`` (:issue:`25777`)
1166-
- Bug where :class:`DataFrame` containing :class:`SparseArray` filled with ``NaN`` when indexed by a list-like (:issue:`27781`, :issue:`29563`)
1166+
- Bug where :class:`DataFrame` containing an all-sparse :class:`SparseArray` filled with ``NaN`` when indexed by a list-like (:issue:`27781`, :issue:`29563`)
11671167
- The repr of :class:`SparseDtype` now includes the repr of its ``fill_value`` attribute. Previously it used ``fill_value``'s string representation (:issue:`34352`)
11681168
- Bug where empty :class:`DataFrame` could not be cast to :class:`SparseDtype` (:issue:`33113`)
11691169
- Bug in :meth:`arrays.SparseArray` was returning the incorrect type when indexing a sparse dataframe with an iterable (:issue:`34526`, :issue:`34540`)

pandas/core/arrays/sparse/array.py

+12-7
Original file line numberDiff line numberDiff line change
@@ -862,21 +862,26 @@ def _take_with_fill(self, indices, fill_value=None) -> np.ndarray:
862862
else:
863863
raise IndexError("cannot do a non-empty take from an empty axes.")
864864

865+
# sp_indexer may be -1 for two reasons
866+
# 1.) we took for an index of -1 (new)
867+
# 2.) we took a value that was self.fill_value (old)
865868
sp_indexer = self.sp_index.lookup_array(indices)
869+
new_fill_indices = indices == -1
870+
old_fill_indices = (sp_indexer == -1) & ~new_fill_indices
866871

867-
if self.sp_index.npoints == 0:
872+
if self.sp_index.npoints == 0 and old_fill_indices.all():
873+
# We've looked up all valid points on an all-sparse array.
874+
taken = np.full(
875+
sp_indexer.shape, fill_value=self.fill_value, dtype=self.dtype.subtype
876+
)
877+
878+
elif self.sp_index.npoints == 0:
868879
# Avoid taking from the empty self.sp_values
869880
_dtype = np.result_type(self.dtype.subtype, type(fill_value))
870881
taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype)
871882
else:
872883
taken = self.sp_values.take(sp_indexer)
873884

874-
# sp_indexer may be -1 for two reasons
875-
# 1.) we took for an index of -1 (new)
876-
# 2.) we took a value that was self.fill_value (old)
877-
new_fill_indices = indices == -1
878-
old_fill_indices = (sp_indexer == -1) & ~new_fill_indices
879-
880885
# Fill in two steps.
881886
# Old fill values
882887
# New fill values

pandas/core/internals/blocks.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -1636,10 +1636,7 @@ def _holder(self):
16361636
@property
16371637
def fill_value(self):
16381638
# Used in reindex_indexer
1639-
if is_sparse(self.values):
1640-
return self.values.dtype.fill_value
1641-
else:
1642-
return self.values.dtype.na_value
1639+
return self.values.dtype.na_value
16431640

16441641
@property
16451642
def _can_hold_na(self):

pandas/tests/arrays/sparse/test_array.py

+5
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,11 @@ def test_take(self):
281281
exp = SparseArray(np.take(self.arr_data, [0, 1, 2]))
282282
tm.assert_sp_array_equal(self.arr.take([0, 1, 2]), exp)
283283

284+
def test_take_all_empty(self):
285+
a = pd.array([0, 0], dtype=pd.SparseDtype("int64"))
286+
result = a.take([0, 1], allow_fill=True, fill_value=np.nan)
287+
tm.assert_sp_array_equal(a, result)
288+
284289
def test_take_fill_value(self):
285290
data = np.array([1, np.nan, 0, 3, 0])
286291
sparse = SparseArray(data, fill_value=0)

pandas/tests/extension/base/getitem.py

-28
Original file line numberDiff line numberDiff line change
@@ -399,31 +399,3 @@ def test_item(self, data):
399399

400400
with pytest.raises(ValueError, match=msg):
401401
s.item()
402-
403-
def test_boolean_mask_frame_fill_value(self, data):
404-
# https://github.com/pandas-dev/pandas/issues/27781
405-
df = pd.DataFrame({"A": data})
406-
407-
mask = np.random.choice([True, False], df.shape[0])
408-
result = pd.isna(df.iloc[mask]["A"])
409-
expected = pd.isna(df["A"].iloc[mask])
410-
self.assert_series_equal(result, expected)
411-
412-
mask = pd.Series(mask, index=df.index)
413-
result = pd.isna(df.loc[mask]["A"])
414-
expected = pd.isna(df["A"].loc[mask])
415-
self.assert_series_equal(result, expected)
416-
417-
def test_fancy_index_frame_fill_value(self, data):
418-
# https://github.com/pandas-dev/pandas/issues/29563
419-
df = pd.DataFrame({"A": data})
420-
421-
mask = np.random.choice(df.shape[0], df.shape[0])
422-
result = pd.isna(df.iloc[mask]["A"])
423-
expected = pd.isna(df["A"].iloc[mask])
424-
self.assert_series_equal(result, expected)
425-
426-
mask = pd.Series(mask, index=df.index)
427-
result = pd.isna(df.loc[mask]["A"])
428-
expected = pd.isna(df["A"].loc[mask])
429-
self.assert_series_equal(result, expected)

pandas/tests/extension/test_sparse.py

-5
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,6 @@ def data_for_twos(request):
4141
return SparseArray(np.ones(100) * 2)
4242

4343

44-
@pytest.fixture(params=[0, np.nan])
45-
def data_zeros(request):
46-
return SparseArray(np.zeros(100, dtype=int), fill_value=request.param)
47-
48-
4944
@pytest.fixture(params=[0, np.nan])
5045
def data_missing(request):
5146
"""Length 2 array with [NA, Valid]"""

pandas/tests/frame/indexing/test_sparse.py

+20
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,23 @@ def test_locindexer_from_spmatrix(self, spmatrix_t, dtype):
4949
result = df.loc[itr_idx].dtypes.values
5050
expected = np.full(cols, SparseDtype(dtype, fill_value=0))
5151
tm.assert_numpy_array_equal(result, expected)
52+
53+
def test_reindex(self):
54+
# https://github.com/pandas-dev/pandas/issues/35286
55+
df = pd.DataFrame(
56+
{"A": [0, 1], "B": pd.array([0, 1], dtype=pd.SparseDtype("int64", 0))}
57+
)
58+
result = df.reindex([0, 2])
59+
expected = pd.DataFrame(
60+
{
61+
"A": [0.0, np.nan],
62+
"B": pd.array([0.0, np.nan], dtype=pd.SparseDtype("float64", 0.0)),
63+
},
64+
index=[0, 2],
65+
)
66+
tm.assert_frame_equal(result, expected)
67+
68+
def test_all_sparse(self):
69+
df = pd.DataFrame({"A": pd.array([0, 0], dtype=pd.SparseDtype("int64"))})
70+
result = df.loc[[0, 1]]
71+
tm.assert_frame_equal(result, df)

0 commit comments

Comments
 (0)