Skip to content

Commit 195401f

Browse files
BUG: DataFrame.sparse.from_spmatrix hard codes an invalid fill_value for certain subtypes (#59064)
* BUG: 🐛 ✨ Add fill_value param to from_spmatrix method. * ENH: ✨ Set explicit fill_value of NaN for complex floats. * TST: ✅ Fix failing tests. * TST: ✅ Add tests for from_spmatrix method. * DOC: 📝 Add what's new entry. * TST: ✅ Fix failing tests for sparse getitem. * TST: ✅ Remove test for 256-bit complex float. * DOC: 📝 Update example in docstring for from_spmatrix method. * DOC: 📝 Update some docstrings and sparse user guide. * DOC: ✏️ Update dtype docstring. Co-authored-by: Matthew Roeschke <[email protected]> * BUG: ⏪ 🐛 Revert fill_value change and fix to_coo method. * TST: ⏪ ✅ Fix and add sparse accessor tests. * TST: ⏪ ✅ Fix and add sparse getitem tests. * DOC: ⏪ 📝 Revert fill_value change to sparse user guide. * CLN: ✏️ Fix instantiation of np.ma.array in test. --------- Co-authored-by: Matthew Roeschke <[email protected]>
1 parent 32ceb4a commit 195401f

File tree

7 files changed

+62
-59
lines changed

7 files changed

+62
-59
lines changed

doc/source/whatsnew/v3.0.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -598,7 +598,7 @@ Reshaping
598598
Sparse
599599
^^^^^^
600600
- Bug in :class:`SparseDtype` for equal comparison with na fill value. (:issue:`54770`)
601-
-
601+
- Bug in :meth:`DataFrame.sparse.from_spmatrix` which hard coded an invalid ``fill_value`` for certain subtypes. (:issue:`59063`)
602602

603603
ExtensionArray
604604
^^^^^^^^^^^^^^

pandas/core/arrays/sparse/accessor.py

+5-7
Original file line numberDiff line numberDiff line change
@@ -291,12 +291,12 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame:
291291
Examples
292292
--------
293293
>>> import scipy.sparse
294-
>>> mat = scipy.sparse.eye(3, dtype=float)
294+
>>> mat = scipy.sparse.eye(3, dtype=int)
295295
>>> pd.DataFrame.sparse.from_spmatrix(mat)
296296
0 1 2
297-
0 1.0 0 0
298-
1 0 1.0 0
299-
2 0 0 1.0
297+
0 1 0 0
298+
1 0 1 0
299+
2 0 0 1
300300
"""
301301
from pandas._libs.sparse import IntIndex
302302

@@ -313,7 +313,7 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame:
313313
indices = data.indices
314314
indptr = data.indptr
315315
array_data = data.data
316-
dtype = SparseDtype(array_data.dtype, 0)
316+
dtype = SparseDtype(array_data.dtype)
317317
arrays = []
318318
for i in range(n_columns):
319319
sl = slice(indptr[i], indptr[i + 1])
@@ -393,8 +393,6 @@ def to_coo(self) -> spmatrix:
393393
cols, rows, data = [], [], []
394394
for col, (_, ser) in enumerate(self._parent.items()):
395395
sp_arr = ser.array
396-
if sp_arr.fill_value != 0:
397-
raise ValueError("fill value must be 0 when converting to COO matrix")
398396

399397
row = sp_arr.sp_index.indices
400398
cols.append(np.repeat(col, len(row)))

pandas/core/dtypes/dtypes.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -1666,7 +1666,7 @@ class SparseDtype(ExtensionDtype):
16661666
"""
16671667
Dtype for data stored in :class:`SparseArray`.
16681668
1669-
`SparseDtype` is used as the data type for :class:`SparseArray`, enabling
1669+
``SparseDtype`` is used as the data type for :class:`SparseArray`, enabling
16701670
more efficient storage of data that contains a significant number of
16711671
repetitive values typically represented by a fill value. It supports any
16721672
scalar dtype as the underlying data type of the non-fill values.
@@ -1677,19 +1677,20 @@ class SparseDtype(ExtensionDtype):
16771677
The dtype of the underlying array storing the non-fill value values.
16781678
fill_value : scalar, optional
16791679
The scalar value not stored in the SparseArray. By default, this
1680-
depends on `dtype`.
1680+
depends on ``dtype``.
16811681
16821682
=========== ==========
16831683
dtype na_value
16841684
=========== ==========
16851685
float ``np.nan``
1686+
complex ``np.nan``
16861687
int ``0``
16871688
bool ``False``
16881689
datetime64 ``pd.NaT``
16891690
timedelta64 ``pd.NaT``
16901691
=========== ==========
16911692
1692-
The default value may be overridden by specifying a `fill_value`.
1693+
The default value may be overridden by specifying a ``fill_value``.
16931694
16941695
Attributes
16951696
----------

pandas/core/dtypes/missing.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -618,6 +618,8 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True):
618618
nan
619619
>>> na_value_for_dtype(np.dtype("float64"))
620620
nan
621+
>>> na_value_for_dtype(np.dtype("complex128"))
622+
nan
621623
>>> na_value_for_dtype(np.dtype("bool"))
622624
False
623625
>>> na_value_for_dtype(np.dtype("datetime64[ns]"))
@@ -629,7 +631,7 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True):
629631
elif dtype.kind in "mM":
630632
unit = np.datetime_data(dtype)[0]
631633
return dtype.type("NaT", unit)
632-
elif dtype.kind == "f":
634+
elif dtype.kind in "fc":
633635
return np.nan
634636
elif dtype.kind in "iu":
635637
if compat:

pandas/tests/arrays/sparse/test_accessor.py

+39-38
Original file line numberDiff line numberDiff line change
@@ -105,28 +105,36 @@ def test_accessor_raises(self):
105105

106106
@pytest.mark.parametrize("format", ["csc", "csr", "coo"])
107107
@pytest.mark.parametrize("labels", [None, list(string.ascii_letters[:10])])
108-
@pytest.mark.parametrize("dtype", ["float64", "int64"])
108+
@pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool])
109109
def test_from_spmatrix(self, format, labels, dtype):
110110
sp_sparse = pytest.importorskip("scipy.sparse")
111111

112-
sp_dtype = SparseDtype(dtype, np.array(0, dtype=dtype).item())
112+
sp_dtype = SparseDtype(dtype)
113113

114-
mat = sp_sparse.eye(10, format=format, dtype=dtype)
115-
result = pd.DataFrame.sparse.from_spmatrix(mat, index=labels, columns=labels)
114+
sp_mat = sp_sparse.eye(10, format=format, dtype=dtype)
115+
result = pd.DataFrame.sparse.from_spmatrix(sp_mat, index=labels, columns=labels)
116+
mat = np.eye(10, dtype=dtype)
116117
expected = pd.DataFrame(
117-
np.eye(10, dtype=dtype), index=labels, columns=labels
118+
np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value),
119+
index=labels,
120+
columns=labels,
118121
).astype(sp_dtype)
119122
tm.assert_frame_equal(result, expected)
120123

121124
@pytest.mark.parametrize("format", ["csc", "csr", "coo"])
122-
def test_from_spmatrix_including_explicit_zero(self, format):
125+
@pytest.mark.parametrize("dtype", [np.int64, bool])
126+
def test_from_spmatrix_including_explicit_zero(self, format, dtype):
123127
sp_sparse = pytest.importorskip("scipy.sparse")
124128

125-
mat = sp_sparse.random(10, 2, density=0.5, format=format)
126-
mat.data[0] = 0
127-
result = pd.DataFrame.sparse.from_spmatrix(mat)
128-
dtype = SparseDtype("float64", 0.0)
129-
expected = pd.DataFrame(mat.todense()).astype(dtype)
129+
sp_dtype = SparseDtype(dtype)
130+
131+
sp_mat = sp_sparse.random(10, 2, density=0.5, format=format, dtype=dtype)
132+
sp_mat.data[0] = 0
133+
result = pd.DataFrame.sparse.from_spmatrix(sp_mat)
134+
mat = sp_mat.toarray()
135+
expected = pd.DataFrame(
136+
np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value)
137+
).astype(sp_dtype)
130138
tm.assert_frame_equal(result, expected)
131139

132140
@pytest.mark.parametrize(
@@ -136,41 +144,34 @@ def test_from_spmatrix_including_explicit_zero(self, format):
136144
def test_from_spmatrix_columns(self, columns):
137145
sp_sparse = pytest.importorskip("scipy.sparse")
138146

139-
dtype = SparseDtype("float64", 0.0)
147+
sp_dtype = SparseDtype(np.float64)
140148

141-
mat = sp_sparse.random(10, 2, density=0.5)
142-
result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns)
143-
expected = pd.DataFrame(mat.toarray(), columns=columns).astype(dtype)
149+
sp_mat = sp_sparse.random(10, 2, density=0.5)
150+
result = pd.DataFrame.sparse.from_spmatrix(sp_mat, columns=columns)
151+
mat = sp_mat.toarray()
152+
expected = pd.DataFrame(
153+
np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value),
154+
columns=columns,
155+
).astype(sp_dtype)
144156
tm.assert_frame_equal(result, expected)
145157

146158
@pytest.mark.parametrize(
147-
"colnames", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)]
159+
"columns", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)]
148160
)
149-
def test_to_coo(self, colnames):
161+
@pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool])
162+
def test_to_coo(self, columns, dtype):
150163
sp_sparse = pytest.importorskip("scipy.sparse")
151164

152-
df = pd.DataFrame(
153-
{colnames[0]: [0, 1, 0], colnames[1]: [1, 0, 0]}, dtype="Sparse[int64, 0]"
154-
)
155-
result = df.sparse.to_coo()
156-
expected = sp_sparse.coo_matrix(np.asarray(df))
157-
assert (result != expected).nnz == 0
165+
sp_dtype = SparseDtype(dtype)
158166

159-
@pytest.mark.parametrize("fill_value", [1, np.nan])
160-
def test_to_coo_nonzero_fill_val_raises(self, fill_value):
161-
pytest.importorskip("scipy")
162-
df = pd.DataFrame(
163-
{
164-
"A": SparseArray(
165-
[fill_value, fill_value, fill_value, 2], fill_value=fill_value
166-
),
167-
"B": SparseArray(
168-
[fill_value, 2, fill_value, fill_value], fill_value=fill_value
169-
),
170-
}
171-
)
172-
with pytest.raises(ValueError, match="fill value must be 0"):
173-
df.sparse.to_coo()
167+
expected = sp_sparse.random(10, 2, density=0.5, format="coo", dtype=dtype)
168+
mat = expected.toarray()
169+
result = pd.DataFrame(
170+
np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value),
171+
columns=columns,
172+
dtype=sp_dtype,
173+
).sparse.to_coo()
174+
assert (result != expected).nnz == 0
174175

175176
def test_to_coo_midx_categorical(self):
176177
# GH#50996

pandas/tests/dtypes/test_missing.py

+3
Original file line numberDiff line numberDiff line change
@@ -697,6 +697,9 @@ def test_array_equivalent_index_with_tuples():
697697
("f2", np.nan),
698698
("f4", np.nan),
699699
("f8", np.nan),
700+
# Complex
701+
("c8", np.nan),
702+
("c16", np.nan),
700703
# Object
701704
("O", np.nan),
702705
# Interval

pandas/tests/indexing/test_loc.py

+7-9
Original file line numberDiff line numberDiff line change
@@ -1281,7 +1281,7 @@ def test_loc_getitem_time_object(self, frame_or_series):
12811281
tm.assert_equal(result, expected)
12821282

12831283
@pytest.mark.parametrize("spmatrix_t", ["coo_matrix", "csc_matrix", "csr_matrix"])
1284-
@pytest.mark.parametrize("dtype", [np.int64, np.float64, complex])
1284+
@pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool])
12851285
def test_loc_getitem_range_from_spmatrix(self, spmatrix_t, dtype):
12861286
sp_sparse = pytest.importorskip("scipy.sparse")
12871287

@@ -1296,13 +1296,13 @@ def test_loc_getitem_range_from_spmatrix(self, spmatrix_t, dtype):
12961296

12971297
# regression test for GH#34526
12981298
itr_idx = range(2, rows)
1299-
result = df.loc[itr_idx].values
1299+
result = np.nan_to_num(df.loc[itr_idx].values)
13001300
expected = spmatrix.toarray()[itr_idx]
13011301
tm.assert_numpy_array_equal(result, expected)
13021302

13031303
# regression test for GH#34540
13041304
result = df.loc[itr_idx].dtypes.values
1305-
expected = np.full(cols, SparseDtype(dtype, fill_value=0))
1305+
expected = np.full(cols, SparseDtype(dtype))
13061306
tm.assert_numpy_array_equal(result, expected)
13071307

13081308
def test_loc_getitem_listlike_all_retains_sparse(self):
@@ -1314,18 +1314,16 @@ def test_loc_getitem_sparse_frame(self):
13141314
# GH34687
13151315
sp_sparse = pytest.importorskip("scipy.sparse")
13161316

1317-
df = DataFrame.sparse.from_spmatrix(sp_sparse.eye(5))
1317+
df = DataFrame.sparse.from_spmatrix(sp_sparse.eye(5, dtype=np.int64))
13181318
result = df.loc[range(2)]
13191319
expected = DataFrame(
1320-
[[1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0]],
1321-
dtype=SparseDtype("float64", 0.0),
1320+
[[1, 0, 0, 0, 0], [0, 1, 0, 0, 0]],
1321+
dtype=SparseDtype(np.int64),
13221322
)
13231323
tm.assert_frame_equal(result, expected)
13241324

13251325
result = df.loc[range(2)].loc[range(1)]
1326-
expected = DataFrame(
1327-
[[1.0, 0.0, 0.0, 0.0, 0.0]], dtype=SparseDtype("float64", 0.0)
1328-
)
1326+
expected = DataFrame([[1, 0, 0, 0, 0]], dtype=SparseDtype(np.int64))
13291327
tm.assert_frame_equal(result, expected)
13301328

13311329
def test_loc_getitem_sparse_series(self):

0 commit comments

Comments
 (0)