diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index b952ffd7661a7..18ad7e3ccedaf 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -584,7 +584,7 @@ Reshaping Sparse ^^^^^^ - Bug in :class:`SparseDtype` for equal comparison with na fill value. (:issue:`54770`) -- +- Bug in :meth:`DataFrame.sparse.from_spmatrix` which hard coded an invalid ``fill_value`` for certain subtypes. (:issue:`59063`) ExtensionArray ^^^^^^^^^^^^^^ diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 6a1c25711acb0..d104e0a126bc2 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -291,12 +291,12 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame: Examples -------- >>> import scipy.sparse - >>> mat = scipy.sparse.eye(3, dtype=float) + >>> mat = scipy.sparse.eye(3, dtype=int) >>> pd.DataFrame.sparse.from_spmatrix(mat) 0 1 2 - 0 1.0 0 0 - 1 0 1.0 0 - 2 0 0 1.0 + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 """ from pandas._libs.sparse import IntIndex @@ -313,7 +313,7 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame: indices = data.indices indptr = data.indptr array_data = data.data - dtype = SparseDtype(array_data.dtype, 0) + dtype = SparseDtype(array_data.dtype) arrays = [] for i in range(n_columns): sl = slice(indptr[i], indptr[i + 1]) @@ -393,8 +393,6 @@ def to_coo(self) -> spmatrix: cols, rows, data = [], [], [] for col, (_, ser) in enumerate(self._parent.items()): sp_arr = ser.array - if sp_arr.fill_value != 0: - raise ValueError("fill value must be 0 when converting to COO matrix") row = sp_arr.sp_index.indices cols.append(np.repeat(col, len(row))) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 5213be8b69016..3aeab96e03163 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1666,7 +1666,7 @@ class SparseDtype(ExtensionDtype): """ Dtype for data stored in :class:`SparseArray`. - `SparseDtype` is used as the data type for :class:`SparseArray`, enabling + ``SparseDtype`` is used as the data type for :class:`SparseArray`, enabling more efficient storage of data that contains a significant number of repetitive values typically represented by a fill value. It supports any scalar dtype as the underlying data type of the non-fill values. @@ -1677,19 +1677,20 @@ class SparseDtype(ExtensionDtype): The dtype of the underlying array storing the non-fill value values. fill_value : scalar, optional The scalar value not stored in the SparseArray. By default, this - depends on `dtype`. + depends on ``dtype``. =========== ========== dtype na_value =========== ========== float ``np.nan`` + complex ``np.nan`` int ``0`` bool ``False`` datetime64 ``pd.NaT`` timedelta64 ``pd.NaT`` =========== ========== - The default value may be overridden by specifying a `fill_value`. + The default value may be overridden by specifying a ``fill_value``. Attributes ---------- diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index f0e21136f8a97..b9cd6ae2f13e8 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -618,6 +618,8 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True): nan >>> na_value_for_dtype(np.dtype("float64")) nan + >>> na_value_for_dtype(np.dtype("complex128")) + nan >>> na_value_for_dtype(np.dtype("bool")) False >>> na_value_for_dtype(np.dtype("datetime64[ns]")) @@ -629,7 +631,7 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True): elif dtype.kind in "mM": unit = np.datetime_data(dtype)[0] return dtype.type("NaT", unit) - elif dtype.kind == "f": + elif dtype.kind in "fc": return np.nan elif dtype.kind in "iu": if compat: diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index 87eb7bcfa9cee..bd3298940ae3a 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -105,28 +105,36 @@ def test_accessor_raises(self): @pytest.mark.parametrize("format", ["csc", "csr", "coo"]) @pytest.mark.parametrize("labels", [None, list(string.ascii_letters[:10])]) - @pytest.mark.parametrize("dtype", ["float64", "int64"]) + @pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool]) def test_from_spmatrix(self, format, labels, dtype): sp_sparse = pytest.importorskip("scipy.sparse") - sp_dtype = SparseDtype(dtype, np.array(0, dtype=dtype).item()) + sp_dtype = SparseDtype(dtype) - mat = sp_sparse.eye(10, format=format, dtype=dtype) - result = pd.DataFrame.sparse.from_spmatrix(mat, index=labels, columns=labels) + sp_mat = sp_sparse.eye(10, format=format, dtype=dtype) + result = pd.DataFrame.sparse.from_spmatrix(sp_mat, index=labels, columns=labels) + mat = np.eye(10, dtype=dtype) expected = pd.DataFrame( - np.eye(10, dtype=dtype), index=labels, columns=labels + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value), + index=labels, + columns=labels, ).astype(sp_dtype) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("format", ["csc", "csr", "coo"]) - def test_from_spmatrix_including_explicit_zero(self, format): + @pytest.mark.parametrize("dtype", [np.int64, bool]) + def test_from_spmatrix_including_explicit_zero(self, format, dtype): sp_sparse = pytest.importorskip("scipy.sparse") - mat = sp_sparse.random(10, 2, density=0.5, format=format) - mat.data[0] = 0 - result = pd.DataFrame.sparse.from_spmatrix(mat) - dtype = SparseDtype("float64", 0.0) - expected = pd.DataFrame(mat.todense()).astype(dtype) + sp_dtype = SparseDtype(dtype) + + sp_mat = sp_sparse.random(10, 2, density=0.5, format=format, dtype=dtype) + sp_mat.data[0] = 0 + result = pd.DataFrame.sparse.from_spmatrix(sp_mat) + mat = sp_mat.toarray() + expected = pd.DataFrame( + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value) + ).astype(sp_dtype) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -136,41 +144,34 @@ def test_from_spmatrix_including_explicit_zero(self, format): def test_from_spmatrix_columns(self, columns): sp_sparse = pytest.importorskip("scipy.sparse") - dtype = SparseDtype("float64", 0.0) + sp_dtype = SparseDtype(np.float64) - mat = sp_sparse.random(10, 2, density=0.5) - result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns) - expected = pd.DataFrame(mat.toarray(), columns=columns).astype(dtype) + sp_mat = sp_sparse.random(10, 2, density=0.5) + result = pd.DataFrame.sparse.from_spmatrix(sp_mat, columns=columns) + mat = sp_mat.toarray() + expected = pd.DataFrame( + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value), + columns=columns, + ).astype(sp_dtype) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( - "colnames", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)] + "columns", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)] ) - def test_to_coo(self, colnames): + @pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool]) + def test_to_coo(self, columns, dtype): sp_sparse = pytest.importorskip("scipy.sparse") - df = pd.DataFrame( - {colnames[0]: [0, 1, 0], colnames[1]: [1, 0, 0]}, dtype="Sparse[int64, 0]" - ) - result = df.sparse.to_coo() - expected = sp_sparse.coo_matrix(np.asarray(df)) - assert (result != expected).nnz == 0 + sp_dtype = SparseDtype(dtype) - @pytest.mark.parametrize("fill_value", [1, np.nan]) - def test_to_coo_nonzero_fill_val_raises(self, fill_value): - pytest.importorskip("scipy") - df = pd.DataFrame( - { - "A": SparseArray( - [fill_value, fill_value, fill_value, 2], fill_value=fill_value - ), - "B": SparseArray( - [fill_value, 2, fill_value, fill_value], fill_value=fill_value - ), - } - ) - with pytest.raises(ValueError, match="fill value must be 0"): - df.sparse.to_coo() + expected = sp_sparse.random(10, 2, density=0.5, format="coo", dtype=dtype) + mat = expected.toarray() + result = pd.DataFrame( + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value), + columns=columns, + dtype=sp_dtype, + ).sparse.to_coo() + assert (result != expected).nnz == 0 def test_to_coo_midx_categorical(self): # GH#50996 diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 2109c794ad44f..f86ed6f49759f 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -697,6 +697,9 @@ def test_array_equivalent_index_with_tuples(): ("f2", np.nan), ("f4", np.nan), ("f8", np.nan), + # Complex + ("c8", np.nan), + ("c16", np.nan), # Object ("O", np.nan), # Interval diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 16f3e0fd0c229..903ad24ce53b3 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1281,7 +1281,7 @@ def test_loc_getitem_time_object(self, frame_or_series): tm.assert_equal(result, expected) @pytest.mark.parametrize("spmatrix_t", ["coo_matrix", "csc_matrix", "csr_matrix"]) - @pytest.mark.parametrize("dtype", [np.int64, np.float64, complex]) + @pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool]) def test_loc_getitem_range_from_spmatrix(self, spmatrix_t, dtype): sp_sparse = pytest.importorskip("scipy.sparse") @@ -1296,13 +1296,13 @@ def test_loc_getitem_range_from_spmatrix(self, spmatrix_t, dtype): # regression test for GH#34526 itr_idx = range(2, rows) - result = df.loc[itr_idx].values + result = np.nan_to_num(df.loc[itr_idx].values) expected = spmatrix.toarray()[itr_idx] tm.assert_numpy_array_equal(result, expected) # regression test for GH#34540 result = df.loc[itr_idx].dtypes.values - expected = np.full(cols, SparseDtype(dtype, fill_value=0)) + expected = np.full(cols, SparseDtype(dtype)) tm.assert_numpy_array_equal(result, expected) def test_loc_getitem_listlike_all_retains_sparse(self): @@ -1314,18 +1314,16 @@ def test_loc_getitem_sparse_frame(self): # GH34687 sp_sparse = pytest.importorskip("scipy.sparse") - df = DataFrame.sparse.from_spmatrix(sp_sparse.eye(5)) + df = DataFrame.sparse.from_spmatrix(sp_sparse.eye(5, dtype=np.int64)) result = df.loc[range(2)] expected = DataFrame( - [[1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0]], - dtype=SparseDtype("float64", 0.0), + [[1, 0, 0, 0, 0], [0, 1, 0, 0, 0]], + dtype=SparseDtype(np.int64), ) tm.assert_frame_equal(result, expected) result = df.loc[range(2)].loc[range(1)] - expected = DataFrame( - [[1.0, 0.0, 0.0, 0.0, 0.0]], dtype=SparseDtype("float64", 0.0) - ) + expected = DataFrame([[1, 0, 0, 0, 0]], dtype=SparseDtype(np.int64)) tm.assert_frame_equal(result, expected) def test_loc_getitem_sparse_series(self):