From 0fff829515ffd3c7ce81b9652ef3209b65f0c3c8 Mon Sep 17 00:00:00 2001 From: Yosuke Kobayashi Date: Sat, 15 Feb 2020 16:15:32 +0900 Subject: [PATCH 1/4] BUG: Fixed wrong reading sparse matrix --- pandas/core/arrays/sparse/array.py | 4 ++++ pandas/tests/arrays/sparse/test_array.py | 13 +++++++++++++ 2 files changed, 17 insertions(+) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index b17a4647ffc9f..ae99186e2d31c 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -414,6 +414,10 @@ def from_spmatrix(cls, data): if ncol != 1: raise ValueError(f"'data' must have a single column, not '{ncol}'") + # when sparse data has explicit zeros, eliminate them. + if data.nnz != data.count_nonzero(): + data.eliminate_zeros() + # our sparse index classes require that the positions be strictly # increasing. So we need to sort loc, and arr accordingly. arr = data.data diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index baca18239b929..2082f1fd7bd98 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -192,6 +192,19 @@ def test_from_spmatrix(self, size, format): expected = mat.toarray().ravel() tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("format", ["coo", "csc", "csr"]) + @td.skip_if_no_scipy + def test_from_spmatrix_including_explicit_zero(self, format): + import scipy.sparse + + mat = scipy.sparse.random(10, 1, density=0.5, format=format) + mat.data[0] = 0 + result = SparseArray.from_spmatrix(mat) + + result = np.asarray(result) + expected = mat.toarray().ravel() + tm.assert_numpy_array_equal(result, expected) + @td.skip_if_no_scipy def test_from_spmatrix_raises(self): import scipy.sparse From f097e583855638ce82bc98e4de938c5b95894e69 Mon Sep 17 00:00:00 2001 From: Yosuke Kobayashi Date: Sat, 11 Apr 2020 11:24:58 +0900 Subject: [PATCH 2/4] Use sort_indices instead of eliminate_zeros --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/arrays/sparse/array.py | 11 +++-------- pandas/tests/arrays/sparse/test_accessor.py | 12 ++++++++++++ 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 718de09a0c3e4..b8fde039e21cd 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -572,7 +572,7 @@ Reshaping Sparse ^^^^^^ - Creating a :class:`SparseArray` from timezone-aware dtype will issue a warning before dropping timezone information, instead of doing so silently (:issue:`32501`) -- +- Bug in :meth:`SparseArray.from_spmatrix` wrongly read scipy sparse matrix (:issue:`31991`) - ExtensionArray diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 7e6c777e1ca85..620e157ee54ec 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -437,17 +437,12 @@ def from_spmatrix(cls, data): if ncol != 1: raise ValueError(f"'data' must have a single column, not '{ncol}'") - # when sparse data has explicit zeros, eliminate them. - if data.nnz != data.count_nonzero(): - data.eliminate_zeros() - # our sparse index classes require that the positions be strictly # increasing. So we need to sort loc, and arr accordingly. + data = data.tocsc() + data.sort_indices() arr = data.data - idx, _ = data.nonzero() - loc = np.argsort(idx) - arr = arr.take(loc) - idx.sort() + idx = data.indices zero = np.array(0, dtype=arr.dtype).item() dtype = SparseDtype(arr.dtype, zero) diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index d8a1831cd61ec..02716044d9771 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -41,6 +41,18 @@ def test_from_spmatrix(self, format, labels, dtype): ).astype(sp_dtype) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("format", ["csc", "csr", "coo"]) + @td.skip_if_no_scipy + def test_from_spmatrix_including_explicit_zero(self, format): + import scipy.sparse + + mat = scipy.sparse.random(10, 2, density=0.5, format=format, dtype="int64") + mat.data[0] = 0 + result = pd.DataFrame.sparse.from_spmatrix(mat) + sp_dtype = SparseDtype("int64", np.array(0, dtype="int64").item()) + expected = pd.DataFrame(mat.todense()).astype(sp_dtype) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "columns", [["a", "b"], pd.MultiIndex.from_product([["A"], ["a", "b"]]), ["a", "a"]], From 556c5f5928aa2041eeb2f19f65294ea67a2ac2df Mon Sep 17 00:00:00 2001 From: Yosuke Kobayashi Date: Sat, 11 Apr 2020 12:00:52 +0900 Subject: [PATCH 3/4] Fix ci failed --- pandas/tests/arrays/sparse/test_accessor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index 02716044d9771..2a81b94ce779c 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -46,11 +46,11 @@ def test_from_spmatrix(self, format, labels, dtype): def test_from_spmatrix_including_explicit_zero(self, format): import scipy.sparse - mat = scipy.sparse.random(10, 2, density=0.5, format=format, dtype="int64") + mat = scipy.sparse.random(10, 2, density=0.5, format=format) mat.data[0] = 0 result = pd.DataFrame.sparse.from_spmatrix(mat) - sp_dtype = SparseDtype("int64", np.array(0, dtype="int64").item()) - expected = pd.DataFrame(mat.todense()).astype(sp_dtype) + dtype = SparseDtype("float64", 0.0) + expected = pd.DataFrame(mat.todense()).astype(dtype) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( From eaa0b32a588d82fa5ed3a13291ccdd38ceed330b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 16 Apr 2020 15:27:37 -0500 Subject: [PATCH 4/4] Update doc/source/whatsnew/v1.1.0.rst --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b8fde039e21cd..0dd24bcc54933 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -572,7 +572,7 @@ Reshaping Sparse ^^^^^^ - Creating a :class:`SparseArray` from timezone-aware dtype will issue a warning before dropping timezone information, instead of doing so silently (:issue:`32501`) -- Bug in :meth:`SparseArray.from_spmatrix` wrongly read scipy sparse matrix (:issue:`31991`) +- Bug in :meth:`arrays.SparseArray.from_spmatrix` wrongly read scipy sparse matrix (:issue:`31991`) - ExtensionArray