From ae40d99cf5819b6930d7ce862637e58d210a93be Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 10 Sep 2021 00:17:00 -0400 Subject: [PATCH 1/2] BUG: df.sparse.to_coo() raising on duplicate colnames --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/arrays/sparse/accessor.py | 7 +++---- pandas/tests/arrays/sparse/test_accessor.py | 4 +++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 487ef5c226f94..4029663e08d98 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -426,6 +426,7 @@ Reshaping Sparse ^^^^^^ +- Bug in :meth:`DataFrame.sparse.to_coo` raising ``AttributeError`` when column names are not unique (:issue:`29564`) - - diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index f3eccd6aad444..60a316b79df2e 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -339,12 +339,11 @@ def to_coo(self): dtype = dtype.subtype cols, rows, data = [], [], [] - for col, name in enumerate(self._parent): - s = self._parent[name] - row = s.array.sp_index.to_int_index().indices + for col, (_, ser) in enumerate(self._parent.iteritems()): + row = ser.array.sp_index.to_int_index().indices cols.append(np.repeat(col, len(row))) rows.append(row) - data.append(s.array.sp_values.astype(dtype, copy=False)) + data.append(ser.array.sp_values.astype(dtype, copy=False)) cols = np.concatenate(cols) rows = np.concatenate(rows) diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index 10f5a7e9a1dc4..6b8dc8821d4fa 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -71,7 +71,9 @@ def test_from_spmatrix_columns(self, columns): expected = pd.DataFrame(mat.toarray(), columns=columns).astype(dtype) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("colnames", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2)]) + @pytest.mark.parametrize( + "colnames", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)] + ) @td.skip_if_no_scipy def test_to_coo(self, colnames): import scipy.sparse From 7d06702e17595b07c7ddb8c1218ebfdfceb1774e Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 10 Sep 2021 13:50:50 -0400 Subject: [PATCH 2/2] Add to_coo frame asv --- asv_bench/benchmarks/sparse.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index c8c1a962e6861..bcc3edab4a349 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -91,6 +91,20 @@ def time_sparse_series_to_coo_single_level(self, sort_labels): self.ss_two_lvl.sparse.to_coo(sort_labels=sort_labels) +class ToCooFrame: + def setup(self): + N = 10000 + k = 10 + arr = np.full((N, k), np.nan) + arr[0, 0] = 3.0 + arr[12, 7] = -1.0 + arr[0, 9] = 11.2 + self.df = pd.DataFrame(arr, dtype=pd.SparseDtype("float")) + + def time_to_coo(self): + self.df.sparse.to_coo() + + class Arithmetic: params = ([0.1, 0.01], [0, np.nan])