From 42d2347f2df73c4d704413c63a9bf78a797dbc3a Mon Sep 17 00:00:00 2001 From: Suvayu Ali Date: Sat, 20 Jun 2020 17:56:08 +0200 Subject: [PATCH 1/8] TST: regression tests for indexing sparse dataframe with iterable closes #34526 --- pandas/tests/frame/indexing/test_indexing.py | 22 ++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 3865ea64ee479..9ef1cade6b7e5 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -20,9 +20,11 @@ isna, notna, ) +import pandas.util._test_decorators as td import pandas._testing as tm from pandas.arrays import SparseArray import pandas.core.common as com +from pandas.core.arrays.sparse import SparseDtype from pandas.core.indexing import IndexingError from pandas.tseries.offsets import BDay @@ -1921,6 +1923,26 @@ def test_getitem_sparse_column(self): result = df.loc[:, "A"] tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("spmatrix_t", ["coo_matrix", "csc_matrix", "csr_matrix"]) + @td.skip_if_no_scipy + def test_locindexer_from_spmatrix(self, spmatrix_t): + import scipy.sparse + spmatrix_t = getattr(scipy.sparse, spmatrix_t) + + spmatrix = spmatrix_t([[1.0, 0.0], [0.0, 0.0]], dtype=np.float64) + df = pd.DataFrame.sparse.from_spmatrix(spmatrix) + + # regression test for #34526 + itr_idx = [1] + result = df.loc[itr_idx].values + expected = spmatrix.toarray()[itr_idx] + tm.assert_numpy_array_equal(result, expected) + + # regression test for #34540 + result_t = df.loc[itr_idx].dtypes.values + expected_t = np.full(2, SparseDtype(np.float64, fill_value=0)) + tm.assert_numpy_array_equal(result_t, expected_t) + def test_setitem_with_unaligned_tz_aware_datetime_column(self): # GH 12981 # Assignment of unaligned offset-aware datetime series. From 3ce4a673c72a7619d21ea5f94dbc639b19d89576 Mon Sep 17 00:00:00 2001 From: Suvayu Ali Date: Sat, 20 Jun 2020 19:02:14 +0200 Subject: [PATCH 2/8] Reorganise sparse indexing tests into a separate file --- pandas/tests/frame/indexing/test_indexing.py | 36 --------------- pandas/tests/frame/indexing/test_sparse.py | 47 ++++++++++++++++++++ 2 files changed, 47 insertions(+), 36 deletions(-) create mode 100644 pandas/tests/frame/indexing/test_sparse.py diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 9ef1cade6b7e5..82a04e93dbcf1 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -22,9 +22,7 @@ ) import pandas.util._test_decorators as td import pandas._testing as tm -from pandas.arrays import SparseArray import pandas.core.common as com -from pandas.core.arrays.sparse import SparseDtype from pandas.core.indexing import IndexingError from pandas.tseries.offsets import BDay @@ -1909,40 +1907,6 @@ def test_getitem_ix_float_duplicates(self): expect = df.iloc[[1, -1], 0] tm.assert_series_equal(df.loc[0.2, "a"], expect) - def test_getitem_sparse_column(self): - # https://github.com/pandas-dev/pandas/issues/23559 - data = SparseArray([0, 1]) - df = pd.DataFrame({"A": data}) - expected = pd.Series(data, name="A") - result = df["A"] - tm.assert_series_equal(result, expected) - - result = df.iloc[:, 0] - tm.assert_series_equal(result, expected) - - result = df.loc[:, "A"] - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("spmatrix_t", ["coo_matrix", "csc_matrix", "csr_matrix"]) - @td.skip_if_no_scipy - def test_locindexer_from_spmatrix(self, spmatrix_t): - import scipy.sparse - spmatrix_t = getattr(scipy.sparse, spmatrix_t) - - spmatrix = spmatrix_t([[1.0, 0.0], [0.0, 0.0]], dtype=np.float64) - df = pd.DataFrame.sparse.from_spmatrix(spmatrix) - - # regression test for #34526 - itr_idx = [1] - result = df.loc[itr_idx].values - expected = spmatrix.toarray()[itr_idx] - tm.assert_numpy_array_equal(result, expected) - - # regression test for #34540 - result_t = df.loc[itr_idx].dtypes.values - expected_t = np.full(2, SparseDtype(np.float64, fill_value=0)) - tm.assert_numpy_array_equal(result_t, expected_t) - def test_setitem_with_unaligned_tz_aware_datetime_column(self): # GH 12981 # Assignment of unaligned offset-aware datetime series. diff --git a/pandas/tests/frame/indexing/test_sparse.py b/pandas/tests/frame/indexing/test_sparse.py new file mode 100644 index 0000000000000..b22eaad329582 --- /dev/null +++ b/pandas/tests/frame/indexing/test_sparse.py @@ -0,0 +1,47 @@ +import numpy as np +import pandas as pd + +import pandas.util._test_decorators as td +import pandas._testing as tm + +from pandas.arrays import SparseArray +from pandas.core.arrays.sparse import SparseDtype + +import pytest + + +class TestSparseDataFrameIndexing: + def test_getitem_sparse_column(self): + # https://github.com/pandas-dev/pandas/issues/23559 + data = SparseArray([0, 1]) + df = pd.DataFrame({"A": data}) + expected = pd.Series(data, name="A") + result = df["A"] + tm.assert_series_equal(result, expected) + + result = df.iloc[:, 0] + tm.assert_series_equal(result, expected) + + result = df.loc[:, "A"] + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("spmatrix_t", ["coo_matrix", "csc_matrix", "csr_matrix"]) + @td.skip_if_no_scipy + def test_locindexer_from_spmatrix(self, spmatrix_t): + import scipy.sparse + + spmatrix_t = getattr(scipy.sparse, spmatrix_t) + + spmatrix = spmatrix_t([[1.0, 0.0], [0.0, 0.0]], dtype=np.float64) + df = pd.DataFrame.sparse.from_spmatrix(spmatrix) + + # regression test for #34526 + itr_idx = [1] + result = df.loc[itr_idx].values + expected = spmatrix.toarray()[itr_idx] + tm.assert_numpy_array_equal(result, expected) + + # regression test for #34540 + result = df.loc[itr_idx].dtypes.values + expected = np.full(2, SparseDtype(np.float64, fill_value=0)) + tm.assert_numpy_array_equal(result, expected) From a82683ae5203db7d8fd264ced4eb70f06e85ee53 Mon Sep 17 00:00:00 2001 From: Suvayu Ali Date: Wed, 24 Jun 2020 12:17:44 +0200 Subject: [PATCH 3/8] sparse/array.py: fix the dtype when indexing only sparse elements --- pandas/core/arrays/sparse/array.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 4996a10002c63..b18a58da3950f 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -866,11 +866,8 @@ def _take_with_fill(self, indices, fill_value=None) -> np.ndarray: if self.sp_index.npoints == 0: # Avoid taking from the empty self.sp_values - taken = np.full( - sp_indexer.shape, - fill_value=fill_value, - dtype=np.result_type(type(fill_value)), - ) + _dtype = np.result_type(self.dtype.subtype, type(fill_value)) + taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype) else: taken = self.sp_values.take(sp_indexer) From ac82243b6a137f064d7463605935c8527827ae78 Mon Sep 17 00:00:00 2001 From: Suvayu Ali Date: Wed, 24 Jun 2020 15:17:39 +0200 Subject: [PATCH 4/8] indexing/test_sparse.py: more thorough indexing by iterable tests --- pandas/tests/frame/indexing/test_sparse.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/indexing/test_sparse.py b/pandas/tests/frame/indexing/test_sparse.py index b22eaad329582..362295cbfa779 100644 --- a/pandas/tests/frame/indexing/test_sparse.py +++ b/pandas/tests/frame/indexing/test_sparse.py @@ -26,22 +26,27 @@ def test_getitem_sparse_column(self): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("spmatrix_t", ["coo_matrix", "csc_matrix", "csr_matrix"]) + @pytest.mark.parametrize("dtype", [np.int64, np.float64, np.complex]) @td.skip_if_no_scipy - def test_locindexer_from_spmatrix(self, spmatrix_t): + def test_locindexer_from_spmatrix(self, spmatrix_t, dtype): import scipy.sparse spmatrix_t = getattr(scipy.sparse, spmatrix_t) - spmatrix = spmatrix_t([[1.0, 0.0], [0.0, 0.0]], dtype=np.float64) + # The bug is triggered by a sparse matrix with purely sparse columns. So the + # recipe below generates a rectangular matrix of dimension (5, 7) where all the + # diagonal cells are ones, meaning the last two columns are purely sparse. + rows, cols = 5, 7 + spmatrix = spmatrix_t(np.eye(rows, cols, dtype=dtype), dtype=dtype) df = pd.DataFrame.sparse.from_spmatrix(spmatrix) # regression test for #34526 - itr_idx = [1] + itr_idx = range(2, rows) result = df.loc[itr_idx].values expected = spmatrix.toarray()[itr_idx] tm.assert_numpy_array_equal(result, expected) # regression test for #34540 result = df.loc[itr_idx].dtypes.values - expected = np.full(2, SparseDtype(np.float64, fill_value=0)) + expected = np.full(cols, SparseDtype(dtype, fill_value=0)) tm.assert_numpy_array_equal(result, expected) From c785d9e0353c6a5dc3fc0a68368b2de485874175 Mon Sep 17 00:00:00 2001 From: Suvayu Ali Date: Wed, 24 Jun 2020 15:25:36 +0200 Subject: [PATCH 5/8] indexing/test_indexing.py: remove unused import --- pandas/tests/frame/indexing/test_indexing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 82a04e93dbcf1..3fa3c9303806f 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -20,7 +20,6 @@ isna, notna, ) -import pandas.util._test_decorators as td import pandas._testing as tm import pandas.core.common as com from pandas.core.indexing import IndexingError From d015d032424a1b4baf2d37b46f60056f996e1ffb Mon Sep 17 00:00:00 2001 From: Suvayu Ali Date: Wed, 24 Jun 2020 16:03:43 +0200 Subject: [PATCH 6/8] indexing/test_sparse.py: make isort happy --- pandas/tests/frame/indexing/test_sparse.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/indexing/test_sparse.py b/pandas/tests/frame/indexing/test_sparse.py index 362295cbfa779..0aead625a7955 100644 --- a/pandas/tests/frame/indexing/test_sparse.py +++ b/pandas/tests/frame/indexing/test_sparse.py @@ -1,14 +1,13 @@ import numpy as np -import pandas as pd +import pytest import pandas.util._test_decorators as td -import pandas._testing as tm +import pandas as pd +import pandas._testing as tm from pandas.arrays import SparseArray from pandas.core.arrays.sparse import SparseDtype -import pytest - class TestSparseDataFrameIndexing: def test_getitem_sparse_column(self): From d1446df05e8c4bbdb7d43442bcab1ffc4749202d Mon Sep 17 00:00:00 2001 From: Suvayu Ali Date: Wed, 24 Jun 2020 16:06:49 +0200 Subject: [PATCH 7/8] indexing/test_sparse.py: resolve deprecation warning from np_dev --- pandas/tests/frame/indexing/test_sparse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/indexing/test_sparse.py b/pandas/tests/frame/indexing/test_sparse.py index 0aead625a7955..876fbe212c466 100644 --- a/pandas/tests/frame/indexing/test_sparse.py +++ b/pandas/tests/frame/indexing/test_sparse.py @@ -25,7 +25,7 @@ def test_getitem_sparse_column(self): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("spmatrix_t", ["coo_matrix", "csc_matrix", "csr_matrix"]) - @pytest.mark.parametrize("dtype", [np.int64, np.float64, np.complex]) + @pytest.mark.parametrize("dtype", [np.int64, np.float64, complex]) @td.skip_if_no_scipy def test_locindexer_from_spmatrix(self, spmatrix_t, dtype): import scipy.sparse From bbd193c8ac92718f1db91bee8e4cace334d58c21 Mon Sep 17 00:00:00 2001 From: Suvayu Ali Date: Wed, 8 Jul 2020 00:02:18 +0200 Subject: [PATCH 8/8] v1.1.0.rst: add entry under bug fixes --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index cee41f248fc60..386fe3ce2160f 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1124,6 +1124,7 @@ Sparse - Bug where :class:`DataFrame` containing :class:`SparseArray` filled with ``NaN`` when indexed by a list-like (:issue:`27781`, :issue:`29563`) - The repr of :class:`SparseDtype` now includes the repr of its ``fill_value`` attribute. Previously it used ``fill_value``'s string representation (:issue:`34352`) - Bug where empty :class:`DataFrame` could not be cast to :class:`SparseDtype` (:issue:`33113`) +- Bug in :meth:`arrays.SparseArray` was returning the incorrect type when indexing a sparse dataframe with an iterable (:issue:`34526`, :issue:`34540`) ExtensionArray ^^^^^^^^^^^^^^