BUG: incorrect type when indexing sparse dataframe with iterable (#34908)

suvayu · web-flow · commit 02a6acfd0674 · 2020-07-08T06:47:08.000-05:00
* TST: regression tests for indexing sparse dataframe with iterable closes #34526
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -1124,6 +1124,7 @@ Sparse
 - Bug where :class:`DataFrame` containing :class:`SparseArray` filled with ``NaN`` when indexed by a list-like (:issue:`27781`, :issue:`29563`)
 - The repr of :class:`SparseDtype` now includes the repr of its ``fill_value`` attribute. Previously it used ``fill_value``'s  string representation (:issue:`34352`)
 - Bug where empty :class:`DataFrame` could not be cast to :class:`SparseDtype` (:issue:`33113`)
+- Bug in :meth:`arrays.SparseArray` was returning the incorrect type when indexing a sparse dataframe with an iterable (:issue:`34526`, :issue:`34540`)
 
 ExtensionArray
 ^^^^^^^^^^^^^^
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -866,11 +866,8 @@ def _take_with_fill(self, indices, fill_value=None) -> np.ndarray:
 
         if self.sp_index.npoints == 0:
             # Avoid taking from the empty self.sp_values
-            taken = np.full(
-                sp_indexer.shape,
-                fill_value=fill_value,
-                dtype=np.result_type(type(fill_value)),
-            )
+            _dtype = np.result_type(self.dtype.subtype, type(fill_value))
+            taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype)
         else:
             taken = self.sp_values.take(sp_indexer)
 
diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
@@ -21,7 +21,6 @@
     notna,
 )
 import pandas._testing as tm
-from pandas.arrays import SparseArray
 import pandas.core.common as com
 from pandas.core.indexing import IndexingError
 
@@ -1907,20 +1906,6 @@ def test_getitem_ix_float_duplicates(self):
         expect = df.iloc[[1, -1], 0]
         tm.assert_series_equal(df.loc[0.2, "a"], expect)
 
-    def test_getitem_sparse_column(self):
-        # https://github.com/pandas-dev/pandas/issues/23559
-        data = SparseArray([0, 1])
-        df = pd.DataFrame({"A": data})
-        expected = pd.Series(data, name="A")
-        result = df["A"]
-        tm.assert_series_equal(result, expected)
-
-        result = df.iloc[:, 0]
-        tm.assert_series_equal(result, expected)
-
-        result = df.loc[:, "A"]
-        tm.assert_series_equal(result, expected)
-
     def test_setitem_with_unaligned_tz_aware_datetime_column(self):
         # GH 12981
         # Assignment of unaligned offset-aware datetime series.
diff --git a/pandas/tests/frame/indexing/test_sparse.py b/pandas/tests/frame/indexing/test_sparse.py
@@ -0,0 +1,51 @@
+import numpy as np
+import pytest
+
+import pandas.util._test_decorators as td
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.arrays import SparseArray
+from pandas.core.arrays.sparse import SparseDtype
+
+
+class TestSparseDataFrameIndexing:
+    def test_getitem_sparse_column(self):
+        # https://github.com/pandas-dev/pandas/issues/23559
+        data = SparseArray([0, 1])
+        df = pd.DataFrame({"A": data})
+        expected = pd.Series(data, name="A")
+        result = df["A"]
+        tm.assert_series_equal(result, expected)
+
+        result = df.iloc[:, 0]
+        tm.assert_series_equal(result, expected)
+
+        result = df.loc[:, "A"]
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("spmatrix_t", ["coo_matrix", "csc_matrix", "csr_matrix"])
+    @pytest.mark.parametrize("dtype", [np.int64, np.float64, complex])
+    @td.skip_if_no_scipy
+    def test_locindexer_from_spmatrix(self, spmatrix_t, dtype):
+        import scipy.sparse
+
+        spmatrix_t = getattr(scipy.sparse, spmatrix_t)
+
+        # The bug is triggered by a sparse matrix with purely sparse columns.  So the
+        # recipe below generates a rectangular matrix of dimension (5, 7) where all the
+        # diagonal cells are ones, meaning the last two columns are purely sparse.
+        rows, cols = 5, 7
+        spmatrix = spmatrix_t(np.eye(rows, cols, dtype=dtype), dtype=dtype)
+        df = pd.DataFrame.sparse.from_spmatrix(spmatrix)
+
+        # regression test for #34526
+        itr_idx = range(2, rows)
+        result = df.loc[itr_idx].values
+        expected = spmatrix.toarray()[itr_idx]
+        tm.assert_numpy_array_equal(result, expected)
+
+        # regression test for #34540
+        result = df.loc[itr_idx].dtypes.values
+        expected = np.full(cols, SparseDtype(dtype, fill_value=0))
+        tm.assert_numpy_array_equal(result, expected)