diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 0aeeb281566ac..c14a0c0961a2d 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -77,6 +77,7 @@ These changes conform sparse handling to return the correct types and work to ma - Bug in ``SparseSeries.loc[]`` with list-like input raises ``TypeError`` (:issue:`10560`) - Bug in ``SparseSeries.iloc[]`` with scalar input may raise ``IndexError`` (:issue:`10560`) - Bug in ``SparseSeries.loc[]``, ``.iloc[]`` with ``slice`` returns ``SparseArray``, rather than ``SparseSeries`` (:issue:`10560`) +- Bug in ``SparseDataFrame.loc[]``, ``.iloc[]`` may results in dense ``Series``, rather than ``SparseSeries`` (:issue:`12787`) - Bug in ``SparseSeries.__repr__`` raises ``TypeError`` when it is longer than ``max_rows`` (:issue:`10560`) - Bug in ``SparseSeries.shape`` ignores ``fill_value`` (:issue:`10452`) - Bug in ``SparseArray.to_dense()`` does not preserve ``dtype`` (:issue:`10648`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index af03f1a17ea75..b4b044c7780e5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1915,8 +1915,10 @@ def _ixs(self, i, axis=0): # if we are a copy, mark as such copy = (isinstance(new_values, np.ndarray) and new_values.base is None) - result = Series(new_values, index=self.columns, - name=self.index[i], dtype=new_values.dtype) + result = self._constructor_sliced(new_values, + index=self.columns, + name=self.index[i], + dtype=new_values.dtype) result._set_is_copy(self, copy=copy) return result diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 737387e76e2f2..848ed7c3baa94 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1752,7 +1752,6 @@ def xs(self, key, axis=0, level=None, copy=None, drop_level=True): new_index = self.index[loc] if lib.isscalar(loc): - from pandas import Series new_values = self._data.fast_xs(loc) # may need to box a datelike-scalar @@ -1763,9 +1762,9 @@ def xs(self, key, axis=0, level=None, copy=None, drop_level=True): if not is_list_like(new_values) or self.ndim == 1: return _maybe_box_datetimelike(new_values) - result = Series(new_values, index=self.columns, - name=self.index[loc], copy=copy, - dtype=new_values.dtype) + result = self._constructor_sliced(new_values, index=self.columns, + name=self.index[loc], copy=copy, + dtype=new_values.dtype) else: result = self.iloc[loc] diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index abc5ffef4a88d..f9741217a024c 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -136,6 +136,8 @@ def wrapper(data=None, index=None, columns=None, return wrapper + _constructor_sliced = SparseSeries + def _init_dict(self, data, index, columns, dtype=None): # pre-filter out columns if we passed it if columns is not None: diff --git a/pandas/sparse/tests/test_indexing.py b/pandas/sparse/tests/test_indexing.py index 384125ddc63f4..0e218d2639662 100644 --- a/pandas/sparse/tests/test_indexing.py +++ b/pandas/sparse/tests/test_indexing.py @@ -82,3 +82,165 @@ def test_iloc_slice(self): orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) sparse = orig.to_sparse() tm.assert_sp_series_equal(sparse.iloc[2:], orig.iloc[2:].to_sparse()) + + +class TestSparseDataFrameIndexing(tm.TestCase): + + _multiprocess_can_split_ = True + + def test_loc(self): + orig = pd.DataFrame([[1, np.nan, np.nan], + [2, 3, np.nan], + [np.nan, np.nan, 4]], + columns=list('xyz')) + sparse = orig.to_sparse() + + self.assertEqual(sparse.loc[0, 'x'], 1) + self.assertTrue(np.isnan(sparse.loc[1, 'z'])) + self.assertEqual(sparse.loc[2, 'z'], 4) + + tm.assert_sp_series_equal(sparse.loc[0], orig.loc[0].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[1], orig.loc[1].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[2, :], + orig.loc[2, :].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[2, :], + orig.loc[2, :].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[:, 'y'], + orig.loc[:, 'y'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[:, 'y'], + orig.loc[:, 'y'].to_sparse()) + + result = sparse.loc[[1, 2]] + exp = orig.loc[[1, 2]].to_sparse() + tm.assert_sp_frame_equal(result, exp) + + result = sparse.loc[[1, 2], :] + exp = orig.loc[[1, 2], :].to_sparse() + tm.assert_sp_frame_equal(result, exp) + + result = sparse.loc[:, ['x', 'z']] + exp = orig.loc[:, ['x', 'z']].to_sparse() + tm.assert_sp_frame_equal(result, exp) + + result = sparse.loc[[0, 2], ['x', 'z']] + exp = orig.loc[[0, 2], ['x', 'z']].to_sparse() + tm.assert_sp_frame_equal(result, exp) + + # exceeds the bounds + result = sparse.loc[[1, 3, 4, 5]] + exp = orig.loc[[1, 3, 4, 5]].to_sparse() + tm.assert_sp_frame_equal(result, exp) + + # dense array + result = sparse.loc[orig.x % 2 == 1] + exp = orig.loc[orig.x % 2 == 1].to_sparse() + tm.assert_sp_frame_equal(result, exp) + + # sparse array (actuary it coerces to normal Series) + result = sparse.loc[sparse.x % 2 == 1] + exp = orig.loc[orig.x % 2 == 1].to_sparse() + tm.assert_sp_frame_equal(result, exp) + + def test_loc_index(self): + orig = pd.DataFrame([[1, np.nan, np.nan], + [2, 3, np.nan], + [np.nan, np.nan, 4]], + index=list('abc'), columns=list('xyz')) + sparse = orig.to_sparse() + + self.assertEqual(sparse.loc['a', 'x'], 1) + self.assertTrue(np.isnan(sparse.loc['b', 'z'])) + self.assertEqual(sparse.loc['c', 'z'], 4) + + tm.assert_sp_series_equal(sparse.loc['a'], orig.loc['a'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc['b'], orig.loc['b'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc['b', :], + orig.loc['b', :].to_sparse()) + tm.assert_sp_series_equal(sparse.loc['b', :], + orig.loc['b', :].to_sparse()) + + tm.assert_sp_series_equal(sparse.loc[:, 'z'], + orig.loc[:, 'z'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[:, 'z'], + orig.loc[:, 'z'].to_sparse()) + + result = sparse.loc[['a', 'b']] + exp = orig.loc[['a', 'b']].to_sparse() + tm.assert_sp_frame_equal(result, exp) + + result = sparse.loc[['a', 'b'], :] + exp = orig.loc[['a', 'b'], :].to_sparse() + tm.assert_sp_frame_equal(result, exp) + + result = sparse.loc[:, ['x', 'z']] + exp = orig.loc[:, ['x', 'z']].to_sparse() + tm.assert_sp_frame_equal(result, exp) + + result = sparse.loc[['c', 'a'], ['x', 'z']] + exp = orig.loc[['c', 'a'], ['x', 'z']].to_sparse() + tm.assert_sp_frame_equal(result, exp) + + # dense array + result = sparse.loc[orig.x % 2 == 1] + exp = orig.loc[orig.x % 2 == 1].to_sparse() + tm.assert_sp_frame_equal(result, exp) + + # sparse array (actuary it coerces to normal Series) + result = sparse.loc[sparse.x % 2 == 1] + exp = orig.loc[orig.x % 2 == 1].to_sparse() + tm.assert_sp_frame_equal(result, exp) + + def test_loc_slice(self): + orig = pd.DataFrame([[1, np.nan, np.nan], + [2, 3, np.nan], + [np.nan, np.nan, 4]], + columns=list('xyz')) + sparse = orig.to_sparse() + tm.assert_sp_frame_equal(sparse.loc[2:], orig.loc[2:].to_sparse()) + + def test_iloc(self): + orig = pd.DataFrame([[1, np.nan, np.nan], + [2, 3, np.nan], + [np.nan, np.nan, 4]]) + sparse = orig.to_sparse() + + self.assertEqual(sparse.iloc[1, 1], 3) + self.assertTrue(np.isnan(sparse.iloc[2, 0])) + + tm.assert_sp_series_equal(sparse.iloc[0], orig.loc[0].to_sparse()) + tm.assert_sp_series_equal(sparse.iloc[1], orig.loc[1].to_sparse()) + tm.assert_sp_series_equal(sparse.iloc[2, :], + orig.iloc[2, :].to_sparse()) + tm.assert_sp_series_equal(sparse.iloc[2, :], + orig.iloc[2, :].to_sparse()) + tm.assert_sp_series_equal(sparse.iloc[:, 1], + orig.iloc[:, 1].to_sparse()) + tm.assert_sp_series_equal(sparse.iloc[:, 1], + orig.iloc[:, 1].to_sparse()) + + result = sparse.iloc[[1, 2]] + exp = orig.iloc[[1, 2]].to_sparse() + tm.assert_sp_frame_equal(result, exp) + + result = sparse.iloc[[1, 2], :] + exp = orig.iloc[[1, 2], :].to_sparse() + tm.assert_sp_frame_equal(result, exp) + + result = sparse.iloc[:, [1, 0]] + exp = orig.iloc[:, [1, 0]].to_sparse() + tm.assert_sp_frame_equal(result, exp) + + result = sparse.iloc[[2], [1, 0]] + exp = orig.iloc[[2], [1, 0]].to_sparse() + tm.assert_sp_frame_equal(result, exp) + + with tm.assertRaises(IndexError): + sparse.iloc[[1, 3, 5]] + + def test_iloc_slice(self): + orig = pd.DataFrame([[1, np.nan, np.nan], + [2, 3, np.nan], + [np.nan, np.nan, 4]], + columns=list('xyz')) + sparse = orig.to_sparse() + tm.assert_sp_frame_equal(sparse.iloc[2:], orig.iloc[2:].to_sparse())