Skip to content

BUG: SparseDataFrame indexing may return normal Series #12787

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.18.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ These changes conform sparse handling to return the correct types and work to ma
- Bug in ``SparseSeries.loc[]`` with list-like input raises ``TypeError`` (:issue:`10560`)
- Bug in ``SparseSeries.iloc[]`` with scalar input may raise ``IndexError`` (:issue:`10560`)
- Bug in ``SparseSeries.loc[]``, ``.iloc[]`` with ``slice`` returns ``SparseArray``, rather than ``SparseSeries`` (:issue:`10560`)
- Bug in ``SparseDataFrame.loc[]``, ``.iloc[]`` may results in dense ``Series``, rather than ``SparseSeries`` (:issue:`12787`)
- Bug in ``SparseSeries.__repr__`` raises ``TypeError`` when it is longer than ``max_rows`` (:issue:`10560`)
- Bug in ``SparseSeries.shape`` ignores ``fill_value`` (:issue:`10452`)
- Bug in ``SparseArray.to_dense()`` does not preserve ``dtype`` (:issue:`10648`)
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1915,8 +1915,10 @@ def _ixs(self, i, axis=0):
# if we are a copy, mark as such
copy = (isinstance(new_values, np.ndarray) and
new_values.base is None)
result = Series(new_values, index=self.columns,
name=self.index[i], dtype=new_values.dtype)
result = self._constructor_sliced(new_values,
index=self.columns,
name=self.index[i],
dtype=new_values.dtype)
result._set_is_copy(self, copy=copy)
return result

Expand Down
7 changes: 3 additions & 4 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1752,7 +1752,6 @@ def xs(self, key, axis=0, level=None, copy=None, drop_level=True):
new_index = self.index[loc]

if lib.isscalar(loc):
from pandas import Series
new_values = self._data.fast_xs(loc)

# may need to box a datelike-scalar
Expand All @@ -1763,9 +1762,9 @@ def xs(self, key, axis=0, level=None, copy=None, drop_level=True):
if not is_list_like(new_values) or self.ndim == 1:
return _maybe_box_datetimelike(new_values)

result = Series(new_values, index=self.columns,
name=self.index[loc], copy=copy,
dtype=new_values.dtype)
result = self._constructor_sliced(new_values, index=self.columns,
name=self.index[loc], copy=copy,
dtype=new_values.dtype)

else:
result = self.iloc[loc]
Expand Down
2 changes: 2 additions & 0 deletions pandas/sparse/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,8 @@ def wrapper(data=None, index=None, columns=None,

return wrapper

_constructor_sliced = SparseSeries

def _init_dict(self, data, index, columns, dtype=None):
# pre-filter out columns if we passed it
if columns is not None:
Expand Down
162 changes: 162 additions & 0 deletions pandas/sparse/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,165 @@ def test_iloc_slice(self):
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
sparse = orig.to_sparse()
tm.assert_sp_series_equal(sparse.iloc[2:], orig.iloc[2:].to_sparse())


class TestSparseDataFrameIndexing(tm.TestCase):

_multiprocess_can_split_ = True

def test_loc(self):
orig = pd.DataFrame([[1, np.nan, np.nan],
[2, 3, np.nan],
[np.nan, np.nan, 4]],
columns=list('xyz'))
sparse = orig.to_sparse()

self.assertEqual(sparse.loc[0, 'x'], 1)
self.assertTrue(np.isnan(sparse.loc[1, 'z']))
self.assertEqual(sparse.loc[2, 'z'], 4)

tm.assert_sp_series_equal(sparse.loc[0], orig.loc[0].to_sparse())
tm.assert_sp_series_equal(sparse.loc[1], orig.loc[1].to_sparse())
tm.assert_sp_series_equal(sparse.loc[2, :],
orig.loc[2, :].to_sparse())
tm.assert_sp_series_equal(sparse.loc[2, :],
orig.loc[2, :].to_sparse())
tm.assert_sp_series_equal(sparse.loc[:, 'y'],
orig.loc[:, 'y'].to_sparse())
tm.assert_sp_series_equal(sparse.loc[:, 'y'],
orig.loc[:, 'y'].to_sparse())

result = sparse.loc[[1, 2]]
exp = orig.loc[[1, 2]].to_sparse()
tm.assert_sp_frame_equal(result, exp)

result = sparse.loc[[1, 2], :]
exp = orig.loc[[1, 2], :].to_sparse()
tm.assert_sp_frame_equal(result, exp)

result = sparse.loc[:, ['x', 'z']]
exp = orig.loc[:, ['x', 'z']].to_sparse()
tm.assert_sp_frame_equal(result, exp)

result = sparse.loc[[0, 2], ['x', 'z']]
exp = orig.loc[[0, 2], ['x', 'z']].to_sparse()
tm.assert_sp_frame_equal(result, exp)

# exceeds the bounds
result = sparse.loc[[1, 3, 4, 5]]
exp = orig.loc[[1, 3, 4, 5]].to_sparse()
tm.assert_sp_frame_equal(result, exp)

# dense array
result = sparse.loc[orig.x % 2 == 1]
exp = orig.loc[orig.x % 2 == 1].to_sparse()
tm.assert_sp_frame_equal(result, exp)

# sparse array (actuary it coerces to normal Series)
result = sparse.loc[sparse.x % 2 == 1]
exp = orig.loc[orig.x % 2 == 1].to_sparse()
tm.assert_sp_frame_equal(result, exp)

def test_loc_index(self):
orig = pd.DataFrame([[1, np.nan, np.nan],
[2, 3, np.nan],
[np.nan, np.nan, 4]],
index=list('abc'), columns=list('xyz'))
sparse = orig.to_sparse()

self.assertEqual(sparse.loc['a', 'x'], 1)
self.assertTrue(np.isnan(sparse.loc['b', 'z']))
self.assertEqual(sparse.loc['c', 'z'], 4)

tm.assert_sp_series_equal(sparse.loc['a'], orig.loc['a'].to_sparse())
tm.assert_sp_series_equal(sparse.loc['b'], orig.loc['b'].to_sparse())
tm.assert_sp_series_equal(sparse.loc['b', :],
orig.loc['b', :].to_sparse())
tm.assert_sp_series_equal(sparse.loc['b', :],
orig.loc['b', :].to_sparse())

tm.assert_sp_series_equal(sparse.loc[:, 'z'],
orig.loc[:, 'z'].to_sparse())
tm.assert_sp_series_equal(sparse.loc[:, 'z'],
orig.loc[:, 'z'].to_sparse())

result = sparse.loc[['a', 'b']]
exp = orig.loc[['a', 'b']].to_sparse()
tm.assert_sp_frame_equal(result, exp)

result = sparse.loc[['a', 'b'], :]
exp = orig.loc[['a', 'b'], :].to_sparse()
tm.assert_sp_frame_equal(result, exp)

result = sparse.loc[:, ['x', 'z']]
exp = orig.loc[:, ['x', 'z']].to_sparse()
tm.assert_sp_frame_equal(result, exp)

result = sparse.loc[['c', 'a'], ['x', 'z']]
exp = orig.loc[['c', 'a'], ['x', 'z']].to_sparse()
tm.assert_sp_frame_equal(result, exp)

# dense array
result = sparse.loc[orig.x % 2 == 1]
exp = orig.loc[orig.x % 2 == 1].to_sparse()
tm.assert_sp_frame_equal(result, exp)

# sparse array (actuary it coerces to normal Series)
result = sparse.loc[sparse.x % 2 == 1]
exp = orig.loc[orig.x % 2 == 1].to_sparse()
tm.assert_sp_frame_equal(result, exp)

def test_loc_slice(self):
orig = pd.DataFrame([[1, np.nan, np.nan],
[2, 3, np.nan],
[np.nan, np.nan, 4]],
columns=list('xyz'))
sparse = orig.to_sparse()
tm.assert_sp_frame_equal(sparse.loc[2:], orig.loc[2:].to_sparse())

def test_iloc(self):
orig = pd.DataFrame([[1, np.nan, np.nan],
[2, 3, np.nan],
[np.nan, np.nan, 4]])
sparse = orig.to_sparse()

self.assertEqual(sparse.iloc[1, 1], 3)
self.assertTrue(np.isnan(sparse.iloc[2, 0]))

tm.assert_sp_series_equal(sparse.iloc[0], orig.loc[0].to_sparse())
tm.assert_sp_series_equal(sparse.iloc[1], orig.loc[1].to_sparse())
tm.assert_sp_series_equal(sparse.iloc[2, :],
orig.iloc[2, :].to_sparse())
tm.assert_sp_series_equal(sparse.iloc[2, :],
orig.iloc[2, :].to_sparse())
tm.assert_sp_series_equal(sparse.iloc[:, 1],
orig.iloc[:, 1].to_sparse())
tm.assert_sp_series_equal(sparse.iloc[:, 1],
orig.iloc[:, 1].to_sparse())

result = sparse.iloc[[1, 2]]
exp = orig.iloc[[1, 2]].to_sparse()
tm.assert_sp_frame_equal(result, exp)

result = sparse.iloc[[1, 2], :]
exp = orig.iloc[[1, 2], :].to_sparse()
tm.assert_sp_frame_equal(result, exp)

result = sparse.iloc[:, [1, 0]]
exp = orig.iloc[:, [1, 0]].to_sparse()
tm.assert_sp_frame_equal(result, exp)

result = sparse.iloc[[2], [1, 0]]
exp = orig.iloc[[2], [1, 0]].to_sparse()
tm.assert_sp_frame_equal(result, exp)

with tm.assertRaises(IndexError):
sparse.iloc[[1, 3, 5]]

def test_iloc_slice(self):
orig = pd.DataFrame([[1, np.nan, np.nan],
[2, 3, np.nan],
[np.nan, np.nan, 4]],
columns=list('xyz'))
sparse = orig.to_sparse()
tm.assert_sp_frame_equal(sparse.iloc[2:], orig.iloc[2:].to_sparse())