Skip to content

BUG: incorrect type when indexing sparse dataframe with iterable #34908

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jul 8, 2020
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1124,6 +1124,7 @@ Sparse
- Bug where :class:`DataFrame` containing :class:`SparseArray` filled with ``NaN`` when indexed by a list-like (:issue:`27781`, :issue:`29563`)
- The repr of :class:`SparseDtype` now includes the repr of its ``fill_value`` attribute. Previously it used ``fill_value``'s string representation (:issue:`34352`)
- Bug where empty :class:`DataFrame` could not be cast to :class:`SparseDtype` (:issue:`33113`)
- Bug in :meth:`arrays.SparseArray` was returning the incorrect type when indexing a sparse dataframe with an iterable (:issue:`34526`, :issue:`34540`)

ExtensionArray
^^^^^^^^^^^^^^
Expand Down
7 changes: 2 additions & 5 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -866,11 +866,8 @@ def _take_with_fill(self, indices, fill_value=None) -> np.ndarray:

if self.sp_index.npoints == 0:
# Avoid taking from the empty self.sp_values
taken = np.full(
sp_indexer.shape,
fill_value=fill_value,
dtype=np.result_type(type(fill_value)),
)
_dtype = np.result_type(self.dtype.subtype, type(fill_value))
taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype)
else:
taken = self.sp_values.take(sp_indexer)

Expand Down
15 changes: 0 additions & 15 deletions pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
notna,
)
import pandas._testing as tm
from pandas.arrays import SparseArray
import pandas.core.common as com
from pandas.core.indexing import IndexingError

Expand Down Expand Up @@ -1907,20 +1906,6 @@ def test_getitem_ix_float_duplicates(self):
expect = df.iloc[[1, -1], 0]
tm.assert_series_equal(df.loc[0.2, "a"], expect)

def test_getitem_sparse_column(self):
# https://github.com/pandas-dev/pandas/issues/23559
data = SparseArray([0, 1])
df = pd.DataFrame({"A": data})
expected = pd.Series(data, name="A")
result = df["A"]
tm.assert_series_equal(result, expected)

result = df.iloc[:, 0]
tm.assert_series_equal(result, expected)

result = df.loc[:, "A"]
tm.assert_series_equal(result, expected)

def test_setitem_with_unaligned_tz_aware_datetime_column(self):
# GH 12981
# Assignment of unaligned offset-aware datetime series.
Expand Down
51 changes: 51 additions & 0 deletions pandas/tests/frame/indexing/test_sparse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

import pandas as pd
import pandas._testing as tm
from pandas.arrays import SparseArray
from pandas.core.arrays.sparse import SparseDtype


class TestSparseDataFrameIndexing:
def test_getitem_sparse_column(self):
# https://github.com/pandas-dev/pandas/issues/23559
data = SparseArray([0, 1])
df = pd.DataFrame({"A": data})
expected = pd.Series(data, name="A")
result = df["A"]
tm.assert_series_equal(result, expected)

result = df.iloc[:, 0]
tm.assert_series_equal(result, expected)

result = df.loc[:, "A"]
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("spmatrix_t", ["coo_matrix", "csc_matrix", "csr_matrix"])
@pytest.mark.parametrize("dtype", [np.int64, np.float64, complex])
@td.skip_if_no_scipy
def test_locindexer_from_spmatrix(self, spmatrix_t, dtype):
import scipy.sparse

spmatrix_t = getattr(scipy.sparse, spmatrix_t)

# The bug is triggered by a sparse matrix with purely sparse columns. So the
# recipe below generates a rectangular matrix of dimension (5, 7) where all the
# diagonal cells are ones, meaning the last two columns are purely sparse.
rows, cols = 5, 7
spmatrix = spmatrix_t(np.eye(rows, cols, dtype=dtype), dtype=dtype)
df = pd.DataFrame.sparse.from_spmatrix(spmatrix)

# regression test for #34526
itr_idx = range(2, rows)
result = df.loc[itr_idx].values
expected = spmatrix.toarray()[itr_idx]
tm.assert_numpy_array_equal(result, expected)

# regression test for #34540
result = df.loc[itr_idx].dtypes.values
expected = np.full(cols, SparseDtype(dtype, fill_value=0))
tm.assert_numpy_array_equal(result, expected)