ENH: Allow SparseDataFrame/SparseSeries values assignment

kernc · kernc · commit c86a18965a40 · 2018-02-23T07:05:44.000+01:00
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -752,9 +752,9 @@ def iterrows(self):
         iteritems : Iterate over (column name, Series) pairs.
 
         """
-        iloc = self.iloc
+        row_at = self.iloc.__getitem__
         for i, k in enumerate(self.index):
-            yield k, iloc[i]
+            yield k, row_at(i)
 
     def itertuples(self, index=True, name="Pandas"):
         """
@@ -2068,9 +2068,7 @@ def set_value(self, index, col, value, takeable=False):
 
         Returns
         -------
-        frame : DataFrame
-            If label pair is contained, will be reference to calling DataFrame,
-            otherwise a new object
+        self : DataFrame
         """
         warnings.warn("set_value is deprecated and will be removed "
                       "in a future release. Please use "
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -913,6 +913,9 @@ def _is_empty_indexer(indexer):
         if _is_empty_indexer(indexer):
             pass
 
+        elif is_sparse(values):
+            values = values.set_values(indexer, value)
+
         # setting a single element for each dim and with a rhs that could
         # be say a list
         # GH 6043
@@ -1795,10 +1798,15 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0,
         new_values = self.values if inplace else self.copy().values
         new_values, _, new, _ = self._try_coerce_args(new_values, new)
 
+        if is_sparse(new_values):
+            indexer = mask.to_dense().values.ravel().nonzero()[0]
+            block = self.setitem(indexer, new)
+            return [block]
+
         if isinstance(new, np.ndarray) and len(new) == len(mask):
             new = new[mask]
 
-        mask = _safe_reshape(mask, new_values.shape)
+        mask = _safe_reshape(np.asarray(mask), new_values.shape)
 
         new_values[mask] = new
         new_values = self._try_coerce_result(new_values)
@@ -2947,6 +2955,17 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
         return self.make_block_same_class(values=values,
                                           placement=self.mgr_locs)
 
+    def _can_hold_element(self, element):
+        return np.can_cast(np.asarray(element).dtype, self.sp_values.dtype)
+
+    def _try_coerce_result(self, result):
+        if (isinstance(result, np.ndarray) and
+                np.ndim(result) > 0
+                and not is_sparse(result)):
+            result = SparseArray(result, kind=self.kind,
+                                 fill_value=self.fill_value, dtype=self.dtype)
+        return result
+
     def __len__(self):
         try:
             return self.sp_index.length
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -965,9 +965,7 @@ def set_value(self, label, value, takeable=False):
 
         Returns
         -------
-        series : Series
-            If label is contained, will be reference to calling Series,
-            otherwise a new object
+        self : Series
         """
         warnings.warn("set_value is deprecated and will be removed "
                       "in a future release. Please use "
diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py
@@ -37,6 +37,7 @@
 import pandas.core.algorithms as algos
 import pandas.core.ops as ops
 import pandas.io.formats.printing as printing
+from pandas.errors import PerformanceWarning
 from pandas.util._decorators import Appender
 from pandas.core.indexes.base import _index_shared_docs
 
@@ -373,6 +374,53 @@ def get_values(self, fill=None):
         """ return a dense representation """
         return self.to_dense(fill=fill)
 
+    def set_values(self, indexer, value):
+        """
+        Return new SparseArray with indexed values set to `value`.
+
+        Returns
+        -------
+        SparseArray
+            A new sparse array with indexer positions filled with value.
+        """
+        # If indexer is not a single int position, easiest to handle via dense
+        if not is_scalar(indexer):
+            warnings.warn(
+                'Setting SparseSeries/Array values is particularly '
+                'inefficient when indexing with multiple keys because the '
+                'whole series is made dense interim.',
+                PerformanceWarning, stacklevel=2)
+
+            values = self.to_dense()
+            values[indexer] = value
+            return SparseArray(values, kind=self.kind,
+                               fill_value=self.fill_value)
+
+        warnings.warn(
+            'Setting SparseSeries/Array values is inefficient '
+            '(a copy of data is made).', PerformanceWarning, stacklevel=2)
+
+        # If label already in sparse index, just switch the value on a copy
+        idx = self.sp_index.lookup(indexer)
+        if idx != -1:
+            obj = self.copy()
+            obj.sp_values[idx] = value
+            return obj
+
+        # Otherwise, construct a new array, and insert the new value in the
+        # correct position
+        indices = self.sp_index.to_int_index().indices
+        pos = np.searchsorted(indices, indexer)
+
+        indices = np.insert(indices, pos, indexer)
+        sp_values = np.insert(self.sp_values, pos, value)
+        # Length can be increased when adding a new value into index
+        length = max(self.sp_index.length, indexer + 1)
+        sp_index = _make_index(length, indices, self.kind)
+
+        return SparseArray(sp_values, sparse_index=sp_index,
+                           fill_value=self.fill_value)
+
     def to_dense(self, fill=None):
         """
         Convert SparseArray to a NumPy array.
diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py
@@ -326,8 +326,8 @@ def _apply_columns(self, func):
             default_fill_value=self.default_fill_value,
             default_kind=self.default_kind).__finalize__(self)
 
-    def astype(self, dtype):
-        return self._apply_columns(lambda x: x.astype(dtype))
+    def astype(self, dtype, **kwargs):
+        return self._apply_columns(lambda x: x.astype(dtype, **kwargs))
 
     def copy(self, deep=True):
         """
@@ -469,44 +469,6 @@ def _get_value(self, index, col, takeable=False):
         return series._get_value(index, takeable=takeable)
     _get_value.__doc__ = get_value.__doc__
 
-    def set_value(self, index, col, value, takeable=False):
-        """
-        Put single value at passed column and index
-
-        .. deprecated:: 0.21.0
-
-        Please use .at[] or .iat[] accessors.
-
-        Parameters
-        ----------
-        index : row label
-        col : column label
-        value : scalar value
-        takeable : interpret the index/col as indexers, default False
-
-        Notes
-        -----
-        This method *always* returns a new object. It is currently not
-        particularly efficient (and potentially very expensive) but is provided
-        for API compatibility with DataFrame
-
-        Returns
-        -------
-        frame : DataFrame
-        """
-        warnings.warn("set_value is deprecated and will be removed "
-                      "in a future release. Please use "
-                      ".at[] or .iat[] accessors instead", FutureWarning,
-                      stacklevel=2)
-        return self._set_value(index, col, value, takeable=takeable)
-
-    def _set_value(self, index, col, value, takeable=False):
-        dense = self.to_dense()._set_value(
-            index, col, value, takeable=takeable)
-        return dense.to_sparse(kind=self._default_kind,
-                               fill_value=self._default_fill_value)
-    _set_value.__doc__ = set_value.__doc__
-
     def _slice(self, slobj, axis=0, kind=None):
         if axis == 0:
             new_index = self.index[slobj]
diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py
@@ -9,6 +9,7 @@
 import warnings
 
 from pandas.core.dtypes.missing import isna, notna
+from pandas.core.dtypes.common import is_sparse
 
 from pandas.compat.numpy import function as nv
 from pandas.core.index import Index, _ensure_index, InvalidIndexError
@@ -17,7 +18,6 @@
 from pandas.core import generic
 import pandas.core.common as com
 import pandas.core.ops as ops
-import pandas._libs.index as libindex
 from pandas.util._decorators import Appender
 
 from pandas.core.sparse.array import (
@@ -279,8 +279,13 @@ def __array_wrap__(self, result, context=None):
         else:
             fill_value = self.fill_value
 
+        # Assume: If result size matches, old sparse index is valid (ok???)
+        if np.size(result) == self.sp_index.npoints:
+            sp_index = self.sp_index
+        else:
+            sp_index = None
         return self._constructor(result, index=self.index,
-                                 sparse_index=self.sp_index,
+                                 sparse_index=sp_index,
                                  fill_value=fill_value,
                                  copy=False).__finalize__(self)
 
@@ -481,7 +486,7 @@ def set_value(self, label, value, takeable=False):
 
         Returns
         -------
-        series : SparseSeries
+        self : SparseSeries
         """
         warnings.warn("set_value is deprecated and will be removed "
                       "in a future release. Please use "
@@ -490,35 +495,16 @@ def set_value(self, label, value, takeable=False):
         return self._set_value(label, value, takeable=takeable)
 
     def _set_value(self, label, value, takeable=False):
-        values = self.to_dense()
-
-        # if the label doesn't exist, we will create a new object here
-        # and possibly change the index
-        new_values = values._set_value(label, value, takeable=takeable)
-        if new_values is not None:
-            values = new_values
-        new_index = values.index
-        values = SparseArray(values, fill_value=self.fill_value,
-                             kind=self.kind)
-        self._data = SingleBlockManager(values, new_index)
-        self._index = new_index
+        self._data = self._data.copy()
+        try:
+            idx = self.index.get_loc(label)
+        except KeyError:
+            idx = len(self)
+            self._data.axes[0] = self._data.index.append(Index([label]))
+        self._data = self._data.setitem(indexer=idx, value=value)
+        return self
     _set_value.__doc__ = set_value.__doc__
 
-    def _set_values(self, key, value):
-
-        # this might be inefficient as we have to recreate the sparse array
-        # rather than setting individual elements, but have to convert
-        # the passed slice/boolean that's in dense space into a sparse indexer
-        # not sure how to do that!
-        if isinstance(key, Series):
-            key = key.values
-
-        values = self.values.to_dense()
-        values[key] = libindex.convert_scalar(values, value)
-        values = SparseArray(values, fill_value=self.fill_value,
-                             kind=self.kind)
-        self._data = SingleBlockManager(values, self.index)
-
     def to_dense(self, sparse_only=False):
         """
         Convert SparseSeries to a Series.
@@ -544,8 +530,10 @@ def to_dense(self, sparse_only=False):
             index = self.index.take(int_index.indices)
             return Series(self.sp_values, index=index, name=self.name)
         else:
-            return Series(self.values.to_dense(), index=self.index,
-                          name=self.name)
+            values = self.values
+            if is_sparse(values):
+                values = values.to_dense()
+            return Series(values, index=self.index, name=self.name)
 
     @property
     def density(self):
diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py
@@ -10,6 +10,7 @@
 
 from pandas import Series, DataFrame, bdate_range, Panel
 from pandas.core.indexes.datetimes import DatetimeIndex
+from pandas.errors import PerformanceWarning
 from pandas.tseries.offsets import BDay
 from pandas.util import testing as tm
 from pandas.compat import lrange
@@ -459,7 +460,6 @@ def test_set_value(self):
         with tm.assert_produces_warning(FutureWarning,
                                         check_stacklevel=False):
             res = self.frame.set_value('foobar', 'B', 1.5)
-        assert res is not self.frame
         assert res.index[-1] == 'foobar'
         with tm.assert_produces_warning(FutureWarning,
                                         check_stacklevel=False):
@@ -468,9 +468,8 @@ def test_set_value(self):
         with tm.assert_produces_warning(FutureWarning,
                                         check_stacklevel=False):
             res2 = res.set_value('foobar', 'qux', 1.5)
-        assert res2 is not res
         tm.assert_index_equal(res2.columns,
-                              pd.Index(list(self.frame.columns) + ['qux']))
+                              pd.Index(list(self.frame.columns)))
         with tm.assert_produces_warning(FutureWarning,
                                         check_stacklevel=False):
             assert res2.get_value('foobar', 'qux') == 1.5
@@ -1268,3 +1267,54 @@ def test_assign_with_sparse_frame(self):
 
         for column in res.columns:
             assert type(res[column]) is SparseSeries
+
+
+def _test_assignment(kind, indexer, key=None):
+    arr = np.array([[1, nan],
+                    [nan, 1]])
+    df = DataFrame(arr, copy=True)
+    sdf = SparseDataFrame(arr, default_kind=kind).to_sparse(kind=kind)
+
+    def get_indexer(df):
+        return getattr(df, indexer) if indexer else df
+
+    if key is None:
+        key = pd.isnull(sdf).to_sparse()
+
+    get_indexer(sdf)[key] = 2
+
+    get_indexer(df)[key] = 2
+    res = df.to_sparse(kind=kind)
+
+    tm.assert_sp_frame_equal(sdf, res)
+
+
+@pytest.fixture(params=['integer', 'block'])
+def spindex_kind(request):
+    return request.param
+
+
+@pytest.mark.parametrize('indexer', ['iat'])
+@pytest.mark.parametrize('key', [(0, 0)])
+def test_frame_assignment_at(spindex_kind, indexer, key):
+    _test_assignment(spindex_kind, indexer, key)
+
+
+@pytest.mark.parametrize('indexer', ['at', 'loc', 'iloc'])
+@pytest.mark.parametrize('key', [0,
+                                 [0, 1],
+                                 [True, False]])
+def test_frame_assignment_loc(spindex_kind, indexer, key):
+    _test_assignment(spindex_kind, indexer, key)
+
+
+@pytest.mark.parametrize('key', [None,
+                                 [True, False]])
+def test_frame_assignment_setitem(spindex_kind, key):
+    _test_assignment(spindex_kind, None, key)
+
+
+@pytest.mark.parametrize('indexer', ['loc', 'at'])
+@pytest.mark.parametrize('key', [3])
+def test_frame_assignment_extend_index(spindex_kind, indexer, key):
+    _test_assignment(spindex_kind, indexer, key)
diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py