diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 9b70bda82e247..fa20a110133ce 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -175,6 +175,7 @@ Other Enhancements (:issue:`21627`) - New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`) - :func:`read_html` copies cell data across ``colspan`` and ``rowspan``, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`) +- :class:`SparseDataFrame` and :class:`SparseSeries` support value assignment (:issue:`21818`) - :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`) - :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`) - :func:`~DataFrame.to_csv`, :func:`~Series.to_csv`, :func:`~DataFrame.to_json`, and :func:`~Series.to_json` now support ``compression='infer'`` to infer compression based on filename extension (:issue:`15008`). diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 700562386c838..47f07c01b7785 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -785,11 +785,9 @@ def iterrows(self): iteritems : Iterate over (column name, Series) pairs. """ - columns = self.columns - klass = self._constructor_sliced - for k, v in zip(self.index, self.values): - s = klass(v, index=columns, name=k) - yield k, s + iloc = self.iloc + for i, k in enumerate(self.index): + yield k, iloc[i] def itertuples(self, index=True, name="Pandas"): """ @@ -2550,9 +2548,7 @@ def set_value(self, index, col, value, takeable=False): Returns ------- - frame : DataFrame - If label pair is contained, will be reference to calling DataFrame, - otherwise a new object + self : DataFrame """ warnings.warn("set_value is deprecated and will be removed " "in a future release. Please use " @@ -2765,7 +2761,7 @@ def _getitem_multilevel(self, key): return self._get_item_cache(key) def _getitem_frame(self, key): - if key.values.size and not is_bool_dtype(key.values): + if key.size and not key.dtypes.map(is_bool_dtype).all(): raise ValueError('Must pass DataFrame with boolean values only') return self.where(key) @@ -3153,7 +3149,7 @@ def _setitem_frame(self, key, value): ) key = self._constructor(key, **self._construct_axes_dict()) - if key.values.size and not is_bool_dtype(key.values): + if key.size and not key.dtypes.map(is_bool_dtype).all(): raise TypeError( 'Must pass DataFrame or 2-d ndarray with boolean values only' ) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f0635014b166b..1404792546a46 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -921,6 +921,9 @@ def _is_empty_indexer(indexer): if _is_empty_indexer(indexer): pass + elif is_sparse(values): + values = values.set_values(indexer, value) + # setting a single element for each dim and with a rhs that could # be say a list # GH 6043 @@ -3154,6 +3157,17 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, return self.make_block_same_class(values=values, placement=self.mgr_locs) + def _can_hold_element(self, element): + return np.can_cast(np.asarray(element).dtype, self.sp_values.dtype) + + def _try_coerce_result(self, result): + if (isinstance(result, np.ndarray) and + np.ndim(result) == 1 and + not is_sparse(result)): + result = SparseArray(result, kind=self.kind, + fill_value=self.fill_value) + return result + def __len__(self): try: return self.sp_index.length @@ -3246,6 +3260,63 @@ def sparse_reindex(self, new_index): return self.make_block_same_class(values, sparse_index=new_index, placement=self.mgr_locs) + def where(self, other, cond, align=True, errors='raise', + try_cast=False, axis=0, transpose=False, mgr=None): + """ + evaluate the block; return result block(s) from the result + + Parameters + ---------- + other : a ndarray/object + cond : the condition to respect + align : boolean, perform alignment on other/cond + errors : str, {'raise', 'ignore'}, default 'raise' + - ``raise`` : allow exceptions to be raised + - ``ignore`` : suppress exceptions. On error return original object + + axis : int + transpose : boolean + Set to True if self is stored with axes reversed + + Returns + ------- + a new sparse block(s), the result of the func + """ + cond = getattr(cond, 'values', cond) + # For SparseBlock, self.values is always 1D. + # If cond was a frame, its 2D values would incorrectly broadcast + # later on. + if self.values.ndim == 1 and any(ax == 1 for ax in cond.shape): + cond = cond.ravel() + + return super(self, SparseBlock).where( + other, cond, align=align, errors=errors, try_cast=try_cast, + axis=axis, transpose=transpose, mgr=mgr) + + def putmask(self, mask, new, align=True, inplace=False, axis=0, + transpose=False, mgr=None): + """ + putmask the data to the block; we must be a single block and not + generate other blocks + + return the resulting block + + Parameters + ---------- + mask : the condition to respect + new : a ndarray/object + align : boolean, perform alignment on other/cond, default is True + inplace : perform inplace modification, default is False + + Returns + ------- + a new block, the result of the putmask + """ + _, _, new, _ = self._try_coerce_args(self.values, new) + indexer = mask.to_dense().values.ravel().nonzero()[0] + block = self.setitem(indexer, new) + return [block] + # ----------------------------------------------------------------- # Constructor Helpers diff --git a/pandas/core/series.py b/pandas/core/series.py index 4b4fccccda4a0..9e70a476898f0 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1071,9 +1071,7 @@ def set_value(self, label, value, takeable=False): Returns ------- - series : Series - If label is contained, will be reference to calling Series, - otherwise a new object + self : Series """ warnings.warn("set_value is deprecated and will be removed " "in a future release. Please use " diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 6f0ffbff22028..8fba29034aeb3 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -37,6 +37,7 @@ import pandas.core.algorithms as algos import pandas.core.ops as ops import pandas.io.formats.printing as printing +from pandas.errors import PerformanceWarning from pandas.util._decorators import Appender from pandas.core.indexes.base import _index_shared_docs @@ -369,6 +370,53 @@ def get_values(self, fill=None): """ return a dense representation """ return self.to_dense(fill=fill) + def set_values(self, indexer, value): + """ + Return new SparseArray with indexed values set to `value`. + + Returns + ------- + SparseArray + A new sparse array with indexer positions filled with value. + """ + # If indexer is not a single int position, easiest to handle via dense + if not is_scalar(indexer): + warnings.warn( + 'Setting SparseSeries/Array values is inefficient when ' + 'indexing with multiple keys because the whole series ' + 'is made dense interim.', + PerformanceWarning, stacklevel=2) + + values = self.to_dense() + values[indexer] = value + return SparseArray(values, kind=self.kind, + fill_value=self.fill_value) + + # If label already in sparse index, just switch the value on a copy + idx = self.sp_index.lookup(indexer) + if idx != -1: + self.sp_values[idx] = value + return self + + warnings.warn( + 'Setting new SparseSeries values is inefficient ' + '(a copy of data is made).', PerformanceWarning, stacklevel=2) + + # Otherwise, construct a new array, and insert the new value in the + # correct position + indices = self.sp_index.to_int_index().indices + pos = np.searchsorted(indices, indexer) + + indices = np.insert(indices, pos, indexer) + sp_values = np.insert(self.sp_values, pos, value) + + # Length can be increased when adding a new value into index + length = max(self.sp_index.length, indexer + 1) + sp_index = _make_index(length, indices, self.kind) + + return SparseArray(sp_values, sparse_index=sp_index, + fill_value=self.fill_value) + def to_dense(self, fill=None): """ Convert SparseArray to a NumPy array. @@ -544,6 +592,10 @@ def astype(self, dtype=None, copy=True): return self._simple_new(sp_values, self.sp_index, fill_value=fill_value) + def tolist(self): + """Return *dense* self as list""" + return self.values.tolist() + def copy(self, deep=True): """ Make a copy of the SparseArray. Only the actual sparse values need to diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 58e3001bcfe6a..d43f1ce5ffef0 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -330,10 +330,11 @@ def _apply_columns(self, func): return self._constructor( data=new_data, index=self.index, columns=self.columns, - default_fill_value=self.default_fill_value).__finalize__(self) + default_fill_value=self.default_fill_value, + default_kind=self.default_kind).__finalize__(self) - def astype(self, dtype): - return self._apply_columns(lambda x: x.astype(dtype)) + def astype(self, dtype, **kwargs): + return self._apply_columns(lambda x: x.astype(dtype, **kwargs)) def copy(self, deep=True): """ @@ -464,44 +465,6 @@ def _get_value(self, index, col, takeable=False): return series._get_value(index, takeable=takeable) _get_value.__doc__ = get_value.__doc__ - def set_value(self, index, col, value, takeable=False): - """ - Put single value at passed column and index - - .. deprecated:: 0.21.0 - - Please use .at[] or .iat[] accessors. - - Parameters - ---------- - index : row label - col : column label - value : scalar value - takeable : interpret the index/col as indexers, default False - - Notes - ----- - This method *always* returns a new object. It is currently not - particularly efficient (and potentially very expensive) but is provided - for API compatibility with DataFrame - - Returns - ------- - frame : DataFrame - """ - warnings.warn("set_value is deprecated and will be removed " - "in a future release. Please use " - ".at[] or .iat[] accessors instead", FutureWarning, - stacklevel=2) - return self._set_value(index, col, value, takeable=takeable) - - def _set_value(self, index, col, value, takeable=False): - dense = self.to_dense()._set_value( - index, col, value, takeable=takeable) - return dense.to_sparse(kind=self._default_kind, - fill_value=self._default_fill_value) - _set_value.__doc__ = set_value.__doc__ - def _slice(self, slobj, axis=0, kind=None): if axis == 0: new_index = self.index[slobj] @@ -576,7 +539,8 @@ def _combine_frame(self, other, func, fill_value=None, level=None): return self._constructor(data=new_data, index=new_index, columns=new_columns, - default_fill_value=new_fill_value + default_fill_value=new_fill_value, + default_kind=self.default_kind, ).__finalize__(self) def _combine_match_index(self, other, func, level=None): @@ -605,7 +569,8 @@ def _combine_match_index(self, other, func, level=None): return self._constructor( new_data, index=new_index, columns=self.columns, - default_fill_value=fill_value).__finalize__(self) + default_fill_value=fill_value, + default_kind=self.default_kind).__finalize__(self) def _combine_match_columns(self, other, func, level=None, try_cast=True): # patched version of DataFrame._combine_match_columns to account for @@ -629,7 +594,8 @@ def _combine_match_columns(self, other, func, level=None, try_cast=True): return self._constructor( new_data, index=self.index, columns=union, - default_fill_value=self.default_fill_value).__finalize__(self) + default_fill_value=self.default_fill_value, + default_kind=self.default_kind).__finalize__(self) def _combine_const(self, other, func, errors='raise', try_cast=True): return self._apply_columns(lambda x: func(x, other)) @@ -673,7 +639,8 @@ def _reindex_index(self, index, method, copy, level, fill_value=np.nan, return self._constructor( new_series, index=index, columns=self.columns, - default_fill_value=self._default_fill_value).__finalize__(self) + default_fill_value=self._default_fill_value, + default_kind=self.default_kind).__finalize__(self) def _reindex_columns(self, columns, method, copy, level, fill_value=None, limit=None, takeable=False): @@ -693,7 +660,8 @@ def _reindex_columns(self, columns, method, copy, level, fill_value=None, sdict = {k: v for k, v in compat.iteritems(self) if k in columns} return self._constructor( sdict, index=self.index, columns=columns, - default_fill_value=self._default_fill_value).__finalize__(self) + default_fill_value=self._default_fill_value, + default_kind=self.default_kind).__finalize__(self) def _reindex_with_indexers(self, reindexers, method=None, fill_value=None, limit=None, copy=False, allow_dups=False): @@ -725,8 +693,10 @@ def _reindex_with_indexers(self, reindexers, method=None, fill_value=None, else: new_arrays[col] = self[col] - return self._constructor(new_arrays, index=index, - columns=columns).__finalize__(self) + return self._constructor( + new_arrays, index=index, columns=columns, + default_fill_value=self.default_fill_value, + default_kind=self.default_kind).__finalize__(self) def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', sort=False): diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 8ac5d81f23bb2..78558f5e7546d 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -18,7 +18,6 @@ import pandas.core.common as com import pandas.core.indexes.base as ibase import pandas.core.ops as ops -import pandas._libs.index as libindex from pandas.util._decorators import Appender from pandas.core.sparse.array import ( @@ -278,8 +277,15 @@ def __array_wrap__(self, result, context=None): else: fill_value = self.fill_value + # Only reuse old sparse index if result size matches + # (fails e.g. for ~sparseseries) + if np.size(result) == self.sp_index.npoints: + sp_index = self.sp_index + else: + sp_index = None + return self._constructor(result, index=self.index, - sparse_index=self.sp_index, + sparse_index=sp_index, fill_value=fill_value, copy=False).__finalize__(self) @@ -480,44 +486,25 @@ def set_value(self, label, value, takeable=False): Returns ------- - series : SparseSeries + self : SparseSeries """ warnings.warn("set_value is deprecated and will be removed " "in a future release. Please use " ".at[] or .iat[] accessors instead", FutureWarning, stacklevel=2) + self._data = self._data.copy() return self._set_value(label, value, takeable=takeable) def _set_value(self, label, value, takeable=False): - values = self.to_dense() - - # if the label doesn't exist, we will create a new object here - # and possibly change the index - new_values = values._set_value(label, value, takeable=takeable) - if new_values is not None: - values = new_values - new_index = values.index - values = SparseArray(values, fill_value=self.fill_value, - kind=self.kind) - self._data = SingleBlockManager(values, new_index) - self._index = new_index + try: + idx = self.index.get_loc(label) + except KeyError: + idx = len(self) + self._data.axes[0] = self._data.index.append(Index([label])) + self._data = self._data.setitem(indexer=idx, value=value) + return self _set_value.__doc__ = set_value.__doc__ - def _set_values(self, key, value): - - # this might be inefficient as we have to recreate the sparse array - # rather than setting individual elements, but have to convert - # the passed slice/boolean that's in dense space into a sparse indexer - # not sure how to do that! - if isinstance(key, Series): - key = key.values - - values = self.values.to_dense() - values[key] = libindex.convert_scalar(values, value) - values = SparseArray(values, fill_value=self.fill_value, - kind=self.kind) - self._data = SingleBlockManager(values, self.index) - def to_dense(self, sparse_only=False): """ Convert SparseSeries to a Series. diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index be5a1710119ee..07b8267ee7510 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -457,7 +457,6 @@ def test_iloc(self): iframe.iloc[:, 0].sp_index) def test_set_value(self): - # ok, as the index gets converted to object frame = self.frame.copy() with tm.assert_produces_warning(FutureWarning, @@ -471,7 +470,6 @@ def test_set_value(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): res = self.frame.set_value('foobar', 'B', 1.5) - assert res is not self.frame assert res.index[-1] == 'foobar' with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): @@ -480,9 +478,8 @@ def test_set_value(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): res2 = res.set_value('foobar', 'qux', 1.5) - assert res2 is not res tm.assert_index_equal(res2.columns, - pd.Index(list(self.frame.columns) + ['qux'])) + pd.Index(list(self.frame.columns))) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): assert res2.get_value('foobar', 'qux') == 1.5 @@ -1302,3 +1299,54 @@ def test_assign_with_sparse_frame(self): for column in res.columns: assert type(res[column]) is SparseSeries + + +def _test_assignment(kind, indexer, key=None): + arr = np.array([[1, nan], + [nan, 1]]) + df = DataFrame(arr, copy=True) + sdf = SparseDataFrame(arr, default_kind=kind).to_sparse(kind=kind) + + def get_indexer(df): + return getattr(df, indexer) if indexer else df + + if key is None: + key = pd.isnull(sdf).to_sparse() + + get_indexer(sdf)[key] = 2 + + get_indexer(df)[key] = 2 + res = df.to_sparse(kind=kind) + + tm.assert_sp_frame_equal(sdf, res) + + +@pytest.fixture(params=['integer', 'block']) +def spindex_kind(request): + return request.param + + +@pytest.mark.parametrize('indexer', ['iat']) +@pytest.mark.parametrize('key', [(0, 0)]) +def test_frame_assignment_at(spindex_kind, indexer, key): + _test_assignment(spindex_kind, indexer, key) + + +@pytest.mark.parametrize('indexer', ['at', 'loc', 'iloc']) +@pytest.mark.parametrize('key', [0, + [0, 1], + [True, False]]) +def test_frame_assignment_loc(spindex_kind, indexer, key): + _test_assignment(spindex_kind, indexer, key) + + +@pytest.mark.parametrize('key', [None, + [True, False]]) +def test_frame_assignment_setitem(spindex_kind, key): + _test_assignment(spindex_kind, None, key) + + +@pytest.mark.parametrize('indexer', ['loc', 'at']) +@pytest.mark.parametrize('key', [3]) +def test_frame_assignment_extend_index(spindex_kind, indexer, key): + _test_assignment(spindex_kind, indexer, key) diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index 921c30234660f..916fca5d8182c 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -484,7 +484,6 @@ def test_get_get_value(self): self.bseries.get_value(10), self.bseries[10]) def test_set_value(self): - idx = self.btseries.index[7] with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): @@ -1456,3 +1455,29 @@ def test_constructor_dict_datetime64_index(datetime_type): expected = SparseSeries(values, map(pd.Timestamp, dates)) tm.assert_sp_series_equal(result, expected) + + +@pytest.mark.parametrize('kind', ['integer', 'block']) +@pytest.mark.parametrize('indexer', [None, 'loc', 'iloc', 'at', 'iat']) +@pytest.mark.parametrize('key', [0, [0, 1], 2, [2, 3], + np.r_[True, False, False, False], + np.r_[False, False, False, True]]) +def test_series_assignment(kind, indexer, key): + is_multikey = np.asarray(key).ndim > 0 + skip_multikey = 'at' in (indexer or '') + if is_multikey and skip_multikey: + return + + arr = np.array([0., 0., nan, nan]) + ss = SparseSeries(arr, kind=kind) + assert len(ss.sp_index.to_int_index().indices) == 2 + + res = arr.copy() + res[key] = 1 + res = SparseSeries(res, kind=kind) + + ss_setitem = getattr(ss, indexer) if indexer else ss + + ss_setitem[key] = 1 + + tm.assert_sp_series_equal(ss, res) diff --git a/pandas/tests/sparse/test_array.py b/pandas/tests/sparse/test_array.py index 2790464e2f811..f82491e8ac7e1 100644 --- a/pandas/tests/sparse/test_array.py +++ b/pandas/tests/sparse/test_array.py @@ -928,3 +928,9 @@ def test_ufunc_args(self): sparse = SparseArray([1, -1, 0, -2], fill_value=0) result = SparseArray([2, 0, 1, -1], fill_value=1) tm.assert_sp_array_equal(np.add(sparse, 1), result) + + def test_tolist(self): + sparse = SparseArray([1, np.nan, 2, np.nan, -2]) + assert isinstance(sparse.tolist(), list) + tm.assert_numpy_array_equal(np.array(sparse.tolist()), + np.array([1, np.nan, 2, np.nan, -2])) diff --git a/pandas/tests/sparse/test_format.py b/pandas/tests/sparse/test_format.py index d983bd209085a..238d7e2f9be05 100644 --- a/pandas/tests/sparse/test_format.py +++ b/pandas/tests/sparse/test_format.py @@ -9,7 +9,6 @@ is_platform_32bit) from pandas.core.config import option_context - use_32bit_repr = is_platform_windows() or is_platform_32bit()