Skip to content

ENH: SparseDataFrame/SparseSeries value assignment #17785

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ Other Enhancements
(:issue:`21627`)
- New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`)
- :func:`read_html` copies cell data across ``colspan`` and ``rowspan``, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`)
- :class:`SparseDataFrame` and :class:`SparseSeries` support value assignment (:issue:`21818`)
- :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`)
- :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`)
- :func:`~DataFrame.to_csv`, :func:`~Series.to_csv`, :func:`~DataFrame.to_json`, and :func:`~Series.to_json` now support ``compression='infer'`` to infer compression based on filename extension (:issue:`15008`).
Expand Down
16 changes: 6 additions & 10 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -785,11 +785,9 @@ def iterrows(self):
iteritems : Iterate over (column name, Series) pairs.

"""
columns = self.columns
klass = self._constructor_sliced
for k, v in zip(self.index, self.values):
s = klass(v, index=columns, name=k)
yield k, s
iloc = self.iloc
for i, k in enumerate(self.index):
yield k, iloc[i]

def itertuples(self, index=True, name="Pandas"):
"""
Expand Down Expand Up @@ -2550,9 +2548,7 @@ def set_value(self, index, col, value, takeable=False):

Returns
-------
frame : DataFrame
If label pair is contained, will be reference to calling DataFrame,
otherwise a new object
self : DataFrame
"""
warnings.warn("set_value is deprecated and will be removed "
"in a future release. Please use "
Expand Down Expand Up @@ -2765,7 +2761,7 @@ def _getitem_multilevel(self, key):
return self._get_item_cache(key)

def _getitem_frame(self, key):
if key.values.size and not is_bool_dtype(key.values):
if key.size and not key.dtypes.map(is_bool_dtype).all():
raise ValueError('Must pass DataFrame with boolean values only')
return self.where(key)

Expand Down Expand Up @@ -3153,7 +3149,7 @@ def _setitem_frame(self, key, value):
)
key = self._constructor(key, **self._construct_axes_dict())

if key.values.size and not is_bool_dtype(key.values):
if key.size and not key.dtypes.map(is_bool_dtype).all():
raise TypeError(
'Must pass DataFrame or 2-d ndarray with boolean values only'
)
Expand Down
71 changes: 71 additions & 0 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -921,6 +921,9 @@ def _is_empty_indexer(indexer):
if _is_empty_indexer(indexer):
pass

elif is_sparse(values):
values = values.set_values(indexer, value)

# setting a single element for each dim and with a rhs that could
# be say a list
# GH 6043
Expand Down Expand Up @@ -3154,6 +3157,17 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
return self.make_block_same_class(values=values,
placement=self.mgr_locs)

def _can_hold_element(self, element):
return np.can_cast(np.asarray(element).dtype, self.sp_values.dtype)

def _try_coerce_result(self, result):
if (isinstance(result, np.ndarray) and
np.ndim(result) == 1 and
not is_sparse(result)):
result = SparseArray(result, kind=self.kind,
fill_value=self.fill_value)
return result

def __len__(self):
try:
return self.sp_index.length
Expand Down Expand Up @@ -3246,6 +3260,63 @@ def sparse_reindex(self, new_index):
return self.make_block_same_class(values, sparse_index=new_index,
placement=self.mgr_locs)

def where(self, other, cond, align=True, errors='raise',
try_cast=False, axis=0, transpose=False, mgr=None):
"""
evaluate the block; return result block(s) from the result

Parameters
----------
other : a ndarray/object
cond : the condition to respect
align : boolean, perform alignment on other/cond
errors : str, {'raise', 'ignore'}, default 'raise'
- ``raise`` : allow exceptions to be raised
- ``ignore`` : suppress exceptions. On error return original object

axis : int
transpose : boolean
Set to True if self is stored with axes reversed

Returns
-------
a new sparse block(s), the result of the func
"""
cond = getattr(cond, 'values', cond)
# For SparseBlock, self.values is always 1D.
# If cond was a frame, its 2D values would incorrectly broadcast
# later on.
if self.values.ndim == 1 and any(ax == 1 for ax in cond.shape):
cond = cond.ravel()

return super(self, SparseBlock).where(
other, cond, align=align, errors=errors, try_cast=try_cast,
axis=axis, transpose=transpose, mgr=mgr)

def putmask(self, mask, new, align=True, inplace=False, axis=0,
transpose=False, mgr=None):
"""
putmask the data to the block; we must be a single block and not
generate other blocks

return the resulting block

Parameters
----------
mask : the condition to respect
new : a ndarray/object
align : boolean, perform alignment on other/cond, default is True
inplace : perform inplace modification, default is False

Returns
-------
a new block, the result of the putmask
"""
_, _, new, _ = self._try_coerce_args(self.values, new)
indexer = mask.to_dense().values.ravel().nonzero()[0]
block = self.setitem(indexer, new)
return [block]


# -----------------------------------------------------------------
# Constructor Helpers
Expand Down
4 changes: 1 addition & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1071,9 +1071,7 @@ def set_value(self, label, value, takeable=False):

Returns
-------
series : Series
If label is contained, will be reference to calling Series,
otherwise a new object
self : Series
"""
warnings.warn("set_value is deprecated and will be removed "
"in a future release. Please use "
Expand Down
52 changes: 52 additions & 0 deletions pandas/core/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import pandas.core.algorithms as algos
import pandas.core.ops as ops
import pandas.io.formats.printing as printing
from pandas.errors import PerformanceWarning
from pandas.util._decorators import Appender
from pandas.core.indexes.base import _index_shared_docs

Expand Down Expand Up @@ -369,6 +370,53 @@ def get_values(self, fill=None):
""" return a dense representation """
return self.to_dense(fill=fill)

def set_values(self, indexer, value):
"""
Return new SparseArray with indexed values set to `value`.

Returns
-------
SparseArray
A new sparse array with indexer positions filled with value.
"""
# If indexer is not a single int position, easiest to handle via dense
if not is_scalar(indexer):
warnings.warn(
'Setting SparseSeries/Array values is inefficient when '
'indexing with multiple keys because the whole series '
'is made dense interim.',
PerformanceWarning, stacklevel=2)

values = self.to_dense()
values[indexer] = value
return SparseArray(values, kind=self.kind,
fill_value=self.fill_value)

# If label already in sparse index, just switch the value on a copy
idx = self.sp_index.lookup(indexer)
if idx != -1:
self.sp_values[idx] = value
return self

warnings.warn(
'Setting new SparseSeries values is inefficient '
'(a copy of data is made).', PerformanceWarning, stacklevel=2)

# Otherwise, construct a new array, and insert the new value in the
# correct position
indices = self.sp_index.to_int_index().indices
pos = np.searchsorted(indices, indexer)

indices = np.insert(indices, pos, indexer)
sp_values = np.insert(self.sp_values, pos, value)

# Length can be increased when adding a new value into index
length = max(self.sp_index.length, indexer + 1)
sp_index = _make_index(length, indices, self.kind)

return SparseArray(sp_values, sparse_index=sp_index,
fill_value=self.fill_value)

def to_dense(self, fill=None):
"""
Convert SparseArray to a NumPy array.
Expand Down Expand Up @@ -544,6 +592,10 @@ def astype(self, dtype=None, copy=True):
return self._simple_new(sp_values, self.sp_index,
fill_value=fill_value)

def tolist(self):
"""Return *dense* self as list"""
return self.values.tolist()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

test for this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added.


def copy(self, deep=True):
"""
Make a copy of the SparseArray. Only the actual sparse values need to
Expand Down
66 changes: 18 additions & 48 deletions pandas/core/sparse/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,10 +330,11 @@ def _apply_columns(self, func):

return self._constructor(
data=new_data, index=self.index, columns=self.columns,
default_fill_value=self.default_fill_value).__finalize__(self)
default_fill_value=self.default_fill_value,
default_kind=self.default_kind).__finalize__(self)

def astype(self, dtype):
return self._apply_columns(lambda x: x.astype(dtype))
def astype(self, dtype, **kwargs):
return self._apply_columns(lambda x: x.astype(dtype, **kwargs))

def copy(self, deep=True):
"""
Expand Down Expand Up @@ -464,44 +465,6 @@ def _get_value(self, index, col, takeable=False):
return series._get_value(index, takeable=takeable)
_get_value.__doc__ = get_value.__doc__

def set_value(self, index, col, value, takeable=False):
"""
Put single value at passed column and index

.. deprecated:: 0.21.0

Please use .at[] or .iat[] accessors.

Parameters
----------
index : row label
col : column label
value : scalar value
takeable : interpret the index/col as indexers, default False

Notes
-----
This method *always* returns a new object. It is currently not
particularly efficient (and potentially very expensive) but is provided
for API compatibility with DataFrame

Returns
-------
frame : DataFrame
"""
warnings.warn("set_value is deprecated and will be removed "
"in a future release. Please use "
".at[] or .iat[] accessors instead", FutureWarning,
stacklevel=2)
return self._set_value(index, col, value, takeable=takeable)

def _set_value(self, index, col, value, takeable=False):
dense = self.to_dense()._set_value(
index, col, value, takeable=takeable)
return dense.to_sparse(kind=self._default_kind,
fill_value=self._default_fill_value)
_set_value.__doc__ = set_value.__doc__

def _slice(self, slobj, axis=0, kind=None):
if axis == 0:
new_index = self.index[slobj]
Expand Down Expand Up @@ -576,7 +539,8 @@ def _combine_frame(self, other, func, fill_value=None, level=None):

return self._constructor(data=new_data, index=new_index,
columns=new_columns,
default_fill_value=new_fill_value
default_fill_value=new_fill_value,
default_kind=self.default_kind,
).__finalize__(self)

def _combine_match_index(self, other, func, level=None):
Expand Down Expand Up @@ -605,7 +569,8 @@ def _combine_match_index(self, other, func, level=None):

return self._constructor(
new_data, index=new_index, columns=self.columns,
default_fill_value=fill_value).__finalize__(self)
default_fill_value=fill_value,
default_kind=self.default_kind).__finalize__(self)

def _combine_match_columns(self, other, func, level=None, try_cast=True):
# patched version of DataFrame._combine_match_columns to account for
Expand All @@ -629,7 +594,8 @@ def _combine_match_columns(self, other, func, level=None, try_cast=True):

return self._constructor(
new_data, index=self.index, columns=union,
default_fill_value=self.default_fill_value).__finalize__(self)
default_fill_value=self.default_fill_value,
default_kind=self.default_kind).__finalize__(self)

def _combine_const(self, other, func, errors='raise', try_cast=True):
return self._apply_columns(lambda x: func(x, other))
Expand Down Expand Up @@ -673,7 +639,8 @@ def _reindex_index(self, index, method, copy, level, fill_value=np.nan,

return self._constructor(
new_series, index=index, columns=self.columns,
default_fill_value=self._default_fill_value).__finalize__(self)
default_fill_value=self._default_fill_value,
default_kind=self.default_kind).__finalize__(self)

def _reindex_columns(self, columns, method, copy, level, fill_value=None,
limit=None, takeable=False):
Expand All @@ -693,7 +660,8 @@ def _reindex_columns(self, columns, method, copy, level, fill_value=None,
sdict = {k: v for k, v in compat.iteritems(self) if k in columns}
return self._constructor(
sdict, index=self.index, columns=columns,
default_fill_value=self._default_fill_value).__finalize__(self)
default_fill_value=self._default_fill_value,
default_kind=self.default_kind).__finalize__(self)

def _reindex_with_indexers(self, reindexers, method=None, fill_value=None,
limit=None, copy=False, allow_dups=False):
Expand Down Expand Up @@ -725,8 +693,10 @@ def _reindex_with_indexers(self, reindexers, method=None, fill_value=None,
else:
new_arrays[col] = self[col]

return self._constructor(new_arrays, index=index,
columns=columns).__finalize__(self)
return self._constructor(
new_arrays, index=index, columns=columns,
default_fill_value=self.default_fill_value,
default_kind=self.default_kind).__finalize__(self)

def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
sort=False):
Expand Down
Loading