Skip to content

Commit 291a5ff

Browse files
committed
ENH: Allow SparseDataFrame/SparseSeries values assignment
Also fix .where for sparse blocks. Discrepancy comes from: dense_frame._data.blocks[0].values # this is 2D even for 1D block sparse_frame._data.blocks[0].values # this is always 1D I'm sure this had worked before and was unneeded in Oct 2017.
1 parent 3092629 commit 291a5ff

File tree

11 files changed

+202
-95
lines changed

11 files changed

+202
-95
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ Other Enhancements
8080
<https://pandas-gbq.readthedocs.io/en/latest/changelog.html#changelog-0-5-0>`__.
8181
(:issue:`21627`)
8282
- New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`)
83+
- :class:`SparseDataFrame` and :class:`SparseSeries` support value assignment (:issue:`21818`)
8384
- :func:`read_html` copies cell data across ``colspan``s and ``rowspan``s, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`)
8485
- :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`)
8586
- :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`)

pandas/core/frame.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -2641,9 +2641,7 @@ def set_value(self, index, col, value, takeable=False):
26412641
26422642
Returns
26432643
-------
2644-
frame : DataFrame
2645-
If label pair is contained, will be reference to calling DataFrame,
2646-
otherwise a new object
2644+
self : DataFrame
26472645
"""
26482646
warnings.warn("set_value is deprecated and will be removed "
26492647
"in a future release. Please use "

pandas/core/internals.py

+24
Original file line numberDiff line numberDiff line change
@@ -924,6 +924,9 @@ def _is_empty_indexer(indexer):
924924
if _is_empty_indexer(indexer):
925925
pass
926926

927+
elif is_sparse(values):
928+
values = values.set_values(indexer, value)
929+
927930
# setting a single element for each dim and with a rhs that could
928931
# be say a list
929932
# GH 6043
@@ -1497,6 +1500,11 @@ def where(self, other, cond, align=True, errors='raise',
14971500
raise ValueError("where must have a condition that is ndarray "
14981501
"like")
14991502

1503+
# For SparseBlock, self.values is always 1D. If cond was a frame,
1504+
# it's 2D values would incorrectly broadcast later on.
1505+
if values.ndim == 1 and any(ax == 1 for ax in cond.shape):
1506+
cond = cond.ravel()
1507+
15001508
# our where function
15011509
def func(cond, values, other):
15021510
if cond.ravel().all():
@@ -1808,6 +1816,11 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0,
18081816
new_values = self.values if inplace else self.copy().values
18091817
new_values, _, new, _ = self._try_coerce_args(new_values, new)
18101818

1819+
if is_sparse(new_values):
1820+
indexer = mask.to_dense().values.ravel().nonzero()[0]
1821+
block = self.setitem(indexer, new)
1822+
return [block]
1823+
18111824
if isinstance(new, np.ndarray) and len(new) == len(mask):
18121825
new = new[mask]
18131826

@@ -3060,6 +3073,17 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
30603073
return self.make_block_same_class(values=values,
30613074
placement=self.mgr_locs)
30623075

3076+
def _can_hold_element(self, element):
3077+
return np.can_cast(np.asarray(element).dtype, self.sp_values.dtype)
3078+
3079+
def _try_coerce_result(self, result):
3080+
if (isinstance(result, np.ndarray) and
3081+
np.ndim(result) == 1 and
3082+
not is_sparse(result)):
3083+
result = SparseArray(result, kind=self.kind,
3084+
fill_value=self.fill_value)
3085+
return result
3086+
30633087
def __len__(self):
30643088
try:
30653089
return self.sp_index.length

pandas/core/series.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -1067,9 +1067,7 @@ def set_value(self, label, value, takeable=False):
10671067
10681068
Returns
10691069
-------
1070-
series : Series
1071-
If label is contained, will be reference to calling Series,
1072-
otherwise a new object
1070+
self : Series
10731071
"""
10741072
warnings.warn("set_value is deprecated and will be removed "
10751073
"in a future release. Please use "

pandas/core/sparse/array.py

+52
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
import pandas.core.algorithms as algos
3838
import pandas.core.ops as ops
3939
import pandas.io.formats.printing as printing
40+
from pandas.errors import PerformanceWarning
4041
from pandas.util._decorators import Appender
4142
from pandas.core.indexes.base import _index_shared_docs
4243

@@ -369,6 +370,53 @@ def get_values(self, fill=None):
369370
""" return a dense representation """
370371
return self.to_dense(fill=fill)
371372

373+
def set_values(self, indexer, value):
374+
"""
375+
Return new SparseArray with indexed values set to `value`.
376+
377+
Returns
378+
-------
379+
SparseArray
380+
A new sparse array with indexer positions filled with value.
381+
"""
382+
# If indexer is not a single int position, easiest to handle via dense
383+
if not is_scalar(indexer):
384+
warnings.warn(
385+
'Setting SparseSeries/Array values is particularly '
386+
'inefficient when indexing with multiple keys because the '
387+
'whole series is made dense interim.',
388+
PerformanceWarning, stacklevel=2)
389+
390+
values = self.to_dense()
391+
values[indexer] = value
392+
return SparseArray(values, kind=self.kind,
393+
fill_value=self.fill_value)
394+
395+
warnings.warn(
396+
'Setting SparseSeries/Array values is inefficient '
397+
'(a copy of data is made).', PerformanceWarning, stacklevel=2)
398+
399+
# If label already in sparse index, just switch the value on a copy
400+
idx = self.sp_index.lookup(indexer)
401+
if idx != -1:
402+
obj = self.copy()
403+
obj.sp_values[idx] = value
404+
return obj
405+
406+
# Otherwise, construct a new array, and insert the new value in the
407+
# correct position
408+
indices = self.sp_index.to_int_index().indices
409+
pos = np.searchsorted(indices, indexer)
410+
411+
indices = np.insert(indices, pos, indexer)
412+
sp_values = np.insert(self.sp_values, pos, value)
413+
# Length can be increased when adding a new value into index
414+
length = max(self.sp_index.length, indexer + 1)
415+
sp_index = _make_index(length, indices, self.kind)
416+
417+
return SparseArray(sp_values, sparse_index=sp_index,
418+
fill_value=self.fill_value)
419+
372420
def to_dense(self, fill=None):
373421
"""
374422
Convert SparseArray to a NumPy array.
@@ -544,6 +592,10 @@ def astype(self, dtype=None, copy=True):
544592
return self._simple_new(sp_values, self.sp_index,
545593
fill_value=fill_value)
546594

595+
def tolist(self):
596+
"""Return *dense* self as list"""
597+
return self.values.tolist()
598+
547599
def copy(self, deep=True):
548600
"""
549601
Make a copy of the SparseArray. Only the actual sparse values need to

pandas/core/sparse/frame.py

+2-40
Original file line numberDiff line numberDiff line change
@@ -332,8 +332,8 @@ def _apply_columns(self, func):
332332
default_fill_value=self.default_fill_value,
333333
default_kind=self.default_kind).__finalize__(self)
334334

335-
def astype(self, dtype):
336-
return self._apply_columns(lambda x: x.astype(dtype))
335+
def astype(self, dtype, **kwargs):
336+
return self._apply_columns(lambda x: x.astype(dtype, **kwargs))
337337

338338
def copy(self, deep=True):
339339
"""
@@ -464,44 +464,6 @@ def _get_value(self, index, col, takeable=False):
464464
return series._get_value(index, takeable=takeable)
465465
_get_value.__doc__ = get_value.__doc__
466466

467-
def set_value(self, index, col, value, takeable=False):
468-
"""
469-
Put single value at passed column and index
470-
471-
.. deprecated:: 0.21.0
472-
473-
Please use .at[] or .iat[] accessors.
474-
475-
Parameters
476-
----------
477-
index : row label
478-
col : column label
479-
value : scalar value
480-
takeable : interpret the index/col as indexers, default False
481-
482-
Notes
483-
-----
484-
This method *always* returns a new object. It is currently not
485-
particularly efficient (and potentially very expensive) but is provided
486-
for API compatibility with DataFrame
487-
488-
Returns
489-
-------
490-
frame : DataFrame
491-
"""
492-
warnings.warn("set_value is deprecated and will be removed "
493-
"in a future release. Please use "
494-
".at[] or .iat[] accessors instead", FutureWarning,
495-
stacklevel=2)
496-
return self._set_value(index, col, value, takeable=takeable)
497-
498-
def _set_value(self, index, col, value, takeable=False):
499-
dense = self.to_dense()._set_value(
500-
index, col, value, takeable=takeable)
501-
return dense.to_sparse(kind=self._default_kind,
502-
fill_value=self._default_fill_value)
503-
_set_value.__doc__ = set_value.__doc__
504-
505467
def _slice(self, slobj, axis=0, kind=None):
506468
if axis == 0:
507469
new_index = self.index[slobj]

pandas/core/sparse/series.py

+15-30
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
from pandas.core import generic
1818
import pandas.core.common as com
1919
import pandas.core.ops as ops
20-
import pandas._libs.index as libindex
2120
from pandas.util._decorators import Appender
2221

2322
from pandas.core.sparse.array import (
@@ -277,8 +276,13 @@ def __array_wrap__(self, result, context=None):
277276
else:
278277
fill_value = self.fill_value
279278

279+
# Assume: If result size matches, old sparse index is valid (ok???)
280+
if np.size(result) == self.sp_index.npoints:
281+
sp_index = self.sp_index
282+
else:
283+
sp_index = None
280284
return self._constructor(result, index=self.index,
281-
sparse_index=self.sp_index,
285+
sparse_index=sp_index,
282286
fill_value=fill_value,
283287
copy=False).__finalize__(self)
284288

@@ -479,7 +483,7 @@ def set_value(self, label, value, takeable=False):
479483
480484
Returns
481485
-------
482-
series : SparseSeries
486+
self : SparseSeries
483487
"""
484488
warnings.warn("set_value is deprecated and will be removed "
485489
"in a future release. Please use "
@@ -488,35 +492,16 @@ def set_value(self, label, value, takeable=False):
488492
return self._set_value(label, value, takeable=takeable)
489493

490494
def _set_value(self, label, value, takeable=False):
491-
values = self.to_dense()
492-
493-
# if the label doesn't exist, we will create a new object here
494-
# and possibly change the index
495-
new_values = values._set_value(label, value, takeable=takeable)
496-
if new_values is not None:
497-
values = new_values
498-
new_index = values.index
499-
values = SparseArray(values, fill_value=self.fill_value,
500-
kind=self.kind)
501-
self._data = SingleBlockManager(values, new_index)
502-
self._index = new_index
495+
self._data = self._data.copy()
496+
try:
497+
idx = self.index.get_loc(label)
498+
except KeyError:
499+
idx = len(self)
500+
self._data.axes[0] = self._data.index.append(Index([label]))
501+
self._data = self._data.setitem(indexer=idx, value=value)
502+
return self
503503
_set_value.__doc__ = set_value.__doc__
504504

505-
def _set_values(self, key, value):
506-
507-
# this might be inefficient as we have to recreate the sparse array
508-
# rather than setting individual elements, but have to convert
509-
# the passed slice/boolean that's in dense space into a sparse indexer
510-
# not sure how to do that!
511-
if isinstance(key, Series):
512-
key = key.values
513-
514-
values = self.values.to_dense()
515-
values[key] = libindex.convert_scalar(values, value)
516-
values = SparseArray(values, fill_value=self.fill_value,
517-
kind=self.kind)
518-
self._data = SingleBlockManager(values, self.index)
519-
520505
def to_dense(self, sparse_only=False):
521506
"""
522507
Convert SparseSeries to a Series.

0 commit comments

Comments
 (0)