Skip to content

Commit c86a189

Browse files
committed
ENH: Allow SparseDataFrame/SparseSeries values assignment
1 parent a7d61be commit c86a189

File tree

8 files changed

+174
-84
lines changed

8 files changed

+174
-84
lines changed

pandas/core/frame.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -752,9 +752,9 @@ def iterrows(self):
752752
iteritems : Iterate over (column name, Series) pairs.
753753
754754
"""
755-
iloc = self.iloc
755+
row_at = self.iloc.__getitem__
756756
for i, k in enumerate(self.index):
757-
yield k, iloc[i]
757+
yield k, row_at(i)
758758

759759
def itertuples(self, index=True, name="Pandas"):
760760
"""
@@ -2068,9 +2068,7 @@ def set_value(self, index, col, value, takeable=False):
20682068
20692069
Returns
20702070
-------
2071-
frame : DataFrame
2072-
If label pair is contained, will be reference to calling DataFrame,
2073-
otherwise a new object
2071+
self : DataFrame
20742072
"""
20752073
warnings.warn("set_value is deprecated and will be removed "
20762074
"in a future release. Please use "

pandas/core/internals.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -913,6 +913,9 @@ def _is_empty_indexer(indexer):
913913
if _is_empty_indexer(indexer):
914914
pass
915915

916+
elif is_sparse(values):
917+
values = values.set_values(indexer, value)
918+
916919
# setting a single element for each dim and with a rhs that could
917920
# be say a list
918921
# GH 6043
@@ -1795,10 +1798,15 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0,
17951798
new_values = self.values if inplace else self.copy().values
17961799
new_values, _, new, _ = self._try_coerce_args(new_values, new)
17971800

1801+
if is_sparse(new_values):
1802+
indexer = mask.to_dense().values.ravel().nonzero()[0]
1803+
block = self.setitem(indexer, new)
1804+
return [block]
1805+
17981806
if isinstance(new, np.ndarray) and len(new) == len(mask):
17991807
new = new[mask]
18001808

1801-
mask = _safe_reshape(mask, new_values.shape)
1809+
mask = _safe_reshape(np.asarray(mask), new_values.shape)
18021810

18031811
new_values[mask] = new
18041812
new_values = self._try_coerce_result(new_values)
@@ -2947,6 +2955,17 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
29472955
return self.make_block_same_class(values=values,
29482956
placement=self.mgr_locs)
29492957

2958+
def _can_hold_element(self, element):
2959+
return np.can_cast(np.asarray(element).dtype, self.sp_values.dtype)
2960+
2961+
def _try_coerce_result(self, result):
2962+
if (isinstance(result, np.ndarray) and
2963+
np.ndim(result) > 0
2964+
and not is_sparse(result)):
2965+
result = SparseArray(result, kind=self.kind,
2966+
fill_value=self.fill_value, dtype=self.dtype)
2967+
return result
2968+
29502969
def __len__(self):
29512970
try:
29522971
return self.sp_index.length

pandas/core/series.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -965,9 +965,7 @@ def set_value(self, label, value, takeable=False):
965965
966966
Returns
967967
-------
968-
series : Series
969-
If label is contained, will be reference to calling Series,
970-
otherwise a new object
968+
self : Series
971969
"""
972970
warnings.warn("set_value is deprecated and will be removed "
973971
"in a future release. Please use "

pandas/core/sparse/array.py

+48
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
import pandas.core.algorithms as algos
3838
import pandas.core.ops as ops
3939
import pandas.io.formats.printing as printing
40+
from pandas.errors import PerformanceWarning
4041
from pandas.util._decorators import Appender
4142
from pandas.core.indexes.base import _index_shared_docs
4243

@@ -373,6 +374,53 @@ def get_values(self, fill=None):
373374
""" return a dense representation """
374375
return self.to_dense(fill=fill)
375376

377+
def set_values(self, indexer, value):
378+
"""
379+
Return new SparseArray with indexed values set to `value`.
380+
381+
Returns
382+
-------
383+
SparseArray
384+
A new sparse array with indexer positions filled with value.
385+
"""
386+
# If indexer is not a single int position, easiest to handle via dense
387+
if not is_scalar(indexer):
388+
warnings.warn(
389+
'Setting SparseSeries/Array values is particularly '
390+
'inefficient when indexing with multiple keys because the '
391+
'whole series is made dense interim.',
392+
PerformanceWarning, stacklevel=2)
393+
394+
values = self.to_dense()
395+
values[indexer] = value
396+
return SparseArray(values, kind=self.kind,
397+
fill_value=self.fill_value)
398+
399+
warnings.warn(
400+
'Setting SparseSeries/Array values is inefficient '
401+
'(a copy of data is made).', PerformanceWarning, stacklevel=2)
402+
403+
# If label already in sparse index, just switch the value on a copy
404+
idx = self.sp_index.lookup(indexer)
405+
if idx != -1:
406+
obj = self.copy()
407+
obj.sp_values[idx] = value
408+
return obj
409+
410+
# Otherwise, construct a new array, and insert the new value in the
411+
# correct position
412+
indices = self.sp_index.to_int_index().indices
413+
pos = np.searchsorted(indices, indexer)
414+
415+
indices = np.insert(indices, pos, indexer)
416+
sp_values = np.insert(self.sp_values, pos, value)
417+
# Length can be increased when adding a new value into index
418+
length = max(self.sp_index.length, indexer + 1)
419+
sp_index = _make_index(length, indices, self.kind)
420+
421+
return SparseArray(sp_values, sparse_index=sp_index,
422+
fill_value=self.fill_value)
423+
376424
def to_dense(self, fill=None):
377425
"""
378426
Convert SparseArray to a NumPy array.

pandas/core/sparse/frame.py

+2-40
Original file line numberDiff line numberDiff line change
@@ -326,8 +326,8 @@ def _apply_columns(self, func):
326326
default_fill_value=self.default_fill_value,
327327
default_kind=self.default_kind).__finalize__(self)
328328

329-
def astype(self, dtype):
330-
return self._apply_columns(lambda x: x.astype(dtype))
329+
def astype(self, dtype, **kwargs):
330+
return self._apply_columns(lambda x: x.astype(dtype, **kwargs))
331331

332332
def copy(self, deep=True):
333333
"""
@@ -469,44 +469,6 @@ def _get_value(self, index, col, takeable=False):
469469
return series._get_value(index, takeable=takeable)
470470
_get_value.__doc__ = get_value.__doc__
471471

472-
def set_value(self, index, col, value, takeable=False):
473-
"""
474-
Put single value at passed column and index
475-
476-
.. deprecated:: 0.21.0
477-
478-
Please use .at[] or .iat[] accessors.
479-
480-
Parameters
481-
----------
482-
index : row label
483-
col : column label
484-
value : scalar value
485-
takeable : interpret the index/col as indexers, default False
486-
487-
Notes
488-
-----
489-
This method *always* returns a new object. It is currently not
490-
particularly efficient (and potentially very expensive) but is provided
491-
for API compatibility with DataFrame
492-
493-
Returns
494-
-------
495-
frame : DataFrame
496-
"""
497-
warnings.warn("set_value is deprecated and will be removed "
498-
"in a future release. Please use "
499-
".at[] or .iat[] accessors instead", FutureWarning,
500-
stacklevel=2)
501-
return self._set_value(index, col, value, takeable=takeable)
502-
503-
def _set_value(self, index, col, value, takeable=False):
504-
dense = self.to_dense()._set_value(
505-
index, col, value, takeable=takeable)
506-
return dense.to_sparse(kind=self._default_kind,
507-
fill_value=self._default_fill_value)
508-
_set_value.__doc__ = set_value.__doc__
509-
510472
def _slice(self, slobj, axis=0, kind=None):
511473
if axis == 0:
512474
new_index = self.index[slobj]

pandas/core/sparse/series.py

+20-32
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import warnings
1010

1111
from pandas.core.dtypes.missing import isna, notna
12+
from pandas.core.dtypes.common import is_sparse
1213

1314
from pandas.compat.numpy import function as nv
1415
from pandas.core.index import Index, _ensure_index, InvalidIndexError
@@ -17,7 +18,6 @@
1718
from pandas.core import generic
1819
import pandas.core.common as com
1920
import pandas.core.ops as ops
20-
import pandas._libs.index as libindex
2121
from pandas.util._decorators import Appender
2222

2323
from pandas.core.sparse.array import (
@@ -279,8 +279,13 @@ def __array_wrap__(self, result, context=None):
279279
else:
280280
fill_value = self.fill_value
281281

282+
# Assume: If result size matches, old sparse index is valid (ok???)
283+
if np.size(result) == self.sp_index.npoints:
284+
sp_index = self.sp_index
285+
else:
286+
sp_index = None
282287
return self._constructor(result, index=self.index,
283-
sparse_index=self.sp_index,
288+
sparse_index=sp_index,
284289
fill_value=fill_value,
285290
copy=False).__finalize__(self)
286291

@@ -481,7 +486,7 @@ def set_value(self, label, value, takeable=False):
481486
482487
Returns
483488
-------
484-
series : SparseSeries
489+
self : SparseSeries
485490
"""
486491
warnings.warn("set_value is deprecated and will be removed "
487492
"in a future release. Please use "
@@ -490,35 +495,16 @@ def set_value(self, label, value, takeable=False):
490495
return self._set_value(label, value, takeable=takeable)
491496

492497
def _set_value(self, label, value, takeable=False):
493-
values = self.to_dense()
494-
495-
# if the label doesn't exist, we will create a new object here
496-
# and possibly change the index
497-
new_values = values._set_value(label, value, takeable=takeable)
498-
if new_values is not None:
499-
values = new_values
500-
new_index = values.index
501-
values = SparseArray(values, fill_value=self.fill_value,
502-
kind=self.kind)
503-
self._data = SingleBlockManager(values, new_index)
504-
self._index = new_index
498+
self._data = self._data.copy()
499+
try:
500+
idx = self.index.get_loc(label)
501+
except KeyError:
502+
idx = len(self)
503+
self._data.axes[0] = self._data.index.append(Index([label]))
504+
self._data = self._data.setitem(indexer=idx, value=value)
505+
return self
505506
_set_value.__doc__ = set_value.__doc__
506507

507-
def _set_values(self, key, value):
508-
509-
# this might be inefficient as we have to recreate the sparse array
510-
# rather than setting individual elements, but have to convert
511-
# the passed slice/boolean that's in dense space into a sparse indexer
512-
# not sure how to do that!
513-
if isinstance(key, Series):
514-
key = key.values
515-
516-
values = self.values.to_dense()
517-
values[key] = libindex.convert_scalar(values, value)
518-
values = SparseArray(values, fill_value=self.fill_value,
519-
kind=self.kind)
520-
self._data = SingleBlockManager(values, self.index)
521-
522508
def to_dense(self, sparse_only=False):
523509
"""
524510
Convert SparseSeries to a Series.
@@ -544,8 +530,10 @@ def to_dense(self, sparse_only=False):
544530
index = self.index.take(int_index.indices)
545531
return Series(self.sp_values, index=index, name=self.name)
546532
else:
547-
return Series(self.values.to_dense(), index=self.index,
548-
name=self.name)
533+
values = self.values
534+
if is_sparse(values):
535+
values = values.to_dense()
536+
return Series(values, index=self.index, name=self.name)
549537

550538
@property
551539
def density(self):

pandas/tests/sparse/frame/test_frame.py

+53-3
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
from pandas import Series, DataFrame, bdate_range, Panel
1212
from pandas.core.indexes.datetimes import DatetimeIndex
13+
from pandas.errors import PerformanceWarning
1314
from pandas.tseries.offsets import BDay
1415
from pandas.util import testing as tm
1516
from pandas.compat import lrange
@@ -459,7 +460,6 @@ def test_set_value(self):
459460
with tm.assert_produces_warning(FutureWarning,
460461
check_stacklevel=False):
461462
res = self.frame.set_value('foobar', 'B', 1.5)
462-
assert res is not self.frame
463463
assert res.index[-1] == 'foobar'
464464
with tm.assert_produces_warning(FutureWarning,
465465
check_stacklevel=False):
@@ -468,9 +468,8 @@ def test_set_value(self):
468468
with tm.assert_produces_warning(FutureWarning,
469469
check_stacklevel=False):
470470
res2 = res.set_value('foobar', 'qux', 1.5)
471-
assert res2 is not res
472471
tm.assert_index_equal(res2.columns,
473-
pd.Index(list(self.frame.columns) + ['qux']))
472+
pd.Index(list(self.frame.columns)))
474473
with tm.assert_produces_warning(FutureWarning,
475474
check_stacklevel=False):
476475
assert res2.get_value('foobar', 'qux') == 1.5
@@ -1268,3 +1267,54 @@ def test_assign_with_sparse_frame(self):
12681267

12691268
for column in res.columns:
12701269
assert type(res[column]) is SparseSeries
1270+
1271+
1272+
def _test_assignment(kind, indexer, key=None):
1273+
arr = np.array([[1, nan],
1274+
[nan, 1]])
1275+
df = DataFrame(arr, copy=True)
1276+
sdf = SparseDataFrame(arr, default_kind=kind).to_sparse(kind=kind)
1277+
1278+
def get_indexer(df):
1279+
return getattr(df, indexer) if indexer else df
1280+
1281+
if key is None:
1282+
key = pd.isnull(sdf).to_sparse()
1283+
1284+
get_indexer(sdf)[key] = 2
1285+
1286+
get_indexer(df)[key] = 2
1287+
res = df.to_sparse(kind=kind)
1288+
1289+
tm.assert_sp_frame_equal(sdf, res)
1290+
1291+
1292+
@pytest.fixture(params=['integer', 'block'])
1293+
def spindex_kind(request):
1294+
return request.param
1295+
1296+
1297+
@pytest.mark.parametrize('indexer', ['iat'])
1298+
@pytest.mark.parametrize('key', [(0, 0)])
1299+
def test_frame_assignment_at(spindex_kind, indexer, key):
1300+
_test_assignment(spindex_kind, indexer, key)
1301+
1302+
1303+
@pytest.mark.parametrize('indexer', ['at', 'loc', 'iloc'])
1304+
@pytest.mark.parametrize('key', [0,
1305+
[0, 1],
1306+
[True, False]])
1307+
def test_frame_assignment_loc(spindex_kind, indexer, key):
1308+
_test_assignment(spindex_kind, indexer, key)
1309+
1310+
1311+
@pytest.mark.parametrize('key', [None,
1312+
[True, False]])
1313+
def test_frame_assignment_setitem(spindex_kind, key):
1314+
_test_assignment(spindex_kind, None, key)
1315+
1316+
1317+
@pytest.mark.parametrize('indexer', ['loc', 'at'])
1318+
@pytest.mark.parametrize('key', [3])
1319+
def test_frame_assignment_extend_index(spindex_kind, indexer, key):
1320+
_test_assignment(spindex_kind, indexer, key)

0 commit comments

Comments
 (0)