From b4889c85ef0dbef8ae91edd09aee4d460615820b Mon Sep 17 00:00:00 2001 From: Kernc Date: Thu, 15 Jun 2017 17:46:23 +0200 Subject: [PATCH 1/6] PERF: frame: avoid unnecessary .values calls Besides hstacking cols (data copy), this densified SparseDataFrame. --- pandas/core/frame.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 700562386c838..bf77747ad4efe 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -785,11 +785,9 @@ def iterrows(self): iteritems : Iterate over (column name, Series) pairs. """ - columns = self.columns - klass = self._constructor_sliced - for k, v in zip(self.index, self.values): - s = klass(v, index=columns, name=k) - yield k, s + iloc = self.iloc + for i, k in enumerate(self.index): + yield k, iloc[i] def itertuples(self, index=True, name="Pandas"): """ @@ -2765,7 +2763,7 @@ def _getitem_multilevel(self, key): return self._get_item_cache(key) def _getitem_frame(self, key): - if key.values.size and not is_bool_dtype(key.values): + if key.size and not key.dtypes.map(is_bool_dtype).all(): raise ValueError('Must pass DataFrame with boolean values only') return self.where(key) @@ -3153,7 +3151,7 @@ def _setitem_frame(self, key, value): ) key = self._constructor(key, **self._construct_axes_dict()) - if key.values.size and not is_bool_dtype(key.values): + if key.size and not key.dtypes.map(is_bool_dtype).all(): raise TypeError( 'Must pass DataFrame or 2-d ndarray with boolean values only' ) From 22b034638a355537afd6c6d566c0a6487a2873d2 Mon Sep 17 00:00:00 2001 From: Kernc Date: Tue, 3 Oct 2017 03:37:58 +0200 Subject: [PATCH 2/6] SparseDataFrame: inherit default_kind and default_fill_value --- pandas/core/sparse/frame.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 58e3001bcfe6a..c27caf7d31f74 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -330,7 +330,8 @@ def _apply_columns(self, func): return self._constructor( data=new_data, index=self.index, columns=self.columns, - default_fill_value=self.default_fill_value).__finalize__(self) + default_fill_value=self.default_fill_value, + default_kind=self.default_kind).__finalize__(self) def astype(self, dtype): return self._apply_columns(lambda x: x.astype(dtype)) @@ -576,7 +577,8 @@ def _combine_frame(self, other, func, fill_value=None, level=None): return self._constructor(data=new_data, index=new_index, columns=new_columns, - default_fill_value=new_fill_value + default_fill_value=new_fill_value, + default_kind=self.default_kind, ).__finalize__(self) def _combine_match_index(self, other, func, level=None): @@ -605,7 +607,8 @@ def _combine_match_index(self, other, func, level=None): return self._constructor( new_data, index=new_index, columns=self.columns, - default_fill_value=fill_value).__finalize__(self) + default_fill_value=fill_value, + default_kind=self.default_kind).__finalize__(self) def _combine_match_columns(self, other, func, level=None, try_cast=True): # patched version of DataFrame._combine_match_columns to account for @@ -629,7 +632,8 @@ def _combine_match_columns(self, other, func, level=None, try_cast=True): return self._constructor( new_data, index=self.index, columns=union, - default_fill_value=self.default_fill_value).__finalize__(self) + default_fill_value=self.default_fill_value, + default_kind=self.default_kind).__finalize__(self) def _combine_const(self, other, func, errors='raise', try_cast=True): return self._apply_columns(lambda x: func(x, other)) @@ -673,7 +677,8 @@ def _reindex_index(self, index, method, copy, level, fill_value=np.nan, return self._constructor( new_series, index=index, columns=self.columns, - default_fill_value=self._default_fill_value).__finalize__(self) + default_fill_value=self._default_fill_value, + default_kind=self.default_kind).__finalize__(self) def _reindex_columns(self, columns, method, copy, level, fill_value=None, limit=None, takeable=False): @@ -693,7 +698,8 @@ def _reindex_columns(self, columns, method, copy, level, fill_value=None, sdict = {k: v for k, v in compat.iteritems(self) if k in columns} return self._constructor( sdict, index=self.index, columns=columns, - default_fill_value=self._default_fill_value).__finalize__(self) + default_fill_value=self._default_fill_value, + default_kind=self.default_kind).__finalize__(self) def _reindex_with_indexers(self, reindexers, method=None, fill_value=None, limit=None, copy=False, allow_dups=False): @@ -725,8 +731,10 @@ def _reindex_with_indexers(self, reindexers, method=None, fill_value=None, else: new_arrays[col] = self[col] - return self._constructor(new_arrays, index=index, - columns=columns).__finalize__(self) + return self._constructor( + new_arrays, index=index, columns=columns, + default_fill_value=self.default_fill_value, + default_kind=self.default_kind).__finalize__(self) def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', sort=False): From 8748339c896b771d92fd03dba076122da88b2100 Mon Sep 17 00:00:00 2001 From: Kernc Date: Thu, 15 Jun 2017 17:49:23 +0200 Subject: [PATCH 3/6] ENH: Allow SparseDataFrame/SparseSeries values assignment Also fix .where for sparse blocks. Discrepancy comes from: dense_frame._data.blocks[0].values # this is 2D even for 1D block sparse_frame._data.blocks[0].values # this is always 1D I'm sure this had worked before and was unneeded in Oct 2017. --- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/core/frame.py | 4 +- pandas/core/internals/blocks.py | 24 +++++++ pandas/core/series.py | 4 +- pandas/core/sparse/array.py | 52 +++++++++++++++ pandas/core/sparse/frame.py | 42 +------------ pandas/core/sparse/series.py | 45 +++++-------- pandas/tests/sparse/frame/test_frame.py | 77 +++++++++++++++++++---- pandas/tests/sparse/series/test_series.py | 33 +++++++++- pandas/tests/sparse/test_format.py | 9 +-- pandas/util/testing.py | 6 +- 11 files changed, 202 insertions(+), 95 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 9b70bda82e247..fa20a110133ce 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -175,6 +175,7 @@ Other Enhancements (:issue:`21627`) - New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`) - :func:`read_html` copies cell data across ``colspan`` and ``rowspan``, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`) +- :class:`SparseDataFrame` and :class:`SparseSeries` support value assignment (:issue:`21818`) - :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`) - :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`) - :func:`~DataFrame.to_csv`, :func:`~Series.to_csv`, :func:`~DataFrame.to_json`, and :func:`~Series.to_json` now support ``compression='infer'`` to infer compression based on filename extension (:issue:`15008`). diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bf77747ad4efe..47f07c01b7785 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2548,9 +2548,7 @@ def set_value(self, index, col, value, takeable=False): Returns ------- - frame : DataFrame - If label pair is contained, will be reference to calling DataFrame, - otherwise a new object + self : DataFrame """ warnings.warn("set_value is deprecated and will be removed " "in a future release. Please use " diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f0635014b166b..42ef9b707b948 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -921,6 +921,9 @@ def _is_empty_indexer(indexer): if _is_empty_indexer(indexer): pass + elif is_sparse(values): + values = values.set_values(indexer, value) + # setting a single element for each dim and with a rhs that could # be say a list # GH 6043 @@ -1494,6 +1497,11 @@ def where(self, other, cond, align=True, errors='raise', raise ValueError("where must have a condition that is ndarray " "like") + # For SparseBlock, self.values is always 1D. If cond was a frame, + # it's 2D values would incorrectly broadcast later on. + if values.ndim == 1 and any(ax == 1 for ax in cond.shape): + cond = cond.ravel() + # our where function def func(cond, values, other): if cond.ravel().all(): @@ -1844,6 +1852,11 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, new_values = self.values if inplace else self.copy().values new_values, _, new, _ = self._try_coerce_args(new_values, new) + if is_sparse(new_values): + indexer = mask.to_dense().values.ravel().nonzero()[0] + block = self.setitem(indexer, new) + return [block] + if isinstance(new, np.ndarray) and len(new) == len(mask): new = new[mask] @@ -3154,6 +3167,17 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, return self.make_block_same_class(values=values, placement=self.mgr_locs) + def _can_hold_element(self, element): + return np.can_cast(np.asarray(element).dtype, self.sp_values.dtype) + + def _try_coerce_result(self, result): + if (isinstance(result, np.ndarray) and + np.ndim(result) == 1 and + not is_sparse(result)): + result = SparseArray(result, kind=self.kind, + fill_value=self.fill_value) + return result + def __len__(self): try: return self.sp_index.length diff --git a/pandas/core/series.py b/pandas/core/series.py index 4b4fccccda4a0..9e70a476898f0 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1071,9 +1071,7 @@ def set_value(self, label, value, takeable=False): Returns ------- - series : Series - If label is contained, will be reference to calling Series, - otherwise a new object + self : Series """ warnings.warn("set_value is deprecated and will be removed " "in a future release. Please use " diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 6f0ffbff22028..5f56bb2925dcb 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -37,6 +37,7 @@ import pandas.core.algorithms as algos import pandas.core.ops as ops import pandas.io.formats.printing as printing +from pandas.errors import PerformanceWarning from pandas.util._decorators import Appender from pandas.core.indexes.base import _index_shared_docs @@ -369,6 +370,53 @@ def get_values(self, fill=None): """ return a dense representation """ return self.to_dense(fill=fill) + def set_values(self, indexer, value): + """ + Return new SparseArray with indexed values set to `value`. + + Returns + ------- + SparseArray + A new sparse array with indexer positions filled with value. + """ + # If indexer is not a single int position, easiest to handle via dense + if not is_scalar(indexer): + warnings.warn( + 'Setting SparseSeries/Array values is particularly ' + 'inefficient when indexing with multiple keys because the ' + 'whole series is made dense interim.', + PerformanceWarning, stacklevel=2) + + values = self.to_dense() + values[indexer] = value + return SparseArray(values, kind=self.kind, + fill_value=self.fill_value) + + warnings.warn( + 'Setting SparseSeries/Array values is inefficient ' + '(a copy of data is made).', PerformanceWarning, stacklevel=2) + + # If label already in sparse index, just switch the value on a copy + idx = self.sp_index.lookup(indexer) + if idx != -1: + obj = self.copy() + obj.sp_values[idx] = value + return obj + + # Otherwise, construct a new array, and insert the new value in the + # correct position + indices = self.sp_index.to_int_index().indices + pos = np.searchsorted(indices, indexer) + + indices = np.insert(indices, pos, indexer) + sp_values = np.insert(self.sp_values, pos, value) + # Length can be increased when adding a new value into index + length = max(self.sp_index.length, indexer + 1) + sp_index = _make_index(length, indices, self.kind) + + return SparseArray(sp_values, sparse_index=sp_index, + fill_value=self.fill_value) + def to_dense(self, fill=None): """ Convert SparseArray to a NumPy array. @@ -544,6 +592,10 @@ def astype(self, dtype=None, copy=True): return self._simple_new(sp_values, self.sp_index, fill_value=fill_value) + def tolist(self): + """Return *dense* self as list""" + return self.values.tolist() + def copy(self, deep=True): """ Make a copy of the SparseArray. Only the actual sparse values need to diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index c27caf7d31f74..d43f1ce5ffef0 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -333,8 +333,8 @@ def _apply_columns(self, func): default_fill_value=self.default_fill_value, default_kind=self.default_kind).__finalize__(self) - def astype(self, dtype): - return self._apply_columns(lambda x: x.astype(dtype)) + def astype(self, dtype, **kwargs): + return self._apply_columns(lambda x: x.astype(dtype, **kwargs)) def copy(self, deep=True): """ @@ -465,44 +465,6 @@ def _get_value(self, index, col, takeable=False): return series._get_value(index, takeable=takeable) _get_value.__doc__ = get_value.__doc__ - def set_value(self, index, col, value, takeable=False): - """ - Put single value at passed column and index - - .. deprecated:: 0.21.0 - - Please use .at[] or .iat[] accessors. - - Parameters - ---------- - index : row label - col : column label - value : scalar value - takeable : interpret the index/col as indexers, default False - - Notes - ----- - This method *always* returns a new object. It is currently not - particularly efficient (and potentially very expensive) but is provided - for API compatibility with DataFrame - - Returns - ------- - frame : DataFrame - """ - warnings.warn("set_value is deprecated and will be removed " - "in a future release. Please use " - ".at[] or .iat[] accessors instead", FutureWarning, - stacklevel=2) - return self._set_value(index, col, value, takeable=takeable) - - def _set_value(self, index, col, value, takeable=False): - dense = self.to_dense()._set_value( - index, col, value, takeable=takeable) - return dense.to_sparse(kind=self._default_kind, - fill_value=self._default_fill_value) - _set_value.__doc__ = set_value.__doc__ - def _slice(self, slobj, axis=0, kind=None): if axis == 0: new_index = self.index[slobj] diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 8ac5d81f23bb2..e6c4e1278c35a 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -18,7 +18,6 @@ import pandas.core.common as com import pandas.core.indexes.base as ibase import pandas.core.ops as ops -import pandas._libs.index as libindex from pandas.util._decorators import Appender from pandas.core.sparse.array import ( @@ -278,8 +277,13 @@ def __array_wrap__(self, result, context=None): else: fill_value = self.fill_value + # Assume: If result size matches, old sparse index is valid (ok???) + if np.size(result) == self.sp_index.npoints: + sp_index = self.sp_index + else: + sp_index = None return self._constructor(result, index=self.index, - sparse_index=self.sp_index, + sparse_index=sp_index, fill_value=fill_value, copy=False).__finalize__(self) @@ -480,7 +484,7 @@ def set_value(self, label, value, takeable=False): Returns ------- - series : SparseSeries + self : SparseSeries """ warnings.warn("set_value is deprecated and will be removed " "in a future release. Please use " @@ -489,35 +493,16 @@ def set_value(self, label, value, takeable=False): return self._set_value(label, value, takeable=takeable) def _set_value(self, label, value, takeable=False): - values = self.to_dense() - - # if the label doesn't exist, we will create a new object here - # and possibly change the index - new_values = values._set_value(label, value, takeable=takeable) - if new_values is not None: - values = new_values - new_index = values.index - values = SparseArray(values, fill_value=self.fill_value, - kind=self.kind) - self._data = SingleBlockManager(values, new_index) - self._index = new_index + self._data = self._data.copy() + try: + idx = self.index.get_loc(label) + except KeyError: + idx = len(self) + self._data.axes[0] = self._data.index.append(Index([label])) + self._data = self._data.setitem(indexer=idx, value=value) + return self _set_value.__doc__ = set_value.__doc__ - def _set_values(self, key, value): - - # this might be inefficient as we have to recreate the sparse array - # rather than setting individual elements, but have to convert - # the passed slice/boolean that's in dense space into a sparse indexer - # not sure how to do that! - if isinstance(key, Series): - key = key.values - - values = self.values.to_dense() - values[key] = libindex.convert_scalar(values, value) - values = SparseArray(values, fill_value=self.fill_value, - kind=self.kind) - self._data = SingleBlockManager(values, self.index) - def to_dense(self, sparse_only=False): """ Convert SparseSeries to a Series. diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index be5a1710119ee..67f1c463e820a 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -10,6 +10,7 @@ from pandas import Series, DataFrame, bdate_range, Panel from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.errors import PerformanceWarning from pandas.tseries.offsets import BDay from pandas.util import testing as tm from pandas.compat import lrange @@ -457,11 +458,11 @@ def test_iloc(self): iframe.iloc[:, 0].sp_index) def test_set_value(self): - # ok, as the index gets converted to object frame = self.frame.copy() with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + check_stacklevel=False, + ignore_extra=True): res = frame.set_value('foobar', 'B', 1.5) assert res.index.dtype == 'object' @@ -469,22 +470,24 @@ def test_set_value(self): res.index = res.index.astype(object) with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + check_stacklevel=False, + ignore_extra=True): res = self.frame.set_value('foobar', 'B', 1.5) - assert res is not self.frame assert res.index[-1] == 'foobar' with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + check_stacklevel=False, + ignore_extra=True): assert res.get_value('foobar', 'B') == 1.5 with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + check_stacklevel=False, + ignore_extra=True): res2 = res.set_value('foobar', 'qux', 1.5) - assert res2 is not res tm.assert_index_equal(res2.columns, - pd.Index(list(self.frame.columns) + ['qux'])) + pd.Index(list(self.frame.columns))) with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + check_stacklevel=False, + ignore_extra=True): assert res2.get_value('foobar', 'qux') == 1.5 def test_fancy_index_misc(self): @@ -591,8 +594,9 @@ def test_setitem_chained_no_consolidate(self): # issuecomment-361696418 # chained setitem used to cause consolidation sdf = pd.SparseDataFrame([[np.nan, 1], [2, np.nan]]) - with pd.option_context('mode.chained_assignment', None): - sdf[0][1] = 2 + with tm.assert_produces_warning(PerformanceWarning): + with pd.option_context('mode.chained_assignment', None): + sdf[0][1] = 2 assert len(sdf._data.blocks) == 2 def test_delitem(self): @@ -1302,3 +1306,54 @@ def test_assign_with_sparse_frame(self): for column in res.columns: assert type(res[column]) is SparseSeries + + +def _test_assignment(kind, indexer, key=None): + arr = np.array([[1, nan], + [nan, 1]]) + df = DataFrame(arr, copy=True) + sdf = SparseDataFrame(arr, default_kind=kind).to_sparse(kind=kind) + + def get_indexer(df): + return getattr(df, indexer) if indexer else df + + if key is None: + key = pd.isnull(sdf).to_sparse() + + get_indexer(sdf)[key] = 2 + + get_indexer(df)[key] = 2 + res = df.to_sparse(kind=kind) + + tm.assert_sp_frame_equal(sdf, res) + + +@pytest.fixture(params=['integer', 'block']) +def spindex_kind(request): + return request.param + + +@pytest.mark.parametrize('indexer', ['iat']) +@pytest.mark.parametrize('key', [(0, 0)]) +def test_frame_assignment_at(spindex_kind, indexer, key): + _test_assignment(spindex_kind, indexer, key) + + +@pytest.mark.parametrize('indexer', ['at', 'loc', 'iloc']) +@pytest.mark.parametrize('key', [0, + [0, 1], + [True, False]]) +def test_frame_assignment_loc(spindex_kind, indexer, key): + _test_assignment(spindex_kind, indexer, key) + + +@pytest.mark.parametrize('key', [None, + [True, False]]) +def test_frame_assignment_setitem(spindex_kind, key): + _test_assignment(spindex_kind, None, key) + + +@pytest.mark.parametrize('indexer', ['loc', 'at']) +@pytest.mark.parametrize('key', [3]) +def test_frame_assignment_extend_index(spindex_kind, indexer, key): + _test_assignment(spindex_kind, indexer, key) diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index 921c30234660f..9d21ab65bbccd 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -484,15 +484,16 @@ def test_get_get_value(self): self.bseries.get_value(10), self.bseries[10]) def test_set_value(self): - idx = self.btseries.index[7] with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + check_stacklevel=False, + ignore_extra=True): self.btseries.set_value(idx, 0) assert self.btseries[idx] == 0 with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + check_stacklevel=False, + ignore_extra=True): self.iseries.set_value('foobar', 0) assert self.iseries.index[-1] == 'foobar' assert self.iseries['foobar'] == 0 @@ -1456,3 +1457,29 @@ def test_constructor_dict_datetime64_index(datetime_type): expected = SparseSeries(values, map(pd.Timestamp, dates)) tm.assert_sp_series_equal(result, expected) + + +@pytest.mark.parametrize('kind', ['integer', 'block']) +@pytest.mark.parametrize('indexer', [None, 'loc', 'iloc', 'at', 'iat']) +@pytest.mark.parametrize('key', [0, [0, 1], 2, [2, 3], + np.r_[True, False, False, False], + np.r_[False, False, False, True]]) +def test_series_assignment(kind, indexer, key): + is_multikey = np.asarray(key).ndim > 0 + skip_multikey = 'at' in (indexer or '') + if is_multikey and skip_multikey: + return + + arr = np.array([0., 0., nan, nan]) + ss = SparseSeries(arr, kind=kind) + assert len(ss.sp_index.to_int_index().indices) == 2 + + res = arr.copy() + res[key] = 1 + res = SparseSeries(res, kind=kind) + + ss_setitem = getattr(ss, indexer) if indexer else ss + + ss_setitem[key] = 1 + + tm.assert_sp_series_equal(ss, res) diff --git a/pandas/tests/sparse/test_format.py b/pandas/tests/sparse/test_format.py index d983bd209085a..91dc30b1cdf4e 100644 --- a/pandas/tests/sparse/test_format.py +++ b/pandas/tests/sparse/test_format.py @@ -8,7 +8,7 @@ from pandas.compat import (is_platform_windows, is_platform_32bit) from pandas.core.config import option_context - +from pandas.errors import PerformanceWarning use_32bit_repr = is_platform_windows() or is_platform_32bit() @@ -124,9 +124,10 @@ def test_sparse_repr_after_set(self): sdf = pd.SparseDataFrame([[np.nan, 1], [2, np.nan]]) res = sdf.copy() - # Ignore the warning - with pd.option_context('mode.chained_assignment', None): - sdf[0][1] = 2 # This line triggers the bug + with tm.assert_produces_warning(PerformanceWarning): + # Ignore the warning + with pd.option_context('mode.chained_assignment', None): + sdf[0][1] = 2 # This line triggers the bug repr(sdf) tm.assert_sp_frame_equal(sdf, res) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 39ab498d080bf..ab67bc4ad7f04 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -2493,7 +2493,8 @@ def exception_matches(self, exc_type, exc_value, trace_back): @contextmanager def assert_produces_warning(expected_warning=Warning, filter_level="always", - clear=None, check_stacklevel=True): + clear=None, check_stacklevel=True, + ignore_extra=False): """ Context manager for running code expected to either raise a specific warning, or not raise any warnings. Verifies that the code raises the @@ -2530,6 +2531,8 @@ class for all warnings. To check that no warning is returned, If True, displays the line that called the function containing the warning to show were the function is called. Otherwise, the line that implements the function is displayed. + ignore_extra : bool, default False + If False, any extra, unexpected warnings are raised as errors. Examples -------- @@ -2596,6 +2599,7 @@ class for all warnings. To check that no warning is returned, msg = "Did not see expected warning of class {name!r}.".format( name=expected_warning.__name__) assert saw_warning, msg + if not ignore_extra: assert not extra_warnings, ("Caused unexpected warning(s): {extra!r}." ).format(extra=extra_warnings) From 35e8c614fbb8d4e9aa0c2a942fe5aa11086374be Mon Sep 17 00:00:00 2001 From: Kernc Date: Thu, 12 Jul 2018 03:56:04 +0200 Subject: [PATCH 4/6] fixup! ENH: Allow SparseDataFrame/SparseSeries values assignment --- pandas/core/internals/blocks.py | 67 ++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 10 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 42ef9b707b948..1404792546a46 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1497,11 +1497,6 @@ def where(self, other, cond, align=True, errors='raise', raise ValueError("where must have a condition that is ndarray " "like") - # For SparseBlock, self.values is always 1D. If cond was a frame, - # it's 2D values would incorrectly broadcast later on. - if values.ndim == 1 and any(ax == 1 for ax in cond.shape): - cond = cond.ravel() - # our where function def func(cond, values, other): if cond.ravel().all(): @@ -1852,11 +1847,6 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, new_values = self.values if inplace else self.copy().values new_values, _, new, _ = self._try_coerce_args(new_values, new) - if is_sparse(new_values): - indexer = mask.to_dense().values.ravel().nonzero()[0] - block = self.setitem(indexer, new) - return [block] - if isinstance(new, np.ndarray) and len(new) == len(mask): new = new[mask] @@ -3270,6 +3260,63 @@ def sparse_reindex(self, new_index): return self.make_block_same_class(values, sparse_index=new_index, placement=self.mgr_locs) + def where(self, other, cond, align=True, errors='raise', + try_cast=False, axis=0, transpose=False, mgr=None): + """ + evaluate the block; return result block(s) from the result + + Parameters + ---------- + other : a ndarray/object + cond : the condition to respect + align : boolean, perform alignment on other/cond + errors : str, {'raise', 'ignore'}, default 'raise' + - ``raise`` : allow exceptions to be raised + - ``ignore`` : suppress exceptions. On error return original object + + axis : int + transpose : boolean + Set to True if self is stored with axes reversed + + Returns + ------- + a new sparse block(s), the result of the func + """ + cond = getattr(cond, 'values', cond) + # For SparseBlock, self.values is always 1D. + # If cond was a frame, its 2D values would incorrectly broadcast + # later on. + if self.values.ndim == 1 and any(ax == 1 for ax in cond.shape): + cond = cond.ravel() + + return super(self, SparseBlock).where( + other, cond, align=align, errors=errors, try_cast=try_cast, + axis=axis, transpose=transpose, mgr=mgr) + + def putmask(self, mask, new, align=True, inplace=False, axis=0, + transpose=False, mgr=None): + """ + putmask the data to the block; we must be a single block and not + generate other blocks + + return the resulting block + + Parameters + ---------- + mask : the condition to respect + new : a ndarray/object + align : boolean, perform alignment on other/cond, default is True + inplace : perform inplace modification, default is False + + Returns + ------- + a new block, the result of the putmask + """ + _, _, new, _ = self._try_coerce_args(self.values, new) + indexer = mask.to_dense().values.ravel().nonzero()[0] + block = self.setitem(indexer, new) + return [block] + # ----------------------------------------------------------------- # Constructor Helpers From 83c1f2bc2783d6d2e47b634c2150ccba2672e885 Mon Sep 17 00:00:00 2001 From: Kernc Date: Tue, 14 Aug 2018 03:13:08 +0200 Subject: [PATCH 5/6] fixup! ENH: Allow SparseDataFrame/SparseSeries values assignment --- pandas/core/sparse/array.py | 20 ++++++++++---------- pandas/core/sparse/series.py | 6 ++++-- pandas/tests/sparse/frame/test_frame.py | 21 +++++++-------------- pandas/tests/sparse/series/test_series.py | 6 ++---- pandas/tests/sparse/test_format.py | 8 +++----- pandas/util/testing.py | 6 +----- 6 files changed, 27 insertions(+), 40 deletions(-) diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 5f56bb2925dcb..8fba29034aeb3 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -382,9 +382,9 @@ def set_values(self, indexer, value): # If indexer is not a single int position, easiest to handle via dense if not is_scalar(indexer): warnings.warn( - 'Setting SparseSeries/Array values is particularly ' - 'inefficient when indexing with multiple keys because the ' - 'whole series is made dense interim.', + 'Setting SparseSeries/Array values is inefficient when ' + 'indexing with multiple keys because the whole series ' + 'is made dense interim.', PerformanceWarning, stacklevel=2) values = self.to_dense() @@ -392,16 +392,15 @@ def set_values(self, indexer, value): return SparseArray(values, kind=self.kind, fill_value=self.fill_value) - warnings.warn( - 'Setting SparseSeries/Array values is inefficient ' - '(a copy of data is made).', PerformanceWarning, stacklevel=2) - # If label already in sparse index, just switch the value on a copy idx = self.sp_index.lookup(indexer) if idx != -1: - obj = self.copy() - obj.sp_values[idx] = value - return obj + self.sp_values[idx] = value + return self + + warnings.warn( + 'Setting new SparseSeries values is inefficient ' + '(a copy of data is made).', PerformanceWarning, stacklevel=2) # Otherwise, construct a new array, and insert the new value in the # correct position @@ -410,6 +409,7 @@ def set_values(self, indexer, value): indices = np.insert(indices, pos, indexer) sp_values = np.insert(self.sp_values, pos, value) + # Length can be increased when adding a new value into index length = max(self.sp_index.length, indexer + 1) sp_index = _make_index(length, indices, self.kind) diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index e6c4e1278c35a..78558f5e7546d 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -277,11 +277,13 @@ def __array_wrap__(self, result, context=None): else: fill_value = self.fill_value - # Assume: If result size matches, old sparse index is valid (ok???) + # Only reuse old sparse index if result size matches + # (fails e.g. for ~sparseseries) if np.size(result) == self.sp_index.npoints: sp_index = self.sp_index else: sp_index = None + return self._constructor(result, index=self.index, sparse_index=sp_index, fill_value=fill_value, @@ -490,10 +492,10 @@ def set_value(self, label, value, takeable=False): "in a future release. Please use " ".at[] or .iat[] accessors instead", FutureWarning, stacklevel=2) + self._data = self._data.copy() return self._set_value(label, value, takeable=takeable) def _set_value(self, label, value, takeable=False): - self._data = self._data.copy() try: idx = self.index.get_loc(label) except KeyError: diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 67f1c463e820a..07b8267ee7510 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -10,7 +10,6 @@ from pandas import Series, DataFrame, bdate_range, Panel from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.errors import PerformanceWarning from pandas.tseries.offsets import BDay from pandas.util import testing as tm from pandas.compat import lrange @@ -461,8 +460,7 @@ def test_set_value(self): # ok, as the index gets converted to object frame = self.frame.copy() with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False, - ignore_extra=True): + check_stacklevel=False): res = frame.set_value('foobar', 'B', 1.5) assert res.index.dtype == 'object' @@ -470,24 +468,20 @@ def test_set_value(self): res.index = res.index.astype(object) with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False, - ignore_extra=True): + check_stacklevel=False): res = self.frame.set_value('foobar', 'B', 1.5) assert res.index[-1] == 'foobar' with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False, - ignore_extra=True): + check_stacklevel=False): assert res.get_value('foobar', 'B') == 1.5 with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False, - ignore_extra=True): + check_stacklevel=False): res2 = res.set_value('foobar', 'qux', 1.5) tm.assert_index_equal(res2.columns, pd.Index(list(self.frame.columns))) with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False, - ignore_extra=True): + check_stacklevel=False): assert res2.get_value('foobar', 'qux') == 1.5 def test_fancy_index_misc(self): @@ -594,9 +588,8 @@ def test_setitem_chained_no_consolidate(self): # issuecomment-361696418 # chained setitem used to cause consolidation sdf = pd.SparseDataFrame([[np.nan, 1], [2, np.nan]]) - with tm.assert_produces_warning(PerformanceWarning): - with pd.option_context('mode.chained_assignment', None): - sdf[0][1] = 2 + with pd.option_context('mode.chained_assignment', None): + sdf[0][1] = 2 assert len(sdf._data.blocks) == 2 def test_delitem(self): diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index 9d21ab65bbccd..916fca5d8182c 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -486,14 +486,12 @@ def test_get_get_value(self): def test_set_value(self): idx = self.btseries.index[7] with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False, - ignore_extra=True): + check_stacklevel=False): self.btseries.set_value(idx, 0) assert self.btseries[idx] == 0 with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False, - ignore_extra=True): + check_stacklevel=False): self.iseries.set_value('foobar', 0) assert self.iseries.index[-1] == 'foobar' assert self.iseries['foobar'] == 0 diff --git a/pandas/tests/sparse/test_format.py b/pandas/tests/sparse/test_format.py index 91dc30b1cdf4e..238d7e2f9be05 100644 --- a/pandas/tests/sparse/test_format.py +++ b/pandas/tests/sparse/test_format.py @@ -8,7 +8,6 @@ from pandas.compat import (is_platform_windows, is_platform_32bit) from pandas.core.config import option_context -from pandas.errors import PerformanceWarning use_32bit_repr = is_platform_windows() or is_platform_32bit() @@ -124,10 +123,9 @@ def test_sparse_repr_after_set(self): sdf = pd.SparseDataFrame([[np.nan, 1], [2, np.nan]]) res = sdf.copy() - with tm.assert_produces_warning(PerformanceWarning): - # Ignore the warning - with pd.option_context('mode.chained_assignment', None): - sdf[0][1] = 2 # This line triggers the bug + # Ignore the warning + with pd.option_context('mode.chained_assignment', None): + sdf[0][1] = 2 # This line triggers the bug repr(sdf) tm.assert_sp_frame_equal(sdf, res) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index ab67bc4ad7f04..39ab498d080bf 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -2493,8 +2493,7 @@ def exception_matches(self, exc_type, exc_value, trace_back): @contextmanager def assert_produces_warning(expected_warning=Warning, filter_level="always", - clear=None, check_stacklevel=True, - ignore_extra=False): + clear=None, check_stacklevel=True): """ Context manager for running code expected to either raise a specific warning, or not raise any warnings. Verifies that the code raises the @@ -2531,8 +2530,6 @@ class for all warnings. To check that no warning is returned, If True, displays the line that called the function containing the warning to show were the function is called. Otherwise, the line that implements the function is displayed. - ignore_extra : bool, default False - If False, any extra, unexpected warnings are raised as errors. Examples -------- @@ -2599,7 +2596,6 @@ class for all warnings. To check that no warning is returned, msg = "Did not see expected warning of class {name!r}.".format( name=expected_warning.__name__) assert saw_warning, msg - if not ignore_extra: assert not extra_warnings, ("Caused unexpected warning(s): {extra!r}." ).format(extra=extra_warnings) From 4779c367abd6e008ec51bf7e355d8cf4ab5b57ea Mon Sep 17 00:00:00 2001 From: Kernc Date: Tue, 14 Aug 2018 03:19:40 +0200 Subject: [PATCH 6/6] TST: SparseArray.tolist() --- pandas/tests/sparse/test_array.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/sparse/test_array.py b/pandas/tests/sparse/test_array.py index 2790464e2f811..f82491e8ac7e1 100644 --- a/pandas/tests/sparse/test_array.py +++ b/pandas/tests/sparse/test_array.py @@ -928,3 +928,9 @@ def test_ufunc_args(self): sparse = SparseArray([1, -1, 0, -2], fill_value=0) result = SparseArray([2, 0, 1, -1], fill_value=1) tm.assert_sp_array_equal(np.add(sparse, 1), result) + + def test_tolist(self): + sparse = SparseArray([1, np.nan, 2, np.nan, -2]) + assert isinstance(sparse.tolist(), list) + tm.assert_numpy_array_equal(np.array(sparse.tolist()), + np.array([1, np.nan, 2, np.nan, -2]))