From d518404dd60b145fed34468f12f1f220abcce586 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 15 Mar 2019 16:04:16 -0500 Subject: [PATCH 01/22] Squashed commit of the following: commit 8b136bfd0629d96f84ddc4de643298f78defad40 Merge: 3005aed6b 01d3dc2e3 Author: Tom Augspurger Date: Fri Mar 15 16:03:23 2019 -0500 Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor commit 3005aed6b9877217a3a45785aa246c2a8b67fd22 Author: Tom Augspurger Date: Thu Mar 14 06:26:32 2019 -0500 isort? commit 318c06f2ceed262056e7a0c3c029bc153e6e52a1 Merge: 0922296d5 79205ea8a Author: Tom Augspurger Date: Thu Mar 14 06:25:45 2019 -0500 Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor commit 0922296d5852956cc4bbe6f3995507daadb32257 Author: Tom Augspurger Date: Wed Mar 13 21:35:51 2019 -0500 updates commit f433be85712e7e0753beaa99c074d56a16552359 Author: Tom Augspurger Date: Wed Mar 13 20:54:07 2019 -0500 lint commit 6696f280547bc4193c0da3769e2b2511abc43548 Merge: 534a3793a 101738219 Author: Tom Augspurger Date: Wed Mar 13 20:53:13 2019 -0500 Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor commit 534a3793a65b33dd7549a4ec2009bfd0dc721ddc Merge: 94a7bafc3 5c341dc13 Author: Tom Augspurger Date: Tue Mar 12 14:37:27 2019 -0500 Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor commit 94a7bafc33af00f0202c46eb07248c5e397aa77f Author: Tom Augspurger Date: Tue Mar 12 14:22:48 2019 -0500 fixups commit 6f619b5b8aee971f4d20e5dfce6f592c29ef9776 Author: Tom Augspurger Date: Tue Mar 12 13:38:48 2019 -0500 32-bit compat commit 24f48c3b380f6ec0a21944d553d1fa78ddd9d107 Author: Tom Augspurger Date: Mon Mar 11 22:05:46 2019 -0500 API: DataFrame.sparse accessor Closes https://github.com/pandas-dev/pandas/issues/25681 --- doc/source/reference/frame.rst | 23 ++ doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/arrays/sparse.py | 261 +++++++++++++++++++- pandas/core/frame.py | 2 + pandas/core/sparse/frame.py | 79 +----- pandas/tests/arrays/sparse/test_accessor.py | 85 +++++++ pandas/tests/arrays/sparse/test_array.py | 22 ++ 7 files changed, 391 insertions(+), 82 deletions(-) create mode 100644 pandas/tests/arrays/sparse/test_accessor.py diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 568acd5207bd1..4f256e86710ae 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -312,6 +312,29 @@ specific plotting methods of the form ``DataFrame.plot.``. DataFrame.boxplot DataFrame.hist + +.. _api.frame.sparse: + +Sparse Accessor +~~~~~~~~~~~~~~~ + +Sparse-dtype specific methods and attributes are provided under the +``DataFrame.sparse`` accessor. + +.. autosummary:: + :toctree: api/ + :template: autosummary/accessor_attribute.rst + + DataFrame.sparse.density + +.. autosummary:: + :toctree: api/ + + DataFrame.sparse.from_spmatrix + DataFrame.sparse.to_coo + DataFrame.sparse.to_dense + + Serialization / IO / Conversion ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 20d4f46348be6..6bf4280201ab5 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -33,6 +33,7 @@ Other Enhancements - :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`) - :meth:`RangeIndex.union` now supports the ``sort`` argument. If ``sort=False`` an unsorted ``Int64Index`` is always returned. ``sort=None`` is the default and returns a mononotically increasing ``RangeIndex`` if possible or a sorted ``Int64Index`` if not (:issue:`24471`) - :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`) +- Added :ref:`api.frame.sparse` for working with a ``DataFrame`` whose values are sparse (:issue:`25681`) - :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`) - :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`) - :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 2eb33d6d2c50f..7a4b8a6fa27da 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -689,6 +689,55 @@ def _simple_new( new._dtype = dtype return new + @classmethod + def from_spmatrix(cls, data): + """ + Create a SparseArray from a scipy.sparse matrix. + + .. versionadded:: 0.25.0 + + Parameters + ---------- + data : scipy.sparse.sp_matrix + This should be a SciPy sparse matrix where the size + of the second dimension is 1. In other words, a + sparse matrix with a single column. + + Returns + ------- + SparseArray + + Examples + -------- + >>> import scipy.sparse + >>> mat = scipy.sparse.coo_matrix((4, 1)) + >>> pd.SparseArray.from_spmatrix(mat) + [0.0, 0.0, 0.0, 0.0] + Fill: 0.0 + IntIndex + Indices: array([], dtype=int32) + """ + length, ncol = data.shape + + if ncol != 1: + raise ValueError( + "'data' must have a single column, not '{}'".format(ncol) + ) + + # our sparse index classes require that the positions be strictly + # increasing. So we need to sort loc, and arr accordingly. + arr = data.data + idx, _ = data.nonzero() + loc = np.argsort(idx) + arr = arr.take(loc) + idx.sort() + + zero = np.array(0, dtype=arr.dtype).item() + dtype = SparseDtype(arr.dtype, zero) + index = IntIndex(length, idx) + + return cls._simple_new(arr, index, dtype) + def __array__(self, dtype=None, copy=True): fill_value = self.fill_value @@ -1898,27 +1947,32 @@ def _make_index(length, indices, kind): # ---------------------------------------------------------------------------- # Accessor + +class BaseAccessor(object): + _validation_msg = "Can only use the '.sparse' accessor with Sparse data." + + def __init__(self, data=None): + self._parent = data + self._validate(data) + + def _validate(self, data): + raise NotImplementedError + + @delegate_names(SparseArray, ['npoints', 'density', 'fill_value', 'sp_values'], typ='property') -class SparseAccessor(PandasDelegate): +class SparseAccessor(BaseAccessor, PandasDelegate): """ Accessor for SparseSparse from other sparse matrix data types. """ - def __init__(self, data=None): - self._validate(data) - # Store the Series since we need that for to_coo - self._parent = data - - @staticmethod - def _validate(data): + def _validate(self, data): if not isinstance(data.dtype, SparseDtype): - msg = "Can only use the '.sparse' accessor with Sparse data." - raise AttributeError(msg) + raise AttributeError(self._validation_msg) def _delegate_property_get(self, name, *args, **kwargs): - return getattr(self._parent.values, name) + return getattr(self._parent.array, name) def _delegate_method(self, name, *args, **kwargs): if name == 'from_coo': @@ -2032,3 +2086,188 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False): column_levels, sort_labels=sort_labels) return A, rows, columns + + def to_dense(self): + """ + Convert a Series from sparse values to dense. + + .. versionadded:: 0.25.0 + + Returns + ------- + Series: + A Series with the same values, stored as a dense array. + + Examples + -------- + >>> series = pd.Series(pd.SparseArray([0, 1, 0])) + >>> series + 0 0 + 1 1 + 2 0 + dtype: Sparse[int64, 0] + + >>> series.sparse.to_dense() + 0 0 + 1 1 + 2 0 + dtype: int64 + """ + from pandas import Series + return Series(self._parent.array.to_dense(), + index=self._parent.index, + name=self._parent.name) + + +class SparseFrameAccessor(BaseAccessor, PandasDelegate): + """ + DataFrame accessor for sparse data. + + .. versionadded :: 0.25.0 + """ + + def _validate(self, data): + dtypes = data.dtypes + if not all(isinstance(t, SparseDtype) for t in dtypes): + raise AttributeError(self._validation_msg) + + @classmethod + def from_spmatrix(cls, data, index=None, columns=None): + """ + Create a new DataFrame from a scipy sparse matrix. + + .. versionadded:: 0.25.0 + + Parameters + ---------- + data : scipy.sparse.spmatrix + Must be convertible to csc format. + index, columns : Index, optional + Row and column labels to use for the resulting DataFrame. + Defaults to a RangeIndex. + + Returns + ------- + DataFrame + Each column of the DataFrame is stored as a + :class:`SparseArray`. + + Examples + -------- + >>> import scipy.sparse + >>> mat = scipy.sparse.eye(3) + >>> pd.DataFrame.sparse.from_spmatrix(mat) + 0 1 2 + 0 1.0 0.0 0.0 + 1 0.0 1.0 0.0 + 2 0.0 0.0 1.0 + """ + from pandas import DataFrame + + data = data.tocsc() + index, columns = cls._prep_index(data, index, columns) + sparrays = [ + SparseArray.from_spmatrix(data[:, i]) + for i in range(data.shape[1]) + ] + data = dict(zip(columns, sparrays)) + return DataFrame(data, index=index) + + def to_dense(self): + """ + Convert a DataFrame with sparse values to dense. + + .. versionadded:: 0.25.0 + + Returns + ------- + DataFrame + A DataFrame with the same values stored as dense arrays. + + Examples + -------- + >>> df = pd.DataFrame({"A": pd.SparseArray([0, 1, 0])}) + >>> df.sparse.to_dense() + A + 0 0 + 1 1 + 2 0 + """ + from pandas import DataFrame + + data = {k: v.array.to_dense() + for k, v in compat.iteritems(self._parent)} + return DataFrame(data, + index=self._parent.index, + columns=self._parent.columns) + + def to_coo(self): + """ + Return the contents of the frame as a sparse SciPy COO matrix. + + .. versionadded:: 0.20.0 + + Returns + ------- + coo_matrix : scipy.sparse.spmatrix + If the caller is heterogeneous and contains booleans or objects, + the result will be of dtype=object. See Notes. + + Notes + ----- + The dtype will be the lowest-common-denominator type (implicit + upcasting); that is to say if the dtypes (even of numeric types) + are mixed, the one that accommodates all will be chosen. + + e.g. If the dtypes are float16 and float32, dtype will be upcast to + float32. By numpy.find_common_type convention, mixing int64 and + and uint64 will result in a float64 dtype. + """ + try: + from scipy.sparse import coo_matrix + except ImportError: + raise ImportError('Scipy is not installed') + + dtype = find_common_type(self._parent.dtypes) + if isinstance(dtype, SparseDtype): + dtype = dtype.subtype + + cols, rows, datas = [], [], [] + for col, name in enumerate(self._parent): + s = self._parent[name] + row = s.array.sp_index.to_int_index().indices + cols.append(np.repeat(col, len(row))) + rows.append(row) + datas.append(s.array.sp_values.astype(dtype, copy=False)) + + cols = np.concatenate(cols) + rows = np.concatenate(rows) + datas = np.concatenate(datas) + return coo_matrix((datas, (rows, cols)), shape=self._parent.shape) + + @property + def density(self): + """ + Ratio of non-sparse points to total (dense) data points + represented in the DataFrame. + """ + return np.mean([column.array.density + for _, column in self._parent.iteritems()]) + + @staticmethod + def _prep_index(data, index, columns): + import pandas.core.indexes.base as ibase + + N, K = data.shape + if index is None: + index = ibase.default_index(N) + if columns is None: + columns = ibase.default_index(K) + + if len(columns) != K: + raise ValueError('Column length mismatch: {columns} vs. {K}' + .format(columns=len(columns), K=K)) + if len(index) != N: + raise ValueError('Index length mismatch: {index} vs. {N}' + .format(index=len(index), N=N)) + return index, columns diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 48922ee870b8e..c0f7adbc7b95f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -33,6 +33,7 @@ from pandas.compat import PY36, lmap, lzip, raise_with_traceback from pandas.compat.numpy import function as nv +from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.dtypes.cast import ( maybe_upcast, cast_scalar_to_array, @@ -8023,6 +8024,7 @@ def isin(self, values): plot = CachedAccessor("plot", gfx.FramePlotMethods) hist = gfx.hist_frame boxplot = gfx.boxplot_frame + sparse = CachedAccessor("sparse", SparseFrameAccessor) DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0, diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 31e94b4770b0e..5093b9cc36477 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -11,17 +11,16 @@ from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender -from pandas.core.dtypes.cast import find_common_type, maybe_upcast +from pandas.core.dtypes.cast import maybe_upcast from pandas.core.dtypes.common import ensure_platform_int, is_scipy_sparse from pandas.core.dtypes.missing import isna, notna import pandas.core.algorithms as algos -from pandas.core.arrays.sparse import SparseArray, SparseDtype +from pandas.core.arrays.sparse import SparseArray, SparseFrameAccessor import pandas.core.common as com from pandas.core.frame import DataFrame import pandas.core.generic as generic from pandas.core.index import Index, MultiIndex, ensure_index -import pandas.core.indexes.base as ibase from pandas.core.internals import ( BlockManager, create_block_manager_from_arrays) from pandas.core.internals.construction import extract_index, prep_ndarray @@ -192,7 +191,7 @@ def _init_matrix(self, data, index, columns, dtype=None): Init self from ndarray or list of lists. """ data = prep_ndarray(data, copy=False) - index, columns = self._prep_index(data, index, columns) + index, columns = SparseFrameAccessor._prep_index(data, index, columns) data = {idx: data[:, i] for i, idx in enumerate(columns)} return self._init_dict(data, index, columns, dtype) @@ -201,7 +200,7 @@ def _init_spmatrix(self, data, index, columns, dtype=None, """ Init self from scipy.sparse matrix. """ - index, columns = self._prep_index(data, index, columns) + index, columns = SparseFrameAccessor._prep_index(data, index, columns) data = data.tocoo() N = len(index) @@ -228,64 +227,9 @@ def _init_spmatrix(self, data, index, columns, dtype=None, return self._init_dict(sdict, index, columns, dtype) - def _prep_index(self, data, index, columns): - N, K = data.shape - if index is None: - index = ibase.default_index(N) - if columns is None: - columns = ibase.default_index(K) - - if len(columns) != K: - raise ValueError('Column length mismatch: {columns} vs. {K}' - .format(columns=len(columns), K=K)) - if len(index) != N: - raise ValueError('Index length mismatch: {index} vs. {N}' - .format(index=len(index), N=N)) - return index, columns - + @Appender(SparseFrameAccessor.to_coo.__doc__) def to_coo(self): - """ - Return the contents of the frame as a sparse SciPy COO matrix. - - .. versionadded:: 0.20.0 - - Returns - ------- - coo_matrix : scipy.sparse.spmatrix - If the caller is heterogeneous and contains booleans or objects, - the result will be of dtype=object. See Notes. - - Notes - ----- - The dtype will be the lowest-common-denominator type (implicit - upcasting); that is to say if the dtypes (even of numeric types) - are mixed, the one that accommodates all will be chosen. - - e.g. If the dtypes are float16 and float32, dtype will be upcast to - float32. By numpy.find_common_type convention, mixing int64 and - and uint64 will result in a float64 dtype. - """ - try: - from scipy.sparse import coo_matrix - except ImportError: - raise ImportError('Scipy is not installed') - - dtype = find_common_type(self.dtypes) - if isinstance(dtype, SparseDtype): - dtype = dtype.subtype - - cols, rows, datas = [], [], [] - for col, name in enumerate(self): - s = self[name] - row = s.sp_index.to_int_index().indices - cols.append(np.repeat(col, len(row))) - rows.append(row) - datas.append(s.sp_values.astype(dtype, copy=False)) - - cols = np.concatenate(cols) - rows = np.concatenate(rows) - datas = np.concatenate(datas) - return coo_matrix((datas, (rows, cols)), shape=self.shape) + return SparseFrameAccessor(self).to_coo() def __array_wrap__(self, result): return self._constructor( @@ -326,16 +270,9 @@ def _unpickle_sparse_frame_compat(self, state): self._default_fill_value = fv self._default_kind = kind + @Appender(SparseFrameAccessor.to_dense.__doc__) def to_dense(self): - """ - Convert to dense DataFrame - - Returns - ------- - df : DataFrame - """ - data = {k: v.to_dense() for k, v in self.items()} - return DataFrame(data, index=self.index, columns=self.columns) + return SparseFrameAccessor(self).to_dense() def _apply_columns(self, func): """ diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py new file mode 100644 index 0000000000000..d43addc45082c --- /dev/null +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -0,0 +1,85 @@ +import string + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas.util.testing as tm + + +class TestSeriesAccessor(object): + # TODO: collect other Series accessor tests + def test_to_dense(self): + s = pd.Series([0, 1, 0, 10], dtype='Sparse[int64]') + result = s.sparse.to_dense() + expected = pd.Series([0, 1, 0, 10]) + tm.assert_series_equal(result, expected) + + +class TestFrameAccessor(object): + + def test_accessor_raises(self): + df = pd.DataFrame({"A": [0, 1]}) + with pytest.raises(AttributeError, match='sparse'): + df.sparse + + @pytest.mark.parametrize('format', ['csc', 'csr', 'coo']) + @pytest.mark.parametrize("labels", [ + None, + list(string.ascii_letters[:10]), + ]) + @pytest.mark.parametrize('dtype', ['float64', 'int64']) + @td.skip_if_no_scipy + def test_from_spmatrix(self, format, labels, dtype): + import scipy.sparse + + sp_dtype = pd.SparseDtype(dtype, np.array(0, dtype=dtype).item()) + + mat = scipy.sparse.eye(10, format=format, dtype=dtype) + result = pd.DataFrame.sparse.from_spmatrix( + mat, index=labels, columns=labels + ) + expected = pd.DataFrame( + np.eye(10, dtype=dtype), + index=labels, + columns=labels, + ).astype(sp_dtype) + tm.assert_frame_equal(result, expected) + + @td.skip_if_no_scipy + def test_to_coo(self): + import scipy.sparse + + df = pd.DataFrame({ + "A": [0, 1, 0], + "B": [1, 0, 0], + }, dtype='Sparse[int64, 0]') + result = df.sparse.to_coo() + expected = scipy.sparse.coo_matrix(np.asarray(df)) + assert (result != expected).nnz == 0 + + def test_to_dense(self): + df = pd.DataFrame({ + "A": pd.SparseArray([1, 0], dtype=pd.SparseDtype('int64', 0)), + "B": pd.SparseArray([1, 0], dtype=pd.SparseDtype('int64', 1)), + "C": pd.SparseArray([1., 0.], + dtype=pd.SparseDtype('float64', 0.0)), + }, index=['b', 'a']) + result = df.sparse.to_dense() + expected = pd.DataFrame({ + 'A': [1, 0], + 'B': [1, 0], + 'C': [1.0, 0.0], + }, index=['b', 'a']) + tm.assert_frame_equal(result, expected) + + def test_density(self): + df = pd.DataFrame({ + 'A': pd.SparseArray([1, 0, 2, 1], fill_value=0), + 'B': pd.SparseArray([0, 1, 1, 1], fill_value=0), + }) + res = df.sparse.density + expected = 0.75 + assert res == expected diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 88758c5d5c959..eef7bf78c54b2 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -171,6 +171,28 @@ def test_constructor_inferred_fill_value(self, data, fill_value): else: assert result == fill_value + @pytest.mark.parametrize('format', ['coo', 'csc', 'csr']) + @pytest.mark.parametrize('size', [0, 10]) + def test_from_spmatrix(self, size, format): + pytest.importorskip('scipy') + import scipy.sparse + + mat = scipy.sparse.random(size, 1, density=0.5, format=format) + result = SparseArray.from_spmatrix(mat) + + result = np.asarray(result) + expected = mat.toarray().ravel() + tm.assert_numpy_array_equal(result, expected) + + def test_from_spmatrix_raises(self): + pytest.importorskip('scipy') + import scipy.sparse + + mat = scipy.sparse.eye(5, 4, format='csc') + + with pytest.raises(ValueError, match="not '4'"): + SparseArray.from_spmatrix(mat) + @pytest.mark.parametrize('scalar,dtype', [ (False, SparseDtype(bool, False)), (0.0, SparseDtype('float64', 0)), From c32e5fff329a1180ccf787adee8c9592fb209c47 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 12 Mar 2019 14:22:48 -0500 Subject: [PATCH 02/22] DEPR: Deprecate SparseSeries and SparseDataFrame --- doc/source/user_guide/sparse.rst | 131 ++++++++++++++++++ pandas/core/sparse/frame.py | 6 + pandas/core/sparse/series.py | 8 ++ .../tests/arrays/sparse/test_arithmetics.py | 1 + pandas/tests/arrays/sparse/test_array.py | 3 + pandas/tests/frame/test_alter_axes.py | 2 + pandas/tests/frame/test_indexing.py | 1 + pandas/tests/frame/test_subclass.py | 2 + pandas/tests/io/json/test_pandas.py | 1 + pandas/tests/io/test_packers.py | 2 + pandas/tests/io/test_pytables.py | 3 + pandas/tests/reshape/test_reshape.py | 1 + pandas/tests/series/test_api.py | 6 +- pandas/tests/series/test_combine_concat.py | 1 + pandas/tests/series/test_missing.py | 15 +- pandas/tests/series/test_subclass.py | 2 + pandas/tests/sparse/frame/test_analytics.py | 2 + pandas/tests/sparse/frame/test_apply.py | 13 +- pandas/tests/sparse/frame/test_frame.py | 33 +++-- pandas/tests/sparse/frame/test_to_csv.py | 1 + .../tests/sparse/frame/test_to_from_scipy.py | 5 + pandas/tests/sparse/series/test_series.py | 42 ++++-- pandas/tests/sparse/test_combine_concat.py | 24 +++- pandas/tests/sparse/test_format.py | 3 + pandas/tests/sparse/test_groupby.py | 2 + pandas/tests/sparse/test_indexing.py | 4 + pandas/tests/sparse/test_pivot.py | 2 + pandas/tests/sparse/test_reshape.py | 3 + pandas/util/testing.py | 3 +- 29 files changed, 290 insertions(+), 32 deletions(-) diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index 20962749e2040..a7fdc05ef5640 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -6,6 +6,12 @@ Sparse data structures ********************** +.. note:: + + ``SparseSeries`` and ``SparseDataFrame`` have been deprecated. Their purpose + is served equally well by a :class:`Series` or :class:`DataFrame` with + sparse values. See :ref:`sparse.migration` for tips on migrating. + We have implemented "sparse" versions of ``Series`` and ``DataFrame``. These are not sparse in the typical "mostly 0". Rather, you can view these objects as being "compressed" where any data matching a specific value (``NaN`` / missing value, though any value @@ -162,6 +168,80 @@ It raises if any value cannot be coerced to specified dtype. Out[2]: ValueError: unable to coerce current fill_value nan to int64 dtype + + +We have implemented "sparse" versions of ``Series`` and ``DataFrame``. These are not sparse +in the typical "mostly 0". Rather, you can view these objects as being "compressed" +where any data matching a specific value (``NaN`` / missing value, though any value +can be chosen) is omitted. A special ``SparseIndex`` object tracks where data has been +"sparsified". This will make much more sense with an example. All of the standard pandas +data structures have a ``to_sparse`` method: + +.. ipython:: python + + ts = pd.Series(np.random.randn(10)) + ts[2:-2] = np.nan + sts = ts.to_sparse() + sts + +The ``to_sparse`` method takes a ``kind`` argument (for the sparse index, see +below) and a ``fill_value``. So if we had a mostly zero ``Series``, we could +convert it to sparse with ``fill_value=0``: + +.. ipython:: python + + ts.fillna(0).to_sparse(fill_value=0) + +The sparse objects exist for memory efficiency reasons. Suppose you had a +large, mostly NA ``DataFrame``: + +.. ipython:: python + + df = pd.DataFrame(np.random.randn(10000, 4)) + df.iloc[:9998] = np.nan + sdf = df.to_sparse() + sdf + sdf.density + +As you can see, the density (% of values that have not been "compressed") is +extremely low. This sparse object takes up much less memory on disk (pickled) +and in the Python interpreter. Functionally, their behavior should be nearly +identical to their dense counterparts. + +Any sparse object can be converted back to the standard dense form by calling +``to_dense``: + +.. ipython:: python + + sts.to_dense() + +.. _sparse.accessor: + +Sparse Accessor +--------------- + +.. versionadded:: 0.24.0 + +Pandas provides a ``.sparse`` accessor, similar to ``.str`` for string data, ``.cat`` +for categorical data, and ``.dt`` for datetime-like data. This namespace provides +attributes and methods that are specific to sparse data. + +.. ipython:: python + + s = pd.Series([0, 0, 1, 2], dtype="Sparse[int]") + s.sparse.density + s.sparse.fill_value + +This accessor is available only on data with ``SparseDtype``, and on the :class:`Series` +class itself for creating a Series with sparse data from a scipy COO matrix with. + + +.. versionadded:: 0.25.0 + +A ``.sparse`` accessor has been added for :class:`DataFrame` as well. +See :ref:`api.dataframe.sparse` for more. + + .. _sparse.calculation: Sparse Calculation @@ -291,3 +371,54 @@ row and columns coordinates of the matrix. Note that this will consume a signifi ss_dense = pd.SparseSeries.from_coo(A, dense_index=True) ss_dense + + +.. _sparse.migration: + +Migrating from SparseSeries and SparseDataFrame +----------------------------------------------- + +:class:`SparseArray` is the building block for all of ``Series``, ``SparseSeries``, +``DataFrame``, and ``SparseDataFrame``. To simplify the pandas API and lower maintenance burden, +we've deprecated the ``SparseSeries`` and ``SparseDataFrame`` classes. + +**There's no performance or memory penalty to using a Series or DataFrame with sparse values, +rather than a SparseSeries or SparseDataFrame**. + +**Construction** + +Use the regular :class:`Series` or :class:`DataFrame` constructors with :class:`SparseArray` values + +.. ipython:: python + + pd.DataFrame({"A": pd.SparseArray([0, 1])}) + +Or use :meth:`DataFrame.sparse.from_spmatrix` + +.. ipython:: python + + from scipy import sparse + mat = sparse.eye(3) + df = pd.DataFrame.sparse.from_spmatrix(mat, columns=['A', 'B', 'C']) + df + +**Conversion** + +Use the ``.sparse`` accessors + +.. ipython:: python + + df.sparse.to_dense() + df.sparse.to_coo() + df['A'] + +**Sparse Properties** + +Sparse-specific properties, like ``density``, are available on the ``.sparse`` accssor. + +.. ipython:: python + + df.sparse.density + +The ``SparseDataFrame.default_kind`` and ``SparseDataFrame.default_fill_value`` attributes +have no replacement. diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 5093b9cc36477..cfd0911ffc434 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -29,6 +29,11 @@ from pandas.core.sparse.series import SparseSeries _shared_doc_kwargs = dict(klass='SparseDataFrame') +depr_msg = """\ +SparseDataFrame is deprecated. + +See ... for more. +""" class SparseDataFrame(DataFrame): @@ -57,6 +62,7 @@ class SparseDataFrame(DataFrame): def __init__(self, data=None, index=None, columns=None, default_kind=None, default_fill_value=None, dtype=None, copy=False): + warnings.warn(depr_msg, FutureWarning, stacklevel=2) # pick up the defaults from the Sparse structures if isinstance(data, SparseDataFrame): if index is None: diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 11231ce90b6b9..7a7936f9cda11 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -32,6 +32,13 @@ optional_labels='', optional_axis='') +depr_msg = """\ +SparseSeries is deprecated. + +See ... for more. +""" + + class SparseSeries(Series): """Data structure for labeled, sparse floating point data @@ -60,6 +67,7 @@ class SparseSeries(Series): def __init__(self, data=None, index=None, sparse_index=None, kind='block', fill_value=None, name=None, dtype=None, copy=False, fastpath=False): + warnings.warn(depr_msg, FutureWarning, stacklevel=2) # TODO: Most of this should be refactored and shared with Series # 1. BlockManager -> array # 2. Series.index, Series.name, index, name reconciliation diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py index 42a29654b44d5..7b2401494419f 100644 --- a/pandas/tests/arrays/sparse/test_arithmetics.py +++ b/pandas/tests/arrays/sparse/test_arithmetics.py @@ -8,6 +8,7 @@ import pandas.util.testing as tm +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseArrayArithmetics(object): _base = np.array diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index eef7bf78c54b2..8e63a28982150 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -209,6 +209,7 @@ def test_scalar_with_index_infer_dtype(self, scalar, dtype): assert exp.dtype == dtype @pytest.mark.parametrize("fill", [1, np.nan, 0]) + @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_sparse_series_round_trip(self, kind, fill): # see gh-13999 arr = SparseArray([np.nan, 1, np.nan, 2, 3], @@ -225,6 +226,7 @@ def test_sparse_series_round_trip(self, kind, fill): tm.assert_sp_array_equal(arr, res) @pytest.mark.parametrize("fill", [True, False, np.nan]) + @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_sparse_series_round_trip2(self, kind, fill): # see gh-13999 arr = SparseArray([True, False, True, True], dtype=np.bool, @@ -1093,6 +1095,7 @@ def test_npoints(self): assert arr.npoints == 1 +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestAccessor(object): @pytest.mark.parametrize('attr', [ diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index f2da432e9d135..eb15b1f600bc5 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -17,6 +17,7 @@ import pandas.util.testing as tm +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestDataFrameAlterAxes(): def test_set_index_directly(self, float_string_frame): @@ -1376,6 +1377,7 @@ def test_droplevel(self): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestIntervalIndex(object): def test_setitem(self): diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index f58fe85cad258..926ffad7ab185 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -2076,6 +2076,7 @@ def test_loc_duplicates(self): df.loc[trange[bool_idx], "A"] += 6 tm.assert_frame_equal(df, expected) + @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_iloc_sparse_propegate_fill_value(self): from pandas.core.sparse.api import SparseDataFrame df = SparseDataFrame({'A': [999, 1]}, default_fill_value=999) diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 73c5c43cb709d..267f79b855f14 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -192,6 +192,7 @@ def test_subclass_iterrows(self): assert isinstance(row, tm.SubclassedSeries) tm.assert_series_equal(row, df.loc[i]) + @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_subclass_sparse_slice(self): rows = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]] ssdf = tm.SubclassedSparseDataFrame(rows) @@ -216,6 +217,7 @@ def test_subclass_sparse_slice(self): check_names=False, check_kind=False) + @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_subclass_sparse_transpose(self): ossdf = tm.SubclassedSparseDataFrame([[1, 2, 3], [4, 5, 6]]) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 0b1b7879910e4..1888f028839d2 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1003,6 +1003,7 @@ def test_datetime_tz(self): s_naive = Series(tz_naive) assert stz.to_json() == s_naive.to_json() + @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_sparse(self): # GH4377 df.to_json segfaults with non-ndarray blocks df = pd.DataFrame(np.random.randn(10, 4)) diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index 91dbe5c78acf3..4e1ae3a9fc9ca 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -551,6 +551,7 @@ def test_dataframe_duplicate_column_names(self): assert_frame_equal(result_3, expected_3) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparse(TestPackers): def _check_roundtrip(self, obj, comparator, **kwargs): @@ -841,6 +842,7 @@ def legacy_packer(request, datapath): @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestMsgpack(object): """ How to add msgpack tests: diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index ed070ce549081..9a573250fc7ad 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -2244,6 +2244,7 @@ def test_series(self): self._check_roundtrip(ts3, tm.assert_series_equal, check_index_type=False) + @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_sparse_series(self): s = tm.makeStringSeries() @@ -2260,6 +2261,7 @@ def test_sparse_series(self): self._check_roundtrip(ss3, tm.assert_series_equal, check_series_type=True) + @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_sparse_frame(self): s = tm.makeDataFrame() @@ -3742,6 +3744,7 @@ def test_start_stop_multiple(self): expected = df.loc[[0], ['foo', 'bar']] tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_start_stop_fixed(self): with ensure_clean_store(self.path) as store: diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index ca083bbde8428..593a0ee1deac2 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -14,6 +14,7 @@ from pandas.util.testing import assert_frame_equal +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestGetDummies(object): @pytest.fixture diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 23e39182dd779..c4f8d35cdd121 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -125,6 +125,7 @@ def test_sort_index_name(self): result = self.ts.sort_index(ascending=False) assert result.name == self.ts.name + @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_to_sparse_pass_name(self): result = self.ts.to_sparse() assert result.name == self.ts.name @@ -197,9 +198,12 @@ def test_constructor_dict_timedelta_index(self): ) self._assert_series_equal(result, expected) + @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_from_array_deprecated(self): - with tm.assert_produces_warning(FutureWarning): + # multiple FutureWarnings, so can't assert stacklevel + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): self.series_klass.from_array([1, 2, 3]) def test_sparse_accessor_updates_on_inplace(self): diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index 3aa98db171a46..5a69a82ea7d1a 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -211,6 +211,7 @@ def test_combine_first_dt_tz_values(self, tz_naive_fixture): exp = pd.Series(exp_vals, name='ser1') assert_series_equal(exp, result) + @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_concat_empty_series_dtypes(self): # booleans diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 94b643900ee0f..e625a9cab4c37 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -781,6 +781,7 @@ def test_series_fillna_limit(self): expected[:3] = np.nan assert_series_equal(result, expected) + @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_sparse_series_fillna_limit(self): index = np.arange(10) s = Series(np.random.randn(10), index=index) @@ -788,7 +789,8 @@ def test_sparse_series_fillna_limit(self): ss = s[:2].reindex(index).to_sparse() # TODO: what is this test doing? why are result an expected # the same call to fillna? - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(PerformanceWarning, + raise_on_extra_warnings=False): # TODO: release-note fillna performance warning result = ss.fillna(method='pad', limit=5) expected = ss.fillna(method='pad', limit=5) @@ -798,7 +800,8 @@ def test_sparse_series_fillna_limit(self): assert_series_equal(result, expected) ss = s[-2:].reindex(index).to_sparse() - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(PerformanceWarning, + raise_on_extra_warnings=False): result = ss.fillna(method='backfill', limit=5) expected = ss.fillna(method='backfill') expected = expected.to_dense() @@ -806,13 +809,15 @@ def test_sparse_series_fillna_limit(self): expected = expected.to_sparse() assert_series_equal(result, expected) + @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_sparse_series_pad_backfill_limit(self): index = np.arange(10) s = Series(np.random.randn(10), index=index) s = s.to_sparse() result = s[:2].reindex(index, method='pad', limit=5) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(PerformanceWarning, + raise_on_extra_warnings=False): expected = s[:2].reindex(index).fillna(method='pad') expected = expected.to_dense() expected[-3:] = np.nan @@ -820,13 +825,15 @@ def test_sparse_series_pad_backfill_limit(self): assert_series_equal(result, expected) result = s[-2:].reindex(index, method='backfill', limit=5) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(PerformanceWarning, + raise_on_extra_warnings=False): expected = s[-2:].reindex(index).fillna(method='backfill') expected = expected.to_dense() expected[:3] = np.nan expected = expected.to_sparse() assert_series_equal(result, expected) + @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_series_pad_backfill_limit(self): index = np.arange(10) s = Series(np.random.randn(10), index=index) diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index deb09a8a9dac3..08807c1b9df32 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -1,5 +1,6 @@ # coding=utf-8 import numpy as np +import pytest import pandas as pd from pandas import SparseDtype @@ -40,6 +41,7 @@ def test_subclass_unstack(self): tm.assert_frame_equal(res, exp) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseSeriesSubclassing(object): def test_subclass_sparse_slice(self): diff --git a/pandas/tests/sparse/frame/test_analytics.py b/pandas/tests/sparse/frame/test_analytics.py index 95c1c8c453d0a..ae97682f297ad 100644 --- a/pandas/tests/sparse/frame/test_analytics.py +++ b/pandas/tests/sparse/frame/test_analytics.py @@ -5,6 +5,7 @@ from pandas.util import testing as tm +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)') def test_quantile(): # GH 17386 @@ -22,6 +23,7 @@ def test_quantile(): tm.assert_sp_series_equal(result, sparse_expected) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)') def test_quantile_multi(): # GH 17386 diff --git a/pandas/tests/sparse/frame/test_apply.py b/pandas/tests/sparse/frame/test_apply.py index b5ea0a5c90e1a..afb54a9fa6264 100644 --- a/pandas/tests/sparse/frame/test_apply.py +++ b/pandas/tests/sparse/frame/test_apply.py @@ -37,17 +37,21 @@ def fill_frame(frame): index=frame.index) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_apply(frame): applied = frame.apply(np.sqrt) assert isinstance(applied, SparseDataFrame) tm.assert_almost_equal(applied.values, np.sqrt(frame.values)) # agg / broadcast - with tm.assert_produces_warning(FutureWarning): + # two FutureWarnings, so we can't check stacklevel properly. + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): broadcasted = frame.apply(np.sum, broadcast=True) assert isinstance(broadcasted, SparseDataFrame) - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): exp = frame.to_dense().apply(np.sum, broadcast=True) tm.assert_frame_equal(broadcasted.to_dense(), exp) @@ -56,15 +60,18 @@ def test_apply(frame): frame.to_dense().apply(nanops.nansum).to_sparse()) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_apply_fill(fill_frame): applied = fill_frame.apply(np.sqrt) assert applied['A'].fill_value == np.sqrt(2) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_apply_empty(empty): assert empty.apply(np.sqrt) is empty +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_apply_nonuq(): orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c']) @@ -88,12 +95,14 @@ def test_apply_nonuq(): # tm.assert_series_equal(res.to_dense(), exp) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_applymap(frame): # just test that it works result = frame.applymap(lambda x: x * 2) assert isinstance(result, SparseDataFrame) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_apply_keep_sparse_dtype(): # GH 23744 sdf = SparseDataFrame(np.array([[0, 1, 0], [0, 0, 0], [0, 0, 1]]), diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 22868030308d7..7e04f6c33e5dc 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -20,6 +20,12 @@ from pandas.tseries.offsets import BDay +def test_deprecated(): + with tm.assert_produces_warning(FutureWarning): + pd.SparseDataFrame({"A": [1, 2]}) + + +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseDataFrame(SharedWithSparse): klass = SparseDataFrame @@ -669,7 +675,8 @@ def test_append(self, float_frame): a = float_frame.iloc[:5, :3] b = float_frame.iloc[5:] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False, + raise_on_extra_warnings=False): # Stacklevel is set for pd.concat, not append appended = a.append(b) tm.assert_sp_frame_equal(appended.iloc[:, :3], float_frame.iloc[:, :3], @@ -684,12 +691,12 @@ def test_append(self, float_frame): "A": [None, None, 2, 3], "D": [None, None, 5, None], }, index=a.index | b.index, columns=['B', 'C', 'A', 'D']) - with tm.assert_produces_warning(None): + with tm.assert_produces_warning(None, raise_on_extra_warnings=False): appended = a.append(b, sort=False) tm.assert_frame_equal(appended, expected) - with tm.assert_produces_warning(None): + with tm.assert_produces_warning(None, raise_on_extra_warnings=False): appended = a.append(b, sort=True) tm.assert_sp_frame_equal(appended, expected[['A', 'B', 'C', 'D']], @@ -810,7 +817,8 @@ def test_sparse_frame_pad_backfill_limit(self): result = sdf[:2].reindex(index, method='pad', limit=5) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(PerformanceWarning, + raise_on_extra_warnings=False): expected = sdf[:2].reindex(index).fillna(method='pad') expected = expected.to_dense() expected.values[-3:] = np.nan @@ -819,7 +827,8 @@ def test_sparse_frame_pad_backfill_limit(self): result = sdf[-2:].reindex(index, method='backfill', limit=5) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(PerformanceWarning, + raise_on_extra_warnings=False): expected = sdf[-2:].reindex(index).fillna(method='backfill') expected = expected.to_dense() expected.values[:3] = np.nan @@ -832,10 +841,12 @@ def test_sparse_frame_fillna_limit(self): sdf = df.to_sparse() result = sdf[:2].reindex(index) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(PerformanceWarning, + raise_on_extra_warnings=False): result = result.fillna(method='pad', limit=5) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(PerformanceWarning, + raise_on_extra_warnings=False): expected = sdf[:2].reindex(index).fillna(method='pad') expected = expected.to_dense() expected.values[-3:] = np.nan @@ -843,10 +854,12 @@ def test_sparse_frame_fillna_limit(self): tm.assert_frame_equal(result, expected) result = sdf[-2:].reindex(index) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(PerformanceWarning, + raise_on_extra_warnings=False): result = result.fillna(method='backfill', limit=5) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(PerformanceWarning, + raise_on_extra_warnings=False): expected = sdf[-2:].reindex(index).fillna(method='backfill') expected = expected.to_dense() expected.values[:3] = np.nan @@ -1283,6 +1296,7 @@ def test_default_fill_value_with_no_data(self): tm.assert_frame_equal(expected, result) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseDataFrameArithmetic(object): def test_numeric_op_scalar(self): @@ -1312,6 +1326,7 @@ def test_comparison_op_scalar(self): tm.assert_frame_equal(res.to_dense(), df != 0) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseDataFrameAnalytics(object): def test_cumsum(self, float_frame): diff --git a/pandas/tests/sparse/frame/test_to_csv.py b/pandas/tests/sparse/frame/test_to_csv.py index ed19872f8a7ef..5ade0457f8db6 100644 --- a/pandas/tests/sparse/frame/test_to_csv.py +++ b/pandas/tests/sparse/frame/test_to_csv.py @@ -5,6 +5,7 @@ from pandas.util import testing as tm +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseDataFrameToCsv(object): fill_values = [np.nan, 0, None, 1] diff --git a/pandas/tests/sparse/frame/test_to_from_scipy.py b/pandas/tests/sparse/frame/test_to_from_scipy.py index a80a51a66017e..269d67976b567 100644 --- a/pandas/tests/sparse/frame/test_to_from_scipy.py +++ b/pandas/tests/sparse/frame/test_to_from_scipy.py @@ -19,6 +19,7 @@ @pytest.mark.parametrize('fill_value', [None, 0, np.nan]) @pytest.mark.parametrize('dtype', [bool, int, float, np.uint16]) @ignore_matrix_warning +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype): # GH 4343 # Make one ndarray and from it one sparse matrix, both to be used for @@ -69,6 +70,7 @@ def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype): @pytest.mark.parametrize('fill_value', [None, 0, np.nan]) # noqa: F811 @ignore_matrix_warning @pytest.mark.filterwarnings("ignore:object dtype is not supp:UserWarning") +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_from_to_scipy_object(spmatrix, fill_value): # GH 4343 dtype = object @@ -117,6 +119,7 @@ def test_from_to_scipy_object(spmatrix, fill_value): @ignore_matrix_warning +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_from_scipy_correct_ordering(spmatrix): # GH 16179 arr = np.arange(1, 5).reshape(2, 2) @@ -136,6 +139,7 @@ def test_from_scipy_correct_ordering(spmatrix): @ignore_matrix_warning +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_from_scipy_fillna(spmatrix): # GH 16112 arr = np.eye(3) @@ -169,6 +173,7 @@ def test_from_scipy_fillna(spmatrix): tm.assert_sp_frame_equal(sdf, expected) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_index_names_multiple_nones(): # https://github.com/pandas-dev/pandas/pull/24092 sparse = pytest.importorskip("scipy.sparse") diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index 35ca5e1ec58fa..3446990b7a0e8 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -21,6 +21,11 @@ from pandas.tseries.offsets import BDay +def test_deprecated(): + with tm.assert_produces_warning(FutureWarning): + pd.SparseSeries([0, 1]) + + def _test_data1(): # nan-based arr = np.arange(20, dtype=float) @@ -55,6 +60,7 @@ def _test_data2_zero(): return arr, index +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseSeries(SharedWithSparse): series_klass = SparseSeries @@ -532,10 +538,13 @@ def _compare(idx): exp = pd.Series(np.repeat(nan, 5)) tm.assert_series_equal(sp.take([0, 1, 2, 3, 4]), exp.to_sparse()) - with tm.assert_produces_warning(FutureWarning): + # multiple FutureWarnings, can't check stacklevel + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): sp.take([1, 5], convert=True) - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): sp.take([1, 5], convert=False) def test_numpy_take(self): @@ -1032,6 +1041,7 @@ def test_memory_usage_deep(self, deep, fill_value): assert sparse_usage < dense_usage +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseHandlingMultiIndexes(object): def setup_method(self, method): @@ -1062,6 +1072,7 @@ def test_round_trip_preserve_multiindex_names(self): @pytest.mark.filterwarnings( "ignore:the matrix subclass:PendingDeprecationWarning" ) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseSeriesScipyInteraction(object): # Issue 8048: add SparseSeries coo methods @@ -1253,13 +1264,15 @@ def test_concat_different_fill(self): sparse1 = pd.SparseSeries(val1, name='x', kind=kind) sparse2 = pd.SparseSeries(val2, name='y', kind=kind, fill_value=0) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(PerformanceWarning, + raise_on_extra_warnings=False): res = pd.concat([sparse1, sparse2]) exp = pd.concat([pd.Series(val1), pd.Series(val2)]) exp = pd.SparseSeries(exp, kind=kind) tm.assert_sp_series_equal(res, exp) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(PerformanceWarning, + raise_on_extra_warnings=False): res = pd.concat([sparse2, sparse1]) exp = pd.concat([pd.Series(val2), pd.Series(val1)]) exp = pd.SparseSeries(exp, kind=kind, fill_value=0) @@ -1285,13 +1298,15 @@ def test_concat_different_kind(self): sparse1 = pd.SparseSeries(val1, name='x', kind='integer') sparse2 = pd.SparseSeries(val2, name='y', kind='block', fill_value=0) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(PerformanceWarning, + raise_on_extra_warnings=False): res = pd.concat([sparse1, sparse2]) exp = pd.concat([pd.Series(val1), pd.Series(val2)]) exp = pd.SparseSeries(exp, kind='integer') tm.assert_sp_series_equal(res, exp) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(PerformanceWarning, + raise_on_extra_warnings=False): res = pd.concat([sparse2, sparse1]) exp = pd.concat([pd.Series(val2), pd.Series(val1)]) exp = pd.SparseSeries(exp, kind='block', fill_value=0) @@ -1425,6 +1440,7 @@ def _dense_series_compare(s, f): tm.assert_series_equal(result.to_dense(), dense_result) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseSeriesAnalytics(object): def setup_method(self, method): @@ -1484,16 +1500,20 @@ def test_deprecated_numpy_func_call(self): for func in funcs: for series in ('bseries', 'zbseries'): with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + check_stacklevel=False, + raise_on_extra_warnings=False): getattr(np, func)(getattr(self, series)) with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + check_stacklevel=False, + raise_on_extra_warnings=False): getattr(getattr(self, series), func)() def test_deprecated_reindex_axis(self): # https://github.com/pandas-dev/pandas/issues/17833 - with tm.assert_produces_warning(FutureWarning) as m: + # Multiple FutureWarnings, can't check stacklevel + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False) as m: self.bseries.reindex_axis([0, 1, 2]) assert 'reindex' in str(m[0].message) @@ -1502,6 +1522,7 @@ def test_deprecated_reindex_axis(self): 'datetime_type', (np.datetime64, pd.Timestamp, lambda x: datetime.strptime(x, '%Y-%m-%d'))) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_constructor_dict_datetime64_index(datetime_type): # GH 9456 dates = ['1984-02-19', '1988-11-06', '1989-12-03', '1990-03-15'] @@ -1513,6 +1534,7 @@ def test_constructor_dict_datetime64_index(datetime_type): tm.assert_sp_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_to_sparse(): # https://github.com/pandas-dev/pandas/issues/22389 arr = pd.SparseArray([1, 2, None, 3]) @@ -1521,12 +1543,14 @@ def test_to_sparse(): tm.assert_sp_array_equal(result.values, arr, check_kind=False) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_constructor_mismatched_raises(): msg = "Length of passed values is 2, index implies 3" with pytest.raises(ValueError, match=msg): SparseSeries([1, 2], index=[1, 2, 3]) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_block_deprecated(): s = SparseSeries([1]) with tm.assert_produces_warning(FutureWarning): diff --git a/pandas/tests/sparse/test_combine_concat.py b/pandas/tests/sparse/test_combine_concat.py index 51875148daadb..23e12e15075ff 100644 --- a/pandas/tests/sparse/test_combine_concat.py +++ b/pandas/tests/sparse/test_combine_concat.py @@ -35,6 +35,7 @@ def test_uses_first_kind(self, kind): assert result.kind == kind +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseSeriesConcat(object): @pytest.mark.parametrize('kind', [ @@ -82,14 +83,16 @@ def test_concat_different_fill(self): sparse1 = pd.SparseSeries(val1, name='x', kind=kind) sparse2 = pd.SparseSeries(val2, name='y', kind=kind, fill_value=0) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(PerformanceWarning, + raise_on_extra_warnings=False): res = pd.concat([sparse1, sparse2]) exp = pd.concat([pd.Series(val1), pd.Series(val2)]) exp = pd.SparseSeries(exp, kind=kind) tm.assert_sp_series_equal(res, exp) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(PerformanceWarning, + raise_on_extra_warnings=False): res = pd.concat([sparse2, sparse1]) exp = pd.concat([pd.Series(val2), pd.Series(val1)]) @@ -176,6 +179,7 @@ def test_concat_sparse_dense(self, kind): tm.assert_series_equal(res, exp) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseDataFrameConcat(object): def setup_method(self, method): @@ -245,12 +249,14 @@ def test_concat_different_fill_value(self): sparse = self.dense1.to_sparse() sparse2 = self.dense2.to_sparse(fill_value=0) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(PerformanceWarning, + raise_on_extra_warnings=False): res = pd.concat([sparse, sparse2]) exp = pd.concat([self.dense1, self.dense2]).to_sparse() tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(PerformanceWarning, + raise_on_extra_warnings=False): res = pd.concat([sparse2, sparse]) exp = pd.concat([self.dense2, self.dense1]).to_sparse(fill_value=0) exp._default_fill_value = np.nan @@ -260,9 +266,15 @@ def test_concat_different_columns_sort_warns(self): sparse = self.dense1.to_sparse() sparse3 = self.dense3.to_sparse() - with tm.assert_produces_warning(FutureWarning): + # stacklevel is wrong since we have two FutureWarnings, + # one for depr, one for sorting. + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False, + raise_on_extra_warnings=False): res = pd.concat([sparse, sparse3]) - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False, + raise_on_extra_warnings=False,): exp = pd.concat([self.dense1, self.dense3]) exp = exp.to_sparse() diff --git a/pandas/tests/sparse/test_format.py b/pandas/tests/sparse/test_format.py index 5f44266620f86..e310c02ff3b05 100644 --- a/pandas/tests/sparse/test_format.py +++ b/pandas/tests/sparse/test_format.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- import numpy as np +import pytest from pandas.compat import is_platform_32bit, is_platform_windows @@ -10,6 +11,7 @@ use_32bit_repr = is_platform_windows() or is_platform_32bit() +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseSeriesFormatting(object): @property @@ -106,6 +108,7 @@ def test_sparse_int(self): assert result == exp +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseDataFrameFormatting(object): def test_sparse_frame(self): diff --git a/pandas/tests/sparse/test_groupby.py b/pandas/tests/sparse/test_groupby.py index d0ff2a02c4046..5146741cc5e44 100644 --- a/pandas/tests/sparse/test_groupby.py +++ b/pandas/tests/sparse/test_groupby.py @@ -6,6 +6,7 @@ import pandas.util.testing as tm +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseGroupBy(object): def setup_method(self, method): @@ -60,6 +61,7 @@ def test_aggfuncs(self): @pytest.mark.parametrize("fill_value", [0, np.nan]) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_groupby_includes_fill_value(fill_value): # https://github.com/pandas-dev/pandas/issues/5078 df = pd.DataFrame({'a': [fill_value, 1, fill_value, fill_value], diff --git a/pandas/tests/sparse/test_indexing.py b/pandas/tests/sparse/test_indexing.py index fb6cae3ad6deb..b2aaf0a8db43d 100644 --- a/pandas/tests/sparse/test_indexing.py +++ b/pandas/tests/sparse/test_indexing.py @@ -6,6 +6,7 @@ import pandas.util.testing as tm +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseSeriesIndexing(object): def setup_method(self, method): @@ -454,6 +455,7 @@ def tests_indexing_with_sparse(self, kind, fill): s.iloc[indexer] +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseSeriesMultiIndexing(TestSparseSeriesIndexing): def setup_method(self, method): @@ -599,6 +601,7 @@ def test_reindex(self): assert sparse is not res +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseDataFrameIndexing(object): def test_getitem(self): @@ -976,6 +979,7 @@ def test_reindex_fill_value(self): tm.assert_sp_frame_equal(res, exp) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestMultitype(object): def setup_method(self, method): diff --git a/pandas/tests/sparse/test_pivot.py b/pandas/tests/sparse/test_pivot.py index af7de43ec0f8a..2288ac671f724 100644 --- a/pandas/tests/sparse/test_pivot.py +++ b/pandas/tests/sparse/test_pivot.py @@ -1,9 +1,11 @@ import numpy as np +import pytest import pandas as pd import pandas.util.testing as tm +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestPivotTable(object): def setup_method(self, method): diff --git a/pandas/tests/sparse/test_reshape.py b/pandas/tests/sparse/test_reshape.py index 6830e40ce6533..37ec0bba2621d 100644 --- a/pandas/tests/sparse/test_reshape.py +++ b/pandas/tests/sparse/test_reshape.py @@ -15,12 +15,14 @@ def multi_index3(): return pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)]) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_sparse_frame_stack(sparse_df, multi_index3): ss = sparse_df.stack() expected = pd.SparseSeries(np.ones(3), index=multi_index3) tm.assert_sp_series_equal(ss, expected) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_sparse_frame_unstack(sparse_df): mi = pd.MultiIndex.from_tuples([(0, 0), (1, 0), (1, 2)]) sparse_df.index = mi @@ -33,6 +35,7 @@ def test_sparse_frame_unstack(sparse_df): tm.assert_numpy_array_equal(unstacked_df.values, unstacked_sdf.values) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_sparse_series_unstack(sparse_df, multi_index3): frame = pd.SparseSeries(np.ones(3), index=multi_index3).unstack() diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 9659cb33686d0..ef740f4a57c74 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -2422,7 +2422,8 @@ class for all warnings. To check that no warning is returned, pass saw_warning = False - warnings.simplefilter(filter_level) + if filter_level: + warnings.simplefilter(filter_level) yield w extra_warnings = [] From c0d6cf23009042aab7389c647b1305e37d43ad6a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 May 2019 10:12:52 -0500 Subject: [PATCH 03/22] fixup --- pandas/core/arrays/sparse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index f35d645bdce02..66ccd6848864d 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1956,7 +1956,7 @@ def _make_index(length, indices, kind): # Accessor -class BaseAccessor(object): +class BaseAccessor: _validation_msg = "Can only use the '.sparse' accessor with Sparse data." def __init__(self, data=None): From 8f06d882c0a22b3d43152928044375c7199855c2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 May 2019 14:47:35 -0500 Subject: [PATCH 04/22] fixup --- pandas/tests/dtypes/test_common.py | 28 ++++++++++++++++++++++++---- pandas/tests/dtypes/test_dtypes.py | 7 ++++++- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index c48fae5c26301..c7a62dfe77c37 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -15,6 +15,10 @@ from pandas.core.sparse.api import SparseDtype import pandas.util.testing as tm +ignore_sparse_warning = pytest.mark.filterwarnings( + "ignore:Sparse:FutureWarning" +) + # EA & Actual Dtypes def to_ea_dtypes(dtypes): @@ -146,6 +150,7 @@ def test_is_object(): @pytest.mark.parametrize("check_scipy", [ False, pytest.param(True, marks=td.skip_if_no_scipy) ]) +@ignore_sparse_warning def test_is_sparse(check_scipy): assert com.is_sparse(pd.SparseArray([1, 2, 3])) assert com.is_sparse(pd.SparseSeries([1, 2, 3])) @@ -158,6 +163,7 @@ def test_is_sparse(check_scipy): @td.skip_if_no_scipy +@ignore_sparse_warning def test_is_scipy_sparse(): from scipy.sparse import bsr_matrix assert com.is_scipy_sparse(bsr_matrix([1, 2, 3])) @@ -529,6 +535,7 @@ def test_is_bool_dtype(): @pytest.mark.parametrize("check_scipy", [ False, pytest.param(True, marks=td.skip_if_no_scipy) ]) +@ignore_sparse_warning def test_is_extension_type(check_scipy): assert not com.is_extension_type([1, 2, 3]) assert not com.is_extension_type(np.array([1, 2, 3])) @@ -595,8 +602,6 @@ def test_is_offsetlike(): (pd.DatetimeIndex([1, 2]).dtype, np.dtype('=M8[ns]')), (' Date: Tue, 14 May 2019 14:50:29 -0500 Subject: [PATCH 05/22] fixup --- pandas/tests/dtypes/test_generic.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/tests/dtypes/test_generic.py b/pandas/tests/dtypes/test_generic.py index e0590591c6899..142ed2f9fc24d 100644 --- a/pandas/tests/dtypes/test_generic.py +++ b/pandas/tests/dtypes/test_generic.py @@ -1,4 +1,4 @@ -from warnings import catch_warnings +from warnings import catch_warnings, simplefilter import numpy as np @@ -17,9 +17,12 @@ class TestABCClasses: categorical = pd.Categorical([1, 2, 3], categories=[2, 3, 1]) categorical_df = pd.DataFrame({"values": [1, 2, 3]}, index=categorical) df = pd.DataFrame({'names': ['a', 'b', 'c']}, index=multi_index) - sparse_series = pd.Series([1, 2, 3]).to_sparse() + with catch_warnings(): + simplefilter('ignore', FutureWarning) + sparse_series = pd.Series([1, 2, 3]).to_sparse() + sparse_frame = pd.SparseDataFrame({'a': [1, -1, None]}) + sparse_array = pd.SparseArray(np.random.randn(10)) - sparse_frame = pd.SparseDataFrame({'a': [1, -1, None]}) datetime_array = pd.core.arrays.DatetimeArray(datetime_index) timedelta_array = pd.core.arrays.TimedeltaArray(timedelta_index) From 21569e2c92b50761999416ac5ca2e8658432d2b5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 May 2019 14:54:13 -0500 Subject: [PATCH 06/22] fixup --- doc/source/user_guide/sparse.rst | 4 ++-- pandas/core/sparse/frame.py | 6 ++++-- pandas/core/sparse/series.py | 6 ++++-- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index a7fdc05ef5640..09e02e597c39e 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -375,8 +375,8 @@ row and columns coordinates of the matrix. Note that this will consume a signifi .. _sparse.migration: -Migrating from SparseSeries and SparseDataFrame ------------------------------------------------ +Migrating +--------- :class:`SparseArray` is the building block for all of ``Series``, ``SparseSeries``, ``DataFrame``, and ``SparseDataFrame``. To simplify the pandas API and lower maintenance burden, diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index a0c63fc0ff02f..400af483815f5 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -29,9 +29,11 @@ _shared_doc_kwargs = dict(klass='SparseDataFrame') depr_msg = """\ -SparseDataFrame is deprecated. +SparseDataFrame is deprecated and will be removed in a future version. +Use a DataFrame with sparse values instead. -See ... for more. +See http://pandas.pydata.org/pandas-docs/stable/\ +user_guide/sparse.html#migrating for more. """ diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index ee633c75bb425..7b196282f42bc 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -33,9 +33,11 @@ depr_msg = """\ -SparseSeries is deprecated. +SparseSeries is deprecated and will be removed in a future version. +Use a Series with sparse values instead. -See ... for more. +See http://pandas.pydata.org/pandas-docs/stable/\ +user_guide/sparse.html#migrating for more. """ From 6a818371b40fde197f1ae467e66e2b4f85a77737 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 May 2019 15:20:34 -0500 Subject: [PATCH 07/22] docs --- doc/source/user_guide/sparse.rst | 328 ++++++++++++------------------- 1 file changed, 128 insertions(+), 200 deletions(-) diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index 09e02e597c39e..f5f49a59d2a4e 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -12,27 +12,22 @@ Sparse data structures is served equally well by a :class:`Series` or :class:`DataFrame` with sparse values. See :ref:`sparse.migration` for tips on migrating. -We have implemented "sparse" versions of ``Series`` and ``DataFrame``. These are not sparse -in the typical "mostly 0". Rather, you can view these objects as being "compressed" -where any data matching a specific value (``NaN`` / missing value, though any value -can be chosen) is omitted. A special ``SparseIndex`` object tracks where data has been -"sparsified". This will make much more sense with an example. All of the standard pandas -data structures have a ``to_sparse`` method: +Pandas provides data structures for efficiently storing sparse data. +These are not necessarily sparse in the typical "mostly 0". Rather, you can view these +objects as being "compressed" where any data matching a specific value (``NaN`` / missing value, though any value +can be chosen, including 0) is omitted. A special ``SparseIndex`` object tracks where data has been +"sparsified". For example, .. ipython:: python - ts = pd.Series(np.random.randn(10)) - ts[2:-2] = np.nan - sts = ts.to_sparse() - sts - -The ``to_sparse`` method takes a ``kind`` argument (for the sparse index, see -below) and a ``fill_value``. So if we had a mostly zero ``Series``, we could -convert it to sparse with ``fill_value=0``: - -.. ipython:: python + arr = np.random.randn(10) + arr[2:-2] = np.nan + ts = pd.Series(pd.SparseArray(arr)) + ts - ts.fillna(0).to_sparse(fill_value=0) +Notice the dtype, ``Sparse[float64, nan]``. The ``nan`` means that elements in the +array that are ``nan`` aren't actually stored, only the non-``nan`` elements are. +Those non-``nan`` elements have a ``float64`` dtype. The sparse objects exist for memory efficiency reasons. Suppose you had a large, mostly NA ``DataFrame``: @@ -41,21 +36,64 @@ large, mostly NA ``DataFrame``: df = pd.DataFrame(np.random.randn(10000, 4)) df.iloc[:9998] = np.nan - sdf = df.to_sparse() + sdf = df.astype(pd.SparseDtype("float", np.nan)) sdf - sdf.density + sdf.sparse.density As you can see, the density (% of values that have not been "compressed") is extremely low. This sparse object takes up much less memory on disk (pickled) and in the Python interpreter. Functionally, their behavior should be nearly identical to their dense counterparts. -Any sparse object can be converted back to the standard dense form by calling -``to_dense``: +.. _sparse.array: + +SparseArray +----------- + +:class:`SparseArray` is a :class:`~pandas.api.extensions.ExtensionArray` +for storing an array of sparse values (see :ref:`basics.dtypes` for more +on extension arrays). It is a 1-dimensional ndarray-like object storing +only values distinct from the ``fill_value``: + +.. ipython:: python + + arr = np.random.randn(10) + arr[2:5] = np.nan + arr[7:8] = np.nan + sparr = pd.SparseArray(arr) + sparr + +A sparse array can be converted to a regular (dense) ndarray with :meth:`numpy.asarray` + +.. ipython:: python + + np.asarray(sparr) + +The :attr:`SparseArray.dtype` property stores two pieces of information + +1. The dtype of the non-sparse values +2. The scalar fill value + +A :class:`SparseDtype` may be constructed by passing each of these + +.. ipython:: python + + pd.SparseDtype(np.dtype('datetime64[ns]')) + +The default fill value for a given NumPy dtype is the "missing" value for that dtype, +though it may be overridden. + +.. ipython:: python + + pd.SparseDtype(np.dtype('datetime64[ns]'), + fill_value=pd.Timestamp('2017-01-01')) + +Finally, the string alias ``'Sparse[dtype]'`` may be used to specify a sparse dtype +in many places .. ipython:: python - sts.to_dense() + pd.array([1, 0, 0, 2], dtype='Sparse[int]') .. _sparse.accessor: @@ -77,30 +115,11 @@ attributes and methods that are specific to sparse data. This accessor is available only on data with ``SparseDtype``, and on the :class:`Series` class itself for creating a Series with sparse data from a scipy COO matrix with. -.. _sparse.array: - -SparseArray ------------ - -``SparseArray`` is the base layer for all of the sparse indexed data -structures. It is a 1-dimensional ndarray-like object storing only values -distinct from the ``fill_value``: - -.. ipython:: python - - arr = np.random.randn(10) - arr[2:5] = np.nan - arr[7:8] = np.nan - sparr = pd.SparseArray(arr) - sparr - -Like the indexed objects (SparseSeries, SparseDataFrame), a ``SparseArray`` -can be converted back to a regular ndarray by calling ``to_dense``: -.. ipython:: python - - sparr.to_dense() +.. versionadded:: 0.25.0 +A ``.sparse`` accessor has been added for :class:`DataFrame` as well. +See :ref:`api.dataframe.sparse` for more. SparseIndex objects ------------------- @@ -111,158 +130,115 @@ keeps an arrays of all of the locations where the data are not equal to the fill value. The ``block`` format tracks only the locations and sizes of blocks of data. -.. _sparse.dtype: - -Sparse Dtypes -------------- +.. _sparse.calculation: -Sparse data should have the same dtype as its dense representation. Currently, -``float64``, ``int64`` and ``bool`` dtypes are supported. Depending on the original -dtype, ``fill_value`` default changes: +Sparse Calculation +------------------ -* ``float64``: ``np.nan`` -* ``int64``: ``0`` -* ``bool``: ``False`` +You can apply NumPy *ufuncs* to ``SparseArray`` and get a ``SparseArray`` as a result. .. ipython:: python - s = pd.Series([1, np.nan, np.nan]) - s - s.to_sparse() - - s = pd.Series([1, 0, 0]) - s - s.to_sparse() - - s = pd.Series([True, False, True]) - s - s.to_sparse() + arr = pd.SparseArray([1., np.nan, np.nan, -2., np.nan]) + np.abs(arr) -You can change the dtype using ``.astype()``, the result is also sparse. Note that -``.astype()`` also affects to the ``fill_value`` to keep its dense representation. +The *ufunc* is also applied to ``fill_value``. This is needed to get +the correct dense result. .. ipython:: python - s = pd.Series([1, 0, 0, 0, 0]) - s - ss = s.to_sparse() - ss - ss.astype(np.float64) - -It raises if any value cannot be coerced to specified dtype. - -.. code-block:: ipython - - In [1]: ss = pd.Series([1, np.nan, np.nan]).to_sparse() - Out[1]: - 0 1.0 - 1 NaN - 2 NaN - dtype: float64 - BlockIndex - Block locations: array([0], dtype=int32) - Block lengths: array([1], dtype=int32) - - In [2]: ss.astype(np.int64) - Out[2]: - ValueError: unable to coerce current fill_value nan to int64 dtype + arr = pd.SparseArray([1., -1, -1, -2., -1], fill_value=-1) + np.abs(arr) + np.abs(arr).to_dense() +.. _sparse.migration: +Migrating +--------- -We have implemented "sparse" versions of ``Series`` and ``DataFrame``. These are not sparse -in the typical "mostly 0". Rather, you can view these objects as being "compressed" -where any data matching a specific value (``NaN`` / missing value, though any value -can be chosen) is omitted. A special ``SparseIndex`` object tracks where data has been -"sparsified". This will make much more sense with an example. All of the standard pandas -data structures have a ``to_sparse`` method: +In older versions of pandas, the ``SparseSeries`` and ``SparseDataFrame`` classes (documented below) +were the preferred way to work with sparse data. With the advent of extension arrays, these subclasses +are no longer needed. Their purpose is better served by using a regular Series or DataFrame with +sparse values instead. -.. ipython:: python +**There's no performance or memory penalty to using a Series or DataFrame with sparse values, +rather than a SparseSeries or SparseDataFrame**. - ts = pd.Series(np.random.randn(10)) - ts[2:-2] = np.nan - sts = ts.to_sparse() - sts +This section provides some guidance on migrating your code to the new style. As a reminder, you can +use the python warnings module to control warnings. If you wish to ignore the warnings, -The ``to_sparse`` method takes a ``kind`` argument (for the sparse index, see -below) and a ``fill_value``. So if we had a mostly zero ``Series``, we could -convert it to sparse with ``fill_value=0``: +.. code-block:: python -.. ipython:: python + >>> import warnings - ts.fillna(0).to_sparse(fill_value=0) + >>> warnings.filterwarnings('ignore', 'Sparse', FutureWarning + >>> pd.SparseSeries() # No warning message + Series([], dtype: Sparse[float64, nan]) + BlockIndex + Block locations: array([], dtype=int32) + Block lengths: array([], dtype=int32) -The sparse objects exist for memory efficiency reasons. Suppose you had a -large, mostly NA ``DataFrame``: +But we recommend modifying your code, rather than ignoring the warning.l -.. ipython:: python +**Construction** - df = pd.DataFrame(np.random.randn(10000, 4)) - df.iloc[:9998] = np.nan - sdf = df.to_sparse() - sdf - sdf.density +From an array-like, use the regular :class:`Series` or +:class:`DataFrame` constructors with :class:`SparseArray` values. -As you can see, the density (% of values that have not been "compressed") is -extremely low. This sparse object takes up much less memory on disk (pickled) -and in the Python interpreter. Functionally, their behavior should be nearly -identical to their dense counterparts. +.. code-block:: python -Any sparse object can be converted back to the standard dense form by calling -``to_dense``: + # Old way + >>> pd.SparseDataFrame({"A": [0, 1]}) .. ipython:: python - sts.to_dense() - -.. _sparse.accessor: + # New way + pd.DataFrame({"A": pd.SparseArray([0, 1])}) -Sparse Accessor ---------------- +From a SciPy sparse matrix, use :meth:`DataFrame.sparse.from_spmatrix`, -.. versionadded:: 0.24.0 +.. code-block:: python -Pandas provides a ``.sparse`` accessor, similar to ``.str`` for string data, ``.cat`` -for categorical data, and ``.dt`` for datetime-like data. This namespace provides -attributes and methods that are specific to sparse data. + # Old way + df = pd.SparseDataFrame(sp_matrix, columns=['A', 'B', 'C']) .. ipython:: python - s = pd.Series([0, 0, 1, 2], dtype="Sparse[int]") - s.sparse.density - s.sparse.fill_value - -This accessor is available only on data with ``SparseDtype``, and on the :class:`Series` -class itself for creating a Series with sparse data from a scipy COO matrix with. - - -.. versionadded:: 0.25.0 + # New way + from scipy import sparse + mat = sparse.eye(3) + df = pd.DataFrame.sparse.from_spmatrix(mat, columns=['A', 'B', 'C']) + df -A ``.sparse`` accessor has been added for :class:`DataFrame` as well. -See :ref:`api.dataframe.sparse` for more. +**Conversion** +From sparse to dense, use the ``.sparse`` accessors -.. _sparse.calculation: +.. ipython:: python -Sparse Calculation ------------------- + df.sparse.to_dense() + df.sparse.to_coo() + df['A'] -You can apply NumPy *ufuncs* to ``SparseArray`` and get a ``SparseArray`` as a result. +From dense to sparse, use :meth:`DataFrame.astype` with a :class:`SparseDtype`. .. ipython:: python - arr = pd.SparseArray([1., np.nan, np.nan, -2., np.nan]) - np.abs(arr) + dense = pd.DataFrame({"A": [1, 0, 0, 1]}) + dtype = pd.SparseDtype(int, fill_value=0) + dense.astype(dtype)['A +**Sparse Properties** -The *ufunc* is also applied to ``fill_value``. This is needed to get -the correct dense result. +Sparse-specific properties, like ``density``, are available on the ``.sparse`` accssor. .. ipython:: python - arr = pd.SparseArray([1., -1, -1, -2., -1], fill_value=-1) - np.abs(arr) - np.abs(arr).to_dense() + df.sparse.density + +The ``SparseDataFrame.default_kind`` and ``SparseDataFrame.default_fill_value`` attributes +have no replacement. .. _sparse.scipysparse: @@ -277,6 +253,7 @@ SparseDataFrame Pandas supports creating sparse dataframes directly from ``scipy.sparse`` matrices. .. ipython:: python + :okwarning: from scipy.sparse import csr_matrix @@ -373,52 +350,3 @@ row and columns coordinates of the matrix. Note that this will consume a signifi ss_dense -.. _sparse.migration: - -Migrating ---------- - -:class:`SparseArray` is the building block for all of ``Series``, ``SparseSeries``, -``DataFrame``, and ``SparseDataFrame``. To simplify the pandas API and lower maintenance burden, -we've deprecated the ``SparseSeries`` and ``SparseDataFrame`` classes. - -**There's no performance or memory penalty to using a Series or DataFrame with sparse values, -rather than a SparseSeries or SparseDataFrame**. - -**Construction** - -Use the regular :class:`Series` or :class:`DataFrame` constructors with :class:`SparseArray` values - -.. ipython:: python - - pd.DataFrame({"A": pd.SparseArray([0, 1])}) - -Or use :meth:`DataFrame.sparse.from_spmatrix` - -.. ipython:: python - - from scipy import sparse - mat = sparse.eye(3) - df = pd.DataFrame.sparse.from_spmatrix(mat, columns=['A', 'B', 'C']) - df - -**Conversion** - -Use the ``.sparse`` accessors - -.. ipython:: python - - df.sparse.to_dense() - df.sparse.to_coo() - df['A'] - -**Sparse Properties** - -Sparse-specific properties, like ``density``, are available on the ``.sparse`` accssor. - -.. ipython:: python - - df.sparse.density - -The ``SparseDataFrame.default_kind`` and ``SparseDataFrame.default_fill_value`` attributes -have no replacement. From 12a8329e65c2205ee4da784b0ef47adfe3c8edb4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 May 2019 16:05:43 -0500 Subject: [PATCH 08/22] remove change --- pandas/util/testing.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 9b301e5b04456..9084ebc736599 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -2431,8 +2431,7 @@ class for all warnings. To check that no warning is returned, pass saw_warning = False - if filter_level: - warnings.simplefilter(filter_level) + warnings.simplefilter(filter_level) yield w extra_warnings = [] From 01c7710f0cff260ab49cf8491489c26c98ae2101 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 May 2019 16:10:13 -0500 Subject: [PATCH 09/22] fixed merge conflict --- pandas/tests/arrays/sparse/test_array.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 525656c638865..dbb9131f858e4 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -172,6 +172,7 @@ def test_constructor_inferred_fill_value(self, data, fill_value): else: assert result == fill_value + @pytest.mark.parametrize('format', ['coo', 'csc', 'csr']) @pytest.mark.parametrize('size', [ pytest.param(0, marks=pytest.mark.skipif(_np_version_under1p16, From e9b9b295ccccc65871dffa513b5647b14e71bfec Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 May 2019 16:11:43 -0500 Subject: [PATCH 10/22] pickle --- pandas/tests/io/test_pickle.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 6acf54ab73b2d..b115a08d3b0d3 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -196,6 +196,7 @@ def legacy_pickle(request, datapath): # --------------------- # tests # --------------------- +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_pickles(current_pickle_data, legacy_pickle): if not is_platform_little_endian(): pytest.skip("known failure on non-little endian") @@ -206,6 +207,7 @@ def test_pickles(current_pickle_data, legacy_pickle): compare(current_pickle_data, legacy_pickle, version) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_round_trip_current(current_pickle_data): def python_pickler(obj, path): From b295ce1035cb0373f55fbdd4b9461f4a5e6500eb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 May 2019 22:02:13 -0500 Subject: [PATCH 11/22] fixups --- doc/source/user_guide/sparse.rst | 4 ++-- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/tests/sparse/test_indexing.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index f5f49a59d2a4e..9d747767cbc6c 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -172,14 +172,14 @@ use the python warnings module to control warnings. If you wish to ignore the wa >>> import warnings - >>> warnings.filterwarnings('ignore', 'Sparse', FutureWarning + >>> warnings.filterwarnings('ignore', 'Sparse', FutureWarning) >>> pd.SparseSeries() # No warning message Series([], dtype: Sparse[float64, nan]) BlockIndex Block locations: array([], dtype=int32) Block lengths: array([], dtype=int32) -But we recommend modifying your code, rather than ignoring the warning.l +But we recommend modifying your code, rather than ignoring the warning. **Construction** diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index c2bc68a2b953c..1e6861c9f686e 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -226,6 +226,7 @@ Deprecations - Deprecated the ``units=M`` (months) and ``units=Y`` (year) parameters for ``units`` of :func:`pandas.to_timedelta`, :func:`pandas.Timedelta` and :func:`pandas.TimedeltaIndex` (:issue:`16344`) - The functions :func:`pandas.to_datetime` and :func:`pandas.to_timedelta` have deprecated the ``box`` keyword. Instead, use :meth:`to_numpy` or :meth:`Timestamp.to_datetime64` or :meth:`Timedelta.to_timedelta64`. (:issue:`24416`) +- The ``SparseSeries`` and ``SparseDataFrame`` subclasses are deprecated. Use a ``DataFrame`` or ``Series`` with sparse values instead. See :ref:`sparse.migration` for more (:issue:`19239`). .. _whatsnew_0250.prior_deprecations: diff --git a/pandas/tests/sparse/test_indexing.py b/pandas/tests/sparse/test_indexing.py index f77a58eb16789..21c303fa2a064 100644 --- a/pandas/tests/sparse/test_indexing.py +++ b/pandas/tests/sparse/test_indexing.py @@ -602,7 +602,7 @@ def test_reindex(self): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -class TestSparseDataFrameIndexing(object): +class TestSparseDataFrameIndexing: def test_getitem(self): orig = pd.DataFrame([[1, np.nan, np.nan], From ccf71db85f2c85d35961a863509dd11dbfbb2586 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 May 2019 22:06:49 -0500 Subject: [PATCH 12/22] fixups --- doc/source/user_guide/sparse.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index 9d747767cbc6c..8655846de3296 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -227,7 +227,7 @@ From dense to sparse, use :meth:`DataFrame.astype` with a :class:`SparseDtype`. dense = pd.DataFrame({"A": [1, 0, 0, 1]}) dtype = pd.SparseDtype(int, fill_value=0) - dense.astype(dtype)['A + dense.astype(dtype) **Sparse Properties** @@ -281,6 +281,7 @@ A :meth:`SparseSeries.to_coo` method is implemented for transforming a ``SparseS The method requires a ``MultiIndex`` with two or more levels. .. ipython:: python + :okwarning: s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), @@ -336,6 +337,7 @@ The default behaviour (with ``dense_index=False``) simply returns a ``SparseSeri only the non-null entries. .. ipython:: python + :okwarning: ss = pd.SparseSeries.from_coo(A) ss @@ -345,6 +347,7 @@ row and columns coordinates of the matrix. Note that this will consume a signifi (relative to ``dense_index=False``) if the sparse matrix is large (and sparse) enough. .. ipython:: python + :okwarning: ss_dense = pd.SparseSeries.from_coo(A, dense_index=True) ss_dense From 7e6fbd6371be1077c6815088fef079b69e940895 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 15 May 2019 08:23:18 -0500 Subject: [PATCH 13/22] doc lint --- doc/source/user_guide/sparse.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index 8655846de3296..b058a315b92ef 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -201,7 +201,9 @@ From a SciPy sparse matrix, use :meth:`DataFrame.sparse.from_spmatrix`, .. code-block:: python # Old way - df = pd.SparseDataFrame(sp_matrix, columns=['A', 'B', 'C']) + >>> from scipy import sparse + >>> mat = sparse.eye(3) + >>> df = pd.SparseDataFrame(mat, columns=['A', 'B', 'C']) .. ipython:: python From 865f1aaadc1666c7bbdf2e8f95194df4741de425 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 15 May 2019 10:23:08 -0500 Subject: [PATCH 14/22] fix pytables --- pandas/tests/io/test_pytables.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 2e274280850ee..8b5907b920cca 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -50,6 +50,7 @@ ignore_natural_naming_warning = pytest.mark.filterwarnings( "ignore:object name:tables.exceptions.NaturalNameWarning" ) +ignore_sparse = pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") # contextmanager to ensure the file cleanup @@ -2243,7 +2244,7 @@ def test_series(self): self._check_roundtrip(ts3, tm.assert_series_equal, check_index_type=False) - @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") + @ignore_sparse def test_sparse_series(self): s = tm.makeStringSeries() @@ -2260,7 +2261,7 @@ def test_sparse_series(self): self._check_roundtrip(ss3, tm.assert_series_equal, check_series_type=True) - @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") + @ignore_sparse def test_sparse_frame(self): s = tm.makeDataFrame() @@ -2599,6 +2600,7 @@ def test_overwrite_node(self): tm.assert_series_equal(store['a'], ts) + @ignore_sparse def test_sparse_with_compression(self): # GH 2931 @@ -3743,7 +3745,7 @@ def test_start_stop_multiple(self): expected = df.loc[[0], ['foo', 'bar']] tm.assert_frame_equal(result, expected) - @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") + @ignore_sparse def test_start_stop_fixed(self): with ensure_clean_store(self.path) as store: From 9915c48ac6d0003142ab9f6cd8dc99f9d1eff7db Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 15 May 2019 10:23:49 -0500 Subject: [PATCH 15/22] temp set error --- setup.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.cfg b/setup.cfg index 160784a8b5b65..b232490aa3615 100644 --- a/setup.cfg +++ b/setup.cfg @@ -69,6 +69,8 @@ markers = doctest_optionflags = NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL addopts = --strict-data-files xfail_strict = True +filterwarnings = + error:Sparse:FutureWarning [coverage:run] branch = False From 30f36705a0d4f099a677195560e2e160accdaf96 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 15 May 2019 12:06:42 -0500 Subject: [PATCH 16/22] skip doctests --- pandas/core/frame.py | 6 +++--- pandas/core/generic.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6ec36c62f0be8..0ffded8b624a0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1920,13 +1920,13 @@ def to_sparse(self, fill_value=None, kind='block'): >>> type(df) - >>> sdf = df.to_sparse() - >>> sdf + >>> sdf = df.to_sparse() # doctest: +SKIP + >>> sdf # doctest: +SKIP 0 1 0 NaN NaN 1 1.0 NaN 2 NaN 1.0 - >>> type(sdf) + >>> type(sdf) # doctest: +SKIP """ from pandas.core.sparse.api import SparseDataFrame diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 876465d96e6fe..7909b59476a58 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5589,7 +5589,7 @@ def ftypes(self): 3 float64:dense dtype: object - >>> pd.SparseDataFrame(arr).ftypes + >>> pd.SparseDataFrame(arr).ftypes # doctest: +SKIP 0 float64:sparse 1 float64:sparse 2 float64:sparse From 706c5dc596d173927bcefda258458f819670c36e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 16 May 2019 14:02:13 -0500 Subject: [PATCH 17/22] fixups --- doc/source/user_guide/sparse.rst | 52 ++++++++++++++++++++++++++------ pandas/core/sparse/frame.py | 2 +- pandas/core/sparse/series.py | 2 ++ 3 files changed, 45 insertions(+), 11 deletions(-) diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index 9da40fc45eaaa..6ced46dda21e4 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -36,12 +36,20 @@ large, mostly NA ``DataFrame``: df = pd.DataFrame(np.random.randn(10000, 4)) df.iloc[:9998] = np.nan sdf = df.astype(pd.SparseDtype("float", np.nan)) - sdf + sdf.head() + sdf.dtypes sdf.sparse.density As you can see, the density (% of values that have not been "compressed") is extremely low. This sparse object takes up much less memory on disk (pickled) -and in the Python interpreter. Functionally, their behavior should be nearly +and in the Python interpreter. + +.. ipython:: python + + print('dense : {:0.2f} bytes'.format(df.memory_usage().sum() / 1e3)) + print('sparse: {:0.2f} bytes'.format(sdf.memory_usage().sum() / 1e3)) + +Functionally, their behavior should be nearly identical to their dense counterparts. .. _sparse.array: @@ -73,6 +81,12 @@ The :attr:`SparseArray.dtype` property stores two pieces of information 1. The dtype of the non-sparse values 2. The scalar fill value + +.. ipython:: python + + sparr.dtype + + A :class:`SparseDtype` may be constructed by passing each of these .. ipython:: python @@ -118,7 +132,7 @@ class itself for creating a Series with sparse data from a scipy COO matrix with .. versionadded:: 0.25.0 A ``.sparse`` accessor has been added for :class:`DataFrame` as well. -See :ref:`api.dataframe.sparse` for more. +See :ref:`api.frame.sparse` for more. .. _sparse.calculation: @@ -160,11 +174,6 @@ This section provides some guidance on migrating your code to the new style. As you can use the python warnings module to control warnings. But we recommend modifying your code, rather than ignoring the warning. -**General Differences** - -In a SparseDataFrame, *all* columns were sparse. A :class:`DataFrame` can have a mixture of -sparse and dense columns. - **Construction** From an array-like, use the regular :class:`Series` or @@ -195,7 +204,7 @@ From a SciPy sparse matrix, use :meth:`DataFrame.sparse.from_spmatrix`, from scipy import sparse mat = sparse.eye(3) df = pd.DataFrame.sparse.from_spmatrix(mat, columns=['A', 'B', 'C']) - df + df.dtypes **Conversion** @@ -205,7 +214,6 @@ From sparse to dense, use the ``.sparse`` accessors df.sparse.to_dense() df.sparse.to_coo() - df['A'] From dense to sparse, use :meth:`DataFrame.astype` with a :class:`SparseDtype`. @@ -223,6 +231,30 @@ Sparse-specific properties, like ``density``, are available on the ``.sparse`` a df.sparse.density +**General Differences** + +In a SparseDataFrame, *all* columns were sparse. A :class:`DataFrame` can have a mixture of +sparse and dense columns. As a consequence, assigning new columns to a DataFrame with sparse +values will not automatically convert the input to be sparse. + +.. code-block:: + + # Previous Way + df = pd.SparseDataFrame({"A": [0, 1]}) + df['B'] = [0, 0] # implicitly becomes Sparse + df['B'].dtype + Sparse[int64, nan] + +Instead, you'll need to ensure that the values being assigned are sparse + +.. ipython:: python + + df = pd.DataFrame({"A": pd.SparseArray([0, 1])}) + df['B'] = [0, 0] # remains dense + df['B'].dtype + df['B'] = pd.SparseArray([0, 0]) + df['B'].dtype + The ``SparseDataFrame.default_kind`` and ``SparseDataFrame.default_fill_value`` attributes have no replacement. diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 167df5c4198e6..47a10d2259acc 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -30,7 +30,7 @@ _shared_doc_kwargs = dict(klass='SparseDataFrame') depr_msg = """\ SparseDataFrame is deprecated and will be removed in a future version. -Use a DataFrame with sparse values instead. +Use a regular DataFrame whose columns are SparseArrays instead. See http://pandas.pydata.org/pandas-docs/stable/\ user_guide/sparse.html#migrating for more. diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index e1d451465fd18..b486b6d3f4460 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -36,6 +36,8 @@ SparseSeries is deprecated and will be removed in a future version. Use a Series with sparse values instead. + >>> series = pd.Series(pd.SparseArray(...)) + See http://pandas.pydata.org/pandas-docs/stable/\ user_guide/sparse.html#migrating for more. """ From 13d30d2c25342f62ad7047b598fbb23ea00644f2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 16 May 2019 15:31:40 -0500 Subject: [PATCH 18/22] fixup --- doc/source/user_guide/sparse.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index 6ced46dda21e4..26aa6838eeed5 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -237,12 +237,12 @@ In a SparseDataFrame, *all* columns were sparse. A :class:`DataFrame` can have a sparse and dense columns. As a consequence, assigning new columns to a DataFrame with sparse values will not automatically convert the input to be sparse. -.. code-block:: +.. code-block:: python # Previous Way - df = pd.SparseDataFrame({"A": [0, 1]}) - df['B'] = [0, 0] # implicitly becomes Sparse - df['B'].dtype + >>> df = pd.SparseDataFrame({"A": [0, 1]}) + >>> df['B'] = [0, 0] # implicitly becomes Sparse + >>> df['B'].dtype Sparse[int64, nan] Instead, you'll need to ensure that the values being assigned are sparse From c5fa3fb984dd3d999c7909f5d3bd8ff8ec521533 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 16 May 2019 16:15:26 -0500 Subject: [PATCH 19/22] updates --- doc/source/user_guide/sparse.rst | 58 ++++++++++----------- pandas/core/arrays/sparse.py | 3 +- pandas/core/series.py | 1 - pandas/core/sparse/scipy_sparse.py | 9 +++- pandas/tests/arrays/sparse/test_accessor.py | 18 +++++++ 5 files changed, 54 insertions(+), 35 deletions(-) diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index 26aa6838eeed5..5b3c4f09bab8a 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -263,15 +263,11 @@ have no replacement. Interaction with scipy.sparse ----------------------------- -SparseDataFrame -~~~~~~~~~~~~~~~ +Use :meth:`DataFrame.sparse.from_coo` to create a ``DataFrame`` with sparse values from a sparse matrix. -.. versionadded:: 0.20.0 - -Pandas supports creating sparse dataframes directly from ``scipy.sparse`` matrices. +.. versionadded:: 0.25.0 .. ipython:: python - :okwarning: from scipy.sparse import csr_matrix @@ -281,25 +277,22 @@ Pandas supports creating sparse dataframes directly from ``scipy.sparse`` matric sp_arr = csr_matrix(arr) sp_arr - sdf = pd.SparseDataFrame(sp_arr) - sdf + sdf = pd.DataFrame.sparse.from_spmatrix(sp_arr) + sdf.head() + sdf.dtypes All sparse formats are supported, but matrices that are not in :mod:`COOrdinate ` format will be converted, copying data as needed. -To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you can use the :meth:`SparseDataFrame.to_coo` method: +To convert back to sparse SciPy matrix in COO format, you can use the :meth:`DataFrame.sparse.to_coo` method: .. ipython:: python - sdf.to_coo() + sdf.sparse.to_coo() -SparseSeries -~~~~~~~~~~~~ - -A :meth:`SparseSeries.to_coo` method is implemented for transforming a ``SparseSeries`` indexed by a ``MultiIndex`` to a ``scipy.sparse.coo_matrix``. +:meth:`Series.sparse.to_coo` is implemented for transforming a ``Series`` with sparse values indexed by a ``MultiIndex`` to a ``scipy.sparse.coo_matrix``. The method requires a ``MultiIndex`` with two or more levels. .. ipython:: python - :okwarning: s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), @@ -309,19 +302,17 @@ The method requires a ``MultiIndex`` with two or more levels. (2, 1, 'b', 0), (2, 1, 'b', 1)], names=['A', 'B', 'C', 'D']) - s - # SparseSeries - ss = s.to_sparse() + ss = s.astype('Sparse') ss -In the example below, we transform the ``SparseSeries`` to a sparse representation of a 2-d array by specifying that the first and second ``MultiIndex`` levels define labels for the rows and the third and fourth levels define labels for the columns. We also specify that the column and row labels should be sorted in the final sparse representation. +In the example below, we transform the ``Series`` to a sparse representation of a 2-d array by specifying that the first and second ``MultiIndex`` levels define labels for the rows and the third and fourth levels define labels for the columns. We also specify that the column and row labels should be sorted in the final sparse representation. .. ipython:: python - A, rows, columns = ss.to_coo(row_levels=['A', 'B'], - column_levels=['C', 'D'], - sort_labels=True) + A, rows, columns = ss.sparse.to_coo(row_levels=['A', 'B'], + column_levels=['C', 'D'], + sort_labels=True) A A.todense() @@ -332,16 +323,16 @@ Specifying different row and column labels (and not sorting them) yields a diffe .. ipython:: python - A, rows, columns = ss.to_coo(row_levels=['A', 'B', 'C'], - column_levels=['D'], - sort_labels=False) + A, rows, columns = ss.sparse.to_coo(row_levels=['A', 'B', 'C'], + column_levels=['D'], + sort_labels=False) A A.todense() rows columns -A convenience method :meth:`SparseSeries.from_coo` is implemented for creating a ``SparseSeries`` from a ``scipy.sparse.coo_matrix``. +A convenience method :meth:`Series.sparse.from_coo` is implemented for creating a ``Series`` with sparse values from a ``scipy.sparse.coo_matrix``. .. ipython:: python @@ -351,13 +342,12 @@ A convenience method :meth:`SparseSeries.from_coo` is implemented for creating a A A.todense() -The default behaviour (with ``dense_index=False``) simply returns a ``SparseSeries`` containing +The default behaviour (with ``dense_index=False``) simply returns a ``Series`` containing only the non-null entries. .. ipython:: python - :okwarning: - ss = pd.SparseSeries.from_coo(A) + ss = pd.Series.sparse.from_coo(A) ss Specifying ``dense_index=True`` will result in an index that is the Cartesian product of the @@ -365,9 +355,15 @@ row and columns coordinates of the matrix. Note that this will consume a signifi (relative to ``dense_index=False``) if the sparse matrix is large (and sparse) enough. .. ipython:: python - :okwarning: - ss_dense = pd.SparseSeries.from_coo(A, dense_index=True) + ss_dense = pd.Series.sparse.from_coo(A, dense_index=True) ss_dense +.. _sparse.subclasses: + +Sparse Subclasses +----------------- + +The :class:`SparseSeries` and :class:`SparseDataFrame` classes are deprecated. Visit their +API pages for usage. diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 66ccd6848864d..aa4d888a33664 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -2033,7 +2033,8 @@ def from_coo(cls, A, dense_index=False): from pandas.core.sparse.scipy_sparse import _coo_to_sparse_series from pandas import Series - result = _coo_to_sparse_series(A, dense_index=dense_index) + result = _coo_to_sparse_series(A, dense_index=dense_index, + sparse_series=False) # SparseSeries -> Series[sparse] result = Series(result.values, index=result.index, copy=False) diff --git a/pandas/core/series.py b/pandas/core/series.py index f0b674596656a..e752901ce7cb9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1589,7 +1589,6 @@ def to_sparse(self, kind='block', fill_value=None): SparseSeries Sparse representation of the Series. """ - # TODO: deprecate from pandas.core.sparse.series import SparseSeries values = SparseArray(self, kind=kind, fill_value=fill_value) diff --git a/pandas/core/sparse/scipy_sparse.py b/pandas/core/sparse/scipy_sparse.py index 40b4452caa8dc..910d7cd099d4c 100644 --- a/pandas/core/sparse/scipy_sparse.py +++ b/pandas/core/sparse/scipy_sparse.py @@ -116,14 +116,19 @@ def _sparse_series_to_coo(ss, row_levels=(0, ), column_levels=(1, ), return sparse_matrix, rows, columns -def _coo_to_sparse_series(A, dense_index=False): +def _coo_to_sparse_series(A, dense_index=False, sparse_series=True): """ Convert a scipy.sparse.coo_matrix to a SparseSeries. Use the defaults given in the SparseSeries constructor. """ + from pandas import SparseDtype + s = Series(A.data, MultiIndex.from_arrays((A.row, A.col))) s = s.sort_index() - s = s.to_sparse() # TODO: specify kind? + if sparse_series: + s = s.to_sparse() # TODO: specify kind? + else: + s = s.astype(SparseDtype(s.dtype)) if dense_index: # is there a better constructor method to use here? i = range(A.shape[0]) diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index 676f578dd2acc..370d222c1ab4e 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -101,3 +101,21 @@ def test_density(self): res = df.sparse.density expected = 0.75 assert res == expected + + @pytest.mark.parametrize("dtype", ['int64', 'float64']) + @pytest.mark.parametrize("dense_index", [True, False]) + @td.skip_if_no_scipy + def test_series_from_coo(self, dtype, dense_index): + import scipy.sparse + + A = scipy.sparse.eye(3, format='coo', dtype=dtype) + result = pd.Series.sparse.from_coo(A, dense_index=dense_index) + index = pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)]) + expected = pd.Series(pd.SparseArray(np.array([1, 1, 1], dtype=dtype)), + index=index) + if dense_index: + expected = expected.reindex( + pd.MultiIndex.from_product(index.levels) + ) + + tm.assert_series_equal(result, expected) From b76745f2b600881814f0fdda96cfc0db67515a76 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 20 May 2019 14:07:00 -0500 Subject: [PATCH 20/22] fixups --- pandas/core/arrays/sparse.py | 3 +-- pandas/core/sparse/scipy_sparse.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index e2fb99d643734..b95f8a75f906c 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -2035,8 +2035,7 @@ def from_coo(cls, A, dense_index=False): result = _coo_to_sparse_series(A, dense_index=dense_index, sparse_series=False) - # SparseSeries -> Series[sparse] - result = Series(result.values, index=result.index, copy=False) + result = Series(result.array, index=result.index, copy=False) return result diff --git a/pandas/core/sparse/scipy_sparse.py b/pandas/core/sparse/scipy_sparse.py index 910d7cd099d4c..df6ebce055b25 100644 --- a/pandas/core/sparse/scipy_sparse.py +++ b/pandas/core/sparse/scipy_sparse.py @@ -116,10 +116,16 @@ def _sparse_series_to_coo(ss, row_levels=(0, ), column_levels=(1, ), return sparse_matrix, rows, columns -def _coo_to_sparse_series(A, dense_index=False, sparse_series=True): +def _coo_to_sparse_series(A, dense_index: bool = False, + sparse_series: bool = True): """ Convert a scipy.sparse.coo_matrix to a SparseSeries. - Use the defaults given in the SparseSeries constructor. + + Parameters + ---------- + A : scipy.sparse.coo.coo_matrix + dense_index : bool, default False + sparse_series : bool, default True """ from pandas import SparseDtype From f15340068dcea25431d61ca76f824ae3bddd7b95 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 20 May 2019 14:08:46 -0500 Subject: [PATCH 21/22] return --- pandas/core/sparse/scipy_sparse.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/sparse/scipy_sparse.py b/pandas/core/sparse/scipy_sparse.py index df6ebce055b25..deafd10063fe9 100644 --- a/pandas/core/sparse/scipy_sparse.py +++ b/pandas/core/sparse/scipy_sparse.py @@ -126,6 +126,10 @@ def _coo_to_sparse_series(A, dense_index: bool = False, A : scipy.sparse.coo.coo_matrix dense_index : bool, default False sparse_series : bool, default True + + Returns + ------- + Series or SparseSeries """ from pandas import SparseDtype From 1903f673221ab914165240d5c0a8c4b149de8a6c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 28 May 2019 09:08:55 -0500 Subject: [PATCH 22/22] fixups --- doc/source/user_guide/sparse.rst | 8 ++++---- pandas/core/sparse/scipy_sparse.py | 3 +++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index eeff2f603d8c9..8fed29d7a6316 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -231,7 +231,7 @@ From dense to sparse, use :meth:`DataFrame.astype` with a :class:`SparseDtype`. **Sparse Properties** -Sparse-specific properties, like ``density``, are available on the ``.sparse`` accssor. +Sparse-specific properties, like ``density``, are available on the ``.sparse`` accessor. .. ipython:: python @@ -239,8 +239,8 @@ Sparse-specific properties, like ``density``, are available on the ``.sparse`` a **General Differences** -In a SparseDataFrame, *all* columns were sparse. A :class:`DataFrame` can have a mixture of -sparse and dense columns. As a consequence, assigning new columns to a DataFrame with sparse +In a ``SparseDataFrame``, *all* columns were sparse. A :class:`DataFrame` can have a mixture of +sparse and dense columns. As a consequence, assigning new columns to a ``DataFrame`` with sparse values will not automatically convert the input to be sparse. .. code-block:: python @@ -294,7 +294,7 @@ To convert back to sparse SciPy matrix in COO format, you can use the :meth:`Dat sdf.sparse.to_coo() -:meth:`Series.sparse.to_coo` is implemented for transforming a ``Series`` with sparse values indexed by a ``MultiIndex`` to a ``scipy.sparse.coo_matrix``. +meth:`Series.sparse.to_coo` is implemented for transforming a ``Series`` with sparse values indexed by a :class:`MultiIndex` to a :class:`scipy.sparse.coo_matrix`. The method requires a ``MultiIndex`` with two or more levels. diff --git a/pandas/core/sparse/scipy_sparse.py b/pandas/core/sparse/scipy_sparse.py index deafd10063fe9..7630983421ff9 100644 --- a/pandas/core/sparse/scipy_sparse.py +++ b/pandas/core/sparse/scipy_sparse.py @@ -136,6 +136,9 @@ def _coo_to_sparse_series(A, dense_index: bool = False, s = Series(A.data, MultiIndex.from_arrays((A.row, A.col))) s = s.sort_index() if sparse_series: + # TODO(SparseSeries): remove this and the sparse_series keyword. + # This is just here to avoid a DeprecationWarning when + # _coo_to_sparse_series is called via Series.sparse.from_coo s = s.to_sparse() # TODO: specify kind? else: s = s.astype(SparseDtype(s.dtype))