diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index cd3e6594f1d1e..dfa475684c834 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -311,6 +311,29 @@ specific plotting methods of the form ``DataFrame.plot.``. DataFrame.boxplot DataFrame.hist + +.. _api.frame.sparse: + +Sparse Accessor +~~~~~~~~~~~~~~~ + +Sparse-dtype specific methods and attributes are provided under the +``DataFrame.sparse`` accessor. + +.. autosummary:: + :toctree: api/ + :template: autosummary/accessor_attribute.rst + + DataFrame.sparse.density + +.. autosummary:: + :toctree: api/ + + DataFrame.sparse.from_spmatrix + DataFrame.sparse.to_coo + DataFrame.sparse.to_dense + + Serialization / IO / Conversion ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index dacd433f112a5..39845e7a7e2e1 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -35,6 +35,7 @@ Other Enhancements - :meth:`RangeIndex.union` now supports the ``sort`` argument. If ``sort=False`` an unsorted ``Int64Index`` is always returned. ``sort=None`` is the default and returns a mononotically increasing ``RangeIndex`` if possible or a sorted ``Int64Index`` if not (:issue:`24471`) - :meth:`TimedeltaIndex.intersection` now also supports the ``sort`` keyword (:issue:`24471`) - :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`) +- Added :ref:`api.frame.sparse` for working with a ``DataFrame`` whose values are sparse (:issue:`25681`) - :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`) - :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`) - :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 9081c81d0a453..66ccd6848864d 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -697,6 +697,55 @@ def _simple_new( new._dtype = dtype return new + @classmethod + def from_spmatrix(cls, data): + """ + Create a SparseArray from a scipy.sparse matrix. + + .. versionadded:: 0.25.0 + + Parameters + ---------- + data : scipy.sparse.sp_matrix + This should be a SciPy sparse matrix where the size + of the second dimension is 1. In other words, a + sparse matrix with a single column. + + Returns + ------- + SparseArray + + Examples + -------- + >>> import scipy.sparse + >>> mat = scipy.sparse.coo_matrix((4, 1)) + >>> pd.SparseArray.from_spmatrix(mat) + [0.0, 0.0, 0.0, 0.0] + Fill: 0.0 + IntIndex + Indices: array([], dtype=int32) + """ + length, ncol = data.shape + + if ncol != 1: + raise ValueError( + "'data' must have a single column, not '{}'".format(ncol) + ) + + # our sparse index classes require that the positions be strictly + # increasing. So we need to sort loc, and arr accordingly. + arr = data.data + idx, _ = data.nonzero() + loc = np.argsort(idx) + arr = arr.take(loc) + idx.sort() + + zero = np.array(0, dtype=arr.dtype).item() + dtype = SparseDtype(arr.dtype, zero) + index = IntIndex(length, idx) + + return cls._simple_new(arr, index, dtype) + def __array__(self, dtype=None, copy=True): fill_value = self.fill_value @@ -1906,27 +1955,32 @@ def _make_index(length, indices, kind): # ---------------------------------------------------------------------------- # Accessor + +class BaseAccessor: + _validation_msg = "Can only use the '.sparse' accessor with Sparse data." + + def __init__(self, data=None): + self._parent = data + self._validate(data) + + def _validate(self, data): + raise NotImplementedError + + @delegate_names(SparseArray, ['npoints', 'density', 'fill_value', 'sp_values'], typ='property') -class SparseAccessor(PandasDelegate): +class SparseAccessor(BaseAccessor, PandasDelegate): """ Accessor for SparseSparse from other sparse matrix data types. """ - def __init__(self, data=None): - self._validate(data) - # Store the Series since we need that for to_coo - self._parent = data - - @staticmethod - def _validate(data): + def _validate(self, data): if not isinstance(data.dtype, SparseDtype): - msg = "Can only use the '.sparse' accessor with Sparse data." - raise AttributeError(msg) + raise AttributeError(self._validation_msg) def _delegate_property_get(self, name, *args, **kwargs): - return getattr(self._parent.values, name) + return getattr(self._parent.array, name) def _delegate_method(self, name, *args, **kwargs): if name == 'from_coo': @@ -2040,3 +2094,190 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False): column_levels, sort_labels=sort_labels) return A, rows, columns + + def to_dense(self): + """ + Convert a Series from sparse values to dense. + + .. versionadded:: 0.25.0 + + Returns + ------- + Series: + A Series with the same values, stored as a dense array. + + Examples + -------- + >>> series = pd.Series(pd.SparseArray([0, 1, 0])) + >>> series + 0 0 + 1 1 + 2 0 + dtype: Sparse[int64, 0] + + >>> series.sparse.to_dense() + 0 0 + 1 1 + 2 0 + dtype: int64 + """ + from pandas import Series + return Series(self._parent.array.to_dense(), + index=self._parent.index, + name=self._parent.name) + + +class SparseFrameAccessor(BaseAccessor, PandasDelegate): + """ + DataFrame accessor for sparse data. + + .. versionadded :: 0.25.0 + """ + + def _validate(self, data): + dtypes = data.dtypes + if not all(isinstance(t, SparseDtype) for t in dtypes): + raise AttributeError(self._validation_msg) + + @classmethod + def from_spmatrix(cls, data, index=None, columns=None): + """ + Create a new DataFrame from a scipy sparse matrix. + + .. versionadded:: 0.25.0 + + Parameters + ---------- + data : scipy.sparse.spmatrix + Must be convertible to csc format. + index, columns : Index, optional + Row and column labels to use for the resulting DataFrame. + Defaults to a RangeIndex. + + Returns + ------- + DataFrame + Each column of the DataFrame is stored as a + :class:`SparseArray`. + + Examples + -------- + >>> import scipy.sparse + >>> mat = scipy.sparse.eye(3) + >>> pd.DataFrame.sparse.from_spmatrix(mat) + 0 1 2 + 0 1.0 0.0 0.0 + 1 0.0 1.0 0.0 + 2 0.0 0.0 1.0 + """ + from pandas import DataFrame + + data = data.tocsc() + index, columns = cls._prep_index(data, index, columns) + sparrays = [ + SparseArray.from_spmatrix(data[:, i]) + for i in range(data.shape[1]) + ] + data = dict(enumerate(sparrays)) + result = DataFrame(data, index=index) + result.columns = columns + return result + + def to_dense(self): + """ + Convert a DataFrame with sparse values to dense. + + .. versionadded:: 0.25.0 + + Returns + ------- + DataFrame + A DataFrame with the same values stored as dense arrays. + + Examples + -------- + >>> df = pd.DataFrame({"A": pd.SparseArray([0, 1, 0])}) + >>> df.sparse.to_dense() + A + 0 0 + 1 1 + 2 0 + """ + from pandas import DataFrame + + data = {k: v.array.to_dense() + for k, v in self._parent.items()} + return DataFrame(data, + index=self._parent.index, + columns=self._parent.columns) + + def to_coo(self): + """ + Return the contents of the frame as a sparse SciPy COO matrix. + + .. versionadded:: 0.25.0 + + Returns + ------- + coo_matrix : scipy.sparse.spmatrix + If the caller is heterogeneous and contains booleans or objects, + the result will be of dtype=object. See Notes. + + Notes + ----- + The dtype will be the lowest-common-denominator type (implicit + upcasting); that is to say if the dtypes (even of numeric types) + are mixed, the one that accommodates all will be chosen. + + e.g. If the dtypes are float16 and float32, dtype will be upcast to + float32. By numpy.find_common_type convention, mixing int64 and + and uint64 will result in a float64 dtype. + """ + try: + from scipy.sparse import coo_matrix + except ImportError: + raise ImportError('Scipy is not installed') + + dtype = find_common_type(self._parent.dtypes) + if isinstance(dtype, SparseDtype): + dtype = dtype.subtype + + cols, rows, datas = [], [], [] + for col, name in enumerate(self._parent): + s = self._parent[name] + row = s.array.sp_index.to_int_index().indices + cols.append(np.repeat(col, len(row))) + rows.append(row) + datas.append(s.array.sp_values.astype(dtype, copy=False)) + + cols = np.concatenate(cols) + rows = np.concatenate(rows) + datas = np.concatenate(datas) + return coo_matrix((datas, (rows, cols)), shape=self._parent.shape) + + @property + def density(self) -> float: + """ + Ratio of non-sparse points to total (dense) data points + represented in the DataFrame. + """ + return np.mean([column.array.density + for _, column in self._parent.items()]) + + @staticmethod + def _prep_index(data, index, columns): + import pandas.core.indexes.base as ibase + + N, K = data.shape + if index is None: + index = ibase.default_index(N) + if columns is None: + columns = ibase.default_index(K) + + if len(columns) != K: + raise ValueError('Column length mismatch: {columns} vs. {K}' + .format(columns=len(columns), K=K)) + if len(index) != N: + raise ValueError('Index length mismatch: {index} vs. {N}' + .format(index=len(index), N=N)) + return index, columns diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 63daae83916c2..6ec36c62f0be8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -33,6 +33,7 @@ from pandas.compat import PY36, raise_with_traceback from pandas.compat.numpy import function as nv +from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.dtypes.cast import ( maybe_upcast, cast_scalar_to_array, @@ -8027,6 +8028,7 @@ def isin(self, values): plot = CachedAccessor("plot", gfx.FramePlotMethods) hist = gfx.hist_frame boxplot = gfx.boxplot_frame + sparse = CachedAccessor("sparse", SparseFrameAccessor) DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0, diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 3adeefd6ffd4e..79b3a622ad72e 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -10,17 +10,16 @@ from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender -from pandas.core.dtypes.cast import find_common_type, maybe_upcast +from pandas.core.dtypes.cast import maybe_upcast from pandas.core.dtypes.common import ensure_platform_int, is_scipy_sparse from pandas.core.dtypes.missing import isna, notna import pandas.core.algorithms as algos -from pandas.core.arrays.sparse import SparseArray, SparseDtype +from pandas.core.arrays.sparse import SparseArray, SparseFrameAccessor import pandas.core.common as com from pandas.core.frame import DataFrame import pandas.core.generic as generic from pandas.core.index import Index, MultiIndex, ensure_index -import pandas.core.indexes.base as ibase from pandas.core.internals import ( BlockManager, create_block_manager_from_arrays) from pandas.core.internals.construction import extract_index, prep_ndarray @@ -191,7 +190,7 @@ def _init_matrix(self, data, index, columns, dtype=None): Init self from ndarray or list of lists. """ data = prep_ndarray(data, copy=False) - index, columns = self._prep_index(data, index, columns) + index, columns = SparseFrameAccessor._prep_index(data, index, columns) data = {idx: data[:, i] for i, idx in enumerate(columns)} return self._init_dict(data, index, columns, dtype) @@ -200,7 +199,7 @@ def _init_spmatrix(self, data, index, columns, dtype=None, """ Init self from scipy.sparse matrix. """ - index, columns = self._prep_index(data, index, columns) + index, columns = SparseFrameAccessor._prep_index(data, index, columns) data = data.tocoo() N = len(index) @@ -227,64 +226,9 @@ def _init_spmatrix(self, data, index, columns, dtype=None, return self._init_dict(sdict, index, columns, dtype) - def _prep_index(self, data, index, columns): - N, K = data.shape - if index is None: - index = ibase.default_index(N) - if columns is None: - columns = ibase.default_index(K) - - if len(columns) != K: - raise ValueError('Column length mismatch: {columns} vs. {K}' - .format(columns=len(columns), K=K)) - if len(index) != N: - raise ValueError('Index length mismatch: {index} vs. {N}' - .format(index=len(index), N=N)) - return index, columns - + @Appender(SparseFrameAccessor.to_coo.__doc__) def to_coo(self): - """ - Return the contents of the frame as a sparse SciPy COO matrix. - - .. versionadded:: 0.20.0 - - Returns - ------- - coo_matrix : scipy.sparse.spmatrix - If the caller is heterogeneous and contains booleans or objects, - the result will be of dtype=object. See Notes. - - Notes - ----- - The dtype will be the lowest-common-denominator type (implicit - upcasting); that is to say if the dtypes (even of numeric types) - are mixed, the one that accommodates all will be chosen. - - e.g. If the dtypes are float16 and float32, dtype will be upcast to - float32. By numpy.find_common_type convention, mixing int64 and - and uint64 will result in a float64 dtype. - """ - try: - from scipy.sparse import coo_matrix - except ImportError: - raise ImportError('Scipy is not installed') - - dtype = find_common_type(self.dtypes) - if isinstance(dtype, SparseDtype): - dtype = dtype.subtype - - cols, rows, datas = [], [], [] - for col, name in enumerate(self): - s = self[name] - row = s.sp_index.to_int_index().indices - cols.append(np.repeat(col, len(row))) - rows.append(row) - datas.append(s.sp_values.astype(dtype, copy=False)) - - cols = np.concatenate(cols) - rows = np.concatenate(rows) - datas = np.concatenate(datas) - return coo_matrix((datas, (rows, cols)), shape=self.shape) + return SparseFrameAccessor(self).to_coo() def __array_wrap__(self, result): return self._constructor( @@ -325,16 +269,9 @@ def _unpickle_sparse_frame_compat(self, state): self._default_fill_value = fv self._default_kind = kind + @Appender(SparseFrameAccessor.to_dense.__doc__) def to_dense(self): - """ - Convert to dense DataFrame - - Returns - ------- - df : DataFrame - """ - data = {k: v.to_dense() for k, v in self.items()} - return DataFrame(data, index=self.index, columns=self.columns) + return SparseFrameAccessor(self).to_dense() def _apply_columns(self, func): """ diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py new file mode 100644 index 0000000000000..676f578dd2acc --- /dev/null +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -0,0 +1,103 @@ +import string + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas.util.testing as tm + + +class TestSeriesAccessor: + # TODO: collect other Series accessor tests + def test_to_dense(self): + s = pd.Series([0, 1, 0, 10], dtype='Sparse[int64]') + result = s.sparse.to_dense() + expected = pd.Series([0, 1, 0, 10]) + tm.assert_series_equal(result, expected) + + +class TestFrameAccessor: + + def test_accessor_raises(self): + df = pd.DataFrame({"A": [0, 1]}) + with pytest.raises(AttributeError, match='sparse'): + df.sparse + + @pytest.mark.parametrize('format', ['csc', 'csr', 'coo']) + @pytest.mark.parametrize("labels", [ + None, + list(string.ascii_letters[:10]), + ]) + @pytest.mark.parametrize('dtype', ['float64', 'int64']) + @td.skip_if_no_scipy + def test_from_spmatrix(self, format, labels, dtype): + import scipy.sparse + + sp_dtype = pd.SparseDtype(dtype, np.array(0, dtype=dtype).item()) + + mat = scipy.sparse.eye(10, format=format, dtype=dtype) + result = pd.DataFrame.sparse.from_spmatrix( + mat, index=labels, columns=labels + ) + expected = pd.DataFrame( + np.eye(10, dtype=dtype), + index=labels, + columns=labels, + ).astype(sp_dtype) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("columns", [ + ['a', 'b'], + pd.MultiIndex.from_product([['A'], ['a', 'b']]), + ['a', 'a'], + ]) + @td.skip_if_no_scipy + def test_from_spmatrix_columns(self, columns): + import scipy.sparse + + dtype = pd.SparseDtype('float64', 0.0) + + mat = scipy.sparse.random(10, 2, density=0.5) + result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns) + expected = pd.DataFrame( + mat.toarray(), columns=columns + ).astype(dtype) + tm.assert_frame_equal(result, expected) + + @td.skip_if_no_scipy + def test_to_coo(self): + import scipy.sparse + + df = pd.DataFrame({ + "A": [0, 1, 0], + "B": [1, 0, 0], + }, dtype='Sparse[int64, 0]') + result = df.sparse.to_coo() + expected = scipy.sparse.coo_matrix(np.asarray(df)) + assert (result != expected).nnz == 0 + + def test_to_dense(self): + df = pd.DataFrame({ + "A": pd.SparseArray([1, 0], dtype=pd.SparseDtype('int64', 0)), + "B": pd.SparseArray([1, 0], dtype=pd.SparseDtype('int64', 1)), + "C": pd.SparseArray([1., 0.], + dtype=pd.SparseDtype('float64', 0.0)), + }, index=['b', 'a']) + result = df.sparse.to_dense() + expected = pd.DataFrame({ + 'A': [1, 0], + 'B': [1, 0], + 'C': [1.0, 0.0], + }, index=['b', 'a']) + tm.assert_frame_equal(result, expected) + + def test_density(self): + df = pd.DataFrame({ + 'A': pd.SparseArray([1, 0, 2, 1], fill_value=0), + 'B': pd.SparseArray([0, 1, 1, 1], fill_value=0), + }) + res = df.sparse.density + expected = 0.75 + assert res == expected diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 2fb675ea74fa8..e09f4e2ccc59c 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -6,6 +6,7 @@ import pytest from pandas._libs.sparse import IntIndex +from pandas.compat.numpy import _np_version_under1p16 import pandas.util._test_decorators as td import pandas as pd @@ -171,6 +172,33 @@ def test_constructor_inferred_fill_value(self, data, fill_value): else: assert result == fill_value + @pytest.mark.parametrize('format', ['coo', 'csc', 'csr']) + @pytest.mark.parametrize('size', [ + pytest.param(0, + marks=pytest.mark.skipif(_np_version_under1p16, + reason='NumPy-11383')), + 10 + ]) + @td.skip_if_no_scipy + def test_from_spmatrix(self, size, format): + import scipy.sparse + + mat = scipy.sparse.random(size, 1, density=0.5, format=format) + result = SparseArray.from_spmatrix(mat) + + result = np.asarray(result) + expected = mat.toarray().ravel() + tm.assert_numpy_array_equal(result, expected) + + @td.skip_if_no_scipy + def test_from_spmatrix_raises(self): + import scipy.sparse + + mat = scipy.sparse.eye(5, 4, format='csc') + + with pytest.raises(ValueError, match="not '4'"): + SparseArray.from_spmatrix(mat) + @pytest.mark.parametrize('scalar,dtype', [ (False, SparseDtype(bool, False)), (0.0, SparseDtype('float64', 0)), @@ -1084,27 +1112,29 @@ def test_get_attributes(self, attr): expected = getattr(arr, attr) assert result == expected + @td.skip_if_no_scipy def test_from_coo(self): - sparse = pytest.importorskip("scipy.sparse") + import scipy.sparse row = [0, 3, 1, 0] col = [0, 3, 1, 2] data = [4, 5, 7, 9] - sp_array = sparse.coo_matrix((data, (row, col))) + sp_array = scipy.sparse.coo_matrix((data, (row, col))) result = pd.Series.sparse.from_coo(sp_array) index = pd.MultiIndex.from_arrays([[0, 0, 1, 3], [0, 2, 1, 3]]) expected = pd.Series([4, 9, 7, 5], index=index, dtype='Sparse[int]') tm.assert_series_equal(result, expected) + @td.skip_if_no_scipy def test_to_coo(self): - sparse = pytest.importorskip("scipy.sparse") + import scipy.sparse ser = pd.Series([1, 2, 3], index=pd.MultiIndex.from_product([[0], [1, 2, 3]], names=['a', 'b']), dtype='Sparse[int]') A, _, _ = ser.sparse.to_coo() - assert isinstance(A, sparse.coo.coo_matrix) + assert isinstance(A, scipy.sparse.coo.coo_matrix) def test_non_sparse_raises(self): ser = pd.Series([1, 2, 3])