API: DataFrame.sparse accessor

TomAugspurger · TomAugspurger · commit 22d316c7bf78 · 2019-03-11T22:05:46.000-05:00
Closes pandas-dev#25681
diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
@@ -312,6 +312,29 @@ specific plotting methods of the form ``DataFrame.plot.<kind>``.
    DataFrame.boxplot
    DataFrame.hist
 
+
+.. _api.frame.sparse:
+
+Sparse Accessor
+~~~~~~~~~~~~~~~
+
+Sparse-dtype specific methods and attributes are provided under the
+``DataFrame.sparse`` accessor.
+
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/accessor_attribute.rst
+
+   DataFrame.sparse.density
+
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.sparse.from_spmatrix
+   DataFrame.sparse.to_coo
+   DataFrame.sparse.to_dense
+
+
 Serialization / IO / Conversion
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autosummary::
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -26,6 +26,7 @@ Other Enhancements
 - :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`)
 - :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`)
 - :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`)
+- Added :ref:`api.frame.sparse` for working with a ``DataFrame`` whose values are sparse (:issue:``)
 
 .. _whatsnew_0250.api_breaking:
 
diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py
@@ -678,6 +678,36 @@ def _simple_new(cls, sparse_array, sparse_index, dtype):
         new._dtype = dtype
         return new
 
+    @classmethod
+    def from_spmatrix(cls, data):
+        """
+        Create a SparseArray from a scipy.sparse matrix.
+
+        Parameters
+        ----------
+        data : scipy.sparse.sp_matrix
+            This should be a 2-D SciPy sparse where the size
+            of the second dimension is 1. In other words, a
+            sparse matrix with a single column.
+
+        Returns
+        -------
+        SparseArray.
+        """
+        assert data.ndim == 2
+
+        length, ncol = data.shape
+
+        assert ncol == 1
+
+        arr = data.data
+        idx, _ = data.nonzero()
+        zero = np.array(0, dtype=arr.dtype).item()
+        dtype = SparseDtype(arr.dtype, zero)
+        index = IntIndex(length, idx)
+
+        return cls._simple_new(arr, index, dtype)
+
     def __array__(self, dtype=None, copy=True):
         fill_value = self.fill_value
 
@@ -1891,6 +1921,9 @@ def _make_index(length, indices, kind):
 # ----------------------------------------------------------------------------
 # Accessor
 
+_validation_msg = "Can only use the '.sparse' accessor with Sparse data."
+
+
 @delegate_names(SparseArray, ['npoints', 'density', 'fill_value',
                               'sp_values'],
                 typ='property')
@@ -1900,15 +1933,13 @@ class SparseAccessor(PandasDelegate):
     """
 
     def __init__(self, data=None):
-        self._validate(data)
         # Store the Series since we need that for to_coo
         self._parent = data
+        self._validate(data)
 
-    @staticmethod
-    def _validate(data):
+    def _validate(self, data):
         if not isinstance(data.dtype, SparseDtype):
-            msg = "Can only use the '.sparse' accessor with Sparse data."
-            raise AttributeError(msg)
+            raise AttributeError(_validation_msg)
 
     def _delegate_property_get(self, name, *args, **kwargs):
         return getattr(self._parent.values, name)
@@ -2025,3 +2056,126 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
                                                  column_levels,
                                                  sort_labels=sort_labels)
         return A, rows, columns
+
+    def to_dense(self):
+        from pandas import Series
+        return Series(self._parent.array.to_dense(),
+                      index=self._parent.index,
+                      name=self._parent.name)
+
+
+class SparseFrameAccessor(PandasDelegate):
+
+    def __init__(self, data=None):
+        # Store the Series since we need that for to_coo
+        self._parent = data
+        self._validate(data)
+
+    def _validate(self, data):
+        dtypes = data.dtypes
+        if not all(isinstance(t, SparseDtype) for t in dtypes):
+            raise AttributeError(_validation_msg)
+
+    @classmethod
+    def from_spmatrix(cls, data, index=None, columns=None):
+        """
+        Create a new DataFrame from a scipy sparse matrix.
+
+        Parameters
+        ----------
+        data : scipy.sparse.spmatrix
+            Must be convertible to csc format.
+        index, columns : Index, optional
+            Row and column labels to use for the resulting DataFrame.
+            Defaults to a RangeIndex.
+
+        Returns
+        -------
+        DataFrame
+
+        Examples
+        --------
+        >>> import scipy.sparse
+        >>> mat = scipy.sparse.eye(3)
+        >>> pd.DataFrame.sparse.from_spmatrix(mat)
+             0    1    2
+        0  1.0  0.0  0.0
+        1  0.0  1.0  0.0
+        2  0.0  0.0  1.0
+        """
+        from pandas import DataFrame
+
+        data = data.tocsc()
+        index, columns = cls._prep_index(data, index, columns)
+        sparrays = [
+            SparseArray.from_spmatrix(data[:, i])
+            for i in range(data.shape[1])
+        ]
+        data = dict(zip(columns, sparrays))
+        return DataFrame(data, index=index)
+
+    def to_dense(self):
+        """
+        Convert to dense DataFrame
+
+        Returns
+        -------
+        df : DataFrame
+        """
+        from pandas import DataFrame
+
+        data = {k: v.array.to_dense()
+                for k, v in compat.iteritems(self._parent)}
+        return DataFrame(data,
+                         index=self._parent.index,
+                         columns=self._parent.columns)
+
+    def to_coo(self):
+        try:
+            from scipy.sparse import coo_matrix
+        except ImportError:
+            raise ImportError('Scipy is not installed')
+
+        dtype = find_common_type(self._parent.dtypes)
+        if isinstance(dtype, SparseDtype):
+            dtype = dtype.subtype
+
+        cols, rows, datas = [], [], []
+        for col, name in enumerate(self._parent):
+            s = self._parent[name]
+            row = s.array.sp_index.to_int_index().indices
+            cols.append(np.repeat(col, len(row)))
+            rows.append(row)
+            datas.append(s.array.sp_values.astype(dtype, copy=False))
+
+        cols = np.concatenate(cols)
+        rows = np.concatenate(rows)
+        datas = np.concatenate(datas)
+        return coo_matrix((datas, (rows, cols)), shape=self._parent.shape)
+
+    @property
+    def density(self):
+        """
+        Ratio of non-sparse points to total (dense) data points
+        represented in the DataFrame.
+        """
+        return np.mean([column.array.density
+                        for _, column in self._parent.iteritems()])
+
+    @staticmethod
+    def _prep_index(data, index, columns):
+        import pandas.core.indexes.base as ibase
+
+        N, K = data.shape
+        if index is None:
+            index = ibase.default_index(N)
+        if columns is None:
+            columns = ibase.default_index(K)
+
+        if len(columns) != K:
+            raise ValueError('Column length mismatch: {columns} vs. {K}'
+                             .format(columns=len(columns), K=K))
+        if len(index) != N:
+            raise ValueError('Index length mismatch: {index} vs. {N}'
+                             .format(index=len(index), N=N))
+        return index, columns
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -36,6 +36,7 @@
                            PY36, raise_with_traceback, Iterator,
                            string_and_binary_types)
 from pandas.compat.numpy import function as nv
+from pandas.core.arrays.sparse import SparseFrameAccessor
 from pandas.core.dtypes.cast import (
     maybe_upcast,
     cast_scalar_to_array,
@@ -8009,6 +8010,7 @@ def isin(self, values):
     plot = CachedAccessor("plot", gfx.FramePlotMethods)
     hist = gfx.hist_frame
     boxplot = gfx.boxplot_frame
+    sparse = CachedAccessor("sparse", SparseFrameAccessor)
 
 
 DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0,
diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py
@@ -14,12 +14,12 @@
 from pandas.compat.numpy import function as nv
 from pandas.util._decorators import Appender
 
-from pandas.core.dtypes.cast import find_common_type, maybe_upcast
+from pandas.core.dtypes.cast import maybe_upcast
 from pandas.core.dtypes.common import ensure_platform_int, is_scipy_sparse
 from pandas.core.dtypes.missing import isna, notna
 
 import pandas.core.algorithms as algos
-from pandas.core.arrays.sparse import SparseArray, SparseDtype
+from pandas.core.arrays.sparse import SparseArray
 import pandas.core.common as com
 from pandas.core.frame import DataFrame
 import pandas.core.generic as generic
@@ -271,27 +271,8 @@ def to_coo(self):
         float32. By numpy.find_common_type convention, mixing int64 and
         and uint64 will result in a float64 dtype.
         """
-        try:
-            from scipy.sparse import coo_matrix
-        except ImportError:
-            raise ImportError('Scipy is not installed')
-
-        dtype = find_common_type(self.dtypes)
-        if isinstance(dtype, SparseDtype):
-            dtype = dtype.subtype
-
-        cols, rows, datas = [], [], []
-        for col, name in enumerate(self):
-            s = self[name]
-            row = s.sp_index.to_int_index().indices
-            cols.append(np.repeat(col, len(row)))
-            rows.append(row)
-            datas.append(s.sp_values.astype(dtype, copy=False))
-
-        cols = np.concatenate(cols)
-        rows = np.concatenate(rows)
-        datas = np.concatenate(datas)
-        return coo_matrix((datas, (rows, cols)), shape=self.shape)
+        from pandas.core.arrays.sparse import SparseFrameAccessor
+        return SparseFrameAccessor(self).to_coo()
 
     def __array_wrap__(self, result):
         return self._constructor(
diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py
@@ -0,0 +1,76 @@
+import string
+
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+
+
+class TestSeriesAccessor(object):
+    # TODO: collect other accessor tests
+    def test_to_dense(self):
+        s = pd.Series([0, 1, 0, 10], dtype='Sparse[int]')
+        result = s.sparse.to_dense()
+        expected = pd.Series([0, 1, 0, 10])
+        tm.assert_series_equal(result, expected)
+
+
+class TestFrameAccessor(object):
+    @pytest.mark.parametrize('format', ['csc', 'csr', 'coo'])
+    @pytest.mark.parametrize("labels", [
+        None,
+        list(string.ascii_letters[:10]),
+    ])
+    @pytest.mark.parametrize('dtype', ['float64', 'int64'])
+    def test_from_spmatrix(self, format, labels, dtype):
+        pytest.importorskip("scipy")
+        import scipy.sparse
+        sp_dtype = pd.SparseDtype(dtype, np.array(0, dtype=dtype).item())
+
+        mat = scipy.sparse.eye(10, format=format, dtype=dtype)
+        result = pd.DataFrame.sparse.from_spmatrix(
+            mat, index=labels, columns=labels
+        )
+        expected = pd.DataFrame(
+            np.eye(10, dtype=dtype),
+            index=labels,
+            columns=labels,
+        ).astype(sp_dtype)
+        tm.assert_frame_equal(result, expected)
+
+    def test_to_coo(self):
+        pytest.importorskip("scipy")
+        import scipy.sparse
+
+        df = pd.DataFrame({
+            "A": [0, 1, 0],
+            "B": [1, 0, 0],
+        }, dtype='Sparse[int64, 0]')
+        result = df.sparse.to_coo()
+        expected = scipy.sparse.coo_matrix(np.asarray(df))
+        assert (result != expected).nnz == 0
+
+    def test_to_dense(self):
+        df = pd.DataFrame({
+            "A": pd.SparseArray([1, 0], dtype=pd.SparseDtype('int64', 0)),
+            "B": pd.SparseArray([1, 0], dtype=pd.SparseDtype('int64', 1)),
+            "C": pd.SparseArray([1., 0.],
+                                dtype=pd.SparseDtype('float64', 0.0)),
+        }, index=['b', 'a'])
+        result = df.sparse.to_dense()
+        expected = pd.DataFrame({
+            'A': [1, 0],
+            'B': [1, 0],
+            'C': [1.0, 0.0],
+        }, index=['b', 'a'])
+        tm.assert_frame_equal(result, expected)
+
+    def test_density(self):
+        df = pd.DataFrame({
+            'A': pd.SparseArray([1, 0, 2, 1], fill_value=0),
+            'B': pd.SparseArray([0, 1, 1, 1], fill_value=0),
+        })
+        res = df.sparse.density
+        expected = 0.75
+        assert res == expected