pandas-dev · jorisvandenbossche · Sep 27, 2019 · Sep 19, 2019 · Sep 19, 2019 · Sep 26, 2019
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -96,9 +96,10 @@ Deprecations
 Removed SparseSeries and SparseDataFrame
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-``SparseSeries`` and ``SparseDataFrame`` have been removed (:issue:`28425`).
-We recommend using a ``Series`` or ``DataFrame`` with sparse values instead.
-See :ref:`sparse.migration` for help with migrating existing code.
+``SparseSeries``, ``SparseDataFrame`` and the ``DataFrame.to_sparse`` method
+have been removed (:issue:`28425`). We recommend using a ``Series`` or
+``DataFrame`` with sparse values instead. See :ref:`sparse.migration` for help
+with migrating existing code.
 
 Removal of prior version deprecations/changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/pandas/__init__.py b/pandas/__init__.py
@@ -114,7 +114,7 @@
     DataFrame,
 )
 
-from pandas.core.sparse.api import SparseArray, SparseDtype
+from pandas.core.arrays.sparse import SparseArray, SparseDtype
 
 from pandas.tseries.api import infer_freq
 from pandas.tseries import offsets

diff --git a/pandas/core/arrays/sparse/__init__.py b/pandas/core/arrays/sparse/__init__.py
@@ -0,0 +1,5 @@
+# flake8: noqa: F401
+
+from .accessor import SparseAccessor, SparseFrameAccessor
+from .array import BlockIndex, IntIndex, SparseArray, _make_index
+from .dtype import SparseDtype
diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py
@@ -0,0 +1,336 @@
+"""Sparse accessor"""
+
+import numpy as np
+
+from pandas.compat._optional import import_optional_dependency
+
+from pandas.core.dtypes.cast import find_common_type
+
+from pandas.core.accessor import PandasDelegate, delegate_names
+
+from .array import SparseArray
+from .dtype import SparseDtype
+
+
+class BaseAccessor:
+    _validation_msg = "Can only use the '.sparse' accessor with Sparse data."
+
+    def __init__(self, data=None):
+        self._parent = data
+        self._validate(data)
+
+    def _validate(self, data):
+        raise NotImplementedError
+
+
+@delegate_names(
+    SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property"
+)
+class SparseAccessor(BaseAccessor, PandasDelegate):
+    """
+    Accessor for SparseSparse from other sparse matrix data types.
+    """
+
+    def _validate(self, data):
+        if not isinstance(data.dtype, SparseDtype):
+            raise AttributeError(self._validation_msg)
+
+    def _delegate_property_get(self, name, *args, **kwargs):
+        return getattr(self._parent.array, name)
+
+    def _delegate_method(self, name, *args, **kwargs):
+        if name == "from_coo":
+            return self.from_coo(*args, **kwargs)
+        elif name == "to_coo":
+            return self.to_coo(*args, **kwargs)
+        else:
+            raise ValueError
+
+    @classmethod
+    def from_coo(cls, A, dense_index=False):
+        """
+        Create a Series with sparse values from a scipy.sparse.coo_matrix.
+
+        Parameters
+        ----------
+        A : scipy.sparse.coo_matrix
+        dense_index : bool, default False
+            If False (default), the SparseSeries index consists of only the
+            coords of the non-null entries of the original coo_matrix.
+            If True, the SparseSeries index consists of the full sorted
+            (row, col) coordinates of the coo_matrix.
+
+        Returns
+        -------
+        s : Series
+            A Series with sparse values.
+
+        Examples
+        --------
+        >>> from scipy import sparse
+        >>> A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])),
+                               shape=(3, 4))
+        >>> A
+        <3x4 sparse matrix of type '<class 'numpy.float64'>'
+                with 3 stored elements in COOrdinate format>
+        >>> A.todense()
+        matrix([[ 0.,  0.,  1.,  2.],
+                [ 3.,  0.,  0.,  0.],
+                [ 0.,  0.,  0.,  0.]])
+        >>> ss = pd.Series.sparse.from_coo(A)
+        >>> ss
+        0  2    1
+           3    2
+        1  0    3
+        dtype: float64
+        BlockIndex
+        Block locations: array([0], dtype=int32)
+        Block lengths: array([3], dtype=int32)
+        """
+        from pandas.core.arrays.sparse.scipy_sparse import _coo_to_sparse_series
+        from pandas import Series
+
+        result = _coo_to_sparse_series(A, dense_index=dense_index)
+        result = Series(result.array, index=result.index, copy=False)
+
+        return result
+
+    def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False):
+        """
+        Create a scipy.sparse.coo_matrix from a Series with MultiIndex.
+
+        Use row_levels and column_levels to determine the row and column
+        coordinates respectively. row_levels and column_levels are the names
+        (labels) or numbers of the levels. {row_levels, column_levels} must be
+        a partition of the MultiIndex level names (or numbers).
+
+        Parameters
+        ----------
+        row_levels : tuple/list
+        column_levels : tuple/list
+        sort_labels : bool, default False
+            Sort the row and column labels before forming the sparse matrix.
+
+        Returns
+        -------
+        y : scipy.sparse.coo_matrix
+        rows : list (row labels)
+        columns : list (column labels)
+
+        Examples
+        --------
+        >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])
+        >>> s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0),
+                                                (1, 2, 'a', 1),
+                                                (1, 1, 'b', 0),
+                                                (1, 1, 'b', 1),
+                                                (2, 1, 'b', 0),
+                                                (2, 1, 'b', 1)],
+                                                names=['A', 'B', 'C', 'D'])
+        >>> ss = s.astype("Sparse")
+        >>> A, rows, columns = ss.sparse.to_coo(row_levels=['A', 'B'],
+        ...                                     column_levels=['C', 'D'],
+        ...                                     sort_labels=True)
+        >>> A
+        <3x4 sparse matrix of type '<class 'numpy.float64'>'
+                with 3 stored elements in COOrdinate format>
+        >>> A.todense()
+        matrix([[ 0.,  0.,  1.,  3.],
+        [ 3.,  0.,  0.,  0.],
+        [ 0.,  0.,  0.,  0.]])
+        >>> rows
+        [(1, 1), (1, 2), (2, 1)]
+        >>> columns
+        [('a', 0), ('a', 1), ('b', 0), ('b', 1)]
+        """
+        from pandas.core.arrays.sparse.scipy_sparse import _sparse_series_to_coo
+
+        A, rows, columns = _sparse_series_to_coo(
+            self._parent, row_levels, column_levels, sort_labels=sort_labels
+        )
+        return A, rows, columns
+
+    def to_dense(self):
+        """
+        Convert a Series from sparse values to dense.
+
+        .. versionadded:: 0.25.0
+
+        Returns
+        -------
+        Series:
+            A Series with the same values, stored as a dense array.
+
+        Examples
+        --------
+        >>> series = pd.Series(pd.SparseArray([0, 1, 0]))
+        >>> series
+        0    0
+        1    1
+        2    0
+        dtype: Sparse[int64, 0]
+
+        >>> series.sparse.to_dense()
+        0    0
+        1    1
+        2    0
+        dtype: int64
+        """
+        from pandas import Series
+
+        return Series(
+            self._parent.array.to_dense(),
+            index=self._parent.index,
+            name=self._parent.name,
+        )
+
+
+class SparseFrameAccessor(BaseAccessor, PandasDelegate):
+    """
+    DataFrame accessor for sparse data.
+
+    .. versionadded:: 0.25.0
+    """
+
+    def _validate(self, data):
+        dtypes = data.dtypes
+        if not all(isinstance(t, SparseDtype) for t in dtypes):
+            raise AttributeError(self._validation_msg)
+
+    @classmethod
+    def from_spmatrix(cls, data, index=None, columns=None):
+        """
+        Create a new DataFrame from a scipy sparse matrix.
+
+        .. versionadded:: 0.25.0
+
+        Parameters
+        ----------
+        data : scipy.sparse.spmatrix
+            Must be convertible to csc format.
+        index, columns : Index, optional
+            Row and column labels to use for the resulting DataFrame.
+            Defaults to a RangeIndex.
+
+        Returns
+        -------
+        DataFrame
+            Each column of the DataFrame is stored as a
+            :class:`SparseArray`.
+
+        Examples
+        --------
+        >>> import scipy.sparse
+        >>> mat = scipy.sparse.eye(3)
+        >>> pd.DataFrame.sparse.from_spmatrix(mat)
+             0    1    2
+        0  1.0  0.0  0.0
+        1  0.0  1.0  0.0
+        2  0.0  0.0  1.0
+        """
+        from pandas import DataFrame
+
+        data = data.tocsc()
+        index, columns = cls._prep_index(data, index, columns)
+        sparrays = [SparseArray.from_spmatrix(data[:, i]) for i in range(data.shape[1])]
+        data = dict(enumerate(sparrays))
+        result = DataFrame(data, index=index)
+        result.columns = columns
+        return result
+
+    def to_dense(self):
+        """
+        Convert a DataFrame with sparse values to dense.
+
+        .. versionadded:: 0.25.0
+
+        Returns
+        -------
+        DataFrame
+            A DataFrame with the same values stored as dense arrays.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({"A": pd.SparseArray([0, 1, 0])})
+        >>> df.sparse.to_dense()
+           A
+        0  0
+        1  1
+        2  0
+        """
+        from pandas import DataFrame
+
+        data = {k: v.array.to_dense() for k, v in self._parent.items()}
+        return DataFrame(data, index=self._parent.index, columns=self._parent.columns)
+
+    def to_coo(self):
+        """
+        Return the contents of the frame as a sparse SciPy COO matrix.
+
+        .. versionadded:: 0.25.0
+
+        Returns
+        -------
+        coo_matrix : scipy.sparse.spmatrix
+            If the caller is heterogeneous and contains booleans or objects,
+            the result will be of dtype=object. See Notes.
+
+        Notes
+        -----
+        The dtype will be the lowest-common-denominator type (implicit
+        upcasting); that is to say if the dtypes (even of numeric types)
+        are mixed, the one that accommodates all will be chosen.
+
+        e.g. If the dtypes are float16 and float32, dtype will be upcast to
+        float32. By numpy.find_common_type convention, mixing int64 and
+        and uint64 will result in a float64 dtype.
+        """
+        import_optional_dependency("scipy")
+        from scipy.sparse import coo_matrix
+
+        dtype = find_common_type(self._parent.dtypes)
+        if isinstance(dtype, SparseDtype):
+            dtype = dtype.subtype
+
+        cols, rows, datas = [], [], []
+        for col, name in enumerate(self._parent):
+            s = self._parent[name]
+            row = s.array.sp_index.to_int_index().indices
+            cols.append(np.repeat(col, len(row)))
+            rows.append(row)
+            datas.append(s.array.sp_values.astype(dtype, copy=False))
+
+        cols = np.concatenate(cols)
+        rows = np.concatenate(rows)
+        datas = np.concatenate(datas)
+        return coo_matrix((datas, (rows, cols)), shape=self._parent.shape)
+
+    @property
+    def density(self) -> float:
+        """
+        Ratio of non-sparse points to total (dense) data points
+        represented in the DataFrame.
+        """
+        return np.mean([column.array.density for _, column in self._parent.items()])
+
+    @staticmethod
+    def _prep_index(data, index, columns):
+        import pandas.core.indexes.base as ibase
+
+        N, K = data.shape
+        if index is None:
+            index = ibase.default_index(N)
+        if columns is None:
+            columns = ibase.default_index(K)
+
+        if len(columns) != K:
+            raise ValueError(
+                "Column length mismatch: {columns} vs. {K}".format(
+                    columns=len(columns), K=K
+                )
+            )
+        if len(index) != N:
+            raise ValueError(
+                "Index length mismatch: {index} vs. {N}".format(index=len(index), N=N)
+            )
+        return index, columns