ENH: Native conversion from/to scipy.sparse matrix to SparseDataFrame

kernc · kernc · commit 3b654c82d7f5 · 2017-02-24T11:26:29.000+01:00
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -153,6 +153,28 @@ New Behavior:
 
   df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum()
 
+.. _whatsnew_0200.enhancements.scipy_sparse:
+
+SciPy sparse matrix from/to SparseDataFrame
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Pandas now supports creating SparseDataFrames directly from ``scipy.sparse.spmatrix`` instances. E.g.
+
+.. ipython:: python
+
+   from scipy.sparse import csr_matrix
+   arr = np.random.random(size=(1000, 5))
+   arr[arr < .9] = 0
+   sp_arr = csr_matrix(arr)
+   sp_arr
+   sdf = pd.DataFrame(sp_arr)
+   sdf
+
+To convert a SparseDataFrame back to scipy sparse matrix in COO format, you can use:
+
+.. ipython:: python
+
+   sdf.to_coo()
+
 .. _whatsnew_0200.enhancements.other:
 
 Other enhancements
diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py
@@ -11,7 +11,7 @@
 import numpy as np
 
 from pandas.types.missing import isnull, notnull
-from pandas.types.cast import _maybe_upcast
+from pandas.types.cast import _maybe_upcast, _find_common_type
 from pandas.types.common import _ensure_platform_int
 
 from pandas.core.common import _try_sort
@@ -25,9 +25,14 @@
                                    create_block_manager_from_arrays)
 import pandas.core.generic as generic
 from pandas.sparse.series import SparseSeries, SparseArray
+from pandas._sparse import BlockIndex, get_blocks
 from pandas.util.decorators import Appender
 import pandas.core.ops as ops
 
+try:
+    from scipy.sparse import spmatrix  # noqa
+except ImportError:
+    spmatrix = type('mock spmatrix', (), {})
 
 _shared_doc_kwargs = dict(klass='SparseDataFrame')
 
@@ -39,7 +44,7 @@ class SparseDataFrame(DataFrame):
 
     Parameters
     ----------
-    data : same types as can be passed to DataFrame
+    data : same types as can be passed to DataFrame or scipy.sparse.spmatrix
     index : array-like, optional
     column : array-like, optional
     default_kind : {'block', 'integer'}, default 'block'
@@ -85,24 +90,20 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None,
         self._default_fill_value = default_fill_value
 
         if isinstance(data, dict):
-            mgr = self._init_dict(data, index, columns)
-            if dtype is not None:
-                mgr = mgr.astype(dtype)
+            mgr = self._init_dict(data, index, columns, dtype=dtype)
         elif isinstance(data, (np.ndarray, list)):
-            mgr = self._init_matrix(data, index, columns)
-            if dtype is not None:
-                mgr = mgr.astype(dtype)
+            mgr = self._init_matrix(data, index, columns, dtype=dtype)
         elif isinstance(data, SparseDataFrame):
             mgr = self._init_mgr(data._data,
                                  dict(index=index, columns=columns),
                                  dtype=dtype, copy=copy)
         elif isinstance(data, DataFrame):
-            mgr = self._init_dict(data, data.index, data.columns)
-            if dtype is not None:
-                mgr = mgr.astype(dtype)
+            mgr = self._init_dict(data, data.index, data.columns, dtype=dtype)
         elif isinstance(data, BlockManager):
             mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
                                  dtype=dtype, copy=copy)
+        elif isinstance(data, spmatrix):
+            mgr = self._init_spmatrix(data, index, columns, dtype=dtype)
         elif data is None:
             data = DataFrame()
 
@@ -175,6 +176,33 @@ def _init_dict(self, data, index, columns, dtype=None):
 
     def _init_matrix(self, data, index, columns, dtype=None):
         data = _prep_ndarray(data, copy=False)
+        index, columns = self._prep_index(data, index, columns)
+        data = dict([(idx, data[:, i]) for i, idx in enumerate(columns)])
+        return self._init_dict(data, index, columns, dtype)
+
+    def _init_spmatrix(self, data, index, columns, dtype=None):
+        index, columns = self._prep_index(data, index, columns)
+        data = data.tocoo(copy=False)
+        N = len(index)
+        bindex = np.arange(N, dtype=np.int32)
+
+        sdict = {}
+        values = Series(data.data, index=data.row)
+        for col, rowvals in values.groupby(data.col):
+            blocs, blens = get_blocks(bindex[rowvals.index])
+            sdict[columns[col]] = SparseSeries(
+                rowvals.values, index=index,
+                sparse_index=BlockIndex(N, blocs, blens))
+
+        # Add any columns that were empty
+        sdict.update({column: SparseSeries(index=index,
+                                           sparse_index=BlockIndex(N, [], []))
+                      for column in columns
+                      if column not in sdict})
+
+        return self._init_dict(sdict, index, columns, dtype)
+
+    def _prep_index(self, data, index, columns):
         N, K = data.shape
         if index is None:
             index = _default_index(N)
@@ -187,9 +215,84 @@ def _init_matrix(self, data, index, columns, dtype=None):
         if len(index) != N:
             raise ValueError('Index length mismatch: %d vs. %d' %
                              (len(index), N))
+        return index, columns
 
-        data = dict([(idx, data[:, i]) for i, idx in enumerate(columns)])
-        return self._init_dict(data, index, columns, dtype)
+    def as_matrix(self, columns=None, sparse=False):
+        """
+        Convert the frame to its Numpy-array or SciPy sparse COO matrix
+        representation.
+
+        Parameters
+        ----------
+        columns : list, optional, default=None
+            If None, return all columns. Otherwise, returns specified columns.
+        sparse : bool, optional, default=True
+            If True, return an instance of scipy.sparse.coo_matrix instead
+            of ndarray. If False, the result values array will be DENSE.
+
+        Returns
+        -------
+        values : ndarray or scipy.sparse.spmatrix
+            If the caller is heterogeneous and contains booleans or objects,
+            the result will be of dtype=object. See Notes.
+
+        Notes
+        -----
+        The dtype will be the lowest-common-denominator type (implicit
+        upcasting); that is to say if the dtypes (even of numeric types)
+        are mixed, the one that accommodates all will be chosen.
+
+        e.g. If the dtypes are float16 and float32, dtype will be upcast to
+        float32. By numpy.find_common_type convention, mixing int64 and
+        and uint64 will result in a float64 dtype.
+
+        See Also
+        --------
+        pandas.SparseDataFrame.to_coo
+        """
+        if sparse:
+            subdf = self if columns is None else self[columns]
+            return subdf.to_coo()
+
+        return super(SparseDataFrame, self).as_matrix(columns=columns)
+
+    def to_coo(self):
+        """
+        Convert the frame to its SciPy sparse COO matrix representation.
+
+        Returns
+        -------
+        coo_matrix : scipy.sparse.spmatrix
+            If the caller is heterogeneous and contains booleans or objects,
+            the result will be of dtype=object. See Notes.
+
+        Notes
+        -----
+        The dtype will be the lowest-common-denominator type (implicit
+        upcasting); that is to say if the dtypes (even of numeric types)
+        are mixed, the one that accommodates all will be chosen.
+
+        e.g. If the dtypes are float16 and float32, dtype will be upcast to
+        float32. By numpy.find_common_type convention, mixing int64 and
+        and uint64 will result in a float64 dtype.
+        """
+        try:
+            from scipy.sparse import coo_matrix
+        except ImportError:
+            raise ImportError('Scipy is not installed')
+
+        cols, rows, datas = [], [], []
+        for col, name in enumerate(self):
+            s = self[name]
+            row = s.sp_index.to_int_index().indices
+            cols.append(np.repeat(col, len(row)))
+            rows.append(row)
+            datas.append(s.sp_values)
+
+        cols = np.hstack(cols)
+        rows = np.hstack(rows)
+        datas = np.hstack(datas).astype(_find_common_type(self.dtypes))
+        return coo_matrix((datas, (rows, cols)), shape=self.shape)
 
     def __array_wrap__(self, result):
         return self._constructor(
diff --git a/pandas/tests/sparse/test_frame.py b/pandas/tests/sparse/test_frame.py
@@ -1118,6 +1118,31 @@ def test_isnotnull(self):
                             'B': [True, False, True, True, False]})
         tm.assert_frame_equal(res.to_dense(), exp)
 
+    def test_from_to_scipy(self):
+        # GH 4343
+        try:
+            from scipy.sparse import csr_matrix
+        except ImportError:
+            return  # scipy not available
+
+        arr = np.array([[0, 1, 0],
+                        [0, 0, 1],
+                        [1, 1, 1.]])
+        spm = csr_matrix(arr)
+
+        for index, columns in ((list('abc'), list('def')),
+                               (None, None)):
+            sdf = pd.SparseDataFrame(spm, index=index, columns=columns)
+
+            if index is not None:
+                tm.assert_index_equal(sdf.index, pd.Index(index))
+            if columns is not None:
+                tm.assert_index_equal(sdf.columns, pd.Index(columns))
+
+            tm.assert_numpy_array_equal(sdf.fillna(0).values, arr)
+            tm.assert_equal((sdf.to_coo() != spm).data.size, 0)
+            tm.assert_equal((sdf.as_matrix(sparse=True) != spm).data.size, 0)
+
 
 class TestSparseDataFrameArithmetic(tm.TestCase):