PERF: optimize DataFrame.sparse.from_spmatrix performance (pandas-dev#32825)

rth · SeeminSyed · commit a88d24cc4b8e · 2020-03-22T19:55:07.000-04:00
diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py
@@ -45,7 +45,6 @@ def time_sparse_array(self, dense_proportion, fill_value, dtype):
 class SparseDataFrameConstructor:
     def setup(self):
         N = 1000
-        self.arr = np.arange(N)
         self.sparse = scipy.sparse.rand(N, N, 0.005)
 
     def time_from_scipy(self):
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -224,6 +224,10 @@ Performance improvements
 - The internal index method :meth:`~Index._shallow_copy` now copies cached attributes over to the new index,
   avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of
   existing indexes (:issue:`28584`, :issue:`32640`, :issue:`32669`)
+- Significant performance improvement when creating a :class:`DataFrame` with
+  sparse values from ``scipy.sparse`` matrices using the
+  :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`,
+  :issue:`32825`,  :issue:`32826`, :issue:`32856`, :issue:`32858`).
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx
@@ -34,18 +34,21 @@ cdef class IntIndex(SparseIndex):
     length : integer
     indices : array-like
         Contains integers corresponding to the indices.
+    check_integrity : bool, default=True
+        Check integrity of the input.
     """
 
     cdef readonly:
         Py_ssize_t length, npoints
         ndarray indices
 
-    def __init__(self, Py_ssize_t length, indices):
+    def __init__(self, Py_ssize_t length, indices, bint check_integrity=True):
         self.length = length
         self.indices = np.ascontiguousarray(indices, dtype=np.int32)
         self.npoints = len(self.indices)
 
-        self.check_integrity()
+        if check_integrity:
+            self.check_integrity()
 
     def __reduce__(self):
         args = (self.length, self.indices)
diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py
@@ -228,14 +228,29 @@ def from_spmatrix(cls, data, index=None, columns=None):
         2  0.0  0.0  1.0
         """
         from pandas import DataFrame
+        from pandas._libs.sparse import IntIndex
 
         data = data.tocsc()
         index, columns = cls._prep_index(data, index, columns)
-        sparrays = [SparseArray.from_spmatrix(data[:, i]) for i in range(data.shape[1])]
-        data = dict(enumerate(sparrays))
-        result = DataFrame(data, index=index)
-        result.columns = columns
-        return result
+        n_rows, n_columns = data.shape
+        # We need to make sure indices are sorted, as we create
+        # IntIndex with no input validation (i.e. check_integrity=False ).
+        # Indices may already be sorted in scipy in which case this adds
+        # a small overhead.
+        data.sort_indices()
+        indices = data.indices
+        indptr = data.indptr
+        array_data = data.data
+        dtype = SparseDtype(array_data.dtype, 0)
+        arrays = []
+        for i in range(n_columns):
+            sl = slice(indptr[i], indptr[i + 1])
+            idx = IntIndex(n_rows, indices[sl], check_integrity=False)
+            arr = SparseArray._simple_new(array_data[sl], idx, dtype)
+            arrays.append(arr)
+        return DataFrame._from_arrays(
+            arrays, columns=columns, index=index, verify_integrity=False
+        )
 
     def to_dense(self):
         """
@@ -314,12 +329,17 @@ def density(self) -> float:
     @staticmethod
     def _prep_index(data, index, columns):
         import pandas.core.indexes.base as ibase
+        from pandas.core.indexes.api import ensure_index
 
         N, K = data.shape
         if index is None:
             index = ibase.default_index(N)
+        else:
+            index = ensure_index(index)
         if columns is None:
             columns = ibase.default_index(K)
+        else:
+            columns = ensure_index(columns)
 
         if len(columns) != K:
             raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}")