diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index ac78ca53679fd..7a09b03648fa7 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -45,7 +45,6 @@ def time_sparse_array(self, dense_proportion, fill_value, dtype): class SparseDataFrameConstructor: def setup(self): N = 1000 - self.arr = np.arange(N) self.sparse = scipy.sparse.rand(N, N, 0.005) def time_from_scipy(self): diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 4044fb2d3fa09..48b0779a1753a 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -224,6 +224,10 @@ Performance improvements - The internal index method :meth:`~Index._shallow_copy` now copies cached attributes over to the new index, avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of existing indexes (:issue:`28584`, :issue:`32640`, :issue:`32669`) +- Significant performance improvement when creating a :class:`DataFrame` with + sparse values from ``scipy.sparse`` matrices using the + :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`, + :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 091ca42cb71dd..d853ddf3de7d4 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -34,18 +34,21 @@ cdef class IntIndex(SparseIndex): length : integer indices : array-like Contains integers corresponding to the indices. + check_integrity : bool, default=True + Check integrity of the input. """ cdef readonly: Py_ssize_t length, npoints ndarray indices - def __init__(self, Py_ssize_t length, indices): + def __init__(self, Py_ssize_t length, indices, bint check_integrity=True): self.length = length self.indices = np.ascontiguousarray(indices, dtype=np.int32) self.npoints = len(self.indices) - self.check_integrity() + if check_integrity: + self.check_integrity() def __reduce__(self): args = (self.length, self.indices) diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 92c05f44d677c..787407060c7f1 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -228,14 +228,29 @@ def from_spmatrix(cls, data, index=None, columns=None): 2 0.0 0.0 1.0 """ from pandas import DataFrame + from pandas._libs.sparse import IntIndex data = data.tocsc() index, columns = cls._prep_index(data, index, columns) - sparrays = [SparseArray.from_spmatrix(data[:, i]) for i in range(data.shape[1])] - data = dict(enumerate(sparrays)) - result = DataFrame(data, index=index) - result.columns = columns - return result + n_rows, n_columns = data.shape + # We need to make sure indices are sorted, as we create + # IntIndex with no input validation (i.e. check_integrity=False ). + # Indices may already be sorted in scipy in which case this adds + # a small overhead. + data.sort_indices() + indices = data.indices + indptr = data.indptr + array_data = data.data + dtype = SparseDtype(array_data.dtype, 0) + arrays = [] + for i in range(n_columns): + sl = slice(indptr[i], indptr[i + 1]) + idx = IntIndex(n_rows, indices[sl], check_integrity=False) + arr = SparseArray._simple_new(array_data[sl], idx, dtype) + arrays.append(arr) + return DataFrame._from_arrays( + arrays, columns=columns, index=index, verify_integrity=False + ) def to_dense(self): """ @@ -314,12 +329,17 @@ def density(self) -> float: @staticmethod def _prep_index(data, index, columns): import pandas.core.indexes.base as ibase + from pandas.core.indexes.api import ensure_index N, K = data.shape if index is None: index = ibase.default_index(N) + else: + index = ensure_index(index) if columns is None: columns = ibase.default_index(K) + else: + columns = ensure_index(columns) if len(columns) != K: raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}")