Skip to content

Commit a88d24c

Browse files
rthSeeminSyed
authored andcommitted
PERF: optimize DataFrame.sparse.from_spmatrix performance (pandas-dev#32825)
1 parent b257914 commit a88d24c

File tree

4 files changed

+34
-8
lines changed

4 files changed

+34
-8
lines changed

asv_bench/benchmarks/sparse.py

-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ def time_sparse_array(self, dense_proportion, fill_value, dtype):
4545
class SparseDataFrameConstructor:
4646
def setup(self):
4747
N = 1000
48-
self.arr = np.arange(N)
4948
self.sparse = scipy.sparse.rand(N, N, 0.005)
5049

5150
def time_from_scipy(self):

doc/source/whatsnew/v1.1.0.rst

+4
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,10 @@ Performance improvements
224224
- The internal index method :meth:`~Index._shallow_copy` now copies cached attributes over to the new index,
225225
avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of
226226
existing indexes (:issue:`28584`, :issue:`32640`, :issue:`32669`)
227+
- Significant performance improvement when creating a :class:`DataFrame` with
228+
sparse values from ``scipy.sparse`` matrices using the
229+
:meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`,
230+
:issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`).
227231

228232
.. ---------------------------------------------------------------------------
229233

pandas/_libs/sparse.pyx

+5-2
Original file line numberDiff line numberDiff line change
@@ -34,18 +34,21 @@ cdef class IntIndex(SparseIndex):
3434
length : integer
3535
indices : array-like
3636
Contains integers corresponding to the indices.
37+
check_integrity : bool, default=True
38+
Check integrity of the input.
3739
"""
3840

3941
cdef readonly:
4042
Py_ssize_t length, npoints
4143
ndarray indices
4244

43-
def __init__(self, Py_ssize_t length, indices):
45+
def __init__(self, Py_ssize_t length, indices, bint check_integrity=True):
4446
self.length = length
4547
self.indices = np.ascontiguousarray(indices, dtype=np.int32)
4648
self.npoints = len(self.indices)
4749

48-
self.check_integrity()
50+
if check_integrity:
51+
self.check_integrity()
4952

5053
def __reduce__(self):
5154
args = (self.length, self.indices)

pandas/core/arrays/sparse/accessor.py

+25-5
Original file line numberDiff line numberDiff line change
@@ -228,14 +228,29 @@ def from_spmatrix(cls, data, index=None, columns=None):
228228
2 0.0 0.0 1.0
229229
"""
230230
from pandas import DataFrame
231+
from pandas._libs.sparse import IntIndex
231232

232233
data = data.tocsc()
233234
index, columns = cls._prep_index(data, index, columns)
234-
sparrays = [SparseArray.from_spmatrix(data[:, i]) for i in range(data.shape[1])]
235-
data = dict(enumerate(sparrays))
236-
result = DataFrame(data, index=index)
237-
result.columns = columns
238-
return result
235+
n_rows, n_columns = data.shape
236+
# We need to make sure indices are sorted, as we create
237+
# IntIndex with no input validation (i.e. check_integrity=False ).
238+
# Indices may already be sorted in scipy in which case this adds
239+
# a small overhead.
240+
data.sort_indices()
241+
indices = data.indices
242+
indptr = data.indptr
243+
array_data = data.data
244+
dtype = SparseDtype(array_data.dtype, 0)
245+
arrays = []
246+
for i in range(n_columns):
247+
sl = slice(indptr[i], indptr[i + 1])
248+
idx = IntIndex(n_rows, indices[sl], check_integrity=False)
249+
arr = SparseArray._simple_new(array_data[sl], idx, dtype)
250+
arrays.append(arr)
251+
return DataFrame._from_arrays(
252+
arrays, columns=columns, index=index, verify_integrity=False
253+
)
239254

240255
def to_dense(self):
241256
"""
@@ -314,12 +329,17 @@ def density(self) -> float:
314329
@staticmethod
315330
def _prep_index(data, index, columns):
316331
import pandas.core.indexes.base as ibase
332+
from pandas.core.indexes.api import ensure_index
317333

318334
N, K = data.shape
319335
if index is None:
320336
index = ibase.default_index(N)
337+
else:
338+
index = ensure_index(index)
321339
if columns is None:
322340
columns = ibase.default_index(K)
341+
else:
342+
columns = ensure_index(columns)
323343

324344
if len(columns) != K:
325345
raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}")

0 commit comments

Comments
 (0)