From 3ffe7f2d84f4bcc369216188eb73639277d61011 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 19 Mar 2020 08:21:57 +0100 Subject: [PATCH 01/15] PERF: fix SparseArray._simple_new object initialization --- pandas/core/arrays/sparse/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 93091555201e8..963c2f3d53138 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -399,7 +399,7 @@ def __init__( def _simple_new( cls, sparse_array: np.ndarray, sparse_index: SparseIndex, dtype: SparseDtype ) -> "SparseArray": - new = cls([]) + new = object.__new__(cls) new._sparse_index = sparse_index new._sparse_values = sparse_array new._dtype = dtype From 93bf8255b913edc2d5b02f0c05d8a2b21c592a25 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 19 Mar 2020 12:05:46 +0100 Subject: [PATCH 02/15] Initial implementation --- pandas/_libs/sparse.pyx | 5 +++-- pandas/core/arrays/sparse/accessor.py | 29 +++++++++++++++++++-------- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 091ca42cb71dd..a44413eecce36 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -40,12 +40,13 @@ cdef class IntIndex(SparseIndex): Py_ssize_t length, npoints ndarray indices - def __init__(self, Py_ssize_t length, indices): + def __init__(self, Py_ssize_t length, indices, check_integrity=True): self.length = length self.indices = np.ascontiguousarray(indices, dtype=np.int32) self.npoints = len(self.indices) - self.check_integrity() + if check_integrity: + self.check_integrity() def __reduce__(self): args = (self.length, self.indices) diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 92c05f44d677c..7286520c06f5f 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -219,23 +219,36 @@ def from_spmatrix(cls, data, index=None, columns=None): Examples -------- - >>> import scipy.sparse + >>> impoVrt scipy.sparse >>> mat = scipy.sparse.eye(3) >>> pd.DataFrame.sparse.from_spmatrix(mat) 0 1 2 0 1.0 0.0 0.0 1 0.0 1.0 0.0 - 2 0.0 0.0 1.0 + 2 0.1 0.0 1.1 """ - from pandas import DataFrame + from pandas import DataFrame, SparseDtype + from . import IntIndex, SparseArray data = data.tocsc() index, columns = cls._prep_index(data, index, columns) - sparrays = [SparseArray.from_spmatrix(data[:, i]) for i in range(data.shape[1])] - data = dict(enumerate(sparrays)) - result = DataFrame(data, index=index) - result.columns = columns - return result + n_rows, n_columns = data.shape + data.sort_indices() + indices = data.indices + indptr = data.indptr + data = data.data + dtype = SparseDtype(data.dtype, 0) + arrays = [] + for i in range(n_columns): + sl = slice(indptr[i], indptr[i + 1]) + idx = IntIndex(n_rows, indices[sl], check_integrity=False) + arr = SparseArray._simple_new(data[sl], idx, dtype) + arrays.append(arr) + return DataFrame._from_arrays( + arrays, + columns=columns, + index=index + ) def to_dense(self): """ From e095e7fa51d129cbd8baa0df55999c7ebb93f440 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 19 Mar 2020 12:41:40 +0100 Subject: [PATCH 03/15] Improve docstring --- pandas/_libs/sparse.pyx | 2 ++ pandas/core/arrays/sparse/accessor.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index a44413eecce36..9208163ee29c3 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -34,6 +34,8 @@ cdef class IntIndex(SparseIndex): length : integer indices : array-like Contains integers corresponding to the indices. + check_integrity : bool, default=True + Check integrity of the input. """ cdef readonly: diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 7286520c06f5f..7f5621fcd2e30 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -219,13 +219,13 @@ def from_spmatrix(cls, data, index=None, columns=None): Examples -------- - >>> impoVrt scipy.sparse + >>> import scipy.sparse >>> mat = scipy.sparse.eye(3) >>> pd.DataFrame.sparse.from_spmatrix(mat) 0 1 2 0 1.0 0.0 0.0 1 0.0 1.0 0.0 - 2 0.1 0.0 1.1 + 2 0.0 0.0 1.0 """ from pandas import DataFrame, SparseDtype from . import IntIndex, SparseArray From 11afe400902d3e7697b95d4e87ff8ece4adc89f8 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 19 Mar 2020 12:50:01 +0100 Subject: [PATCH 04/15] Add what's new --- doc/source/whatsnew/v1.1.0.rst | 3 +++ pandas/core/arrays/sparse/accessor.py | 6 +----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 2db61a17858de..58c8042be4e6e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -224,6 +224,9 @@ Performance improvements - The internal index method :meth:`~Index._shallow_copy` now copies cached attributes over to the new index, avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of existing indexes (:issue:`28584`, :issue:`32640`, :issue:`32669`) +- Performance improvement when creating sparse :class:`DataFrame` from + ``scipy.sparse`` matrices using the :meth:`DataFrame.sparse.from_spmatrix` + constructor (:issue:`32196`). .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 7f5621fcd2e30..1cc955950003f 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -244,11 +244,7 @@ def from_spmatrix(cls, data, index=None, columns=None): idx = IntIndex(n_rows, indices[sl], check_integrity=False) arr = SparseArray._simple_new(data[sl], idx, dtype) arrays.append(arr) - return DataFrame._from_arrays( - arrays, - columns=columns, - index=index - ) + return DataFrame._from_arrays(arrays, columns=columns, index=index) def to_dense(self): """ From eda0732ed947de721a32de6dec91040659290646 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 19 Mar 2020 14:14:37 +0100 Subject: [PATCH 05/15] Update doc/source/whatsnew/v1.1.0.rst Co-Authored-By: Tom Augspurger --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 58c8042be4e6e..556d270cf224e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -224,7 +224,7 @@ Performance improvements - The internal index method :meth:`~Index._shallow_copy` now copies cached attributes over to the new index, avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of existing indexes (:issue:`28584`, :issue:`32640`, :issue:`32669`) -- Performance improvement when creating sparse :class:`DataFrame` from +- Performance improvement when creating a :class:`DataFrame` with sparse values ``scipy.sparse`` matrices using the :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32196`). From 40f4cd6e4285c376d8ddcd0890fb3346f344d4a9 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 19 Mar 2020 14:21:19 +0100 Subject: [PATCH 06/15] Improve what's new --- doc/source/whatsnew/v1.1.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 556d270cf224e..1cb5367f371b8 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -225,8 +225,8 @@ Performance improvements avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of existing indexes (:issue:`28584`, :issue:`32640`, :issue:`32669`) - Performance improvement when creating a :class:`DataFrame` with sparse values - ``scipy.sparse`` matrices using the :meth:`DataFrame.sparse.from_spmatrix` - constructor (:issue:`32196`). + from ``scipy.sparse`` matrices using the + :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32196`). .. --------------------------------------------------------------------------- From 508fda51f97b372eb157bc42983200fb49e5fda7 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 19 Mar 2020 14:30:34 +0100 Subject: [PATCH 07/15] Add random state --- asv_bench/benchmarks/sparse.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index ac78ca53679fd..418614a2d2b98 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -45,8 +45,7 @@ def time_sparse_array(self, dense_proportion, fill_value, dtype): class SparseDataFrameConstructor: def setup(self): N = 1000 - self.arr = np.arange(N) - self.sparse = scipy.sparse.rand(N, N, 0.005) + self.sparse = scipy.sparse.rand(N, N, 0.005, random_state=0) def time_from_scipy(self): pd.DataFrame.sparse.from_spmatrix(self.sparse) From 00538170efac0259770914d1939024922d7e249c Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 19 Mar 2020 15:10:26 +0100 Subject: [PATCH 08/15] Add inline comment about sort_indices --- pandas/core/arrays/sparse/accessor.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 1cc955950003f..4f4d7ef5a4112 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -233,6 +233,10 @@ def from_spmatrix(cls, data, index=None, columns=None): data = data.tocsc() index, columns = cls._prep_index(data, index, columns) n_rows, n_columns = data.shape + # We need to make sure indices are sorted, as we create + # IntIndex with no input validation (i.e. check_integrity=False ). + # Indices may already be sorted in scipy in which case this adds + # a small overhead. data.sort_indices() indices = data.indices indptr = data.indptr From 42792a396dc51b5cfd1a75c30207ccd150e20341 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 19 Mar 2020 16:31:09 +0100 Subject: [PATCH 09/15] Use absolute import --- pandas/core/arrays/sparse/accessor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 4f4d7ef5a4112..3ae5badfb6bf4 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -227,8 +227,8 @@ def from_spmatrix(cls, data, index=None, columns=None): 1 0.0 1.0 0.0 2 0.0 0.0 1.0 """ - from pandas import DataFrame, SparseDtype - from . import IntIndex, SparseArray + from pandas import DataFrame + from pandas._libs.sparse import IntIndex data = data.tocsc() index, columns = cls._prep_index(data, index, columns) From c8f7abc42a2ed56e91473e46fbaf4cb24ca7448c Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 19 Mar 2020 20:34:15 +0100 Subject: [PATCH 10/15] Update pandas/_libs/sparse.pyx Co-Authored-By: William Ayd --- pandas/_libs/sparse.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 9208163ee29c3..d853ddf3de7d4 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -42,7 +42,7 @@ cdef class IntIndex(SparseIndex): Py_ssize_t length, npoints ndarray indices - def __init__(self, Py_ssize_t length, indices, check_integrity=True): + def __init__(self, Py_ssize_t length, indices, bint check_integrity=True): self.length = length self.indices = np.ascontiguousarray(indices, dtype=np.int32) self.npoints = len(self.indices) From e541b0dbcfa57d66dc10466d853338eab8ad4ccb Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 19 Mar 2020 20:37:21 +0100 Subject: [PATCH 11/15] Rename variable --- pandas/core/arrays/sparse/accessor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 3ae5badfb6bf4..fdfde9206f34e 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -240,13 +240,13 @@ def from_spmatrix(cls, data, index=None, columns=None): data.sort_indices() indices = data.indices indptr = data.indptr - data = data.data - dtype = SparseDtype(data.dtype, 0) + array_data = data.data + dtype = SparseDtype(array_data.dtype, 0) arrays = [] for i in range(n_columns): sl = slice(indptr[i], indptr[i + 1]) idx = IntIndex(n_rows, indices[sl], check_integrity=False) - arr = SparseArray._simple_new(data[sl], idx, dtype) + arr = SparseArray._simple_new(array_data[sl], idx, dtype) arrays.append(arr) return DataFrame._from_arrays(arrays, columns=columns, index=index) From ce6619efd3f38f716810e2c55efa3ba284f641ea Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 20 Mar 2020 21:09:34 +0100 Subject: [PATCH 12/15] Use DataFrame._from_arrays(..., verify_integrity=False) --- pandas/core/arrays/sparse/accessor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index fdfde9206f34e..a284d7d481746 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -248,7 +248,9 @@ def from_spmatrix(cls, data, index=None, columns=None): idx = IntIndex(n_rows, indices[sl], check_integrity=False) arr = SparseArray._simple_new(array_data[sl], idx, dtype) arrays.append(arr) - return DataFrame._from_arrays(arrays, columns=columns, index=index) + return DataFrame._from_arrays( + arrays, columns=columns, index=index, verify_integrity=False + ) def to_dense(self): """ From e063c8a59048486a62d0b9234e4ccd40bd971835 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 20 Mar 2020 21:10:42 +0100 Subject: [PATCH 13/15] checkout upstream/master -- asv_bench/benchmarks/sparse.py --- asv_bench/benchmarks/sparse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index 418614a2d2b98..7a09b03648fa7 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -45,7 +45,7 @@ def time_sparse_array(self, dense_proportion, fill_value, dtype): class SparseDataFrameConstructor: def setup(self): N = 1000 - self.sparse = scipy.sparse.rand(N, N, 0.005, random_state=0) + self.sparse = scipy.sparse.rand(N, N, 0.005) def time_from_scipy(self): pd.DataFrame.sparse.from_spmatrix(self.sparse) From 20c36856d63d1366a57e37214ec030736652f406 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sun, 22 Mar 2020 10:38:05 +0100 Subject: [PATCH 14/15] Link to PR's by Joris in what's new --- doc/source/whatsnew/v1.1.0.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 7cb45ed553b9b..48b0779a1753a 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -224,9 +224,10 @@ Performance improvements - The internal index method :meth:`~Index._shallow_copy` now copies cached attributes over to the new index, avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of existing indexes (:issue:`28584`, :issue:`32640`, :issue:`32669`) -- Performance improvement when creating a :class:`DataFrame` with sparse values - from ``scipy.sparse`` matrices using the - :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32196`). +- Significant performance improvement when creating a :class:`DataFrame` with + sparse values from ``scipy.sparse`` matrices using the + :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`, + :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). .. --------------------------------------------------------------------------- From d750eb45f80902b87a8bfb0ebb3a8fe025146c60 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sun, 22 Mar 2020 15:11:49 +0100 Subject: [PATCH 15/15] Use ensure_index when the columns/index is provided by the user --- pandas/core/arrays/sparse/accessor.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index a284d7d481746..787407060c7f1 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -329,12 +329,17 @@ def density(self) -> float: @staticmethod def _prep_index(data, index, columns): import pandas.core.indexes.base as ibase + from pandas.core.indexes.api import ensure_index N, K = data.shape if index is None: index = ibase.default_index(N) + else: + index = ensure_index(index) if columns is None: columns = ibase.default_index(K) + else: + columns = ensure_index(columns) if len(columns) != K: raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}")