From 3ffe7f2d84f4bcc369216188eb73639277d61011 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 19 Mar 2020 08:21:57 +0100
Subject: [PATCH 01/15] PERF: fix SparseArray._simple_new object initialization

---
 pandas/core/arrays/sparse/array.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
index 93091555201e8..963c2f3d53138 100644
--- a/pandas/core/arrays/sparse/array.py
+++ b/pandas/core/arrays/sparse/array.py
@@ -399,7 +399,7 @@ def __init__(
     def _simple_new(
         cls, sparse_array: np.ndarray, sparse_index: SparseIndex, dtype: SparseDtype
     ) -> "SparseArray":
-        new = cls([])
+        new = object.__new__(cls)
         new._sparse_index = sparse_index
         new._sparse_values = sparse_array
         new._dtype = dtype

From 93bf8255b913edc2d5b02f0c05d8a2b21c592a25 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 19 Mar 2020 12:05:46 +0100
Subject: [PATCH 02/15] Initial implementation

---
 pandas/_libs/sparse.pyx               |  5 +++--
 pandas/core/arrays/sparse/accessor.py | 29 +++++++++++++++++++--------
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx
index 091ca42cb71dd..a44413eecce36 100644
--- a/pandas/_libs/sparse.pyx
+++ b/pandas/_libs/sparse.pyx
@@ -40,12 +40,13 @@ cdef class IntIndex(SparseIndex):
         Py_ssize_t length, npoints
         ndarray indices
 
-    def __init__(self, Py_ssize_t length, indices):
+    def __init__(self, Py_ssize_t length, indices, check_integrity=True):
         self.length = length
         self.indices = np.ascontiguousarray(indices, dtype=np.int32)
         self.npoints = len(self.indices)
 
-        self.check_integrity()
+        if check_integrity:
+            self.check_integrity()
 
     def __reduce__(self):
         args = (self.length, self.indices)
diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py
index 92c05f44d677c..7286520c06f5f 100644
--- a/pandas/core/arrays/sparse/accessor.py
+++ b/pandas/core/arrays/sparse/accessor.py
@@ -219,23 +219,36 @@ def from_spmatrix(cls, data, index=None, columns=None):
 
         Examples
         --------
-        >>> import scipy.sparse
+        >>> impoVrt scipy.sparse
         >>> mat = scipy.sparse.eye(3)
         >>> pd.DataFrame.sparse.from_spmatrix(mat)
              0    1    2
         0  1.0  0.0  0.0
         1  0.0  1.0  0.0
-        2  0.0  0.0  1.0
+        2  0.1  0.0  1.1
         """
-        from pandas import DataFrame
+        from pandas import DataFrame, SparseDtype
+        from . import IntIndex, SparseArray
 
         data = data.tocsc()
         index, columns = cls._prep_index(data, index, columns)
-        sparrays = [SparseArray.from_spmatrix(data[:, i]) for i in range(data.shape[1])]
-        data = dict(enumerate(sparrays))
-        result = DataFrame(data, index=index)
-        result.columns = columns
-        return result
+        n_rows, n_columns = data.shape
+        data.sort_indices()
+        indices = data.indices
+        indptr = data.indptr
+        data = data.data
+        dtype = SparseDtype(data.dtype, 0)
+        arrays = []
+        for i in range(n_columns):
+            sl = slice(indptr[i], indptr[i + 1])
+            idx = IntIndex(n_rows, indices[sl], check_integrity=False)
+            arr = SparseArray._simple_new(data[sl], idx, dtype)
+            arrays.append(arr)
+        return DataFrame._from_arrays(
+            arrays,
+            columns=columns,
+            index=index
+        )
 
     def to_dense(self):
         """

From e095e7fa51d129cbd8baa0df55999c7ebb93f440 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 19 Mar 2020 12:41:40 +0100
Subject: [PATCH 03/15] Improve docstring

---
 pandas/_libs/sparse.pyx               | 2 ++
 pandas/core/arrays/sparse/accessor.py | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx
index a44413eecce36..9208163ee29c3 100644
--- a/pandas/_libs/sparse.pyx
+++ b/pandas/_libs/sparse.pyx
@@ -34,6 +34,8 @@ cdef class IntIndex(SparseIndex):
     length : integer
     indices : array-like
         Contains integers corresponding to the indices.
+    check_integrity : bool, default=True
+        Check integrity of the input.
     """
 
     cdef readonly:
diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py
index 7286520c06f5f..7f5621fcd2e30 100644
--- a/pandas/core/arrays/sparse/accessor.py
+++ b/pandas/core/arrays/sparse/accessor.py
@@ -219,13 +219,13 @@ def from_spmatrix(cls, data, index=None, columns=None):
 
         Examples
         --------
-        >>> impoVrt scipy.sparse
+        >>> import scipy.sparse
         >>> mat = scipy.sparse.eye(3)
         >>> pd.DataFrame.sparse.from_spmatrix(mat)
              0    1    2
         0  1.0  0.0  0.0
         1  0.0  1.0  0.0
-        2  0.1  0.0  1.1
+        2  0.0  0.0  1.0
         """
         from pandas import DataFrame, SparseDtype
         from . import IntIndex, SparseArray

From 11afe400902d3e7697b95d4e87ff8ece4adc89f8 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 19 Mar 2020 12:50:01 +0100
Subject: [PATCH 04/15] Add what's new

---
 doc/source/whatsnew/v1.1.0.rst        | 3 +++
 pandas/core/arrays/sparse/accessor.py | 6 +-----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index 2db61a17858de..58c8042be4e6e 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -224,6 +224,9 @@ Performance improvements
 - The internal index method :meth:`~Index._shallow_copy` now copies cached attributes over to the new index,
   avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of
   existing indexes (:issue:`28584`, :issue:`32640`, :issue:`32669`)
+- Performance improvement when creating sparse :class:`DataFrame` from
+  ``scipy.sparse`` matrices using the :meth:`DataFrame.sparse.from_spmatrix`
+  constructor (:issue:`32196`).
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py
index 7f5621fcd2e30..1cc955950003f 100644
--- a/pandas/core/arrays/sparse/accessor.py
+++ b/pandas/core/arrays/sparse/accessor.py
@@ -244,11 +244,7 @@ def from_spmatrix(cls, data, index=None, columns=None):
             idx = IntIndex(n_rows, indices[sl], check_integrity=False)
             arr = SparseArray._simple_new(data[sl], idx, dtype)
             arrays.append(arr)
-        return DataFrame._from_arrays(
-            arrays,
-            columns=columns,
-            index=index
-        )
+        return DataFrame._from_arrays(arrays, columns=columns, index=index)
 
     def to_dense(self):
         """

From eda0732ed947de721a32de6dec91040659290646 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 19 Mar 2020 14:14:37 +0100
Subject: [PATCH 05/15] Update doc/source/whatsnew/v1.1.0.rst

Co-Authored-By: Tom Augspurger <TomAugspurger@users.noreply.github.com>
---
 doc/source/whatsnew/v1.1.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index 58c8042be4e6e..556d270cf224e 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -224,7 +224,7 @@ Performance improvements
 - The internal index method :meth:`~Index._shallow_copy` now copies cached attributes over to the new index,
   avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of
   existing indexes (:issue:`28584`, :issue:`32640`, :issue:`32669`)
-- Performance improvement when creating sparse :class:`DataFrame` from
+- Performance improvement when creating a :class:`DataFrame` with sparse values
   ``scipy.sparse`` matrices using the :meth:`DataFrame.sparse.from_spmatrix`
   constructor (:issue:`32196`).
 

From 40f4cd6e4285c376d8ddcd0890fb3346f344d4a9 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 19 Mar 2020 14:21:19 +0100
Subject: [PATCH 06/15] Improve what's new

---
 doc/source/whatsnew/v1.1.0.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index 556d270cf224e..1cb5367f371b8 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -225,8 +225,8 @@ Performance improvements
   avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of
   existing indexes (:issue:`28584`, :issue:`32640`, :issue:`32669`)
 - Performance improvement when creating a :class:`DataFrame` with sparse values
-  ``scipy.sparse`` matrices using the :meth:`DataFrame.sparse.from_spmatrix`
-  constructor (:issue:`32196`).
+  from ``scipy.sparse`` matrices using the
+  :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32196`).
 
 .. ---------------------------------------------------------------------------
 

From 508fda51f97b372eb157bc42983200fb49e5fda7 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 19 Mar 2020 14:30:34 +0100
Subject: [PATCH 07/15] Add random state

---
 asv_bench/benchmarks/sparse.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py
index ac78ca53679fd..418614a2d2b98 100644
--- a/asv_bench/benchmarks/sparse.py
+++ b/asv_bench/benchmarks/sparse.py
@@ -45,8 +45,7 @@ def time_sparse_array(self, dense_proportion, fill_value, dtype):
 class SparseDataFrameConstructor:
     def setup(self):
         N = 1000
-        self.arr = np.arange(N)
-        self.sparse = scipy.sparse.rand(N, N, 0.005)
+        self.sparse = scipy.sparse.rand(N, N, 0.005, random_state=0)
 
     def time_from_scipy(self):
         pd.DataFrame.sparse.from_spmatrix(self.sparse)

From 00538170efac0259770914d1939024922d7e249c Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 19 Mar 2020 15:10:26 +0100
Subject: [PATCH 08/15] Add inline comment about sort_indices

---
 pandas/core/arrays/sparse/accessor.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py
index 1cc955950003f..4f4d7ef5a4112 100644
--- a/pandas/core/arrays/sparse/accessor.py
+++ b/pandas/core/arrays/sparse/accessor.py
@@ -233,6 +233,10 @@ def from_spmatrix(cls, data, index=None, columns=None):
         data = data.tocsc()
         index, columns = cls._prep_index(data, index, columns)
         n_rows, n_columns = data.shape
+        # We need to make sure indices are sorted, as we create
+        # IntIndex with no input validation (i.e. check_integrity=False ).
+        # Indices may already be sorted in scipy in which case this adds
+        # a small overhead.
         data.sort_indices()
         indices = data.indices
         indptr = data.indptr

From 42792a396dc51b5cfd1a75c30207ccd150e20341 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 19 Mar 2020 16:31:09 +0100
Subject: [PATCH 09/15] Use absolute import

---
 pandas/core/arrays/sparse/accessor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py
index 4f4d7ef5a4112..3ae5badfb6bf4 100644
--- a/pandas/core/arrays/sparse/accessor.py
+++ b/pandas/core/arrays/sparse/accessor.py
@@ -227,8 +227,8 @@ def from_spmatrix(cls, data, index=None, columns=None):
         1  0.0  1.0  0.0
         2  0.0  0.0  1.0
         """
-        from pandas import DataFrame, SparseDtype
-        from . import IntIndex, SparseArray
+        from pandas import DataFrame
+        from pandas._libs.sparse import IntIndex
 
         data = data.tocsc()
         index, columns = cls._prep_index(data, index, columns)

From c8f7abc42a2ed56e91473e46fbaf4cb24ca7448c Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 19 Mar 2020 20:34:15 +0100
Subject: [PATCH 10/15] Update pandas/_libs/sparse.pyx

Co-Authored-By: William Ayd <william.ayd@icloud.com>
---
 pandas/_libs/sparse.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx
index 9208163ee29c3..d853ddf3de7d4 100644
--- a/pandas/_libs/sparse.pyx
+++ b/pandas/_libs/sparse.pyx
@@ -42,7 +42,7 @@ cdef class IntIndex(SparseIndex):
         Py_ssize_t length, npoints
         ndarray indices
 
-    def __init__(self, Py_ssize_t length, indices, check_integrity=True):
+    def __init__(self, Py_ssize_t length, indices, bint check_integrity=True):
         self.length = length
         self.indices = np.ascontiguousarray(indices, dtype=np.int32)
         self.npoints = len(self.indices)

From e541b0dbcfa57d66dc10466d853338eab8ad4ccb Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 19 Mar 2020 20:37:21 +0100
Subject: [PATCH 11/15] Rename variable

---
 pandas/core/arrays/sparse/accessor.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py
index 3ae5badfb6bf4..fdfde9206f34e 100644
--- a/pandas/core/arrays/sparse/accessor.py
+++ b/pandas/core/arrays/sparse/accessor.py
@@ -240,13 +240,13 @@ def from_spmatrix(cls, data, index=None, columns=None):
         data.sort_indices()
         indices = data.indices
         indptr = data.indptr
-        data = data.data
-        dtype = SparseDtype(data.dtype, 0)
+        array_data = data.data
+        dtype = SparseDtype(array_data.dtype, 0)
         arrays = []
         for i in range(n_columns):
             sl = slice(indptr[i], indptr[i + 1])
             idx = IntIndex(n_rows, indices[sl], check_integrity=False)
-            arr = SparseArray._simple_new(data[sl], idx, dtype)
+            arr = SparseArray._simple_new(array_data[sl], idx, dtype)
             arrays.append(arr)
         return DataFrame._from_arrays(arrays, columns=columns, index=index)
 

From ce6619efd3f38f716810e2c55efa3ba284f641ea Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Fri, 20 Mar 2020 21:09:34 +0100
Subject: [PATCH 12/15] Use DataFrame._from_arrays(..., verify_integrity=False)

---
 pandas/core/arrays/sparse/accessor.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py
index fdfde9206f34e..a284d7d481746 100644
--- a/pandas/core/arrays/sparse/accessor.py
+++ b/pandas/core/arrays/sparse/accessor.py
@@ -248,7 +248,9 @@ def from_spmatrix(cls, data, index=None, columns=None):
             idx = IntIndex(n_rows, indices[sl], check_integrity=False)
             arr = SparseArray._simple_new(array_data[sl], idx, dtype)
             arrays.append(arr)
-        return DataFrame._from_arrays(arrays, columns=columns, index=index)
+        return DataFrame._from_arrays(
+            arrays, columns=columns, index=index, verify_integrity=False
+        )
 
     def to_dense(self):
         """

From e063c8a59048486a62d0b9234e4ccd40bd971835 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Fri, 20 Mar 2020 21:10:42 +0100
Subject: [PATCH 13/15] checkout upstream/master --
 asv_bench/benchmarks/sparse.py

---
 asv_bench/benchmarks/sparse.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py
index 418614a2d2b98..7a09b03648fa7 100644
--- a/asv_bench/benchmarks/sparse.py
+++ b/asv_bench/benchmarks/sparse.py
@@ -45,7 +45,7 @@ def time_sparse_array(self, dense_proportion, fill_value, dtype):
 class SparseDataFrameConstructor:
     def setup(self):
         N = 1000
-        self.sparse = scipy.sparse.rand(N, N, 0.005, random_state=0)
+        self.sparse = scipy.sparse.rand(N, N, 0.005)
 
     def time_from_scipy(self):
         pd.DataFrame.sparse.from_spmatrix(self.sparse)

From 20c36856d63d1366a57e37214ec030736652f406 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Sun, 22 Mar 2020 10:38:05 +0100
Subject: [PATCH 14/15] Link to PR's by Joris in what's new

---
 doc/source/whatsnew/v1.1.0.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index 7cb45ed553b9b..48b0779a1753a 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -224,9 +224,10 @@ Performance improvements
 - The internal index method :meth:`~Index._shallow_copy` now copies cached attributes over to the new index,
   avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of
   existing indexes (:issue:`28584`, :issue:`32640`, :issue:`32669`)
-- Performance improvement when creating a :class:`DataFrame` with sparse values
-  from ``scipy.sparse`` matrices using the
-  :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32196`).
+- Significant performance improvement when creating a :class:`DataFrame` with
+  sparse values from ``scipy.sparse`` matrices using the
+  :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`,
+  :issue:`32825`,  :issue:`32826`, :issue:`32856`, :issue:`32858`).
 
 .. ---------------------------------------------------------------------------
 

From d750eb45f80902b87a8bfb0ebb3a8fe025146c60 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Sun, 22 Mar 2020 15:11:49 +0100
Subject: [PATCH 15/15] Use ensure_index when the columns/index is provided by
 the user

---
 pandas/core/arrays/sparse/accessor.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py
index a284d7d481746..787407060c7f1 100644
--- a/pandas/core/arrays/sparse/accessor.py
+++ b/pandas/core/arrays/sparse/accessor.py
@@ -329,12 +329,17 @@ def density(self) -> float:
     @staticmethod
     def _prep_index(data, index, columns):
         import pandas.core.indexes.base as ibase
+        from pandas.core.indexes.api import ensure_index
 
         N, K = data.shape
         if index is None:
             index = ibase.default_index(N)
+        else:
+            index = ensure_index(index)
         if columns is None:
             columns = ibase.default_index(K)
+        else:
+            columns = ensure_index(columns)
 
         if len(columns) != K:
             raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}")