PERF: Sparse Series to scipy COO sparse matrix (#42925)

TLouf · web-flow · commit 37bd4dc7b957 · 2021-09-04T21:45:00.000-04:00
diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py
@@ -67,16 +67,28 @@ def time_sparse_series_from_coo(self):
 
 
 class ToCoo:
-    def setup(self):
+    params = [True, False]
+    param_names = ["sort_labels"]
+
+    def setup(self, sort_labels):
         s = Series([np.nan] * 10000)
         s[0] = 3.0
         s[100] = -1.0
         s[999] = 12.1
-        s.index = MultiIndex.from_product([range(10)] * 4)
-        self.ss = s.astype("Sparse")
 
-    def time_sparse_series_to_coo(self):
-        self.ss.sparse.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True)
+        s_mult_lvl = s.set_axis(MultiIndex.from_product([range(10)] * 4))
+        self.ss_mult_lvl = s_mult_lvl.astype("Sparse")
+
+        s_two_lvl = s.set_axis(MultiIndex.from_product([range(100)] * 2))
+        self.ss_two_lvl = s_two_lvl.astype("Sparse")
+
+    def time_sparse_series_to_coo(self, sort_labels):
+        self.ss_mult_lvl.sparse.to_coo(
+            row_levels=[0, 1], column_levels=[2, 3], sort_labels=sort_labels
+        )
+
+    def time_sparse_series_to_coo_single_level(self, sort_labels):
+        self.ss_two_lvl.sparse.to_coo(sort_labels=sort_labels)
 
 
 class Arithmetic:
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -290,6 +290,8 @@ Performance improvements
 - Performance improvement in some :meth:`GroupBy.apply` operations (:issue:`42992`)
 - Performance improvement in :func:`read_stata` (:issue:`43059`)
 - Performance improvement in :meth:`to_datetime` with ``uint`` dtypes (:issue:`42606`)
+- Performance improvement in :meth:`Series.sparse.to_coo` (:issue:`42880`)
+-
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py
@@ -113,6 +113,8 @@ def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False):
         column_levels : tuple/list
         sort_labels : bool, default False
             Sort the row and column labels before forming the sparse matrix.
+            When `row_levels` and/or `column_levels` refer to a single level,
+            set to `True` for a faster execution.
 
         Returns
         -------
diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py
@@ -3,14 +3,32 @@
 
 Currently only includes to_coo helpers.
 """
-from pandas.core.indexes.api import (
-    Index,
-    MultiIndex,
+from __future__ import annotations
+
+from typing import (
+    TYPE_CHECKING,
+    Iterable,
+)
+
+import numpy as np
+
+from pandas._libs import lib
+from pandas._typing import (
+    IndexLabel,
+    npt,
 )
+
+from pandas.core.dtypes.missing import notna
+
+from pandas.core.algorithms import factorize
+from pandas.core.indexes.api import MultiIndex
 from pandas.core.series import Series
 
+if TYPE_CHECKING:
+    import scipy.sparse
+
 
-def _check_is_partition(parts, whole):
+def _check_is_partition(parts: Iterable, whole: Iterable):
     whole = set(whole)
     parts = [set(x) for x in parts]
     if set.intersection(*parts) != set():
@@ -19,76 +37,115 @@ def _check_is_partition(parts, whole):
         raise ValueError("Is not a partition because union is not the whole.")
 
 
-def _to_ijv(ss, row_levels=(0,), column_levels=(1,), sort_labels=False):
-    """
-    For arbitrary (MultiIndexed) sparse Series return
-    (v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for
-    passing to scipy.sparse.coo constructor.
+def _levels_to_axis(
+    ss,
+    levels: tuple[int] | list[int],
+    valid_ilocs: npt.NDArray[np.intp],
+    sort_labels: bool = False,
+) -> tuple[npt.NDArray[np.intp], list[IndexLabel]]:
     """
-    # index and column levels must be a partition of the index
-    _check_is_partition([row_levels, column_levels], range(ss.index.nlevels))
+    For a MultiIndexed sparse Series `ss`, return `ax_coords` and `ax_labels`,
+    where `ax_coords` are the coordinates along one of the two axes of the
+    destination sparse matrix, and `ax_labels` are the labels from `ss`' Index
+    which correspond to these coordinates.
+
+    Parameters
+    ----------
+    ss : Series
+    levels : tuple/list
+    valid_ilocs : numpy.ndarray
+        Array of integer positions of valid values for the sparse matrix in ss.
+    sort_labels : bool, default False
+        Sort the axis labels before forming the sparse matrix. When `levels`
+        refers to a single level, set to True for a faster execution.
 
-    # from the sparse Series: get the labels and data for non-null entries
-    values = ss.array._valid_sp_values
-
-    nonnull_labels = ss.dropna()
-
-    def get_indexers(levels):
-        """Return sparse coords and dense labels for subset levels"""
-        # TODO: how to do this better? cleanly slice nonnull_labels given the
-        # coord
-        values_ilabels = [tuple(x[i] for i in levels) for x in nonnull_labels.index]
-        if len(levels) == 1:
-            values_ilabels = [x[0] for x in values_ilabels]
-
-        # # performance issues with groupby ###################################
-        # TODO: these two lines can replace the code below but
-        # groupby is too slow (in some cases at least)
-        # labels_to_i = ss.groupby(level=levels, sort=sort_labels).first()
-        # labels_to_i[:] = np.arange(labels_to_i.shape[0])
-
-        def _get_label_to_i_dict(labels, sort_labels=False):
-            """
-            Return dict of unique labels to number.
-            Optionally sort by label.
-            """
-            labels = Index(map(tuple, labels)).unique().tolist()  # squish
-            if sort_labels:
-                labels = sorted(labels)
-            return {k: i for i, k in enumerate(labels)}
-
-        def _get_index_subset_to_coord_dict(index, subset, sort_labels=False):
-            ilabels = list(zip(*(index._get_level_values(i) for i in subset)))
-            labels_to_i = _get_label_to_i_dict(ilabels, sort_labels=sort_labels)
-            labels_to_i = Series(labels_to_i)
-            if len(subset) > 1:
-                labels_to_i.index = MultiIndex.from_tuples(labels_to_i.index)
-                labels_to_i.index.names = [index.names[i] for i in subset]
-            else:
-                labels_to_i.index = Index(x[0] for x in labels_to_i.index)
-                labels_to_i.index.name = index.names[subset[0]]
-
-            labels_to_i.name = "value"
-            return labels_to_i
-
-        labels_to_i = _get_index_subset_to_coord_dict(
-            ss.index, levels, sort_labels=sort_labels
+    Returns
+    -------
+    ax_coords : numpy.ndarray (axis coordinates)
+    ax_labels : list (axis labels)
+    """
+    # Since the labels are sorted in `Index.levels`, when we wish to sort and
+    # there is only one level of the MultiIndex for this axis, the desired
+    # output can be obtained in the following simpler, more efficient way.
+    if sort_labels and len(levels) == 1:
+        ax_coords = ss.index.codes[levels[0]][valid_ilocs]
+        ax_labels = ss.index.levels[levels[0]]
+
+    else:
+        levels_values = lib.fast_zip(
+            [ss.index.get_level_values(lvl).values for lvl in levels]
         )
-        # #####################################################################
-        # #####################################################################
+        codes, ax_labels = factorize(levels_values, sort=sort_labels)
+        ax_coords = codes[valid_ilocs]
+
+    ax_labels = ax_labels.tolist()
+    return ax_coords, ax_labels
+
+
+def _to_ijv(
+    ss,
+    row_levels: tuple[int] | list[int] = (0,),
+    column_levels: tuple[int] | list[int] = (1,),
+    sort_labels: bool = False,
+) -> tuple[
+    np.ndarray,
+    npt.NDArray[np.intp],
+    npt.NDArray[np.intp],
+    list[IndexLabel],
+    list[IndexLabel],
+]:
+    """
+    For an arbitrary MultiIndexed sparse Series return (v, i, j, ilabels,
+    jlabels) where (v, (i, j)) is suitable for passing to scipy.sparse.coo
+    constructor, and ilabels and jlabels are the row and column labels
+    respectively.
 
-        i_coord = labels_to_i[values_ilabels].tolist()
-        i_labels = labels_to_i.index.tolist()
+    Parameters
+    ----------
+    ss : Series
+    row_levels : tuple/list
+    column_levels : tuple/list
+    sort_labels : bool, default False
+        Sort the row and column labels before forming the sparse matrix.
+        When `row_levels` and/or `column_levels` refer to a single level,
+        set to `True` for a faster execution.
 
-        return i_coord, i_labels
+    Returns
+    -------
+    values : numpy.ndarray
+        Valid values to populate a sparse matrix, extracted from
+        ss.
+    i_coords : numpy.ndarray (row coordinates of the values)
+    j_coords : numpy.ndarray (column coordinates of the values)
+    i_labels : list (row labels)
+    j_labels : list (column labels)
+    """
+    # index and column levels must be a partition of the index
+    _check_is_partition([row_levels, column_levels], range(ss.index.nlevels))
+    # From the sparse Series, get the integer indices and data for valid sparse
+    # entries.
+    sp_vals = ss.array.sp_values
+    na_mask = notna(sp_vals)
+    values = sp_vals[na_mask]
+    valid_ilocs = ss.array.sp_index.indices[na_mask]
+
+    i_coords, i_labels = _levels_to_axis(
+        ss, row_levels, valid_ilocs, sort_labels=sort_labels
+    )
 
-    i_coord, i_labels = get_indexers(row_levels)
-    j_coord, j_labels = get_indexers(column_levels)
+    j_coords, j_labels = _levels_to_axis(
+        ss, column_levels, valid_ilocs, sort_labels=sort_labels
+    )
 
-    return values, i_coord, j_coord, i_labels, j_labels
+    return values, i_coords, j_coords, i_labels, j_labels
 
 
-def sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=False):
+def sparse_series_to_coo(
+    ss: Series,
+    row_levels: Iterable[int] = (0,),
+    column_levels: Iterable[int] = (1,),
+    sort_labels: bool = False,
+) -> tuple[scipy.sparse.coo_matrix, list[IndexLabel], list[IndexLabel]]:
     """
     Convert a sparse Series to a scipy.sparse.coo_matrix using index
     levels row_levels, column_levels as the row and column
@@ -97,7 +154,7 @@ def sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=Fa
     import scipy.sparse
 
     if ss.index.nlevels < 2:
-        raise ValueError("to_coo requires MultiIndex with nlevels > 2")
+        raise ValueError("to_coo requires MultiIndex with nlevels >= 2.")
     if not ss.index.is_unique:
         raise ValueError(
             "Duplicate index entries are not allowed in to_coo transformation."
@@ -116,7 +173,9 @@ def sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=Fa
     return sparse_matrix, rows, columns
 
 
-def coo_to_sparse_series(A, dense_index: bool = False):
+def coo_to_sparse_series(
+    A: scipy.sparse.coo_matrix, dense_index: bool = False
+) -> Series:
     """
     Convert a scipy.sparse.coo_matrix to a SparseSeries.
 
diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
@@ -1196,16 +1196,52 @@ def test_from_coo(self):
         tm.assert_series_equal(result, expected)
 
     @td.skip_if_no_scipy
-    def test_to_coo(self):
+    @pytest.mark.parametrize(
+        "sort_labels, expected_rows, expected_cols, expected_values_pos",
+        [
+            (
+                False,
+                [("b", 2), ("a", 2), ("b", 1), ("a", 1)],
+                [("z", 1), ("z", 2), ("x", 2), ("z", 0)],
+                {1: (1, 0), 3: (3, 3)},
+            ),
+            (
+                True,
+                [("a", 1), ("a", 2), ("b", 1), ("b", 2)],
+                [("x", 2), ("z", 0), ("z", 1), ("z", 2)],
+                {1: (1, 2), 3: (0, 1)},
+            ),
+        ],
+    )
+    def test_to_coo(
+        self, sort_labels, expected_rows, expected_cols, expected_values_pos
+    ):
         import scipy.sparse
 
-        ser = pd.Series(
-            [1, 2, 3],
-            index=pd.MultiIndex.from_product([[0], [1, 2, 3]], names=["a", "b"]),
-            dtype="Sparse[int]",
+        values = SparseArray([0, np.nan, 1, 0, None, 3], fill_value=0)
+        index = pd.MultiIndex.from_tuples(
+            [
+                ("b", 2, "z", 1),
+                ("a", 2, "z", 2),
+                ("a", 2, "z", 1),
+                ("a", 2, "x", 2),
+                ("b", 1, "z", 1),
+                ("a", 1, "z", 0),
+            ]
+        )
+        ss = pd.Series(values, index=index)
+
+        expected_A = np.zeros((4, 4))
+        for value, (row, col) in expected_values_pos.items():
+            expected_A[row, col] = value
+
+        A, rows, cols = ss.sparse.to_coo(
+            row_levels=(0, 1), column_levels=(2, 3), sort_labels=sort_labels
         )
-        A, _, _ = ser.sparse.to_coo()
         assert isinstance(A, scipy.sparse.coo.coo_matrix)
+        np.testing.assert_array_equal(A.toarray(), expected_A)
+        assert rows == expected_rows
+        assert cols == expected_cols
 
     def test_non_sparse_raises(self):
         ser = pd.Series([1, 2, 3])