From cb8263000292935d84b4aa88f80b8a5f0732dc21 Mon Sep 17 00:00:00 2001 From: TLouf Date: Sat, 7 Aug 2021 19:08:28 +0200 Subject: [PATCH 01/16] PERF: sparse_series_to_coo improvement The performance is improved overall, and most dramatically for a two-level MultiIndex --- pandas/core/arrays/sparse/scipy_sparse.py | 104 ++++++++++------------ 1 file changed, 46 insertions(+), 58 deletions(-) diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index f399d3230d897..7ede910144d39 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -3,10 +3,9 @@ Currently only includes to_coo helpers. """ -from pandas.core.indexes.api import ( - Index, - MultiIndex, -) +import numpy as np + +from pandas.core.indexes.api import MultiIndex from pandas.core.series import Series @@ -19,6 +18,34 @@ def _check_is_partition(parts, whole): raise ValueError("Is not a partition because union is not the whole.") +def _levels_to_axis(levels_codes, levels_labels, valid_ilocs, sort_labels=False): + if sort_labels and levels_codes.shape[0] == 1: + ax_coords = levels_codes[0][valid_ilocs] + ax_labels = levels_labels[0].tolist() + + else: + # Why return_index anyway : https://github.com/numpy/numpy/issues/16923 + ucodes, ucodes_idx, ucodes_inv = np.unique( + levels_codes.T, axis=0, return_index=True, return_inverse=True + ) + + if sort_labels: + ax_coords = ucodes_inv[valid_ilocs] + + else: + og_order = np.argsort(ucodes_idx) + ucodes = ucodes[og_order, :] + ax_coords = og_order.argsort()[ucodes_inv[valid_ilocs]] + + ax_labels = list( + zip( + *(tuple(lbls[ucodes[:, lvl]]) for lvl, lbls in enumerate(levels_labels)) + ) + ) + + return ax_coords, ax_labels + + def _to_ijv(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): """ For arbitrary (MultiIndexed) sparse Series return @@ -27,65 +54,26 @@ def _to_ijv(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): """ # index and column levels must be a partition of the index _check_is_partition([row_levels, column_levels], range(ss.index.nlevels)) - # from the sparse Series: get the labels and data for non-null entries values = ss.array._valid_sp_values - nonnull_labels = ss.dropna() - - def get_indexers(levels): - """Return sparse coords and dense labels for subset levels""" - # TODO: how to do this better? cleanly slice nonnull_labels given the - # coord - values_ilabels = [tuple(x[i] for i in levels) for x in nonnull_labels.index] - if len(levels) == 1: - values_ilabels = [x[0] for x in values_ilabels] - - # # performance issues with groupby ################################### - # TODO: these two lines can replace the code below but - # groupby is too slow (in some cases at least) - # labels_to_i = ss.groupby(level=levels, sort=sort_labels).first() - # labels_to_i[:] = np.arange(labels_to_i.shape[0]) - - def _get_label_to_i_dict(labels, sort_labels=False): - """ - Return dict of unique labels to number. - Optionally sort by label. - """ - labels = Index(map(tuple, labels)).unique().tolist() # squish - if sort_labels: - labels = sorted(labels) - return {k: i for i, k in enumerate(labels)} - - def _get_index_subset_to_coord_dict(index, subset, sort_labels=False): - ilabels = list(zip(*(index._get_level_values(i) for i in subset))) - labels_to_i = _get_label_to_i_dict(ilabels, sort_labels=sort_labels) - labels_to_i = Series(labels_to_i) - if len(subset) > 1: - labels_to_i.index = MultiIndex.from_tuples(labels_to_i.index) - labels_to_i.index.names = [index.names[i] for i in subset] - else: - labels_to_i.index = Index(x[0] for x in labels_to_i.index) - labels_to_i.index.name = index.names[subset[0]] - - labels_to_i.name = "value" - return labels_to_i - - labels_to_i = _get_index_subset_to_coord_dict( - ss.index, levels, sort_labels=sort_labels - ) - # ##################################################################### - # ##################################################################### + codes = ss.index.codes + labels = ss.index.levels + valid_ilocs = np.where(ss.notnull())[0] - i_coord = labels_to_i[values_ilabels].tolist() - i_labels = labels_to_i.index.tolist() - - return i_coord, i_labels + row_labels = [labels[lvl] for lvl in row_levels] + row_codes = np.asarray([codes[lvl] for lvl in row_levels]) + i_coords, i_labels = _levels_to_axis( + row_codes, row_labels, valid_ilocs, sort_labels=sort_labels + ) - i_coord, i_labels = get_indexers(row_levels) - j_coord, j_labels = get_indexers(column_levels) + col_labels = [labels[lvl] for lvl in column_levels] + col_codes = np.asarray([codes[lvl] for lvl in column_levels]) + j_coords, j_labels = _levels_to_axis( + col_codes, col_labels, valid_ilocs, sort_labels=sort_labels + ) - return values, i_coord, j_coord, i_labels, j_labels + return values, i_coords, j_coords, i_labels, j_labels def sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): @@ -97,7 +85,7 @@ def sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=Fa import scipy.sparse if ss.index.nlevels < 2: - raise ValueError("to_coo requires MultiIndex with nlevels > 2") + raise ValueError("to_coo requires MultiIndex with nlevels >= 2.") if not ss.index.is_unique: raise ValueError( "Duplicate index entries are not allowed in to_coo transformation." From 91560ea39af6c1b67cfed52ec8cbe71c9e8a0aa6 Mon Sep 17 00:00:00 2001 From: TLouf Date: Sat, 7 Aug 2021 19:10:01 +0200 Subject: [PATCH 02/16] Extend benchmark to two-level MultiIndex case --- asv_bench/benchmarks/sparse.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index 35e5818cd3b2b..7d4cd7fddabf4 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -67,16 +67,28 @@ def time_sparse_series_from_coo(self): class ToCoo: + params = [True, False] + param_names = ["sort_labels"] + def setup(self): s = Series([np.nan] * 10000) s[0] = 3.0 s[100] = -1.0 s[999] = 12.1 - s.index = MultiIndex.from_product([range(10)] * 4) - self.ss = s.astype("Sparse") - def time_sparse_series_to_coo(self): - self.ss.sparse.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True) + s_mult_lvl = s.set_axis(MultiIndex.from_product([range(10)] * 4)) + self.ss_mult_lvl = s_mult_lvl.astype("Sparse") + + s_two_lvl = s.set_axis(MultiIndex.from_product([range(100)] * 2)) + self.ss_two_lvl = s_two_lvl.astype("Sparse") + + def time_sparse_series_to_coo(self, sort_labels): + self.ss_mult_lvl.sparse.to_coo( + row_levels=[0, 1], column_levels=[2, 3], sort_labels=sort_labels + ) + + def time_sparse_series_to_coo_single_level(self, sort_labels): + self.ss_two_lvl.sparse.to_coo(sort_labels=sort_labels) class Arithmetic: From 042dbbc3b0b99e20b53553a1e669d569c0b0e0c6 Mon Sep 17 00:00:00 2001 From: TLouf Date: Sat, 7 Aug 2021 19:11:42 +0200 Subject: [PATCH 03/16] Test more properly the output --- pandas/tests/arrays/sparse/test_array.py | 39 +++++++++++++++++++++--- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 1cc8a2df44812..082704834e9fd 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -1198,13 +1198,42 @@ def test_from_coo(self): def test_to_coo(self): import scipy.sparse - ser = pd.Series( - [1, 2, 3], - index=pd.MultiIndex.from_product([[0], [1, 2, 3]], names=["a", "b"]), - dtype="Sparse[int]", + s = pd.Series([np.nan] * 6) + s[2] = 1 + s[5] = 3 + s.index = pd.MultiIndex.from_tuples( + [ + ("b", 2, "z", 1), + ("a", 2, "z", 2), + ("a", 2, "z", 1), + ("a", 2, "x", 2), + ("b", 1, "z", 1), + ("a", 1, "z", 0), + ] + ) + ss = s.astype("Sparse") + + expected_A = np.zeros((4, 4)) + expected_A[1, 0] = 1 + expected_A[3, 3] = 3 + A, rows, cols = ss.sparse.to_coo( + row_levels=(0, 1), column_levels=(2, 3), sort_labels=False + ) + assert isinstance(A, scipy.sparse.coo.coo_matrix) + assert np.all(A.toarray() == expected_A) + assert rows == [("b", 2), ("a", 2), ("b", 1), ("a", 1)] + assert cols == [("z", 1), ("z", 2), ("x", 2), ("z", 0)] + + expected_A = np.zeros((4, 4)) + expected_A[1, 2] = 1 + expected_A[0, 1] = 3 + A, rows, cols = ss.sparse.to_coo( + row_levels=(0, 1), column_levels=(2, 3), sort_labels=True ) - A, _, _ = ser.sparse.to_coo() assert isinstance(A, scipy.sparse.coo.coo_matrix) + assert np.all(A.toarray() == expected_A) + assert rows == [("a", 1), ("a", 2), ("b", 1), ("b", 2)] + assert cols == [("x", 2), ("z", 0), ("z", 1), ("z", 2)] def test_non_sparse_raises(self): ser = pd.Series([1, 2, 3]) From db7ebf915b75f66a1fa8afaf2ce5409a5fffd75a Mon Sep 17 00:00:00 2001 From: TLouf Date: Sat, 7 Aug 2021 19:24:37 +0200 Subject: [PATCH 04/16] Fix benchmark --- asv_bench/benchmarks/sparse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index 7d4cd7fddabf4..c8c1a962e6861 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -70,7 +70,7 @@ class ToCoo: params = [True, False] param_names = ["sort_labels"] - def setup(self): + def setup(self, sort_labels): s = Series([np.nan] * 10000) s[0] = 3.0 s[100] = -1.0 From e09d760abf852b723ab0c1d18ef07581dd385918 Mon Sep 17 00:00:00 2001 From: TLouf Date: Sun, 8 Aug 2021 12:17:34 +0200 Subject: [PATCH 05/16] use pd.factorize --- pandas/core/arrays/sparse/scipy_sparse.py | 41 ++++++----------------- 1 file changed, 11 insertions(+), 30 deletions(-) diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index 7ede910144d39..6f6fe06d925c2 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -5,6 +5,7 @@ """ import numpy as np +from pandas.core.algorithms import factorize from pandas.core.indexes.api import MultiIndex from pandas.core.series import Series @@ -18,31 +19,17 @@ def _check_is_partition(parts, whole): raise ValueError("Is not a partition because union is not the whole.") -def _levels_to_axis(levels_codes, levels_labels, valid_ilocs, sort_labels=False): - if sort_labels and levels_codes.shape[0] == 1: - ax_coords = levels_codes[0][valid_ilocs] - ax_labels = levels_labels[0].tolist() +def _levels_to_axis(ss, levels, valid_ilocs, sort_labels=False): + if sort_labels and len(levels) == 1: + ax_coords = ss.index.codes[levels[0]][valid_ilocs] + ax_labels = ss.index.levels[levels[0]] else: - # Why return_index anyway : https://github.com/numpy/numpy/issues/16923 - ucodes, ucodes_idx, ucodes_inv = np.unique( - levels_codes.T, axis=0, return_index=True, return_inverse=True - ) - - if sort_labels: - ax_coords = ucodes_inv[valid_ilocs] - - else: - og_order = np.argsort(ucodes_idx) - ucodes = ucodes[og_order, :] - ax_coords = og_order.argsort()[ucodes_inv[valid_ilocs]] - - ax_labels = list( - zip( - *(tuple(lbls[ucodes[:, lvl]]) for lvl, lbls in enumerate(levels_labels)) - ) - ) + levels_values = list(zip(*(ss.index.get_level_values(lvl) for lvl in levels))) + codes, ax_labels = factorize(levels_values, sort=sort_labels) + ax_coords = codes[valid_ilocs] + ax_labels = ax_labels.tolist() return ax_coords, ax_labels @@ -57,20 +44,14 @@ def _to_ijv(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): # from the sparse Series: get the labels and data for non-null entries values = ss.array._valid_sp_values - codes = ss.index.codes - labels = ss.index.levels valid_ilocs = np.where(ss.notnull())[0] - row_labels = [labels[lvl] for lvl in row_levels] - row_codes = np.asarray([codes[lvl] for lvl in row_levels]) i_coords, i_labels = _levels_to_axis( - row_codes, row_labels, valid_ilocs, sort_labels=sort_labels + ss, row_levels, valid_ilocs, sort_labels=sort_labels ) - col_labels = [labels[lvl] for lvl in column_levels] - col_codes = np.asarray([codes[lvl] for lvl in column_levels]) j_coords, j_labels = _levels_to_axis( - col_codes, col_labels, valid_ilocs, sort_labels=sort_labels + ss, column_levels, valid_ilocs, sort_labels=sort_labels ) return values, i_coords, j_coords, i_labels, j_labels From 55a7d32bec8b9c47984d1e4c27d7eae31b3982c9 Mon Sep 17 00:00:00 2001 From: TLouf Date: Sun, 8 Aug 2021 12:55:11 +0200 Subject: [PATCH 06/16] parameterie test over sort_labels --- pandas/tests/arrays/sparse/test_array.py | 41 +++++++++++++++--------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 082704834e9fd..5eb6ae67b7089 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -1195,7 +1195,26 @@ def test_from_coo(self): tm.assert_series_equal(result, expected) @td.skip_if_no_scipy - def test_to_coo(self): + @pytest.mark.parametrize( + "sort_labels, expected_rows, expected_cols, expected_values_pos", + [ + ( + False, + [("b", 2), ("a", 2), ("b", 1), ("a", 1)], + [("z", 1), ("z", 2), ("x", 2), ("z", 0)], + {1: (1, 0), 3: (3, 3)}, + ), + ( + True, + [("a", 1), ("a", 2), ("b", 1), ("b", 2)], + [("x", 2), ("z", 0), ("z", 1), ("z", 2)], + {1: (1, 2), 3: (0, 1)}, + ), + ], + ) + def test_to_coo( + self, sort_labels, expected_rows, expected_cols, expected_values_pos + ): import scipy.sparse s = pd.Series([np.nan] * 6) @@ -1214,26 +1233,16 @@ def test_to_coo(self): ss = s.astype("Sparse") expected_A = np.zeros((4, 4)) - expected_A[1, 0] = 1 - expected_A[3, 3] = 3 - A, rows, cols = ss.sparse.to_coo( - row_levels=(0, 1), column_levels=(2, 3), sort_labels=False - ) - assert isinstance(A, scipy.sparse.coo.coo_matrix) - assert np.all(A.toarray() == expected_A) - assert rows == [("b", 2), ("a", 2), ("b", 1), ("a", 1)] - assert cols == [("z", 1), ("z", 2), ("x", 2), ("z", 0)] + for value, (row, col) in expected_values_pos.items(): + expected_A[row, col] = value - expected_A = np.zeros((4, 4)) - expected_A[1, 2] = 1 - expected_A[0, 1] = 3 A, rows, cols = ss.sparse.to_coo( - row_levels=(0, 1), column_levels=(2, 3), sort_labels=True + row_levels=(0, 1), column_levels=(2, 3), sort_labels=sort_labels ) assert isinstance(A, scipy.sparse.coo.coo_matrix) assert np.all(A.toarray() == expected_A) - assert rows == [("a", 1), ("a", 2), ("b", 1), ("b", 2)] - assert cols == [("x", 2), ("z", 0), ("z", 1), ("z", 2)] + assert rows == expected_rows + assert cols == expected_cols def test_non_sparse_raises(self): ser = pd.Series([1, 2, 3]) From e013e82cf7bc3280412029a2eefa6b17866b09ce Mon Sep 17 00:00:00 2001 From: TLouf Date: Sun, 8 Aug 2021 13:17:22 +0200 Subject: [PATCH 07/16] add whatsnew entry --- doc/source/whatsnew/v1.4.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 7395f9d2dcb9e..af7e1e4914685 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -174,6 +174,7 @@ Performance improvements - Performance improvement in constructing :class:`DataFrame` objects (:issue:`42631`) - Performance improvement in :meth:`GroupBy.shift` when ``fill_value`` argument is provided (:issue:`26615`) - Performance improvement in :meth:`DataFrame.corr` for ``method=pearson`` on data without missing values (:issue:`40956`) +- Performance improvement in :meth:`Series.sparse.to_coo` (:issue:`42880`) - .. --------------------------------------------------------------------------- From 621b1028e639a564431630472173e8421165c05f Mon Sep 17 00:00:00 2001 From: TLouf Date: Thu, 12 Aug 2021 16:21:13 +0200 Subject: [PATCH 08/16] speedup of factorize with fast_zip --- pandas/core/arrays/sparse/scipy_sparse.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index 6f6fe06d925c2..3ce62c1608d72 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -5,6 +5,8 @@ """ import numpy as np +from pandas._libs import lib + from pandas.core.algorithms import factorize from pandas.core.indexes.api import MultiIndex from pandas.core.series import Series @@ -25,7 +27,9 @@ def _levels_to_axis(ss, levels, valid_ilocs, sort_labels=False): ax_labels = ss.index.levels[levels[0]] else: - levels_values = list(zip(*(ss.index.get_level_values(lvl) for lvl in levels))) + levels_values = lib.fast_zip( + [ss.index.get_level_values(lvl).values for lvl in levels] + ) codes, ax_labels = factorize(levels_values, sort=sort_labels) ax_coords = codes[valid_ilocs] From 1fa16cc698008315a6a4edf0f3b8e08493340109 Mon Sep 17 00:00:00 2001 From: TLouf Date: Thu, 12 Aug 2021 19:45:11 +0200 Subject: [PATCH 09/16] add type hints --- pandas/core/arrays/sparse/scipy_sparse.py | 49 ++++++++++++++++++++--- 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index 3ce62c1608d72..839dfcfebd4e0 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -3,16 +3,30 @@ Currently only includes to_coo helpers. """ +from typing import ( + TYPE_CHECKING, + Iterable, + List, + Tuple, +) + import numpy as np from pandas._libs import lib +from pandas._typing import ( + IndexLabel, + npt, +) from pandas.core.algorithms import factorize from pandas.core.indexes.api import MultiIndex from pandas.core.series import Series +if TYPE_CHECKING: + import scipy.sparse + -def _check_is_partition(parts, whole): +def _check_is_partition(parts: Iterable, whole: Iterable): whole = set(whole) parts = [set(x) for x in parts] if set.intersection(*parts) != set(): @@ -21,7 +35,12 @@ def _check_is_partition(parts, whole): raise ValueError("Is not a partition because union is not the whole.") -def _levels_to_axis(ss, levels, valid_ilocs, sort_labels=False): +def _levels_to_axis( + ss, + levels: Tuple[int] | List[int], + valid_ilocs: np.ndarray, + sort_labels: bool = False, +) -> Tuple[npt.NDArray[np.intp], List[IndexLabel]]: if sort_labels and len(levels) == 1: ax_coords = ss.index.codes[levels[0]][valid_ilocs] ax_labels = ss.index.levels[levels[0]] @@ -37,7 +56,18 @@ def _levels_to_axis(ss, levels, valid_ilocs, sort_labels=False): return ax_coords, ax_labels -def _to_ijv(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): +def _to_ijv( + ss, + row_levels: Tuple[int] | List[int] = (0,), + column_levels: Tuple[int] | List[int] = (1,), + sort_labels: bool = False, +) -> Tuple[ + np.ndarray, + npt.NDArray[np.intp], + npt.NDArray[np.intp], + List[IndexLabel], + List[IndexLabel], +]: """ For arbitrary (MultiIndexed) sparse Series return (v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for @@ -61,7 +91,12 @@ def _to_ijv(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): return values, i_coords, j_coords, i_labels, j_labels -def sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): +def sparse_series_to_coo( + ss: Series, + row_levels: Iterable[int] = (0,), + column_levels: Iterable[int] = (1,), + sort_labels: bool = False, +) -> Tuple[scipy.sparse.coo_matrix, List[IndexLabel], List[IndexLabel]]: """ Convert a sparse Series to a scipy.sparse.coo_matrix using index levels row_levels, column_levels as the row and column @@ -89,14 +124,16 @@ def sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=Fa return sparse_matrix, rows, columns -def coo_to_sparse_series(A, dense_index: bool = False): +def coo_to_sparse_series( + A: scipy.sparse.coo_matrix, dense_index: bool = False +) -> Series: """ Convert a scipy.sparse.coo_matrix to a SparseSeries. Parameters ---------- A : scipy.sparse.coo.coo_matrix - dense_index : bool, default False + dense_index : bool, default FalseSparseDtype Returns ------- From a5f093d164029e1786b76adb91a0ae929ed7b4d0 Mon Sep 17 00:00:00 2001 From: TLouf Date: Thu, 12 Aug 2021 20:06:21 +0200 Subject: [PATCH 10/16] add docstring and comment on simple case --- pandas/core/arrays/sparse/scipy_sparse.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index 839dfcfebd4e0..199f72f385d79 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -41,6 +41,15 @@ def _levels_to_axis( valid_ilocs: np.ndarray, sort_labels: bool = False, ) -> Tuple[npt.NDArray[np.intp], List[IndexLabel]]: + """ + For a MultiIndexed sparse Series `ss`, return `ax_coords` and `ax_labels`, + where `ax_coords` are the coordinates along one of the two axes of the + destination sparse matrix, and `ax_labels` are the labels from `ss`' Index + which correspond to these coordinates. + """ + # Since the labels are sorted in `Index.levels`, when we wish to sort and + # there is only one level of the MultiIndex for this axis, the desired + # output can be obtained in the following simpler, more efficient way. if sort_labels and len(levels) == 1: ax_coords = ss.index.codes[levels[0]][valid_ilocs] ax_labels = ss.index.levels[levels[0]] @@ -69,9 +78,10 @@ def _to_ijv( List[IndexLabel], ]: """ - For arbitrary (MultiIndexed) sparse Series return - (v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for - passing to scipy.sparse.coo constructor. + For arbitrary (MultiIndexed) sparse Series return (v, i, j, ilabels, + jlabels) where (v, (i, j)) is suitable for passing to scipy.sparse.coo + constructor, and ilabels and jlabels are the row and column labels + respectively. """ # index and column levels must be a partition of the index _check_is_partition([row_levels, column_levels], range(ss.index.nlevels)) From 69f8651526446b4770708c427a2c2afd602a3cd2 Mon Sep 17 00:00:00 2001 From: TLouf Date: Mon, 23 Aug 2021 12:47:37 +0200 Subject: [PATCH 11/16] add params and returns in docstrings --- pandas/core/arrays/sparse/scipy_sparse.py | 34 ++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index 199f72f385d79..ea4281204068f 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -46,6 +46,20 @@ def _levels_to_axis( where `ax_coords` are the coordinates along one of the two axes of the destination sparse matrix, and `ax_labels` are the labels from `ss`' Index which correspond to these coordinates. + + Parameters + ---------- + ss : Series + levels : tuple/list + valid_ilocs : numpy.ndarray + Array of integer positions of valid values for the sparse matrix in ss. + sort_labels : bool, default False + Sort the axis labels before forming the sparse matrix. + + Returns + ------- + ax_coords : numpy.ndarray (axis coordinates) + ax_labels : list (axis labels) """ # Since the labels are sorted in `Index.levels`, when we wish to sort and # there is only one level of the MultiIndex for this axis, the desired @@ -78,10 +92,28 @@ def _to_ijv( List[IndexLabel], ]: """ - For arbitrary (MultiIndexed) sparse Series return (v, i, j, ilabels, + For an arbitrary MultiIndexed sparse Series return (v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for passing to scipy.sparse.coo constructor, and ilabels and jlabels are the row and column labels respectively. + + Parameters + ---------- + ss : Series + row_levels : tuple/list + column_levels : tuple/list + sort_labels : bool, default False + Sort the row and column labels before forming the sparse matrix. + + Returns + ------- + values : numpy.ndarray + Valid values to populate a sparse matrix, extracted from + ss. + i_coords : numpy.ndarray (row coordinates of the values) + j_coords : numpy.ndarray (column coordinates of the values) + i_labels : list (row labels) + j_labels : list (column labels) """ # index and column levels must be a partition of the index _check_is_partition([row_levels, column_levels], range(ss.index.nlevels)) From 14cb2ccfd1ee3ba7923cb8d21bccef00b146a288 Mon Sep 17 00:00:00 2001 From: TLouf Date: Tue, 24 Aug 2021 13:54:56 +0200 Subject: [PATCH 12/16] add sort_labels performance tip in docstrings --- pandas/core/arrays/sparse/accessor.py | 2 ++ pandas/core/arrays/sparse/scipy_sparse.py | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 8efdfb719bbfa..f3eccd6aad444 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -113,6 +113,8 @@ def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False): column_levels : tuple/list sort_labels : bool, default False Sort the row and column labels before forming the sparse matrix. + When `row_levels` and/or `column_levels` refer to a single level, + set to `True` for a faster execution. Returns ------- diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index ea4281204068f..8e1dcfdd10926 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -54,7 +54,8 @@ def _levels_to_axis( valid_ilocs : numpy.ndarray Array of integer positions of valid values for the sparse matrix in ss. sort_labels : bool, default False - Sort the axis labels before forming the sparse matrix. + Sort the axis labels before forming the sparse matrix. When `levels` + refers to a single level, set to True for a faster execution. Returns ------- @@ -104,6 +105,8 @@ def _to_ijv( column_levels : tuple/list sort_labels : bool, default False Sort the row and column labels before forming the sparse matrix. + When `row_levels` and/or `column_levels` refer to a single level, + set to `True` for a faster execution. Returns ------- From 7b2d13b10ade7fc7932f85a81f27e98fcd47ec19 Mon Sep 17 00:00:00 2001 From: TLouf Date: Sun, 29 Aug 2021 11:59:54 +0200 Subject: [PATCH 13/16] fix failing tests with future annotations import --- pandas/core/arrays/sparse/scipy_sparse.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index 8e1dcfdd10926..162f704a3c1f0 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -3,11 +3,11 @@ Currently only includes to_coo helpers. """ +from __future__ import annotations + from typing import ( TYPE_CHECKING, Iterable, - List, - Tuple, ) import numpy as np @@ -37,10 +37,10 @@ def _check_is_partition(parts: Iterable, whole: Iterable): def _levels_to_axis( ss, - levels: Tuple[int] | List[int], - valid_ilocs: np.ndarray, + levels: tuple[int] | list[int], + valid_ilocs: npt.NDArray[np.intp], sort_labels: bool = False, -) -> Tuple[npt.NDArray[np.intp], List[IndexLabel]]: +) -> tuple[npt.NDArray[np.intp], list[IndexLabel]]: """ For a MultiIndexed sparse Series `ss`, return `ax_coords` and `ax_labels`, where `ax_coords` are the coordinates along one of the two axes of the @@ -82,15 +82,15 @@ def _levels_to_axis( def _to_ijv( ss, - row_levels: Tuple[int] | List[int] = (0,), - column_levels: Tuple[int] | List[int] = (1,), + row_levels: tuple[int] | list[int] = (0,), + column_levels: tuple[int] | list[int] = (1,), sort_labels: bool = False, -) -> Tuple[ +) -> tuple[ np.ndarray, npt.NDArray[np.intp], npt.NDArray[np.intp], - List[IndexLabel], - List[IndexLabel], + list[IndexLabel], + list[IndexLabel], ]: """ For an arbitrary MultiIndexed sparse Series return (v, i, j, ilabels, @@ -141,7 +141,7 @@ def sparse_series_to_coo( row_levels: Iterable[int] = (0,), column_levels: Iterable[int] = (1,), sort_labels: bool = False, -) -> Tuple[scipy.sparse.coo_matrix, List[IndexLabel], List[IndexLabel]]: +) -> tuple[scipy.sparse.coo_matrix, list[IndexLabel], list[IndexLabel]]: """ Convert a sparse Series to a scipy.sparse.coo_matrix using index levels row_levels, column_levels as the row and column From 2d4a49619e6bc629e7475568f56a2f8965250c64 Mon Sep 17 00:00:00 2001 From: TLouf Date: Tue, 31 Aug 2021 10:26:05 +0200 Subject: [PATCH 14/16] trim trailing whitespace --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index b08f130dd90b2..283437dac0aa5 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -247,7 +247,7 @@ Performance improvements - Performance improvement in :func:`read_stata` (:issue:`43059`) - Performance improvement in :meth:`to_datetime` with ``uint`` dtypes (:issue:`42606`) - Performance improvement in :meth:`Series.sparse.to_coo` (:issue:`42880`) -- +- .. --------------------------------------------------------------------------- From 6c0ab67003701470a905a8e6fcc5cd9f7c572880 Mon Sep 17 00:00:00 2001 From: TLouf Date: Wed, 1 Sep 2021 11:04:28 +0200 Subject: [PATCH 15/16] fix for sparse series with notna fill_value --- pandas/core/arrays/sparse/scipy_sparse.py | 12 ++++++++---- pandas/tests/arrays/sparse/test_array.py | 10 ++++------ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index 162f704a3c1f0..eaa33d35be446 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -18,6 +18,8 @@ npt, ) +from pandas.core.dtypes.missing import notna + from pandas.core.algorithms import factorize from pandas.core.indexes.api import MultiIndex from pandas.core.series import Series @@ -120,10 +122,12 @@ def _to_ijv( """ # index and column levels must be a partition of the index _check_is_partition([row_levels, column_levels], range(ss.index.nlevels)) - # from the sparse Series: get the labels and data for non-null entries - values = ss.array._valid_sp_values - - valid_ilocs = np.where(ss.notnull())[0] + # From the sparse Series, get the integer indices and data for valid sparse + # entries. + sp_vals = ss.array.sp_values + na_mask = notna(sp_vals) + values = sp_vals[na_mask] + valid_ilocs = ss.array.sp_index.indices[na_mask] i_coords, i_labels = _levels_to_axis( ss, row_levels, valid_ilocs, sort_labels=sort_labels diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index f814ee49d6c46..04c80354036f6 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -1217,10 +1217,8 @@ def test_to_coo( ): import scipy.sparse - s = pd.Series([np.nan] * 6) - s[2] = 1 - s[5] = 3 - s.index = pd.MultiIndex.from_tuples( + values = SparseArray([0, np.nan, 1, 0, None, 3], fill_value=0) + index = pd.MultiIndex.from_tuples( [ ("b", 2, "z", 1), ("a", 2, "z", 2), @@ -1230,7 +1228,7 @@ def test_to_coo( ("a", 1, "z", 0), ] ) - ss = s.astype("Sparse") + ss = pd.Series(values, index=index) expected_A = np.zeros((4, 4)) for value, (row, col) in expected_values_pos.items(): @@ -1240,7 +1238,7 @@ def test_to_coo( row_levels=(0, 1), column_levels=(2, 3), sort_labels=sort_labels ) assert isinstance(A, scipy.sparse.coo.coo_matrix) - assert np.all(A.toarray() == expected_A) + np.testing.assert_array_equal(A.toarray(), expected_A) assert rows == expected_rows assert cols == expected_cols From 2acfe466db7615c8f5e2425defa17fcd34e2f628 Mon Sep 17 00:00:00 2001 From: TLouf Date: Wed, 1 Sep 2021 17:05:03 +0200 Subject: [PATCH 16/16] fix typo --- pandas/core/arrays/sparse/scipy_sparse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index eaa33d35be446..3f69321ae98a6 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -182,7 +182,7 @@ def coo_to_sparse_series( Parameters ---------- A : scipy.sparse.coo.coo_matrix - dense_index : bool, default FalseSparseDtype + dense_index : bool, default False Returns -------