3
3
4
4
Currently only includes to_coo helpers.
5
5
"""
6
- from pandas .core .indexes .api import (
7
- Index ,
8
- MultiIndex ,
6
+ from __future__ import annotations
7
+
8
+ from typing import (
9
+ TYPE_CHECKING ,
10
+ Iterable ,
11
+ )
12
+
13
+ import numpy as np
14
+
15
+ from pandas ._libs import lib
16
+ from pandas ._typing import (
17
+ IndexLabel ,
18
+ npt ,
9
19
)
20
+
21
+ from pandas .core .dtypes .missing import notna
22
+
23
+ from pandas .core .algorithms import factorize
24
+ from pandas .core .indexes .api import MultiIndex
10
25
from pandas .core .series import Series
11
26
27
+ if TYPE_CHECKING :
28
+ import scipy .sparse
29
+
12
30
13
- def _check_is_partition (parts , whole ):
31
+ def _check_is_partition (parts : Iterable , whole : Iterable ):
14
32
whole = set (whole )
15
33
parts = [set (x ) for x in parts ]
16
34
if set .intersection (* parts ) != set ():
@@ -19,76 +37,115 @@ def _check_is_partition(parts, whole):
19
37
raise ValueError ("Is not a partition because union is not the whole." )
20
38
21
39
22
- def _to_ijv (ss , row_levels = (0 ,), column_levels = (1 ,), sort_labels = False ):
23
- """
24
- For arbitrary (MultiIndexed) sparse Series return
25
- (v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for
26
- passing to scipy.sparse.coo constructor.
40
+ def _levels_to_axis (
41
+ ss ,
42
+ levels : tuple [int ] | list [int ],
43
+ valid_ilocs : npt .NDArray [np .intp ],
44
+ sort_labels : bool = False ,
45
+ ) -> tuple [npt .NDArray [np .intp ], list [IndexLabel ]]:
27
46
"""
28
- # index and column levels must be a partition of the index
29
- _check_is_partition ([row_levels , column_levels ], range (ss .index .nlevels ))
47
+ For a MultiIndexed sparse Series `ss`, return `ax_coords` and `ax_labels`,
48
+ where `ax_coords` are the coordinates along one of the two axes of the
49
+ destination sparse matrix, and `ax_labels` are the labels from `ss`' Index
50
+ which correspond to these coordinates.
51
+
52
+ Parameters
53
+ ----------
54
+ ss : Series
55
+ levels : tuple/list
56
+ valid_ilocs : numpy.ndarray
57
+ Array of integer positions of valid values for the sparse matrix in ss.
58
+ sort_labels : bool, default False
59
+ Sort the axis labels before forming the sparse matrix. When `levels`
60
+ refers to a single level, set to True for a faster execution.
30
61
31
- # from the sparse Series: get the labels and data for non-null entries
32
- values = ss .array ._valid_sp_values
33
-
34
- nonnull_labels = ss .dropna ()
35
-
36
- def get_indexers (levels ):
37
- """Return sparse coords and dense labels for subset levels"""
38
- # TODO: how to do this better? cleanly slice nonnull_labels given the
39
- # coord
40
- values_ilabels = [tuple (x [i ] for i in levels ) for x in nonnull_labels .index ]
41
- if len (levels ) == 1 :
42
- values_ilabels = [x [0 ] for x in values_ilabels ]
43
-
44
- # # performance issues with groupby ###################################
45
- # TODO: these two lines can replace the code below but
46
- # groupby is too slow (in some cases at least)
47
- # labels_to_i = ss.groupby(level=levels, sort=sort_labels).first()
48
- # labels_to_i[:] = np.arange(labels_to_i.shape[0])
49
-
50
- def _get_label_to_i_dict (labels , sort_labels = False ):
51
- """
52
- Return dict of unique labels to number.
53
- Optionally sort by label.
54
- """
55
- labels = Index (map (tuple , labels )).unique ().tolist () # squish
56
- if sort_labels :
57
- labels = sorted (labels )
58
- return {k : i for i , k in enumerate (labels )}
59
-
60
- def _get_index_subset_to_coord_dict (index , subset , sort_labels = False ):
61
- ilabels = list (zip (* (index ._get_level_values (i ) for i in subset )))
62
- labels_to_i = _get_label_to_i_dict (ilabels , sort_labels = sort_labels )
63
- labels_to_i = Series (labels_to_i )
64
- if len (subset ) > 1 :
65
- labels_to_i .index = MultiIndex .from_tuples (labels_to_i .index )
66
- labels_to_i .index .names = [index .names [i ] for i in subset ]
67
- else :
68
- labels_to_i .index = Index (x [0 ] for x in labels_to_i .index )
69
- labels_to_i .index .name = index .names [subset [0 ]]
70
-
71
- labels_to_i .name = "value"
72
- return labels_to_i
73
-
74
- labels_to_i = _get_index_subset_to_coord_dict (
75
- ss .index , levels , sort_labels = sort_labels
62
+ Returns
63
+ -------
64
+ ax_coords : numpy.ndarray (axis coordinates)
65
+ ax_labels : list (axis labels)
66
+ """
67
+ # Since the labels are sorted in `Index.levels`, when we wish to sort and
68
+ # there is only one level of the MultiIndex for this axis, the desired
69
+ # output can be obtained in the following simpler, more efficient way.
70
+ if sort_labels and len (levels ) == 1 :
71
+ ax_coords = ss .index .codes [levels [0 ]][valid_ilocs ]
72
+ ax_labels = ss .index .levels [levels [0 ]]
73
+
74
+ else :
75
+ levels_values = lib .fast_zip (
76
+ [ss .index .get_level_values (lvl ).values for lvl in levels ]
76
77
)
77
- # #####################################################################
78
- # #####################################################################
78
+ codes , ax_labels = factorize (levels_values , sort = sort_labels )
79
+ ax_coords = codes [valid_ilocs ]
80
+
81
+ ax_labels = ax_labels .tolist ()
82
+ return ax_coords , ax_labels
83
+
84
+
85
+ def _to_ijv (
86
+ ss ,
87
+ row_levels : tuple [int ] | list [int ] = (0 ,),
88
+ column_levels : tuple [int ] | list [int ] = (1 ,),
89
+ sort_labels : bool = False ,
90
+ ) -> tuple [
91
+ np .ndarray ,
92
+ npt .NDArray [np .intp ],
93
+ npt .NDArray [np .intp ],
94
+ list [IndexLabel ],
95
+ list [IndexLabel ],
96
+ ]:
97
+ """
98
+ For an arbitrary MultiIndexed sparse Series return (v, i, j, ilabels,
99
+ jlabels) where (v, (i, j)) is suitable for passing to scipy.sparse.coo
100
+ constructor, and ilabels and jlabels are the row and column labels
101
+ respectively.
79
102
80
- i_coord = labels_to_i [values_ilabels ].tolist ()
81
- i_labels = labels_to_i .index .tolist ()
103
+ Parameters
104
+ ----------
105
+ ss : Series
106
+ row_levels : tuple/list
107
+ column_levels : tuple/list
108
+ sort_labels : bool, default False
109
+ Sort the row and column labels before forming the sparse matrix.
110
+ When `row_levels` and/or `column_levels` refer to a single level,
111
+ set to `True` for a faster execution.
82
112
83
- return i_coord , i_labels
113
+ Returns
114
+ -------
115
+ values : numpy.ndarray
116
+ Valid values to populate a sparse matrix, extracted from
117
+ ss.
118
+ i_coords : numpy.ndarray (row coordinates of the values)
119
+ j_coords : numpy.ndarray (column coordinates of the values)
120
+ i_labels : list (row labels)
121
+ j_labels : list (column labels)
122
+ """
123
+ # index and column levels must be a partition of the index
124
+ _check_is_partition ([row_levels , column_levels ], range (ss .index .nlevels ))
125
+ # From the sparse Series, get the integer indices and data for valid sparse
126
+ # entries.
127
+ sp_vals = ss .array .sp_values
128
+ na_mask = notna (sp_vals )
129
+ values = sp_vals [na_mask ]
130
+ valid_ilocs = ss .array .sp_index .indices [na_mask ]
131
+
132
+ i_coords , i_labels = _levels_to_axis (
133
+ ss , row_levels , valid_ilocs , sort_labels = sort_labels
134
+ )
84
135
85
- i_coord , i_labels = get_indexers (row_levels )
86
- j_coord , j_labels = get_indexers (column_levels )
136
+ j_coords , j_labels = _levels_to_axis (
137
+ ss , column_levels , valid_ilocs , sort_labels = sort_labels
138
+ )
87
139
88
- return values , i_coord , j_coord , i_labels , j_labels
140
+ return values , i_coords , j_coords , i_labels , j_labels
89
141
90
142
91
- def sparse_series_to_coo (ss , row_levels = (0 ,), column_levels = (1 ,), sort_labels = False ):
143
+ def sparse_series_to_coo (
144
+ ss : Series ,
145
+ row_levels : Iterable [int ] = (0 ,),
146
+ column_levels : Iterable [int ] = (1 ,),
147
+ sort_labels : bool = False ,
148
+ ) -> tuple [scipy .sparse .coo_matrix , list [IndexLabel ], list [IndexLabel ]]:
92
149
"""
93
150
Convert a sparse Series to a scipy.sparse.coo_matrix using index
94
151
levels row_levels, column_levels as the row and column
@@ -97,7 +154,7 @@ def sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=Fa
97
154
import scipy .sparse
98
155
99
156
if ss .index .nlevels < 2 :
100
- raise ValueError ("to_coo requires MultiIndex with nlevels > 2 " )
157
+ raise ValueError ("to_coo requires MultiIndex with nlevels >= 2. " )
101
158
if not ss .index .is_unique :
102
159
raise ValueError (
103
160
"Duplicate index entries are not allowed in to_coo transformation."
@@ -116,7 +173,9 @@ def sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=Fa
116
173
return sparse_matrix , rows , columns
117
174
118
175
119
- def coo_to_sparse_series (A , dense_index : bool = False ):
176
+ def coo_to_sparse_series (
177
+ A : scipy .sparse .coo_matrix , dense_index : bool = False
178
+ ) -> Series :
120
179
"""
121
180
Convert a scipy.sparse.coo_matrix to a SparseSeries.
122
181
0 commit comments