forked from pandas-dev/pandas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscipy_sparse.py
152 lines (124 loc) · 5.26 KB
/
scipy_sparse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""
Interaction with scipy.sparse matrices.
Currently only includes to_coo helpers.
"""
from pandas.core.indexes.api import (
Index,
MultiIndex,
)
from pandas.core.series import Series
def _check_is_partition(parts, whole):
whole = set(whole)
parts = [set(x) for x in parts]
if set.intersection(*parts) != set():
raise ValueError("Is not a partition because intersection is not null.")
if set.union(*parts) != whole:
raise ValueError("Is not a partition because union is not the whole.")
def _to_ijv(ss, row_levels=(0,), column_levels=(1,), sort_labels=False):
"""
For arbitrary (MultiIndexed) sparse Series return
(v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for
passing to scipy.sparse.coo constructor.
"""
# index and column levels must be a partition of the index
_check_is_partition([row_levels, column_levels], range(ss.index.nlevels))
# from the sparse Series: get the labels and data for non-null entries
values = ss.array._valid_sp_values
nonnull_labels = ss.dropna()
def get_indexers(levels):
"""Return sparse coords and dense labels for subset levels"""
# TODO: how to do this better? cleanly slice nonnull_labels given the
# coord
values_ilabels = [tuple(x[i] for i in levels) for x in nonnull_labels.index]
if len(levels) == 1:
values_ilabels = [x[0] for x in values_ilabels]
# # performance issues with groupby ###################################
# TODO: these two lines can replace the code below but
# groupby is too slow (in some cases at least)
# labels_to_i = ss.groupby(level=levels, sort=sort_labels).first()
# labels_to_i[:] = np.arange(labels_to_i.shape[0])
def _get_label_to_i_dict(labels, sort_labels=False):
"""
Return dict of unique labels to number.
Optionally sort by label.
"""
labels = Index(map(tuple, labels)).unique().tolist() # squish
if sort_labels:
labels = sorted(labels)
return {k: i for i, k in enumerate(labels)}
def _get_index_subset_to_coord_dict(index, subset, sort_labels=False):
ilabels = list(zip(*[index._get_level_values(i) for i in subset]))
labels_to_i = _get_label_to_i_dict(ilabels, sort_labels=sort_labels)
labels_to_i = Series(labels_to_i)
if len(subset) > 1:
labels_to_i.index = MultiIndex.from_tuples(labels_to_i.index)
labels_to_i.index.names = [index.names[i] for i in subset]
else:
labels_to_i.index = Index(x[0] for x in labels_to_i.index)
labels_to_i.index.name = index.names[subset[0]]
labels_to_i.name = "value"
return labels_to_i
labels_to_i = _get_index_subset_to_coord_dict(
ss.index, levels, sort_labels=sort_labels
)
# #####################################################################
# #####################################################################
i_coord = labels_to_i[values_ilabels].tolist()
i_labels = labels_to_i.index.tolist()
return i_coord, i_labels
i_coord, i_labels = get_indexers(row_levels)
j_coord, j_labels = get_indexers(column_levels)
return values, i_coord, j_coord, i_labels, j_labels
def sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=False):
"""
Convert a sparse Series to a scipy.sparse.coo_matrix using index
levels row_levels, column_levels as the row and column
labels respectively. Returns the sparse_matrix, row and column labels.
"""
import scipy.sparse
if ss.index.nlevels < 2:
raise ValueError("to_coo requires MultiIndex with nlevels > 2")
if not ss.index.is_unique:
raise ValueError(
"Duplicate index entries are not allowed in to_coo transformation."
)
# to keep things simple, only rely on integer indexing (not labels)
row_levels = [ss.index._get_level_number(x) for x in row_levels]
column_levels = [ss.index._get_level_number(x) for x in column_levels]
v, i, j, rows, columns = _to_ijv(
ss, row_levels=row_levels, column_levels=column_levels, sort_labels=sort_labels
)
sparse_matrix = scipy.sparse.coo_matrix(
(v, (i, j)), shape=(len(rows), len(columns))
)
return sparse_matrix, rows, columns
def coo_to_sparse_series(A, dense_index: bool = False):
"""
Convert a scipy.sparse.coo_matrix to a SparseSeries.
Parameters
----------
A : scipy.sparse.coo_matrix
dense_index : bool, default False
Returns
-------
Series
Raises
------
TypeError if A is not a coo_matrix
"""
from pandas import SparseDtype
try:
s = Series(A.data, MultiIndex.from_arrays((A.row, A.col)))
except AttributeError as err:
raise TypeError(
f"Expected coo_matrix. Got {type(A).__name__} instead."
) from err
s = s.sort_index()
s = s.astype(SparseDtype(s.dtype))
if dense_index:
# is there a better constructor method to use here?
i = range(A.shape[0])
j = range(A.shape[1])
ind = MultiIndex.from_product([i, j])
s = s.reindex(ind)
return s