forked from pandas-dev/pandas
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathscipy_sparse.py
151 lines (121 loc) · 5.72 KB
/
scipy_sparse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""
Interaction with scipy.sparse matrices.
Currently only includes SparseSeries.to_coo helpers.
"""
from collections import OrderedDict
from pandas.core.index import Index, MultiIndex
from pandas.core.series import Series
def _check_is_partition(parts, whole):
whole = set(whole)
parts = [set(x) for x in parts]
if set.intersection(*parts) != set():
raise ValueError(
'Is not a partition because intersection is not null.')
if set.union(*parts) != whole:
raise ValueError('Is not a partition because union is not the whole.')
def _to_ijv(ss, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
""" For arbitrary (MultiIndexed) SparseSeries return
(v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for
passing to scipy.sparse.coo constructor. """
# index and column levels must be a partition of the index
_check_is_partition([row_levels, column_levels], range(ss.index.nlevels))
# from the SparseSeries: get the labels and data for non-null entries
values = ss._data.internal_values()._valid_sp_values
nonnull_labels = ss.dropna()
def get_indexers(levels):
""" Return sparse coords and dense labels for subset levels """
# TODO: how to do this better? cleanly slice nonnull_labels given the
# coord
values_ilabels = [tuple(x[i] for i in levels)
for x in nonnull_labels.index]
if len(levels) == 1:
values_ilabels = [x[0] for x in values_ilabels]
# # performance issues with groupby ###################################
# TODO: these two lines can rejplace the code below but
# groupby is too slow (in some cases at least)
# labels_to_i = ss.groupby(level=levels, sort=sort_labels).first()
# labels_to_i[:] = np.arange(labels_to_i.shape[0])
def _get_label_to_i_dict(labels, sort_labels=False):
""" Return OrderedDict of unique labels to number.
Optionally sort by label.
"""
labels = Index(map(tuple, labels)).unique().tolist() # squish
if sort_labels:
labels = sorted(list(labels))
d = OrderedDict((k, i) for i, k in enumerate(labels))
return (d)
def _get_index_subset_to_coord_dict(index, subset, sort_labels=False):
ilabels = list(zip(*[index._get_level_values(i) for i in subset]))
labels_to_i = _get_label_to_i_dict(ilabels,
sort_labels=sort_labels)
labels_to_i = Series(labels_to_i)
if len(subset) > 1:
labels_to_i.index = MultiIndex.from_tuples(labels_to_i.index)
labels_to_i.index.names = [index.names[i] for i in subset]
else:
labels_to_i.index = Index(x[0] for x in labels_to_i.index)
labels_to_i.index.name = index.names[subset[0]]
labels_to_i.name = 'value'
return (labels_to_i)
labels_to_i = _get_index_subset_to_coord_dict(ss.index, levels,
sort_labels=sort_labels)
# #####################################################################
# #####################################################################
i_coord = labels_to_i[values_ilabels].tolist()
i_labels = labels_to_i.index.tolist()
return i_coord, i_labels
i_coord, i_labels = get_indexers(row_levels)
j_coord, j_labels = get_indexers(column_levels)
return values, i_coord, j_coord, i_labels, j_labels
def _sparse_series_to_coo(ss, row_levels=(0, ), column_levels=(1, ),
sort_labels=False):
"""
Convert a SparseSeries to a scipy.sparse.coo_matrix using index
levels row_levels, column_levels as the row and column
labels respectively. Returns the sparse_matrix, row and column labels.
"""
import scipy.sparse
if ss.index.nlevels < 2:
raise ValueError('to_coo requires MultiIndex with nlevels > 2')
if not ss.index.is_unique:
raise ValueError('Duplicate index entries are not allowed in to_coo '
'transformation.')
# to keep things simple, only rely on integer indexing (not labels)
row_levels = [ss.index._get_level_number(x) for x in row_levels]
column_levels = [ss.index._get_level_number(x) for x in column_levels]
v, i, j, rows, columns = _to_ijv(ss, row_levels=row_levels,
column_levels=column_levels,
sort_labels=sort_labels)
sparse_matrix = scipy.sparse.coo_matrix(
(v, (i, j)), shape=(len(rows), len(columns)))
return sparse_matrix, rows, columns
def _coo_to_sparse_series(A, dense_index: bool = False,
sparse_series: bool = True):
"""
Convert a scipy.sparse.coo_matrix to a SparseSeries.
Parameters
----------
A : scipy.sparse.coo.coo_matrix
dense_index : bool, default False
sparse_series : bool, default True
Returns
-------
Series or SparseSeries
"""
from pandas import SparseDtype
s = Series(A.data, MultiIndex.from_arrays((A.row, A.col)))
s = s.sort_index()
if sparse_series:
# TODO(SparseSeries): remove this and the sparse_series keyword.
# This is just here to avoid a DeprecationWarning when
# _coo_to_sparse_series is called via Series.sparse.from_coo
s = s.to_sparse() # TODO: specify kind?
else:
s = s.astype(SparseDtype(s.dtype))
if dense_index:
# is there a better constructor method to use here?
i = range(A.shape[0])
j = range(A.shape[1])
ind = MultiIndex.from_product([i, j])
s = s.reindex(ind)
return s