|
| 1 | +""" |
| 2 | +Interaction with scipy.sparse matrices. |
| 3 | +
|
| 4 | +Currently only includes SparseSeries.to_coo helpers. |
| 5 | +""" |
| 6 | +from pandas.core.frame import DataFrame |
| 7 | +from pandas.core.index import MultiIndex, Index |
| 8 | +from pandas.core.series import Series |
| 9 | +import itertools |
| 10 | +import numpy as np |
| 11 | +from pandas.compat import OrderedDict |
| 12 | +from pandas.tools.util import cartesian_product |
| 13 | + |
| 14 | + |
| 15 | +def _check_is_partition(parts, whole): |
| 16 | + whole = set(whole) |
| 17 | + parts = [set(x) for x in parts] |
| 18 | + if set.intersection(*parts) != set(): |
| 19 | + raise ValueError( |
| 20 | + 'Is not a partition because intersection is not null.') |
| 21 | + if set.union(*parts) != whole: |
| 22 | + raise ValueError('Is not a partition becuase union is not the whole.') |
| 23 | + |
| 24 | + |
| 25 | +def _to_ijv(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): |
| 26 | + """ For arbitrary (MultiIndexed) SparseSeries return |
| 27 | + (v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for |
| 28 | + passing to scipy.sparse.coo constructor. """ |
| 29 | + # index and column levels must be a partition of the index |
| 30 | + _check_is_partition([row_levels, column_levels], range(ss.index.nlevels)) |
| 31 | + |
| 32 | + # from the SparseSeries: get the labels and data for non-null entries |
| 33 | + values = ss._data.values._valid_sp_values |
| 34 | + |
| 35 | + nonnull_labels = ss.dropna() |
| 36 | + |
| 37 | + def get_indexers(levels): |
| 38 | + """ Return sparse coords and dense labels for subset levels """ |
| 39 | + |
| 40 | + # TODO: how to do this better? cleanly slice nonnull_labels given the |
| 41 | + # coord |
| 42 | + values_ilabels = [tuple(x[i] for i in levels) |
| 43 | + for x in nonnull_labels.index] |
| 44 | + if len(levels) == 1: |
| 45 | + values_ilabels = [x[0] for x in values_ilabels] |
| 46 | + |
| 47 | + ####################################################################### |
| 48 | + # # performance issues with groupby ################################### |
| 49 | + # TODO: these two lines can rejplace the code below but |
| 50 | + # groupby is too slow (in some cases at least) |
| 51 | + # labels_to_i = ss.groupby(level=levels, sort=sort_labels).first() |
| 52 | + # labels_to_i[:] = np.arange(labels_to_i.shape[0]) |
| 53 | + |
| 54 | + def _get_label_to_i_dict(labels, sort_labels=False): |
| 55 | + """ Return OrderedDict of unique labels to number. |
| 56 | + Optionally sort by label. """ |
| 57 | + labels = Index(map(tuple, labels)).unique().tolist() # squish |
| 58 | + if sort_labels: |
| 59 | + labels = sorted(list(labels)) |
| 60 | + d = OrderedDict((k, i) for i, k in enumerate(labels)) |
| 61 | + return(d) |
| 62 | + |
| 63 | + def _get_index_subset_to_coord_dict(index, subset, sort_labels=False): |
| 64 | + def robust_get_level_values(i): |
| 65 | + # if index has labels (that are not None) use those, |
| 66 | + # else use the level location |
| 67 | + try: |
| 68 | + return(index.get_level_values(index.names[i])) |
| 69 | + except KeyError: |
| 70 | + return(index.get_level_values(i)) |
| 71 | + ilabels = list( |
| 72 | + zip(*[robust_get_level_values(i) for i in subset])) |
| 73 | + labels_to_i = _get_label_to_i_dict( |
| 74 | + ilabels, sort_labels=sort_labels) |
| 75 | + labels_to_i = Series(labels_to_i) |
| 76 | + labels_to_i.index = MultiIndex.from_tuples(labels_to_i.index) |
| 77 | + labels_to_i.index.names = [index.names[i] for i in subset] |
| 78 | + labels_to_i.name = 'value' |
| 79 | + return(labels_to_i) |
| 80 | + |
| 81 | + labels_to_i = _get_index_subset_to_coord_dict( |
| 82 | + ss.index, levels, sort_labels=sort_labels) |
| 83 | + ####################################################################### |
| 84 | + ####################################################################### |
| 85 | + |
| 86 | + i_coord = labels_to_i[values_ilabels].tolist() |
| 87 | + i_labels = labels_to_i.index.tolist() |
| 88 | + |
| 89 | + return i_coord, i_labels |
| 90 | + |
| 91 | + i_coord, i_labels = get_indexers(row_levels) |
| 92 | + j_coord, j_labels = get_indexers(column_levels) |
| 93 | + |
| 94 | + return values, i_coord, j_coord, i_labels, j_labels |
| 95 | + |
| 96 | + |
| 97 | +def _sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): |
| 98 | + """ Convert a SparseSeries to a scipy.sparse.coo_matrix using index |
| 99 | + levels row_levels, column_levels as the row and column |
| 100 | + labels respectively. Returns the sparse_matrix, row and column labels. """ |
| 101 | + |
| 102 | + import scipy.sparse |
| 103 | + |
| 104 | + if ss.index.nlevels < 2: |
| 105 | + raise ValueError('to_coo requires MultiIndex with nlevels > 2') |
| 106 | + if not ss.index.is_unique: |
| 107 | + raise ValueError( |
| 108 | + 'Duplicate index entries are not allowed in to_coo transformation.') |
| 109 | + |
| 110 | + # to keep things simple, only rely on integer indexing (not labels) |
| 111 | + row_levels = [ss.index._get_level_number(x) for x in row_levels] |
| 112 | + column_levels = [ss.index._get_level_number(x) for x in column_levels] |
| 113 | + |
| 114 | + v, i, j, rows, columns = _to_ijv( |
| 115 | + ss, row_levels=row_levels, column_levels=column_levels, sort_labels=sort_labels) |
| 116 | + sparse_matrix = scipy.sparse.coo_matrix( |
| 117 | + (v, (i, j)), shape=(len(rows), len(columns))) |
| 118 | + return sparse_matrix, rows, columns |
| 119 | + |
| 120 | + |
| 121 | +def _coo_to_sparse_series(A, dense_index=False): |
| 122 | + """ Convert a scipy.sparse.coo_matrix to a SparseSeries. |
| 123 | + Use the defaults given in the SparseSeries constructor. """ |
| 124 | + s = Series(A.data, MultiIndex.from_arrays((A.row, A.col))) |
| 125 | + s = s.sort_index() |
| 126 | + s = s.to_sparse() # TODO: specify kind? |
| 127 | + if dense_index: |
| 128 | + # is there a better constructor method to use here? |
| 129 | + i = range(A.shape[0]) |
| 130 | + j = range(A.shape[1]) |
| 131 | + ind = MultiIndex.from_product([i, j]) |
| 132 | + s = s.reindex_axis(ind) |
| 133 | + return s |
0 commit comments