|
| 1 | +"""Sparse accessor""" |
| 2 | + |
| 3 | +import numpy as np |
| 4 | + |
| 5 | +from pandas.compat._optional import import_optional_dependency |
| 6 | + |
| 7 | +from pandas.core.dtypes.cast import find_common_type |
| 8 | + |
| 9 | +from pandas.core.accessor import PandasDelegate, delegate_names |
| 10 | + |
| 11 | +from .array import SparseArray |
| 12 | +from .dtype import SparseDtype |
| 13 | + |
| 14 | + |
| 15 | +class BaseAccessor: |
| 16 | + _validation_msg = "Can only use the '.sparse' accessor with Sparse data." |
| 17 | + |
| 18 | + def __init__(self, data=None): |
| 19 | + self._parent = data |
| 20 | + self._validate(data) |
| 21 | + |
| 22 | + def _validate(self, data): |
| 23 | + raise NotImplementedError |
| 24 | + |
| 25 | + |
| 26 | +@delegate_names( |
| 27 | + SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property" |
| 28 | +) |
| 29 | +class SparseAccessor(BaseAccessor, PandasDelegate): |
| 30 | + """ |
| 31 | + Accessor for SparseSparse from other sparse matrix data types. |
| 32 | + """ |
| 33 | + |
| 34 | + def _validate(self, data): |
| 35 | + if not isinstance(data.dtype, SparseDtype): |
| 36 | + raise AttributeError(self._validation_msg) |
| 37 | + |
| 38 | + def _delegate_property_get(self, name, *args, **kwargs): |
| 39 | + return getattr(self._parent.array, name) |
| 40 | + |
| 41 | + def _delegate_method(self, name, *args, **kwargs): |
| 42 | + if name == "from_coo": |
| 43 | + return self.from_coo(*args, **kwargs) |
| 44 | + elif name == "to_coo": |
| 45 | + return self.to_coo(*args, **kwargs) |
| 46 | + else: |
| 47 | + raise ValueError |
| 48 | + |
| 49 | + @classmethod |
| 50 | + def from_coo(cls, A, dense_index=False): |
| 51 | + """ |
| 52 | + Create a Series with sparse values from a scipy.sparse.coo_matrix. |
| 53 | +
|
| 54 | + Parameters |
| 55 | + ---------- |
| 56 | + A : scipy.sparse.coo_matrix |
| 57 | + dense_index : bool, default False |
| 58 | + If False (default), the SparseSeries index consists of only the |
| 59 | + coords of the non-null entries of the original coo_matrix. |
| 60 | + If True, the SparseSeries index consists of the full sorted |
| 61 | + (row, col) coordinates of the coo_matrix. |
| 62 | +
|
| 63 | + Returns |
| 64 | + ------- |
| 65 | + s : Series |
| 66 | + A Series with sparse values. |
| 67 | +
|
| 68 | + Examples |
| 69 | + -------- |
| 70 | + >>> from scipy import sparse |
| 71 | + >>> A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), |
| 72 | + shape=(3, 4)) |
| 73 | + >>> A |
| 74 | + <3x4 sparse matrix of type '<class 'numpy.float64'>' |
| 75 | + with 3 stored elements in COOrdinate format> |
| 76 | + >>> A.todense() |
| 77 | + matrix([[ 0., 0., 1., 2.], |
| 78 | + [ 3., 0., 0., 0.], |
| 79 | + [ 0., 0., 0., 0.]]) |
| 80 | + >>> ss = pd.Series.sparse.from_coo(A) |
| 81 | + >>> ss |
| 82 | + 0 2 1 |
| 83 | + 3 2 |
| 84 | + 1 0 3 |
| 85 | + dtype: float64 |
| 86 | + BlockIndex |
| 87 | + Block locations: array([0], dtype=int32) |
| 88 | + Block lengths: array([3], dtype=int32) |
| 89 | + """ |
| 90 | + from pandas.core.arrays.sparse.scipy_sparse import _coo_to_sparse_series |
| 91 | + from pandas import Series |
| 92 | + |
| 93 | + result = _coo_to_sparse_series(A, dense_index=dense_index) |
| 94 | + result = Series(result.array, index=result.index, copy=False) |
| 95 | + |
| 96 | + return result |
| 97 | + |
| 98 | + def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False): |
| 99 | + """ |
| 100 | + Create a scipy.sparse.coo_matrix from a Series with MultiIndex. |
| 101 | +
|
| 102 | + Use row_levels and column_levels to determine the row and column |
| 103 | + coordinates respectively. row_levels and column_levels are the names |
| 104 | + (labels) or numbers of the levels. {row_levels, column_levels} must be |
| 105 | + a partition of the MultiIndex level names (or numbers). |
| 106 | +
|
| 107 | + Parameters |
| 108 | + ---------- |
| 109 | + row_levels : tuple/list |
| 110 | + column_levels : tuple/list |
| 111 | + sort_labels : bool, default False |
| 112 | + Sort the row and column labels before forming the sparse matrix. |
| 113 | +
|
| 114 | + Returns |
| 115 | + ------- |
| 116 | + y : scipy.sparse.coo_matrix |
| 117 | + rows : list (row labels) |
| 118 | + columns : list (column labels) |
| 119 | +
|
| 120 | + Examples |
| 121 | + -------- |
| 122 | + >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) |
| 123 | + >>> s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), |
| 124 | + (1, 2, 'a', 1), |
| 125 | + (1, 1, 'b', 0), |
| 126 | + (1, 1, 'b', 1), |
| 127 | + (2, 1, 'b', 0), |
| 128 | + (2, 1, 'b', 1)], |
| 129 | + names=['A', 'B', 'C', 'D']) |
| 130 | + >>> ss = s.astype("Sparse") |
| 131 | + >>> A, rows, columns = ss.sparse.to_coo(row_levels=['A', 'B'], |
| 132 | + ... column_levels=['C', 'D'], |
| 133 | + ... sort_labels=True) |
| 134 | + >>> A |
| 135 | + <3x4 sparse matrix of type '<class 'numpy.float64'>' |
| 136 | + with 3 stored elements in COOrdinate format> |
| 137 | + >>> A.todense() |
| 138 | + matrix([[ 0., 0., 1., 3.], |
| 139 | + [ 3., 0., 0., 0.], |
| 140 | + [ 0., 0., 0., 0.]]) |
| 141 | + >>> rows |
| 142 | + [(1, 1), (1, 2), (2, 1)] |
| 143 | + >>> columns |
| 144 | + [('a', 0), ('a', 1), ('b', 0), ('b', 1)] |
| 145 | + """ |
| 146 | + from pandas.core.arrays.sparse.scipy_sparse import _sparse_series_to_coo |
| 147 | + |
| 148 | + A, rows, columns = _sparse_series_to_coo( |
| 149 | + self._parent, row_levels, column_levels, sort_labels=sort_labels |
| 150 | + ) |
| 151 | + return A, rows, columns |
| 152 | + |
| 153 | + def to_dense(self): |
| 154 | + """ |
| 155 | + Convert a Series from sparse values to dense. |
| 156 | +
|
| 157 | + .. versionadded:: 0.25.0 |
| 158 | +
|
| 159 | + Returns |
| 160 | + ------- |
| 161 | + Series: |
| 162 | + A Series with the same values, stored as a dense array. |
| 163 | +
|
| 164 | + Examples |
| 165 | + -------- |
| 166 | + >>> series = pd.Series(pd.SparseArray([0, 1, 0])) |
| 167 | + >>> series |
| 168 | + 0 0 |
| 169 | + 1 1 |
| 170 | + 2 0 |
| 171 | + dtype: Sparse[int64, 0] |
| 172 | +
|
| 173 | + >>> series.sparse.to_dense() |
| 174 | + 0 0 |
| 175 | + 1 1 |
| 176 | + 2 0 |
| 177 | + dtype: int64 |
| 178 | + """ |
| 179 | + from pandas import Series |
| 180 | + |
| 181 | + return Series( |
| 182 | + self._parent.array.to_dense(), |
| 183 | + index=self._parent.index, |
| 184 | + name=self._parent.name, |
| 185 | + ) |
| 186 | + |
| 187 | + |
| 188 | +class SparseFrameAccessor(BaseAccessor, PandasDelegate): |
| 189 | + """ |
| 190 | + DataFrame accessor for sparse data. |
| 191 | +
|
| 192 | + .. versionadded:: 0.25.0 |
| 193 | + """ |
| 194 | + |
| 195 | + def _validate(self, data): |
| 196 | + dtypes = data.dtypes |
| 197 | + if not all(isinstance(t, SparseDtype) for t in dtypes): |
| 198 | + raise AttributeError(self._validation_msg) |
| 199 | + |
| 200 | + @classmethod |
| 201 | + def from_spmatrix(cls, data, index=None, columns=None): |
| 202 | + """ |
| 203 | + Create a new DataFrame from a scipy sparse matrix. |
| 204 | +
|
| 205 | + .. versionadded:: 0.25.0 |
| 206 | +
|
| 207 | + Parameters |
| 208 | + ---------- |
| 209 | + data : scipy.sparse.spmatrix |
| 210 | + Must be convertible to csc format. |
| 211 | + index, columns : Index, optional |
| 212 | + Row and column labels to use for the resulting DataFrame. |
| 213 | + Defaults to a RangeIndex. |
| 214 | +
|
| 215 | + Returns |
| 216 | + ------- |
| 217 | + DataFrame |
| 218 | + Each column of the DataFrame is stored as a |
| 219 | + :class:`SparseArray`. |
| 220 | +
|
| 221 | + Examples |
| 222 | + -------- |
| 223 | + >>> import scipy.sparse |
| 224 | + >>> mat = scipy.sparse.eye(3) |
| 225 | + >>> pd.DataFrame.sparse.from_spmatrix(mat) |
| 226 | + 0 1 2 |
| 227 | + 0 1.0 0.0 0.0 |
| 228 | + 1 0.0 1.0 0.0 |
| 229 | + 2 0.0 0.0 1.0 |
| 230 | + """ |
| 231 | + from pandas import DataFrame |
| 232 | + |
| 233 | + data = data.tocsc() |
| 234 | + index, columns = cls._prep_index(data, index, columns) |
| 235 | + sparrays = [SparseArray.from_spmatrix(data[:, i]) for i in range(data.shape[1])] |
| 236 | + data = dict(enumerate(sparrays)) |
| 237 | + result = DataFrame(data, index=index) |
| 238 | + result.columns = columns |
| 239 | + return result |
| 240 | + |
| 241 | + def to_dense(self): |
| 242 | + """ |
| 243 | + Convert a DataFrame with sparse values to dense. |
| 244 | +
|
| 245 | + .. versionadded:: 0.25.0 |
| 246 | +
|
| 247 | + Returns |
| 248 | + ------- |
| 249 | + DataFrame |
| 250 | + A DataFrame with the same values stored as dense arrays. |
| 251 | +
|
| 252 | + Examples |
| 253 | + -------- |
| 254 | + >>> df = pd.DataFrame({"A": pd.SparseArray([0, 1, 0])}) |
| 255 | + >>> df.sparse.to_dense() |
| 256 | + A |
| 257 | + 0 0 |
| 258 | + 1 1 |
| 259 | + 2 0 |
| 260 | + """ |
| 261 | + from pandas import DataFrame |
| 262 | + |
| 263 | + data = {k: v.array.to_dense() for k, v in self._parent.items()} |
| 264 | + return DataFrame(data, index=self._parent.index, columns=self._parent.columns) |
| 265 | + |
| 266 | + def to_coo(self): |
| 267 | + """ |
| 268 | + Return the contents of the frame as a sparse SciPy COO matrix. |
| 269 | +
|
| 270 | + .. versionadded:: 0.25.0 |
| 271 | +
|
| 272 | + Returns |
| 273 | + ------- |
| 274 | + coo_matrix : scipy.sparse.spmatrix |
| 275 | + If the caller is heterogeneous and contains booleans or objects, |
| 276 | + the result will be of dtype=object. See Notes. |
| 277 | +
|
| 278 | + Notes |
| 279 | + ----- |
| 280 | + The dtype will be the lowest-common-denominator type (implicit |
| 281 | + upcasting); that is to say if the dtypes (even of numeric types) |
| 282 | + are mixed, the one that accommodates all will be chosen. |
| 283 | +
|
| 284 | + e.g. If the dtypes are float16 and float32, dtype will be upcast to |
| 285 | + float32. By numpy.find_common_type convention, mixing int64 and |
| 286 | + and uint64 will result in a float64 dtype. |
| 287 | + """ |
| 288 | + import_optional_dependency("scipy") |
| 289 | + from scipy.sparse import coo_matrix |
| 290 | + |
| 291 | + dtype = find_common_type(self._parent.dtypes) |
| 292 | + if isinstance(dtype, SparseDtype): |
| 293 | + dtype = dtype.subtype |
| 294 | + |
| 295 | + cols, rows, datas = [], [], [] |
| 296 | + for col, name in enumerate(self._parent): |
| 297 | + s = self._parent[name] |
| 298 | + row = s.array.sp_index.to_int_index().indices |
| 299 | + cols.append(np.repeat(col, len(row))) |
| 300 | + rows.append(row) |
| 301 | + datas.append(s.array.sp_values.astype(dtype, copy=False)) |
| 302 | + |
| 303 | + cols = np.concatenate(cols) |
| 304 | + rows = np.concatenate(rows) |
| 305 | + datas = np.concatenate(datas) |
| 306 | + return coo_matrix((datas, (rows, cols)), shape=self._parent.shape) |
| 307 | + |
| 308 | + @property |
| 309 | + def density(self) -> float: |
| 310 | + """ |
| 311 | + Ratio of non-sparse points to total (dense) data points |
| 312 | + represented in the DataFrame. |
| 313 | + """ |
| 314 | + return np.mean([column.array.density for _, column in self._parent.items()]) |
| 315 | + |
| 316 | + @staticmethod |
| 317 | + def _prep_index(data, index, columns): |
| 318 | + import pandas.core.indexes.base as ibase |
| 319 | + |
| 320 | + N, K = data.shape |
| 321 | + if index is None: |
| 322 | + index = ibase.default_index(N) |
| 323 | + if columns is None: |
| 324 | + columns = ibase.default_index(K) |
| 325 | + |
| 326 | + if len(columns) != K: |
| 327 | + raise ValueError( |
| 328 | + "Column length mismatch: {columns} vs. {K}".format( |
| 329 | + columns=len(columns), K=K |
| 330 | + ) |
| 331 | + ) |
| 332 | + if len(index) != N: |
| 333 | + raise ValueError( |
| 334 | + "Index length mismatch: {index} vs. {N}".format(index=len(index), N=N) |
| 335 | + ) |
| 336 | + return index, columns |
0 commit comments