11
11
import numpy as np
12
12
13
13
from pandas .types .missing import isnull , notnull
14
- from pandas .types .cast import _maybe_upcast
14
+ from pandas .types .cast import _maybe_upcast , _find_common_type
15
15
from pandas .types .common import _ensure_platform_int
16
16
17
17
from pandas .core .common import _try_sort
25
25
create_block_manager_from_arrays )
26
26
import pandas .core .generic as generic
27
27
from pandas .sparse .series import SparseSeries , SparseArray
28
+ from pandas ._sparse import BlockIndex , get_blocks
28
29
from pandas .util .decorators import Appender
29
30
import pandas .core .ops as ops
30
31
32
+ try :
33
+ from scipy .sparse import spmatrix # noqa
34
+ except ImportError :
35
+ spmatrix = type ('mock spmatrix' , (), {})
31
36
32
37
_shared_doc_kwargs = dict (klass = 'SparseDataFrame' )
33
38
@@ -39,7 +44,7 @@ class SparseDataFrame(DataFrame):
39
44
40
45
Parameters
41
46
----------
42
- data : same types as can be passed to DataFrame
47
+ data : same types as can be passed to DataFrame or scipy.sparse.spmatrix
43
48
index : array-like, optional
44
49
column : array-like, optional
45
50
default_kind : {'block', 'integer'}, default 'block'
@@ -85,24 +90,20 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None,
85
90
self ._default_fill_value = default_fill_value
86
91
87
92
if isinstance (data , dict ):
88
- mgr = self ._init_dict (data , index , columns )
89
- if dtype is not None :
90
- mgr = mgr .astype (dtype )
93
+ mgr = self ._init_dict (data , index , columns , dtype = dtype )
91
94
elif isinstance (data , (np .ndarray , list )):
92
- mgr = self ._init_matrix (data , index , columns )
93
- if dtype is not None :
94
- mgr = mgr .astype (dtype )
95
+ mgr = self ._init_matrix (data , index , columns , dtype = dtype )
95
96
elif isinstance (data , SparseDataFrame ):
96
97
mgr = self ._init_mgr (data ._data ,
97
98
dict (index = index , columns = columns ),
98
99
dtype = dtype , copy = copy )
99
100
elif isinstance (data , DataFrame ):
100
- mgr = self ._init_dict (data , data .index , data .columns )
101
- if dtype is not None :
102
- mgr = mgr .astype (dtype )
101
+ mgr = self ._init_dict (data , data .index , data .columns , dtype = dtype )
103
102
elif isinstance (data , BlockManager ):
104
103
mgr = self ._init_mgr (data , axes = dict (index = index , columns = columns ),
105
104
dtype = dtype , copy = copy )
105
+ elif isinstance (data , spmatrix ):
106
+ mgr = self ._init_spmatrix (data , index , columns , dtype = dtype )
106
107
elif data is None :
107
108
data = DataFrame ()
108
109
@@ -175,6 +176,33 @@ def _init_dict(self, data, index, columns, dtype=None):
175
176
176
177
def _init_matrix (self , data , index , columns , dtype = None ):
177
178
data = _prep_ndarray (data , copy = False )
179
+ index , columns = self ._prep_index (data , index , columns )
180
+ data = dict ([(idx , data [:, i ]) for i , idx in enumerate (columns )])
181
+ return self ._init_dict (data , index , columns , dtype )
182
+
183
+ def _init_spmatrix (self , data , index , columns , dtype = None ):
184
+ index , columns = self ._prep_index (data , index , columns )
185
+ data = data .tocoo (copy = False )
186
+ N = len (index )
187
+ bindex = np .arange (N , dtype = np .int32 )
188
+
189
+ sdict = {}
190
+ values = Series (data .data , index = data .row )
191
+ for col , rowvals in values .groupby (data .col ):
192
+ blocs , blens = get_blocks (bindex [rowvals .index ])
193
+ sdict [columns [col ]] = SparseSeries (
194
+ rowvals .values , index = index ,
195
+ sparse_index = BlockIndex (N , blocs , blens ))
196
+
197
+ # Add any columns that were empty
198
+ sdict .update ({column : SparseSeries (index = index ,
199
+ sparse_index = BlockIndex (N , [], []))
200
+ for column in columns
201
+ if column not in sdict })
202
+
203
+ return self ._init_dict (sdict , index , columns , dtype )
204
+
205
+ def _prep_index (self , data , index , columns ):
178
206
N , K = data .shape
179
207
if index is None :
180
208
index = _default_index (N )
@@ -187,9 +215,84 @@ def _init_matrix(self, data, index, columns, dtype=None):
187
215
if len (index ) != N :
188
216
raise ValueError ('Index length mismatch: %d vs. %d' %
189
217
(len (index ), N ))
218
+ return index , columns
190
219
191
- data = dict ([(idx , data [:, i ]) for i , idx in enumerate (columns )])
192
- return self ._init_dict (data , index , columns , dtype )
220
+ def as_matrix (self , columns = None , sparse = False ):
221
+ """
222
+ Convert the frame to its Numpy-array or SciPy sparse COO matrix
223
+ representation.
224
+
225
+ Parameters
226
+ ----------
227
+ columns : list, optional, default=None
228
+ If None, return all columns. Otherwise, returns specified columns.
229
+ sparse : bool, optional, default=True
230
+ If True, return an instance of scipy.sparse.coo_matrix instead
231
+ of ndarray. If False, the result values array will be DENSE.
232
+
233
+ Returns
234
+ -------
235
+ values : ndarray or scipy.sparse.spmatrix
236
+ If the caller is heterogeneous and contains booleans or objects,
237
+ the result will be of dtype=object. See Notes.
238
+
239
+ Notes
240
+ -----
241
+ The dtype will be the lowest-common-denominator type (implicit
242
+ upcasting); that is to say if the dtypes (even of numeric types)
243
+ are mixed, the one that accommodates all will be chosen.
244
+
245
+ e.g. If the dtypes are float16 and float32, dtype will be upcast to
246
+ float32. By numpy.find_common_type convention, mixing int64 and
247
+ and uint64 will result in a float64 dtype.
248
+
249
+ See Also
250
+ --------
251
+ pandas.SparseDataFrame.to_coo
252
+ """
253
+ if sparse :
254
+ subdf = self if columns is None else self [columns ]
255
+ return subdf .to_coo ()
256
+
257
+ return super (SparseDataFrame , self ).as_matrix (columns = columns )
258
+
259
+ def to_coo (self ):
260
+ """
261
+ Convert the frame to its SciPy sparse COO matrix representation.
262
+
263
+ Returns
264
+ -------
265
+ coo_matrix : scipy.sparse.spmatrix
266
+ If the caller is heterogeneous and contains booleans or objects,
267
+ the result will be of dtype=object. See Notes.
268
+
269
+ Notes
270
+ -----
271
+ The dtype will be the lowest-common-denominator type (implicit
272
+ upcasting); that is to say if the dtypes (even of numeric types)
273
+ are mixed, the one that accommodates all will be chosen.
274
+
275
+ e.g. If the dtypes are float16 and float32, dtype will be upcast to
276
+ float32. By numpy.find_common_type convention, mixing int64 and
277
+ and uint64 will result in a float64 dtype.
278
+ """
279
+ try :
280
+ from scipy .sparse import coo_matrix
281
+ except ImportError :
282
+ raise ImportError ('Scipy is not installed' )
283
+
284
+ cols , rows , datas = [], [], []
285
+ for col , name in enumerate (self ):
286
+ s = self [name ]
287
+ row = s .sp_index .to_int_index ().indices
288
+ cols .append (np .repeat (col , len (row )))
289
+ rows .append (row )
290
+ datas .append (s .sp_values )
291
+
292
+ cols = np .hstack (cols )
293
+ rows = np .hstack (rows )
294
+ datas = np .hstack (datas ).astype (_find_common_type (self .dtypes ))
295
+ return coo_matrix ((datas , (rows , cols )), shape = self .shape )
193
296
194
297
def __array_wrap__ (self , result ):
195
298
return self ._constructor (
0 commit comments