Skip to content

Commit 3b654c8

Browse files
committed
ENH: Native conversion from/to scipy.sparse matrix to SparseDataFrame
1 parent b94186d commit 3b654c8

File tree

3 files changed

+163
-13
lines changed

3 files changed

+163
-13
lines changed

doc/source/whatsnew/v0.20.0.txt

+22
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,28 @@ New Behavior:
153153

154154
df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum()
155155

156+
.. _whatsnew_0200.enhancements.scipy_sparse:
157+
158+
SciPy sparse matrix from/to SparseDataFrame
159+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
160+
Pandas now supports creating SparseDataFrames directly from ``scipy.sparse.spmatrix`` instances. E.g.
161+
162+
.. ipython:: python
163+
164+
from scipy.sparse import csr_matrix
165+
arr = np.random.random(size=(1000, 5))
166+
arr[arr < .9] = 0
167+
sp_arr = csr_matrix(arr)
168+
sp_arr
169+
sdf = pd.DataFrame(sp_arr)
170+
sdf
171+
172+
To convert a SparseDataFrame back to scipy sparse matrix in COO format, you can use:
173+
174+
.. ipython:: python
175+
176+
sdf.to_coo()
177+
156178
.. _whatsnew_0200.enhancements.other:
157179

158180
Other enhancements

pandas/sparse/frame.py

+116-13
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import numpy as np
1212

1313
from pandas.types.missing import isnull, notnull
14-
from pandas.types.cast import _maybe_upcast
14+
from pandas.types.cast import _maybe_upcast, _find_common_type
1515
from pandas.types.common import _ensure_platform_int
1616

1717
from pandas.core.common import _try_sort
@@ -25,9 +25,14 @@
2525
create_block_manager_from_arrays)
2626
import pandas.core.generic as generic
2727
from pandas.sparse.series import SparseSeries, SparseArray
28+
from pandas._sparse import BlockIndex, get_blocks
2829
from pandas.util.decorators import Appender
2930
import pandas.core.ops as ops
3031

32+
try:
33+
from scipy.sparse import spmatrix # noqa
34+
except ImportError:
35+
spmatrix = type('mock spmatrix', (), {})
3136

3237
_shared_doc_kwargs = dict(klass='SparseDataFrame')
3338

@@ -39,7 +44,7 @@ class SparseDataFrame(DataFrame):
3944
4045
Parameters
4146
----------
42-
data : same types as can be passed to DataFrame
47+
data : same types as can be passed to DataFrame or scipy.sparse.spmatrix
4348
index : array-like, optional
4449
column : array-like, optional
4550
default_kind : {'block', 'integer'}, default 'block'
@@ -85,24 +90,20 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None,
8590
self._default_fill_value = default_fill_value
8691

8792
if isinstance(data, dict):
88-
mgr = self._init_dict(data, index, columns)
89-
if dtype is not None:
90-
mgr = mgr.astype(dtype)
93+
mgr = self._init_dict(data, index, columns, dtype=dtype)
9194
elif isinstance(data, (np.ndarray, list)):
92-
mgr = self._init_matrix(data, index, columns)
93-
if dtype is not None:
94-
mgr = mgr.astype(dtype)
95+
mgr = self._init_matrix(data, index, columns, dtype=dtype)
9596
elif isinstance(data, SparseDataFrame):
9697
mgr = self._init_mgr(data._data,
9798
dict(index=index, columns=columns),
9899
dtype=dtype, copy=copy)
99100
elif isinstance(data, DataFrame):
100-
mgr = self._init_dict(data, data.index, data.columns)
101-
if dtype is not None:
102-
mgr = mgr.astype(dtype)
101+
mgr = self._init_dict(data, data.index, data.columns, dtype=dtype)
103102
elif isinstance(data, BlockManager):
104103
mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
105104
dtype=dtype, copy=copy)
105+
elif isinstance(data, spmatrix):
106+
mgr = self._init_spmatrix(data, index, columns, dtype=dtype)
106107
elif data is None:
107108
data = DataFrame()
108109

@@ -175,6 +176,33 @@ def _init_dict(self, data, index, columns, dtype=None):
175176

176177
def _init_matrix(self, data, index, columns, dtype=None):
177178
data = _prep_ndarray(data, copy=False)
179+
index, columns = self._prep_index(data, index, columns)
180+
data = dict([(idx, data[:, i]) for i, idx in enumerate(columns)])
181+
return self._init_dict(data, index, columns, dtype)
182+
183+
def _init_spmatrix(self, data, index, columns, dtype=None):
184+
index, columns = self._prep_index(data, index, columns)
185+
data = data.tocoo(copy=False)
186+
N = len(index)
187+
bindex = np.arange(N, dtype=np.int32)
188+
189+
sdict = {}
190+
values = Series(data.data, index=data.row)
191+
for col, rowvals in values.groupby(data.col):
192+
blocs, blens = get_blocks(bindex[rowvals.index])
193+
sdict[columns[col]] = SparseSeries(
194+
rowvals.values, index=index,
195+
sparse_index=BlockIndex(N, blocs, blens))
196+
197+
# Add any columns that were empty
198+
sdict.update({column: SparseSeries(index=index,
199+
sparse_index=BlockIndex(N, [], []))
200+
for column in columns
201+
if column not in sdict})
202+
203+
return self._init_dict(sdict, index, columns, dtype)
204+
205+
def _prep_index(self, data, index, columns):
178206
N, K = data.shape
179207
if index is None:
180208
index = _default_index(N)
@@ -187,9 +215,84 @@ def _init_matrix(self, data, index, columns, dtype=None):
187215
if len(index) != N:
188216
raise ValueError('Index length mismatch: %d vs. %d' %
189217
(len(index), N))
218+
return index, columns
190219

191-
data = dict([(idx, data[:, i]) for i, idx in enumerate(columns)])
192-
return self._init_dict(data, index, columns, dtype)
220+
def as_matrix(self, columns=None, sparse=False):
221+
"""
222+
Convert the frame to its Numpy-array or SciPy sparse COO matrix
223+
representation.
224+
225+
Parameters
226+
----------
227+
columns : list, optional, default=None
228+
If None, return all columns. Otherwise, returns specified columns.
229+
sparse : bool, optional, default=True
230+
If True, return an instance of scipy.sparse.coo_matrix instead
231+
of ndarray. If False, the result values array will be DENSE.
232+
233+
Returns
234+
-------
235+
values : ndarray or scipy.sparse.spmatrix
236+
If the caller is heterogeneous and contains booleans or objects,
237+
the result will be of dtype=object. See Notes.
238+
239+
Notes
240+
-----
241+
The dtype will be the lowest-common-denominator type (implicit
242+
upcasting); that is to say if the dtypes (even of numeric types)
243+
are mixed, the one that accommodates all will be chosen.
244+
245+
e.g. If the dtypes are float16 and float32, dtype will be upcast to
246+
float32. By numpy.find_common_type convention, mixing int64 and
247+
and uint64 will result in a float64 dtype.
248+
249+
See Also
250+
--------
251+
pandas.SparseDataFrame.to_coo
252+
"""
253+
if sparse:
254+
subdf = self if columns is None else self[columns]
255+
return subdf.to_coo()
256+
257+
return super(SparseDataFrame, self).as_matrix(columns=columns)
258+
259+
def to_coo(self):
260+
"""
261+
Convert the frame to its SciPy sparse COO matrix representation.
262+
263+
Returns
264+
-------
265+
coo_matrix : scipy.sparse.spmatrix
266+
If the caller is heterogeneous and contains booleans or objects,
267+
the result will be of dtype=object. See Notes.
268+
269+
Notes
270+
-----
271+
The dtype will be the lowest-common-denominator type (implicit
272+
upcasting); that is to say if the dtypes (even of numeric types)
273+
are mixed, the one that accommodates all will be chosen.
274+
275+
e.g. If the dtypes are float16 and float32, dtype will be upcast to
276+
float32. By numpy.find_common_type convention, mixing int64 and
277+
and uint64 will result in a float64 dtype.
278+
"""
279+
try:
280+
from scipy.sparse import coo_matrix
281+
except ImportError:
282+
raise ImportError('Scipy is not installed')
283+
284+
cols, rows, datas = [], [], []
285+
for col, name in enumerate(self):
286+
s = self[name]
287+
row = s.sp_index.to_int_index().indices
288+
cols.append(np.repeat(col, len(row)))
289+
rows.append(row)
290+
datas.append(s.sp_values)
291+
292+
cols = np.hstack(cols)
293+
rows = np.hstack(rows)
294+
datas = np.hstack(datas).astype(_find_common_type(self.dtypes))
295+
return coo_matrix((datas, (rows, cols)), shape=self.shape)
193296

194297
def __array_wrap__(self, result):
195298
return self._constructor(

pandas/tests/sparse/test_frame.py

+25
Original file line numberDiff line numberDiff line change
@@ -1118,6 +1118,31 @@ def test_isnotnull(self):
11181118
'B': [True, False, True, True, False]})
11191119
tm.assert_frame_equal(res.to_dense(), exp)
11201120

1121+
def test_from_to_scipy(self):
1122+
# GH 4343
1123+
try:
1124+
from scipy.sparse import csr_matrix
1125+
except ImportError:
1126+
return # scipy not available
1127+
1128+
arr = np.array([[0, 1, 0],
1129+
[0, 0, 1],
1130+
[1, 1, 1.]])
1131+
spm = csr_matrix(arr)
1132+
1133+
for index, columns in ((list('abc'), list('def')),
1134+
(None, None)):
1135+
sdf = pd.SparseDataFrame(spm, index=index, columns=columns)
1136+
1137+
if index is not None:
1138+
tm.assert_index_equal(sdf.index, pd.Index(index))
1139+
if columns is not None:
1140+
tm.assert_index_equal(sdf.columns, pd.Index(columns))
1141+
1142+
tm.assert_numpy_array_equal(sdf.fillna(0).values, arr)
1143+
tm.assert_equal((sdf.to_coo() != spm).data.size, 0)
1144+
tm.assert_equal((sdf.as_matrix(sparse=True) != spm).data.size, 0)
1145+
11211146

11221147
class TestSparseDataFrameArithmetic(tm.TestCase):
11231148

0 commit comments

Comments
 (0)