Skip to content

Commit 22d316c

Browse files
committed
API: DataFrame.sparse accessor
Closes pandas-dev#25681
1 parent 21769e9 commit 22d316c

File tree

6 files changed

+265
-28
lines changed

6 files changed

+265
-28
lines changed

doc/source/reference/frame.rst

+23
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,29 @@ specific plotting methods of the form ``DataFrame.plot.<kind>``.
312312
DataFrame.boxplot
313313
DataFrame.hist
314314

315+
316+
.. _api.frame.sparse:
317+
318+
Sparse Accessor
319+
~~~~~~~~~~~~~~~
320+
321+
Sparse-dtype specific methods and attributes are provided under the
322+
``DataFrame.sparse`` accessor.
323+
324+
.. autosummary::
325+
:toctree: api/
326+
:template: autosummary/accessor_attribute.rst
327+
328+
DataFrame.sparse.density
329+
330+
.. autosummary::
331+
:toctree: api/
332+
333+
DataFrame.sparse.from_spmatrix
334+
DataFrame.sparse.to_coo
335+
DataFrame.sparse.to_dense
336+
337+
315338
Serialization / IO / Conversion
316339
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
317340
.. autosummary::

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ Other Enhancements
2626
- :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`)
2727
- :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`)
2828
- :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`)
29+
- Added :ref:`api.frame.sparse` for working with a ``DataFrame`` whose values are sparse (:issue:``)
2930
3031
.. _whatsnew_0250.api_breaking:
3132

pandas/core/arrays/sparse.py

+159-5
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,36 @@ def _simple_new(cls, sparse_array, sparse_index, dtype):
678678
new._dtype = dtype
679679
return new
680680

681+
@classmethod
682+
def from_spmatrix(cls, data):
683+
"""
684+
Create a SparseArray from a scipy.sparse matrix.
685+
686+
Parameters
687+
----------
688+
data : scipy.sparse.sp_matrix
689+
This should be a 2-D SciPy sparse where the size
690+
of the second dimension is 1. In other words, a
691+
sparse matrix with a single column.
692+
693+
Returns
694+
-------
695+
SparseArray.
696+
"""
697+
assert data.ndim == 2
698+
699+
length, ncol = data.shape
700+
701+
assert ncol == 1
702+
703+
arr = data.data
704+
idx, _ = data.nonzero()
705+
zero = np.array(0, dtype=arr.dtype).item()
706+
dtype = SparseDtype(arr.dtype, zero)
707+
index = IntIndex(length, idx)
708+
709+
return cls._simple_new(arr, index, dtype)
710+
681711
def __array__(self, dtype=None, copy=True):
682712
fill_value = self.fill_value
683713

@@ -1891,6 +1921,9 @@ def _make_index(length, indices, kind):
18911921
# ----------------------------------------------------------------------------
18921922
# Accessor
18931923

1924+
_validation_msg = "Can only use the '.sparse' accessor with Sparse data."
1925+
1926+
18941927
@delegate_names(SparseArray, ['npoints', 'density', 'fill_value',
18951928
'sp_values'],
18961929
typ='property')
@@ -1900,15 +1933,13 @@ class SparseAccessor(PandasDelegate):
19001933
"""
19011934

19021935
def __init__(self, data=None):
1903-
self._validate(data)
19041936
# Store the Series since we need that for to_coo
19051937
self._parent = data
1938+
self._validate(data)
19061939

1907-
@staticmethod
1908-
def _validate(data):
1940+
def _validate(self, data):
19091941
if not isinstance(data.dtype, SparseDtype):
1910-
msg = "Can only use the '.sparse' accessor with Sparse data."
1911-
raise AttributeError(msg)
1942+
raise AttributeError(_validation_msg)
19121943

19131944
def _delegate_property_get(self, name, *args, **kwargs):
19141945
return getattr(self._parent.values, name)
@@ -2025,3 +2056,126 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
20252056
column_levels,
20262057
sort_labels=sort_labels)
20272058
return A, rows, columns
2059+
2060+
def to_dense(self):
2061+
from pandas import Series
2062+
return Series(self._parent.array.to_dense(),
2063+
index=self._parent.index,
2064+
name=self._parent.name)
2065+
2066+
2067+
class SparseFrameAccessor(PandasDelegate):
2068+
2069+
def __init__(self, data=None):
2070+
# Store the Series since we need that for to_coo
2071+
self._parent = data
2072+
self._validate(data)
2073+
2074+
def _validate(self, data):
2075+
dtypes = data.dtypes
2076+
if not all(isinstance(t, SparseDtype) for t in dtypes):
2077+
raise AttributeError(_validation_msg)
2078+
2079+
@classmethod
2080+
def from_spmatrix(cls, data, index=None, columns=None):
2081+
"""
2082+
Create a new DataFrame from a scipy sparse matrix.
2083+
2084+
Parameters
2085+
----------
2086+
data : scipy.sparse.spmatrix
2087+
Must be convertible to csc format.
2088+
index, columns : Index, optional
2089+
Row and column labels to use for the resulting DataFrame.
2090+
Defaults to a RangeIndex.
2091+
2092+
Returns
2093+
-------
2094+
DataFrame
2095+
2096+
Examples
2097+
--------
2098+
>>> import scipy.sparse
2099+
>>> mat = scipy.sparse.eye(3)
2100+
>>> pd.DataFrame.sparse.from_spmatrix(mat)
2101+
0 1 2
2102+
0 1.0 0.0 0.0
2103+
1 0.0 1.0 0.0
2104+
2 0.0 0.0 1.0
2105+
"""
2106+
from pandas import DataFrame
2107+
2108+
data = data.tocsc()
2109+
index, columns = cls._prep_index(data, index, columns)
2110+
sparrays = [
2111+
SparseArray.from_spmatrix(data[:, i])
2112+
for i in range(data.shape[1])
2113+
]
2114+
data = dict(zip(columns, sparrays))
2115+
return DataFrame(data, index=index)
2116+
2117+
def to_dense(self):
2118+
"""
2119+
Convert to dense DataFrame
2120+
2121+
Returns
2122+
-------
2123+
df : DataFrame
2124+
"""
2125+
from pandas import DataFrame
2126+
2127+
data = {k: v.array.to_dense()
2128+
for k, v in compat.iteritems(self._parent)}
2129+
return DataFrame(data,
2130+
index=self._parent.index,
2131+
columns=self._parent.columns)
2132+
2133+
def to_coo(self):
2134+
try:
2135+
from scipy.sparse import coo_matrix
2136+
except ImportError:
2137+
raise ImportError('Scipy is not installed')
2138+
2139+
dtype = find_common_type(self._parent.dtypes)
2140+
if isinstance(dtype, SparseDtype):
2141+
dtype = dtype.subtype
2142+
2143+
cols, rows, datas = [], [], []
2144+
for col, name in enumerate(self._parent):
2145+
s = self._parent[name]
2146+
row = s.array.sp_index.to_int_index().indices
2147+
cols.append(np.repeat(col, len(row)))
2148+
rows.append(row)
2149+
datas.append(s.array.sp_values.astype(dtype, copy=False))
2150+
2151+
cols = np.concatenate(cols)
2152+
rows = np.concatenate(rows)
2153+
datas = np.concatenate(datas)
2154+
return coo_matrix((datas, (rows, cols)), shape=self._parent.shape)
2155+
2156+
@property
2157+
def density(self):
2158+
"""
2159+
Ratio of non-sparse points to total (dense) data points
2160+
represented in the DataFrame.
2161+
"""
2162+
return np.mean([column.array.density
2163+
for _, column in self._parent.iteritems()])
2164+
2165+
@staticmethod
2166+
def _prep_index(data, index, columns):
2167+
import pandas.core.indexes.base as ibase
2168+
2169+
N, K = data.shape
2170+
if index is None:
2171+
index = ibase.default_index(N)
2172+
if columns is None:
2173+
columns = ibase.default_index(K)
2174+
2175+
if len(columns) != K:
2176+
raise ValueError('Column length mismatch: {columns} vs. {K}'
2177+
.format(columns=len(columns), K=K))
2178+
if len(index) != N:
2179+
raise ValueError('Index length mismatch: {index} vs. {N}'
2180+
.format(index=len(index), N=N))
2181+
return index, columns

pandas/core/frame.py

+2
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
PY36, raise_with_traceback, Iterator,
3737
string_and_binary_types)
3838
from pandas.compat.numpy import function as nv
39+
from pandas.core.arrays.sparse import SparseFrameAccessor
3940
from pandas.core.dtypes.cast import (
4041
maybe_upcast,
4142
cast_scalar_to_array,
@@ -8009,6 +8010,7 @@ def isin(self, values):
80098010
plot = CachedAccessor("plot", gfx.FramePlotMethods)
80108011
hist = gfx.hist_frame
80118012
boxplot = gfx.boxplot_frame
8013+
sparse = CachedAccessor("sparse", SparseFrameAccessor)
80128014

80138015

80148016
DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0,

pandas/core/sparse/frame.py

+4-23
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@
1414
from pandas.compat.numpy import function as nv
1515
from pandas.util._decorators import Appender
1616

17-
from pandas.core.dtypes.cast import find_common_type, maybe_upcast
17+
from pandas.core.dtypes.cast import maybe_upcast
1818
from pandas.core.dtypes.common import ensure_platform_int, is_scipy_sparse
1919
from pandas.core.dtypes.missing import isna, notna
2020

2121
import pandas.core.algorithms as algos
22-
from pandas.core.arrays.sparse import SparseArray, SparseDtype
22+
from pandas.core.arrays.sparse import SparseArray
2323
import pandas.core.common as com
2424
from pandas.core.frame import DataFrame
2525
import pandas.core.generic as generic
@@ -271,27 +271,8 @@ def to_coo(self):
271271
float32. By numpy.find_common_type convention, mixing int64 and
272272
and uint64 will result in a float64 dtype.
273273
"""
274-
try:
275-
from scipy.sparse import coo_matrix
276-
except ImportError:
277-
raise ImportError('Scipy is not installed')
278-
279-
dtype = find_common_type(self.dtypes)
280-
if isinstance(dtype, SparseDtype):
281-
dtype = dtype.subtype
282-
283-
cols, rows, datas = [], [], []
284-
for col, name in enumerate(self):
285-
s = self[name]
286-
row = s.sp_index.to_int_index().indices
287-
cols.append(np.repeat(col, len(row)))
288-
rows.append(row)
289-
datas.append(s.sp_values.astype(dtype, copy=False))
290-
291-
cols = np.concatenate(cols)
292-
rows = np.concatenate(rows)
293-
datas = np.concatenate(datas)
294-
return coo_matrix((datas, (rows, cols)), shape=self.shape)
274+
from pandas.core.arrays.sparse import SparseFrameAccessor
275+
return SparseFrameAccessor(self).to_coo()
295276

296277
def __array_wrap__(self, result):
297278
return self._constructor(
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import string
2+
3+
import numpy as np
4+
import pytest
5+
6+
import pandas as pd
7+
import pandas.util.testing as tm
8+
9+
10+
class TestSeriesAccessor(object):
11+
# TODO: collect other accessor tests
12+
def test_to_dense(self):
13+
s = pd.Series([0, 1, 0, 10], dtype='Sparse[int]')
14+
result = s.sparse.to_dense()
15+
expected = pd.Series([0, 1, 0, 10])
16+
tm.assert_series_equal(result, expected)
17+
18+
19+
class TestFrameAccessor(object):
20+
@pytest.mark.parametrize('format', ['csc', 'csr', 'coo'])
21+
@pytest.mark.parametrize("labels", [
22+
None,
23+
list(string.ascii_letters[:10]),
24+
])
25+
@pytest.mark.parametrize('dtype', ['float64', 'int64'])
26+
def test_from_spmatrix(self, format, labels, dtype):
27+
pytest.importorskip("scipy")
28+
import scipy.sparse
29+
sp_dtype = pd.SparseDtype(dtype, np.array(0, dtype=dtype).item())
30+
31+
mat = scipy.sparse.eye(10, format=format, dtype=dtype)
32+
result = pd.DataFrame.sparse.from_spmatrix(
33+
mat, index=labels, columns=labels
34+
)
35+
expected = pd.DataFrame(
36+
np.eye(10, dtype=dtype),
37+
index=labels,
38+
columns=labels,
39+
).astype(sp_dtype)
40+
tm.assert_frame_equal(result, expected)
41+
42+
def test_to_coo(self):
43+
pytest.importorskip("scipy")
44+
import scipy.sparse
45+
46+
df = pd.DataFrame({
47+
"A": [0, 1, 0],
48+
"B": [1, 0, 0],
49+
}, dtype='Sparse[int64, 0]')
50+
result = df.sparse.to_coo()
51+
expected = scipy.sparse.coo_matrix(np.asarray(df))
52+
assert (result != expected).nnz == 0
53+
54+
def test_to_dense(self):
55+
df = pd.DataFrame({
56+
"A": pd.SparseArray([1, 0], dtype=pd.SparseDtype('int64', 0)),
57+
"B": pd.SparseArray([1, 0], dtype=pd.SparseDtype('int64', 1)),
58+
"C": pd.SparseArray([1., 0.],
59+
dtype=pd.SparseDtype('float64', 0.0)),
60+
}, index=['b', 'a'])
61+
result = df.sparse.to_dense()
62+
expected = pd.DataFrame({
63+
'A': [1, 0],
64+
'B': [1, 0],
65+
'C': [1.0, 0.0],
66+
}, index=['b', 'a'])
67+
tm.assert_frame_equal(result, expected)
68+
69+
def test_density(self):
70+
df = pd.DataFrame({
71+
'A': pd.SparseArray([1, 0, 2, 1], fill_value=0),
72+
'B': pd.SparseArray([0, 1, 1, 1], fill_value=0),
73+
})
74+
res = df.sparse.density
75+
expected = 0.75
76+
assert res == expected

0 commit comments

Comments
 (0)