Skip to content

Commit cc7367c

Browse files
CLN: clean-up internal sparse imports + restructure sparse submodule (#28516)
1 parent a6fe803 commit cc7367c

File tree

14 files changed

+695
-666
lines changed

14 files changed

+695
-666
lines changed

doc/source/whatsnew/v1.0.0.rst

+4-3
Original file line numberDiff line numberDiff line change
@@ -125,9 +125,10 @@ Deprecations
125125
Removed SparseSeries and SparseDataFrame
126126
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
127127

128-
``SparseSeries`` and ``SparseDataFrame`` have been removed (:issue:`28425`).
129-
We recommend using a ``Series`` or ``DataFrame`` with sparse values instead.
130-
See :ref:`sparse.migration` for help with migrating existing code.
128+
``SparseSeries``, ``SparseDataFrame`` and the ``DataFrame.to_sparse`` method
129+
have been removed (:issue:`28425`). We recommend using a ``Series`` or
130+
``DataFrame`` with sparse values instead. See :ref:`sparse.migration` for help
131+
with migrating existing code.
131132

132133
Removal of prior version deprecations/changes
133134
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

pandas/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@
114114
DataFrame,
115115
)
116116

117-
from pandas.core.sparse.api import SparseArray, SparseDtype
117+
from pandas.core.arrays.sparse import SparseArray, SparseDtype
118118

119119
from pandas.tseries.api import infer_freq
120120
from pandas.tseries import offsets

pandas/core/arrays/sparse/__init__.py

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# flake8: noqa: F401
2+
3+
from .accessor import SparseAccessor, SparseFrameAccessor
4+
from .array import BlockIndex, IntIndex, SparseArray, _make_index
5+
from .dtype import SparseDtype

pandas/core/arrays/sparse/accessor.py

+336
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,336 @@
1+
"""Sparse accessor"""
2+
3+
import numpy as np
4+
5+
from pandas.compat._optional import import_optional_dependency
6+
7+
from pandas.core.dtypes.cast import find_common_type
8+
9+
from pandas.core.accessor import PandasDelegate, delegate_names
10+
11+
from .array import SparseArray
12+
from .dtype import SparseDtype
13+
14+
15+
class BaseAccessor:
16+
_validation_msg = "Can only use the '.sparse' accessor with Sparse data."
17+
18+
def __init__(self, data=None):
19+
self._parent = data
20+
self._validate(data)
21+
22+
def _validate(self, data):
23+
raise NotImplementedError
24+
25+
26+
@delegate_names(
27+
SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property"
28+
)
29+
class SparseAccessor(BaseAccessor, PandasDelegate):
30+
"""
31+
Accessor for SparseSparse from other sparse matrix data types.
32+
"""
33+
34+
def _validate(self, data):
35+
if not isinstance(data.dtype, SparseDtype):
36+
raise AttributeError(self._validation_msg)
37+
38+
def _delegate_property_get(self, name, *args, **kwargs):
39+
return getattr(self._parent.array, name)
40+
41+
def _delegate_method(self, name, *args, **kwargs):
42+
if name == "from_coo":
43+
return self.from_coo(*args, **kwargs)
44+
elif name == "to_coo":
45+
return self.to_coo(*args, **kwargs)
46+
else:
47+
raise ValueError
48+
49+
@classmethod
50+
def from_coo(cls, A, dense_index=False):
51+
"""
52+
Create a Series with sparse values from a scipy.sparse.coo_matrix.
53+
54+
Parameters
55+
----------
56+
A : scipy.sparse.coo_matrix
57+
dense_index : bool, default False
58+
If False (default), the SparseSeries index consists of only the
59+
coords of the non-null entries of the original coo_matrix.
60+
If True, the SparseSeries index consists of the full sorted
61+
(row, col) coordinates of the coo_matrix.
62+
63+
Returns
64+
-------
65+
s : Series
66+
A Series with sparse values.
67+
68+
Examples
69+
--------
70+
>>> from scipy import sparse
71+
>>> A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])),
72+
shape=(3, 4))
73+
>>> A
74+
<3x4 sparse matrix of type '<class 'numpy.float64'>'
75+
with 3 stored elements in COOrdinate format>
76+
>>> A.todense()
77+
matrix([[ 0., 0., 1., 2.],
78+
[ 3., 0., 0., 0.],
79+
[ 0., 0., 0., 0.]])
80+
>>> ss = pd.Series.sparse.from_coo(A)
81+
>>> ss
82+
0 2 1
83+
3 2
84+
1 0 3
85+
dtype: float64
86+
BlockIndex
87+
Block locations: array([0], dtype=int32)
88+
Block lengths: array([3], dtype=int32)
89+
"""
90+
from pandas.core.arrays.sparse.scipy_sparse import _coo_to_sparse_series
91+
from pandas import Series
92+
93+
result = _coo_to_sparse_series(A, dense_index=dense_index)
94+
result = Series(result.array, index=result.index, copy=False)
95+
96+
return result
97+
98+
def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False):
99+
"""
100+
Create a scipy.sparse.coo_matrix from a Series with MultiIndex.
101+
102+
Use row_levels and column_levels to determine the row and column
103+
coordinates respectively. row_levels and column_levels are the names
104+
(labels) or numbers of the levels. {row_levels, column_levels} must be
105+
a partition of the MultiIndex level names (or numbers).
106+
107+
Parameters
108+
----------
109+
row_levels : tuple/list
110+
column_levels : tuple/list
111+
sort_labels : bool, default False
112+
Sort the row and column labels before forming the sparse matrix.
113+
114+
Returns
115+
-------
116+
y : scipy.sparse.coo_matrix
117+
rows : list (row labels)
118+
columns : list (column labels)
119+
120+
Examples
121+
--------
122+
>>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])
123+
>>> s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0),
124+
(1, 2, 'a', 1),
125+
(1, 1, 'b', 0),
126+
(1, 1, 'b', 1),
127+
(2, 1, 'b', 0),
128+
(2, 1, 'b', 1)],
129+
names=['A', 'B', 'C', 'D'])
130+
>>> ss = s.astype("Sparse")
131+
>>> A, rows, columns = ss.sparse.to_coo(row_levels=['A', 'B'],
132+
... column_levels=['C', 'D'],
133+
... sort_labels=True)
134+
>>> A
135+
<3x4 sparse matrix of type '<class 'numpy.float64'>'
136+
with 3 stored elements in COOrdinate format>
137+
>>> A.todense()
138+
matrix([[ 0., 0., 1., 3.],
139+
[ 3., 0., 0., 0.],
140+
[ 0., 0., 0., 0.]])
141+
>>> rows
142+
[(1, 1), (1, 2), (2, 1)]
143+
>>> columns
144+
[('a', 0), ('a', 1), ('b', 0), ('b', 1)]
145+
"""
146+
from pandas.core.arrays.sparse.scipy_sparse import _sparse_series_to_coo
147+
148+
A, rows, columns = _sparse_series_to_coo(
149+
self._parent, row_levels, column_levels, sort_labels=sort_labels
150+
)
151+
return A, rows, columns
152+
153+
def to_dense(self):
154+
"""
155+
Convert a Series from sparse values to dense.
156+
157+
.. versionadded:: 0.25.0
158+
159+
Returns
160+
-------
161+
Series:
162+
A Series with the same values, stored as a dense array.
163+
164+
Examples
165+
--------
166+
>>> series = pd.Series(pd.SparseArray([0, 1, 0]))
167+
>>> series
168+
0 0
169+
1 1
170+
2 0
171+
dtype: Sparse[int64, 0]
172+
173+
>>> series.sparse.to_dense()
174+
0 0
175+
1 1
176+
2 0
177+
dtype: int64
178+
"""
179+
from pandas import Series
180+
181+
return Series(
182+
self._parent.array.to_dense(),
183+
index=self._parent.index,
184+
name=self._parent.name,
185+
)
186+
187+
188+
class SparseFrameAccessor(BaseAccessor, PandasDelegate):
189+
"""
190+
DataFrame accessor for sparse data.
191+
192+
.. versionadded:: 0.25.0
193+
"""
194+
195+
def _validate(self, data):
196+
dtypes = data.dtypes
197+
if not all(isinstance(t, SparseDtype) for t in dtypes):
198+
raise AttributeError(self._validation_msg)
199+
200+
@classmethod
201+
def from_spmatrix(cls, data, index=None, columns=None):
202+
"""
203+
Create a new DataFrame from a scipy sparse matrix.
204+
205+
.. versionadded:: 0.25.0
206+
207+
Parameters
208+
----------
209+
data : scipy.sparse.spmatrix
210+
Must be convertible to csc format.
211+
index, columns : Index, optional
212+
Row and column labels to use for the resulting DataFrame.
213+
Defaults to a RangeIndex.
214+
215+
Returns
216+
-------
217+
DataFrame
218+
Each column of the DataFrame is stored as a
219+
:class:`SparseArray`.
220+
221+
Examples
222+
--------
223+
>>> import scipy.sparse
224+
>>> mat = scipy.sparse.eye(3)
225+
>>> pd.DataFrame.sparse.from_spmatrix(mat)
226+
0 1 2
227+
0 1.0 0.0 0.0
228+
1 0.0 1.0 0.0
229+
2 0.0 0.0 1.0
230+
"""
231+
from pandas import DataFrame
232+
233+
data = data.tocsc()
234+
index, columns = cls._prep_index(data, index, columns)
235+
sparrays = [SparseArray.from_spmatrix(data[:, i]) for i in range(data.shape[1])]
236+
data = dict(enumerate(sparrays))
237+
result = DataFrame(data, index=index)
238+
result.columns = columns
239+
return result
240+
241+
def to_dense(self):
242+
"""
243+
Convert a DataFrame with sparse values to dense.
244+
245+
.. versionadded:: 0.25.0
246+
247+
Returns
248+
-------
249+
DataFrame
250+
A DataFrame with the same values stored as dense arrays.
251+
252+
Examples
253+
--------
254+
>>> df = pd.DataFrame({"A": pd.SparseArray([0, 1, 0])})
255+
>>> df.sparse.to_dense()
256+
A
257+
0 0
258+
1 1
259+
2 0
260+
"""
261+
from pandas import DataFrame
262+
263+
data = {k: v.array.to_dense() for k, v in self._parent.items()}
264+
return DataFrame(data, index=self._parent.index, columns=self._parent.columns)
265+
266+
def to_coo(self):
267+
"""
268+
Return the contents of the frame as a sparse SciPy COO matrix.
269+
270+
.. versionadded:: 0.25.0
271+
272+
Returns
273+
-------
274+
coo_matrix : scipy.sparse.spmatrix
275+
If the caller is heterogeneous and contains booleans or objects,
276+
the result will be of dtype=object. See Notes.
277+
278+
Notes
279+
-----
280+
The dtype will be the lowest-common-denominator type (implicit
281+
upcasting); that is to say if the dtypes (even of numeric types)
282+
are mixed, the one that accommodates all will be chosen.
283+
284+
e.g. If the dtypes are float16 and float32, dtype will be upcast to
285+
float32. By numpy.find_common_type convention, mixing int64 and
286+
and uint64 will result in a float64 dtype.
287+
"""
288+
import_optional_dependency("scipy")
289+
from scipy.sparse import coo_matrix
290+
291+
dtype = find_common_type(self._parent.dtypes)
292+
if isinstance(dtype, SparseDtype):
293+
dtype = dtype.subtype
294+
295+
cols, rows, datas = [], [], []
296+
for col, name in enumerate(self._parent):
297+
s = self._parent[name]
298+
row = s.array.sp_index.to_int_index().indices
299+
cols.append(np.repeat(col, len(row)))
300+
rows.append(row)
301+
datas.append(s.array.sp_values.astype(dtype, copy=False))
302+
303+
cols = np.concatenate(cols)
304+
rows = np.concatenate(rows)
305+
datas = np.concatenate(datas)
306+
return coo_matrix((datas, (rows, cols)), shape=self._parent.shape)
307+
308+
@property
309+
def density(self) -> float:
310+
"""
311+
Ratio of non-sparse points to total (dense) data points
312+
represented in the DataFrame.
313+
"""
314+
return np.mean([column.array.density for _, column in self._parent.items()])
315+
316+
@staticmethod
317+
def _prep_index(data, index, columns):
318+
import pandas.core.indexes.base as ibase
319+
320+
N, K = data.shape
321+
if index is None:
322+
index = ibase.default_index(N)
323+
if columns is None:
324+
columns = ibase.default_index(K)
325+
326+
if len(columns) != K:
327+
raise ValueError(
328+
"Column length mismatch: {columns} vs. {K}".format(
329+
columns=len(columns), K=K
330+
)
331+
)
332+
if len(index) != N:
333+
raise ValueError(
334+
"Index length mismatch: {index} vs. {N}".format(index=len(index), N=N)
335+
)
336+
return index, columns

0 commit comments

Comments
 (0)