Skip to content

Commit d518404

Browse files
committed
Squashed commit of the following:
commit 8b136bf Merge: 3005aed 01d3dc2 Author: Tom Augspurger <[email protected]> Date: Fri Mar 15 16:03:23 2019 -0500 Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor commit 3005aed Author: Tom Augspurger <[email protected]> Date: Thu Mar 14 06:26:32 2019 -0500 isort? commit 318c06f Merge: 0922296 79205ea Author: Tom Augspurger <[email protected]> Date: Thu Mar 14 06:25:45 2019 -0500 Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor commit 0922296 Author: Tom Augspurger <[email protected]> Date: Wed Mar 13 21:35:51 2019 -0500 updates commit f433be8 Author: Tom Augspurger <[email protected]> Date: Wed Mar 13 20:54:07 2019 -0500 lint commit 6696f28 Merge: 534a379 1017382 Author: Tom Augspurger <[email protected]> Date: Wed Mar 13 20:53:13 2019 -0500 Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor commit 534a379 Merge: 94a7baf 5c341dc Author: Tom Augspurger <[email protected]> Date: Tue Mar 12 14:37:27 2019 -0500 Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor commit 94a7baf Author: Tom Augspurger <[email protected]> Date: Tue Mar 12 14:22:48 2019 -0500 fixups commit 6f619b5 Author: Tom Augspurger <[email protected]> Date: Tue Mar 12 13:38:48 2019 -0500 32-bit compat commit 24f48c3 Author: Tom Augspurger <[email protected]> Date: Mon Mar 11 22:05:46 2019 -0500 API: DataFrame.sparse accessor Closes pandas-dev#25681
1 parent dd9c585 commit d518404

File tree

7 files changed

+391
-82
lines changed

7 files changed

+391
-82
lines changed

doc/source/reference/frame.rst

+23
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,29 @@ specific plotting methods of the form ``DataFrame.plot.<kind>``.
312312
DataFrame.boxplot
313313
DataFrame.hist
314314

315+
316+
.. _api.frame.sparse:
317+
318+
Sparse Accessor
319+
~~~~~~~~~~~~~~~
320+
321+
Sparse-dtype specific methods and attributes are provided under the
322+
``DataFrame.sparse`` accessor.
323+
324+
.. autosummary::
325+
:toctree: api/
326+
:template: autosummary/accessor_attribute.rst
327+
328+
DataFrame.sparse.density
329+
330+
.. autosummary::
331+
:toctree: api/
332+
333+
DataFrame.sparse.from_spmatrix
334+
DataFrame.sparse.to_coo
335+
DataFrame.sparse.to_dense
336+
337+
315338
Serialization / IO / Conversion
316339
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
317340
.. autosummary::

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ Other Enhancements
3333
- :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`)
3434
- :meth:`RangeIndex.union` now supports the ``sort`` argument. If ``sort=False`` an unsorted ``Int64Index`` is always returned. ``sort=None`` is the default and returns a mononotically increasing ``RangeIndex`` if possible or a sorted ``Int64Index`` if not (:issue:`24471`)
3535
- :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`)
36+
- Added :ref:`api.frame.sparse` for working with a ``DataFrame`` whose values are sparse (:issue:`25681`)
3637
- :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`)
3738
- :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`)
3839
- :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`)

pandas/core/arrays/sparse.py

+250-11
Original file line numberDiff line numberDiff line change
@@ -689,6 +689,55 @@ def _simple_new(
689689
new._dtype = dtype
690690
return new
691691

692+
@classmethod
693+
def from_spmatrix(cls, data):
694+
"""
695+
Create a SparseArray from a scipy.sparse matrix.
696+
697+
.. versionadded:: 0.25.0
698+
699+
Parameters
700+
----------
701+
data : scipy.sparse.sp_matrix
702+
This should be a SciPy sparse matrix where the size
703+
of the second dimension is 1. In other words, a
704+
sparse matrix with a single column.
705+
706+
Returns
707+
-------
708+
SparseArray
709+
710+
Examples
711+
--------
712+
>>> import scipy.sparse
713+
>>> mat = scipy.sparse.coo_matrix((4, 1))
714+
>>> pd.SparseArray.from_spmatrix(mat)
715+
[0.0, 0.0, 0.0, 0.0]
716+
Fill: 0.0
717+
IntIndex
718+
Indices: array([], dtype=int32)
719+
"""
720+
length, ncol = data.shape
721+
722+
if ncol != 1:
723+
raise ValueError(
724+
"'data' must have a single column, not '{}'".format(ncol)
725+
)
726+
727+
# our sparse index classes require that the positions be strictly
728+
# increasing. So we need to sort loc, and arr accordingly.
729+
arr = data.data
730+
idx, _ = data.nonzero()
731+
loc = np.argsort(idx)
732+
arr = arr.take(loc)
733+
idx.sort()
734+
735+
zero = np.array(0, dtype=arr.dtype).item()
736+
dtype = SparseDtype(arr.dtype, zero)
737+
index = IntIndex(length, idx)
738+
739+
return cls._simple_new(arr, index, dtype)
740+
692741
def __array__(self, dtype=None, copy=True):
693742
fill_value = self.fill_value
694743

@@ -1898,27 +1947,32 @@ def _make_index(length, indices, kind):
18981947
# ----------------------------------------------------------------------------
18991948
# Accessor
19001949

1950+
1951+
class BaseAccessor(object):
1952+
_validation_msg = "Can only use the '.sparse' accessor with Sparse data."
1953+
1954+
def __init__(self, data=None):
1955+
self._parent = data
1956+
self._validate(data)
1957+
1958+
def _validate(self, data):
1959+
raise NotImplementedError
1960+
1961+
19011962
@delegate_names(SparseArray, ['npoints', 'density', 'fill_value',
19021963
'sp_values'],
19031964
typ='property')
1904-
class SparseAccessor(PandasDelegate):
1965+
class SparseAccessor(BaseAccessor, PandasDelegate):
19051966
"""
19061967
Accessor for SparseSparse from other sparse matrix data types.
19071968
"""
19081969

1909-
def __init__(self, data=None):
1910-
self._validate(data)
1911-
# Store the Series since we need that for to_coo
1912-
self._parent = data
1913-
1914-
@staticmethod
1915-
def _validate(data):
1970+
def _validate(self, data):
19161971
if not isinstance(data.dtype, SparseDtype):
1917-
msg = "Can only use the '.sparse' accessor with Sparse data."
1918-
raise AttributeError(msg)
1972+
raise AttributeError(self._validation_msg)
19191973

19201974
def _delegate_property_get(self, name, *args, **kwargs):
1921-
return getattr(self._parent.values, name)
1975+
return getattr(self._parent.array, name)
19221976

19231977
def _delegate_method(self, name, *args, **kwargs):
19241978
if name == 'from_coo':
@@ -2032,3 +2086,188 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
20322086
column_levels,
20332087
sort_labels=sort_labels)
20342088
return A, rows, columns
2089+
2090+
def to_dense(self):
2091+
"""
2092+
Convert a Series from sparse values to dense.
2093+
2094+
.. versionadded:: 0.25.0
2095+
2096+
Returns
2097+
-------
2098+
Series:
2099+
A Series with the same values, stored as a dense array.
2100+
2101+
Examples
2102+
--------
2103+
>>> series = pd.Series(pd.SparseArray([0, 1, 0]))
2104+
>>> series
2105+
0 0
2106+
1 1
2107+
2 0
2108+
dtype: Sparse[int64, 0]
2109+
2110+
>>> series.sparse.to_dense()
2111+
0 0
2112+
1 1
2113+
2 0
2114+
dtype: int64
2115+
"""
2116+
from pandas import Series
2117+
return Series(self._parent.array.to_dense(),
2118+
index=self._parent.index,
2119+
name=self._parent.name)
2120+
2121+
2122+
class SparseFrameAccessor(BaseAccessor, PandasDelegate):
2123+
"""
2124+
DataFrame accessor for sparse data.
2125+
2126+
.. versionadded :: 0.25.0
2127+
"""
2128+
2129+
def _validate(self, data):
2130+
dtypes = data.dtypes
2131+
if not all(isinstance(t, SparseDtype) for t in dtypes):
2132+
raise AttributeError(self._validation_msg)
2133+
2134+
@classmethod
2135+
def from_spmatrix(cls, data, index=None, columns=None):
2136+
"""
2137+
Create a new DataFrame from a scipy sparse matrix.
2138+
2139+
.. versionadded:: 0.25.0
2140+
2141+
Parameters
2142+
----------
2143+
data : scipy.sparse.spmatrix
2144+
Must be convertible to csc format.
2145+
index, columns : Index, optional
2146+
Row and column labels to use for the resulting DataFrame.
2147+
Defaults to a RangeIndex.
2148+
2149+
Returns
2150+
-------
2151+
DataFrame
2152+
Each column of the DataFrame is stored as a
2153+
:class:`SparseArray`.
2154+
2155+
Examples
2156+
--------
2157+
>>> import scipy.sparse
2158+
>>> mat = scipy.sparse.eye(3)
2159+
>>> pd.DataFrame.sparse.from_spmatrix(mat)
2160+
0 1 2
2161+
0 1.0 0.0 0.0
2162+
1 0.0 1.0 0.0
2163+
2 0.0 0.0 1.0
2164+
"""
2165+
from pandas import DataFrame
2166+
2167+
data = data.tocsc()
2168+
index, columns = cls._prep_index(data, index, columns)
2169+
sparrays = [
2170+
SparseArray.from_spmatrix(data[:, i])
2171+
for i in range(data.shape[1])
2172+
]
2173+
data = dict(zip(columns, sparrays))
2174+
return DataFrame(data, index=index)
2175+
2176+
def to_dense(self):
2177+
"""
2178+
Convert a DataFrame with sparse values to dense.
2179+
2180+
.. versionadded:: 0.25.0
2181+
2182+
Returns
2183+
-------
2184+
DataFrame
2185+
A DataFrame with the same values stored as dense arrays.
2186+
2187+
Examples
2188+
--------
2189+
>>> df = pd.DataFrame({"A": pd.SparseArray([0, 1, 0])})
2190+
>>> df.sparse.to_dense()
2191+
A
2192+
0 0
2193+
1 1
2194+
2 0
2195+
"""
2196+
from pandas import DataFrame
2197+
2198+
data = {k: v.array.to_dense()
2199+
for k, v in compat.iteritems(self._parent)}
2200+
return DataFrame(data,
2201+
index=self._parent.index,
2202+
columns=self._parent.columns)
2203+
2204+
def to_coo(self):
2205+
"""
2206+
Return the contents of the frame as a sparse SciPy COO matrix.
2207+
2208+
.. versionadded:: 0.20.0
2209+
2210+
Returns
2211+
-------
2212+
coo_matrix : scipy.sparse.spmatrix
2213+
If the caller is heterogeneous and contains booleans or objects,
2214+
the result will be of dtype=object. See Notes.
2215+
2216+
Notes
2217+
-----
2218+
The dtype will be the lowest-common-denominator type (implicit
2219+
upcasting); that is to say if the dtypes (even of numeric types)
2220+
are mixed, the one that accommodates all will be chosen.
2221+
2222+
e.g. If the dtypes are float16 and float32, dtype will be upcast to
2223+
float32. By numpy.find_common_type convention, mixing int64 and
2224+
and uint64 will result in a float64 dtype.
2225+
"""
2226+
try:
2227+
from scipy.sparse import coo_matrix
2228+
except ImportError:
2229+
raise ImportError('Scipy is not installed')
2230+
2231+
dtype = find_common_type(self._parent.dtypes)
2232+
if isinstance(dtype, SparseDtype):
2233+
dtype = dtype.subtype
2234+
2235+
cols, rows, datas = [], [], []
2236+
for col, name in enumerate(self._parent):
2237+
s = self._parent[name]
2238+
row = s.array.sp_index.to_int_index().indices
2239+
cols.append(np.repeat(col, len(row)))
2240+
rows.append(row)
2241+
datas.append(s.array.sp_values.astype(dtype, copy=False))
2242+
2243+
cols = np.concatenate(cols)
2244+
rows = np.concatenate(rows)
2245+
datas = np.concatenate(datas)
2246+
return coo_matrix((datas, (rows, cols)), shape=self._parent.shape)
2247+
2248+
@property
2249+
def density(self):
2250+
"""
2251+
Ratio of non-sparse points to total (dense) data points
2252+
represented in the DataFrame.
2253+
"""
2254+
return np.mean([column.array.density
2255+
for _, column in self._parent.iteritems()])
2256+
2257+
@staticmethod
2258+
def _prep_index(data, index, columns):
2259+
import pandas.core.indexes.base as ibase
2260+
2261+
N, K = data.shape
2262+
if index is None:
2263+
index = ibase.default_index(N)
2264+
if columns is None:
2265+
columns = ibase.default_index(K)
2266+
2267+
if len(columns) != K:
2268+
raise ValueError('Column length mismatch: {columns} vs. {K}'
2269+
.format(columns=len(columns), K=K))
2270+
if len(index) != N:
2271+
raise ValueError('Index length mismatch: {index} vs. {N}'
2272+
.format(index=len(index), N=N))
2273+
return index, columns

pandas/core/frame.py

+2
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333

3434
from pandas.compat import PY36, lmap, lzip, raise_with_traceback
3535
from pandas.compat.numpy import function as nv
36+
from pandas.core.arrays.sparse import SparseFrameAccessor
3637
from pandas.core.dtypes.cast import (
3738
maybe_upcast,
3839
cast_scalar_to_array,
@@ -8023,6 +8024,7 @@ def isin(self, values):
80238024
plot = CachedAccessor("plot", gfx.FramePlotMethods)
80248025
hist = gfx.hist_frame
80258026
boxplot = gfx.boxplot_frame
8027+
sparse = CachedAccessor("sparse", SparseFrameAccessor)
80268028

80278029

80288030
DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0,

0 commit comments

Comments
 (0)