Skip to content

Commit beb58d2

Browse files
committed
Squashed commit of the following:
commit 8b136bf Merge: 3005aed 01d3dc2 Author: Tom Augspurger <[email protected]> Date: Fri Mar 15 16:03:23 2019 -0500 Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor commit 3005aed Author: Tom Augspurger <[email protected]> Date: Thu Mar 14 06:26:32 2019 -0500 isort? commit 318c06f Merge: 0922296 79205ea Author: Tom Augspurger <[email protected]> Date: Thu Mar 14 06:25:45 2019 -0500 Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor commit 0922296 Author: Tom Augspurger <[email protected]> Date: Wed Mar 13 21:35:51 2019 -0500 updates commit f433be8 Author: Tom Augspurger <[email protected]> Date: Wed Mar 13 20:54:07 2019 -0500 lint commit 6696f28 Merge: 534a379 1017382 Author: Tom Augspurger <[email protected]> Date: Wed Mar 13 20:53:13 2019 -0500 Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor commit 534a379 Merge: 94a7baf 5c341dc Author: Tom Augspurger <[email protected]> Date: Tue Mar 12 14:37:27 2019 -0500 Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor commit 94a7baf Author: Tom Augspurger <[email protected]> Date: Tue Mar 12 14:22:48 2019 -0500 fixups commit 6f619b5 Author: Tom Augspurger <[email protected]> Date: Tue Mar 12 13:38:48 2019 -0500 32-bit compat commit 24f48c3 Author: Tom Augspurger <[email protected]> Date: Mon Mar 11 22:05:46 2019 -0500 API: DataFrame.sparse accessor Closes pandas-dev#25681
1 parent 6c613c8 commit beb58d2

File tree

7 files changed

+391
-82
lines changed

7 files changed

+391
-82
lines changed

doc/source/reference/frame.rst

+23
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,29 @@ specific plotting methods of the form ``DataFrame.plot.<kind>``.
312312
DataFrame.boxplot
313313
DataFrame.hist
314314

315+
316+
.. _api.frame.sparse:
317+
318+
Sparse Accessor
319+
~~~~~~~~~~~~~~~
320+
321+
Sparse-dtype specific methods and attributes are provided under the
322+
``DataFrame.sparse`` accessor.
323+
324+
.. autosummary::
325+
:toctree: api/
326+
:template: autosummary/accessor_attribute.rst
327+
328+
DataFrame.sparse.density
329+
330+
.. autosummary::
331+
:toctree: api/
332+
333+
DataFrame.sparse.from_spmatrix
334+
DataFrame.sparse.to_coo
335+
DataFrame.sparse.to_dense
336+
337+
315338
Serialization / IO / Conversion
316339
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
317340
.. autosummary::

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ Other Enhancements
3333
- :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`)
3434
- :meth:`RangeIndex.union` now supports the ``sort`` argument. If ``sort=False`` an unsorted ``Int64Index`` is always returned. ``sort=None`` is the default and returns a mononotically increasing ``RangeIndex`` if possible or a sorted ``Int64Index`` if not (:issue:`24471`)
3535
- :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`)
36+
- Added :ref:`api.frame.sparse` for working with a ``DataFrame`` whose values are sparse (:issue:`25681`)
3637
- :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`)
3738
- :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`)
3839
- :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`)

pandas/core/arrays/sparse.py

+250-11
Original file line numberDiff line numberDiff line change
@@ -688,6 +688,55 @@ def _simple_new(
688688
new._dtype = dtype
689689
return new
690690

691+
@classmethod
692+
def from_spmatrix(cls, data):
693+
"""
694+
Create a SparseArray from a scipy.sparse matrix.
695+
696+
.. versionadded:: 0.25.0
697+
698+
Parameters
699+
----------
700+
data : scipy.sparse.sp_matrix
701+
This should be a SciPy sparse matrix where the size
702+
of the second dimension is 1. In other words, a
703+
sparse matrix with a single column.
704+
705+
Returns
706+
-------
707+
SparseArray
708+
709+
Examples
710+
--------
711+
>>> import scipy.sparse
712+
>>> mat = scipy.sparse.coo_matrix((4, 1))
713+
>>> pd.SparseArray.from_spmatrix(mat)
714+
[0.0, 0.0, 0.0, 0.0]
715+
Fill: 0.0
716+
IntIndex
717+
Indices: array([], dtype=int32)
718+
"""
719+
length, ncol = data.shape
720+
721+
if ncol != 1:
722+
raise ValueError(
723+
"'data' must have a single column, not '{}'".format(ncol)
724+
)
725+
726+
# our sparse index classes require that the positions be strictly
727+
# increasing. So we need to sort loc, and arr accordingly.
728+
arr = data.data
729+
idx, _ = data.nonzero()
730+
loc = np.argsort(idx)
731+
arr = arr.take(loc)
732+
idx.sort()
733+
734+
zero = np.array(0, dtype=arr.dtype).item()
735+
dtype = SparseDtype(arr.dtype, zero)
736+
index = IntIndex(length, idx)
737+
738+
return cls._simple_new(arr, index, dtype)
739+
691740
def __array__(self, dtype=None, copy=True):
692741
fill_value = self.fill_value
693742

@@ -1899,27 +1948,32 @@ def _make_index(length, indices, kind):
18991948
# ----------------------------------------------------------------------------
19001949
# Accessor
19011950

1951+
1952+
class BaseAccessor(object):
1953+
_validation_msg = "Can only use the '.sparse' accessor with Sparse data."
1954+
1955+
def __init__(self, data=None):
1956+
self._parent = data
1957+
self._validate(data)
1958+
1959+
def _validate(self, data):
1960+
raise NotImplementedError
1961+
1962+
19021963
@delegate_names(SparseArray, ['npoints', 'density', 'fill_value',
19031964
'sp_values'],
19041965
typ='property')
1905-
class SparseAccessor(PandasDelegate):
1966+
class SparseAccessor(BaseAccessor, PandasDelegate):
19061967
"""
19071968
Accessor for SparseSparse from other sparse matrix data types.
19081969
"""
19091970

1910-
def __init__(self, data=None):
1911-
self._validate(data)
1912-
# Store the Series since we need that for to_coo
1913-
self._parent = data
1914-
1915-
@staticmethod
1916-
def _validate(data):
1971+
def _validate(self, data):
19171972
if not isinstance(data.dtype, SparseDtype):
1918-
msg = "Can only use the '.sparse' accessor with Sparse data."
1919-
raise AttributeError(msg)
1973+
raise AttributeError(self._validation_msg)
19201974

19211975
def _delegate_property_get(self, name, *args, **kwargs):
1922-
return getattr(self._parent.values, name)
1976+
return getattr(self._parent.array, name)
19231977

19241978
def _delegate_method(self, name, *args, **kwargs):
19251979
if name == 'from_coo':
@@ -2033,3 +2087,188 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
20332087
column_levels,
20342088
sort_labels=sort_labels)
20352089
return A, rows, columns
2090+
2091+
def to_dense(self):
2092+
"""
2093+
Convert a Series from sparse values to dense.
2094+
2095+
.. versionadded:: 0.25.0
2096+
2097+
Returns
2098+
-------
2099+
Series:
2100+
A Series with the same values, stored as a dense array.
2101+
2102+
Examples
2103+
--------
2104+
>>> series = pd.Series(pd.SparseArray([0, 1, 0]))
2105+
>>> series
2106+
0 0
2107+
1 1
2108+
2 0
2109+
dtype: Sparse[int64, 0]
2110+
2111+
>>> series.sparse.to_dense()
2112+
0 0
2113+
1 1
2114+
2 0
2115+
dtype: int64
2116+
"""
2117+
from pandas import Series
2118+
return Series(self._parent.array.to_dense(),
2119+
index=self._parent.index,
2120+
name=self._parent.name)
2121+
2122+
2123+
class SparseFrameAccessor(BaseAccessor, PandasDelegate):
2124+
"""
2125+
DataFrame accessor for sparse data.
2126+
2127+
.. versionadded :: 0.25.0
2128+
"""
2129+
2130+
def _validate(self, data):
2131+
dtypes = data.dtypes
2132+
if not all(isinstance(t, SparseDtype) for t in dtypes):
2133+
raise AttributeError(self._validation_msg)
2134+
2135+
@classmethod
2136+
def from_spmatrix(cls, data, index=None, columns=None):
2137+
"""
2138+
Create a new DataFrame from a scipy sparse matrix.
2139+
2140+
.. versionadded:: 0.25.0
2141+
2142+
Parameters
2143+
----------
2144+
data : scipy.sparse.spmatrix
2145+
Must be convertible to csc format.
2146+
index, columns : Index, optional
2147+
Row and column labels to use for the resulting DataFrame.
2148+
Defaults to a RangeIndex.
2149+
2150+
Returns
2151+
-------
2152+
DataFrame
2153+
Each column of the DataFrame is stored as a
2154+
:class:`SparseArray`.
2155+
2156+
Examples
2157+
--------
2158+
>>> import scipy.sparse
2159+
>>> mat = scipy.sparse.eye(3)
2160+
>>> pd.DataFrame.sparse.from_spmatrix(mat)
2161+
0 1 2
2162+
0 1.0 0.0 0.0
2163+
1 0.0 1.0 0.0
2164+
2 0.0 0.0 1.0
2165+
"""
2166+
from pandas import DataFrame
2167+
2168+
data = data.tocsc()
2169+
index, columns = cls._prep_index(data, index, columns)
2170+
sparrays = [
2171+
SparseArray.from_spmatrix(data[:, i])
2172+
for i in range(data.shape[1])
2173+
]
2174+
data = dict(zip(columns, sparrays))
2175+
return DataFrame(data, index=index)
2176+
2177+
def to_dense(self):
2178+
"""
2179+
Convert a DataFrame with sparse values to dense.
2180+
2181+
.. versionadded:: 0.25.0
2182+
2183+
Returns
2184+
-------
2185+
DataFrame
2186+
A DataFrame with the same values stored as dense arrays.
2187+
2188+
Examples
2189+
--------
2190+
>>> df = pd.DataFrame({"A": pd.SparseArray([0, 1, 0])})
2191+
>>> df.sparse.to_dense()
2192+
A
2193+
0 0
2194+
1 1
2195+
2 0
2196+
"""
2197+
from pandas import DataFrame
2198+
2199+
data = {k: v.array.to_dense()
2200+
for k, v in compat.iteritems(self._parent)}
2201+
return DataFrame(data,
2202+
index=self._parent.index,
2203+
columns=self._parent.columns)
2204+
2205+
def to_coo(self):
2206+
"""
2207+
Return the contents of the frame as a sparse SciPy COO matrix.
2208+
2209+
.. versionadded:: 0.20.0
2210+
2211+
Returns
2212+
-------
2213+
coo_matrix : scipy.sparse.spmatrix
2214+
If the caller is heterogeneous and contains booleans or objects,
2215+
the result will be of dtype=object. See Notes.
2216+
2217+
Notes
2218+
-----
2219+
The dtype will be the lowest-common-denominator type (implicit
2220+
upcasting); that is to say if the dtypes (even of numeric types)
2221+
are mixed, the one that accommodates all will be chosen.
2222+
2223+
e.g. If the dtypes are float16 and float32, dtype will be upcast to
2224+
float32. By numpy.find_common_type convention, mixing int64 and
2225+
and uint64 will result in a float64 dtype.
2226+
"""
2227+
try:
2228+
from scipy.sparse import coo_matrix
2229+
except ImportError:
2230+
raise ImportError('Scipy is not installed')
2231+
2232+
dtype = find_common_type(self._parent.dtypes)
2233+
if isinstance(dtype, SparseDtype):
2234+
dtype = dtype.subtype
2235+
2236+
cols, rows, datas = [], [], []
2237+
for col, name in enumerate(self._parent):
2238+
s = self._parent[name]
2239+
row = s.array.sp_index.to_int_index().indices
2240+
cols.append(np.repeat(col, len(row)))
2241+
rows.append(row)
2242+
datas.append(s.array.sp_values.astype(dtype, copy=False))
2243+
2244+
cols = np.concatenate(cols)
2245+
rows = np.concatenate(rows)
2246+
datas = np.concatenate(datas)
2247+
return coo_matrix((datas, (rows, cols)), shape=self._parent.shape)
2248+
2249+
@property
2250+
def density(self):
2251+
"""
2252+
Ratio of non-sparse points to total (dense) data points
2253+
represented in the DataFrame.
2254+
"""
2255+
return np.mean([column.array.density
2256+
for _, column in self._parent.iteritems()])
2257+
2258+
@staticmethod
2259+
def _prep_index(data, index, columns):
2260+
import pandas.core.indexes.base as ibase
2261+
2262+
N, K = data.shape
2263+
if index is None:
2264+
index = ibase.default_index(N)
2265+
if columns is None:
2266+
columns = ibase.default_index(K)
2267+
2268+
if len(columns) != K:
2269+
raise ValueError('Column length mismatch: {columns} vs. {K}'
2270+
.format(columns=len(columns), K=K))
2271+
if len(index) != N:
2272+
raise ValueError('Index length mismatch: {index} vs. {N}'
2273+
.format(index=len(index), N=N))
2274+
return index, columns

pandas/core/frame.py

+2
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from pandas import compat
3737
from pandas.compat import PY36, lmap, lzip, raise_with_traceback
3838
from pandas.compat.numpy import function as nv
39+
from pandas.core.arrays.sparse import SparseFrameAccessor
3940
from pandas.core.dtypes.cast import (
4041
maybe_upcast,
4142
cast_scalar_to_array,
@@ -8034,6 +8035,7 @@ def isin(self, values):
80348035
plot = CachedAccessor("plot", gfx.FramePlotMethods)
80358036
hist = gfx.hist_frame
80368037
boxplot = gfx.boxplot_frame
8038+
sparse = CachedAccessor("sparse", SparseFrameAccessor)
80378039

80388040

80398041
DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0,

0 commit comments

Comments
 (0)