Skip to content

Commit 48b8a8d

Browse files
committed
Squashed commit of the following:
commit 8b136bf Merge: 3005aed 01d3dc2 Author: Tom Augspurger <[email protected]> Date: Fri Mar 15 16:03:23 2019 -0500 Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor commit 3005aed Author: Tom Augspurger <[email protected]> Date: Thu Mar 14 06:26:32 2019 -0500 isort? commit 318c06f Merge: 0922296 79205ea Author: Tom Augspurger <[email protected]> Date: Thu Mar 14 06:25:45 2019 -0500 Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor commit 0922296 Author: Tom Augspurger <[email protected]> Date: Wed Mar 13 21:35:51 2019 -0500 updates commit f433be8 Author: Tom Augspurger <[email protected]> Date: Wed Mar 13 20:54:07 2019 -0500 lint commit 6696f28 Merge: 534a379 1017382 Author: Tom Augspurger <[email protected]> Date: Wed Mar 13 20:53:13 2019 -0500 Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor commit 534a379 Merge: 94a7baf 5c341dc Author: Tom Augspurger <[email protected]> Date: Tue Mar 12 14:37:27 2019 -0500 Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor commit 94a7baf Author: Tom Augspurger <[email protected]> Date: Tue Mar 12 14:22:48 2019 -0500 fixups commit 6f619b5 Author: Tom Augspurger <[email protected]> Date: Tue Mar 12 13:38:48 2019 -0500 32-bit compat commit 24f48c3 Author: Tom Augspurger <[email protected]> Date: Mon Mar 11 22:05:46 2019 -0500 API: DataFrame.sparse accessor Closes pandas-dev#25681
1 parent e02ec8f commit 48b8a8d

File tree

7 files changed

+391
-82
lines changed

7 files changed

+391
-82
lines changed

doc/source/reference/frame.rst

+23
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,29 @@ specific plotting methods of the form ``DataFrame.plot.<kind>``.
312312
DataFrame.boxplot
313313
DataFrame.hist
314314

315+
316+
.. _api.frame.sparse:
317+
318+
Sparse Accessor
319+
~~~~~~~~~~~~~~~
320+
321+
Sparse-dtype specific methods and attributes are provided under the
322+
``DataFrame.sparse`` accessor.
323+
324+
.. autosummary::
325+
:toctree: api/
326+
:template: autosummary/accessor_attribute.rst
327+
328+
DataFrame.sparse.density
329+
330+
.. autosummary::
331+
:toctree: api/
332+
333+
DataFrame.sparse.from_spmatrix
334+
DataFrame.sparse.to_coo
335+
DataFrame.sparse.to_dense
336+
337+
315338
Serialization / IO / Conversion
316339
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
317340
.. autosummary::

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ Other Enhancements
3333
- :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`)
3434
- :meth:`RangeIndex.union` now supports the ``sort`` argument. If ``sort=False`` an unsorted ``Int64Index`` is always returned. ``sort=None`` is the default and returns a mononotically increasing ``RangeIndex`` if possible or a sorted ``Int64Index`` if not (:issue:`24471`)
3535
- :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`)
36+
- Added :ref:`api.frame.sparse` for working with a ``DataFrame`` whose values are sparse (:issue:`25681`)
3637
- :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`)
3738
- :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`)
3839
- :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`)

pandas/core/arrays/sparse.py

+250-11
Original file line numberDiff line numberDiff line change
@@ -689,6 +689,55 @@ def _simple_new(
689689
new._dtype = dtype
690690
return new
691691

692+
@classmethod
693+
def from_spmatrix(cls, data):
694+
"""
695+
Create a SparseArray from a scipy.sparse matrix.
696+
697+
.. versionadded:: 0.25.0
698+
699+
Parameters
700+
----------
701+
data : scipy.sparse.sp_matrix
702+
This should be a SciPy sparse matrix where the size
703+
of the second dimension is 1. In other words, a
704+
sparse matrix with a single column.
705+
706+
Returns
707+
-------
708+
SparseArray
709+
710+
Examples
711+
--------
712+
>>> import scipy.sparse
713+
>>> mat = scipy.sparse.coo_matrix((4, 1))
714+
>>> pd.SparseArray.from_spmatrix(mat)
715+
[0.0, 0.0, 0.0, 0.0]
716+
Fill: 0.0
717+
IntIndex
718+
Indices: array([], dtype=int32)
719+
"""
720+
length, ncol = data.shape
721+
722+
if ncol != 1:
723+
raise ValueError(
724+
"'data' must have a single column, not '{}'".format(ncol)
725+
)
726+
727+
# our sparse index classes require that the positions be strictly
728+
# increasing. So we need to sort loc, and arr accordingly.
729+
arr = data.data
730+
idx, _ = data.nonzero()
731+
loc = np.argsort(idx)
732+
arr = arr.take(loc)
733+
idx.sort()
734+
735+
zero = np.array(0, dtype=arr.dtype).item()
736+
dtype = SparseDtype(arr.dtype, zero)
737+
index = IntIndex(length, idx)
738+
739+
return cls._simple_new(arr, index, dtype)
740+
692741
def __array__(self, dtype=None, copy=True):
693742
fill_value = self.fill_value
694743

@@ -1900,27 +1949,32 @@ def _make_index(length, indices, kind):
19001949
# ----------------------------------------------------------------------------
19011950
# Accessor
19021951

1952+
1953+
class BaseAccessor(object):
1954+
_validation_msg = "Can only use the '.sparse' accessor with Sparse data."
1955+
1956+
def __init__(self, data=None):
1957+
self._parent = data
1958+
self._validate(data)
1959+
1960+
def _validate(self, data):
1961+
raise NotImplementedError
1962+
1963+
19031964
@delegate_names(SparseArray, ['npoints', 'density', 'fill_value',
19041965
'sp_values'],
19051966
typ='property')
1906-
class SparseAccessor(PandasDelegate):
1967+
class SparseAccessor(BaseAccessor, PandasDelegate):
19071968
"""
19081969
Accessor for SparseSparse from other sparse matrix data types.
19091970
"""
19101971

1911-
def __init__(self, data=None):
1912-
self._validate(data)
1913-
# Store the Series since we need that for to_coo
1914-
self._parent = data
1915-
1916-
@staticmethod
1917-
def _validate(data):
1972+
def _validate(self, data):
19181973
if not isinstance(data.dtype, SparseDtype):
1919-
msg = "Can only use the '.sparse' accessor with Sparse data."
1920-
raise AttributeError(msg)
1974+
raise AttributeError(self._validation_msg)
19211975

19221976
def _delegate_property_get(self, name, *args, **kwargs):
1923-
return getattr(self._parent.values, name)
1977+
return getattr(self._parent.array, name)
19241978

19251979
def _delegate_method(self, name, *args, **kwargs):
19261980
if name == 'from_coo':
@@ -2034,3 +2088,188 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
20342088
column_levels,
20352089
sort_labels=sort_labels)
20362090
return A, rows, columns
2091+
2092+
def to_dense(self):
2093+
"""
2094+
Convert a Series from sparse values to dense.
2095+
2096+
.. versionadded:: 0.25.0
2097+
2098+
Returns
2099+
-------
2100+
Series:
2101+
A Series with the same values, stored as a dense array.
2102+
2103+
Examples
2104+
--------
2105+
>>> series = pd.Series(pd.SparseArray([0, 1, 0]))
2106+
>>> series
2107+
0 0
2108+
1 1
2109+
2 0
2110+
dtype: Sparse[int64, 0]
2111+
2112+
>>> series.sparse.to_dense()
2113+
0 0
2114+
1 1
2115+
2 0
2116+
dtype: int64
2117+
"""
2118+
from pandas import Series
2119+
return Series(self._parent.array.to_dense(),
2120+
index=self._parent.index,
2121+
name=self._parent.name)
2122+
2123+
2124+
class SparseFrameAccessor(BaseAccessor, PandasDelegate):
2125+
"""
2126+
DataFrame accessor for sparse data.
2127+
2128+
.. versionadded :: 0.25.0
2129+
"""
2130+
2131+
def _validate(self, data):
2132+
dtypes = data.dtypes
2133+
if not all(isinstance(t, SparseDtype) for t in dtypes):
2134+
raise AttributeError(self._validation_msg)
2135+
2136+
@classmethod
2137+
def from_spmatrix(cls, data, index=None, columns=None):
2138+
"""
2139+
Create a new DataFrame from a scipy sparse matrix.
2140+
2141+
.. versionadded:: 0.25.0
2142+
2143+
Parameters
2144+
----------
2145+
data : scipy.sparse.spmatrix
2146+
Must be convertible to csc format.
2147+
index, columns : Index, optional
2148+
Row and column labels to use for the resulting DataFrame.
2149+
Defaults to a RangeIndex.
2150+
2151+
Returns
2152+
-------
2153+
DataFrame
2154+
Each column of the DataFrame is stored as a
2155+
:class:`SparseArray`.
2156+
2157+
Examples
2158+
--------
2159+
>>> import scipy.sparse
2160+
>>> mat = scipy.sparse.eye(3)
2161+
>>> pd.DataFrame.sparse.from_spmatrix(mat)
2162+
0 1 2
2163+
0 1.0 0.0 0.0
2164+
1 0.0 1.0 0.0
2165+
2 0.0 0.0 1.0
2166+
"""
2167+
from pandas import DataFrame
2168+
2169+
data = data.tocsc()
2170+
index, columns = cls._prep_index(data, index, columns)
2171+
sparrays = [
2172+
SparseArray.from_spmatrix(data[:, i])
2173+
for i in range(data.shape[1])
2174+
]
2175+
data = dict(zip(columns, sparrays))
2176+
return DataFrame(data, index=index)
2177+
2178+
def to_dense(self):
2179+
"""
2180+
Convert a DataFrame with sparse values to dense.
2181+
2182+
.. versionadded:: 0.25.0
2183+
2184+
Returns
2185+
-------
2186+
DataFrame
2187+
A DataFrame with the same values stored as dense arrays.
2188+
2189+
Examples
2190+
--------
2191+
>>> df = pd.DataFrame({"A": pd.SparseArray([0, 1, 0])})
2192+
>>> df.sparse.to_dense()
2193+
A
2194+
0 0
2195+
1 1
2196+
2 0
2197+
"""
2198+
from pandas import DataFrame
2199+
2200+
data = {k: v.array.to_dense()
2201+
for k, v in compat.iteritems(self._parent)}
2202+
return DataFrame(data,
2203+
index=self._parent.index,
2204+
columns=self._parent.columns)
2205+
2206+
def to_coo(self):
2207+
"""
2208+
Return the contents of the frame as a sparse SciPy COO matrix.
2209+
2210+
.. versionadded:: 0.20.0
2211+
2212+
Returns
2213+
-------
2214+
coo_matrix : scipy.sparse.spmatrix
2215+
If the caller is heterogeneous and contains booleans or objects,
2216+
the result will be of dtype=object. See Notes.
2217+
2218+
Notes
2219+
-----
2220+
The dtype will be the lowest-common-denominator type (implicit
2221+
upcasting); that is to say if the dtypes (even of numeric types)
2222+
are mixed, the one that accommodates all will be chosen.
2223+
2224+
e.g. If the dtypes are float16 and float32, dtype will be upcast to
2225+
float32. By numpy.find_common_type convention, mixing int64 and
2226+
and uint64 will result in a float64 dtype.
2227+
"""
2228+
try:
2229+
from scipy.sparse import coo_matrix
2230+
except ImportError:
2231+
raise ImportError('Scipy is not installed')
2232+
2233+
dtype = find_common_type(self._parent.dtypes)
2234+
if isinstance(dtype, SparseDtype):
2235+
dtype = dtype.subtype
2236+
2237+
cols, rows, datas = [], [], []
2238+
for col, name in enumerate(self._parent):
2239+
s = self._parent[name]
2240+
row = s.array.sp_index.to_int_index().indices
2241+
cols.append(np.repeat(col, len(row)))
2242+
rows.append(row)
2243+
datas.append(s.array.sp_values.astype(dtype, copy=False))
2244+
2245+
cols = np.concatenate(cols)
2246+
rows = np.concatenate(rows)
2247+
datas = np.concatenate(datas)
2248+
return coo_matrix((datas, (rows, cols)), shape=self._parent.shape)
2249+
2250+
@property
2251+
def density(self):
2252+
"""
2253+
Ratio of non-sparse points to total (dense) data points
2254+
represented in the DataFrame.
2255+
"""
2256+
return np.mean([column.array.density
2257+
for _, column in self._parent.iteritems()])
2258+
2259+
@staticmethod
2260+
def _prep_index(data, index, columns):
2261+
import pandas.core.indexes.base as ibase
2262+
2263+
N, K = data.shape
2264+
if index is None:
2265+
index = ibase.default_index(N)
2266+
if columns is None:
2267+
columns = ibase.default_index(K)
2268+
2269+
if len(columns) != K:
2270+
raise ValueError('Column length mismatch: {columns} vs. {K}'
2271+
.format(columns=len(columns), K=K))
2272+
if len(index) != N:
2273+
raise ValueError('Index length mismatch: {index} vs. {N}'
2274+
.format(index=len(index), N=N))
2275+
return index, columns

pandas/core/frame.py

+2
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from pandas import compat
3737
from pandas.compat import PY36, lmap, lzip, raise_with_traceback
3838
from pandas.compat.numpy import function as nv
39+
from pandas.core.arrays.sparse import SparseFrameAccessor
3940
from pandas.core.dtypes.cast import (
4041
maybe_upcast,
4142
cast_scalar_to_array,
@@ -8034,6 +8035,7 @@ def isin(self, values):
80348035
plot = CachedAccessor("plot", gfx.FramePlotMethods)
80358036
hist = gfx.hist_frame
80368037
boxplot = gfx.boxplot_frame
8038+
sparse = CachedAccessor("sparse", SparseFrameAccessor)
80378039

80388040

80398041
DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0,

0 commit comments

Comments
 (0)