Skip to content

Commit 44b743d

Browse files
TomAugspurgertm9k1
authored andcommitted
API: Add sparse Acessor (pandas-dev#23183)
1 parent eb02b72 commit 44b743d

File tree

9 files changed

+281
-95
lines changed

9 files changed

+281
-95
lines changed

doc/source/api.rst

+16
Original file line numberDiff line numberDiff line change
@@ -851,6 +851,22 @@ Sparse
851851
SparseSeries.to_coo
852852
SparseSeries.from_coo
853853

854+
.. autosummary::
855+
:toctree: generated/
856+
:template: autosummary/accessor_attribute.rst
857+
858+
Series.sparse.npoints
859+
Series.sparse.density
860+
Series.sparse.fill_value
861+
Series.sparse.sp_values
862+
863+
864+
.. autosummary::
865+
:toctree: generated/
866+
867+
Series.sparse.from_coo
868+
Series.sparse.to_coo
869+
854870
.. _api.dataframe:
855871

856872
DataFrame

doc/source/sparse.rst

+20
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,26 @@ Any sparse object can be converted back to the standard dense form by calling
6262
6363
sts.to_dense()
6464
65+
.. _sparse.accessor:
66+
67+
Sparse Accessor
68+
---------------
69+
70+
.. versionadded:: 0.24.0
71+
72+
Pandas provides a ``.sparse`` accessor, similar to ``.str`` for string data, ``.cat``
73+
for categorical data, and ``.dt`` for datetime-like data. This namespace provides
74+
attributes and methods that are specific to sparse data.
75+
76+
.. ipython:: python
77+
78+
s = pd.Series([0, 0, 1, 2], dtype="Sparse[int]")
79+
s.sparse.density
80+
s.sparse.fill_value
81+
82+
This accessor is available only on data with ``SparseDtype``, and on the :class:`Series`
83+
class itself for creating a Series with sparse data from a scipy COO matrix with.
84+
6585
.. _sparse.array:
6686

6787
SparseArray

doc/source/whatsnew/v0.24.0.txt

+7-1
Original file line numberDiff line numberDiff line change
@@ -533,14 +533,20 @@ changes were made:
533533
- ``SparseDataFrame.combine`` and ``DataFrame.combine_first`` no longer supports combining a sparse column with a dense column while preserving the sparse subtype. The result will be an object-dtype SparseArray.
534534
- Setting :attr:`SparseArray.fill_value` to a fill value with a different dtype is now allowed.
535535

536-
537536
Some new warnings are issued for operations that require or are likely to materialize a large dense array:
538537

539538
- A :class:`errors.PerformanceWarning` is issued when using fillna with a ``method``, as a dense array is constructed to create the filled array. Filling with a ``value`` is the efficient way to fill a sparse array.
540539
- A :class:`errors.PerformanceWarning` is now issued when concatenating sparse Series with differing fill values. The fill value from the first sparse array continues to be used.
541540

542541
In addition to these API breaking changes, many :ref:`performance improvements and bug fixes have been made <whatsnew_0240.bug_fixes.sparse>`.
543542

543+
Finally, a ``Series.sparse`` accessor was added to provide sparse-specific methods like :meth:`Series.sparse.from_coo`.
544+
545+
.. ipython:: python
546+
547+
s = pd.Series([0, 0, 1, 1, 1], dtype='Sparse[int]')
548+
s.sparse.density
549+
544550
.. _whatsnew_0240.api_breaking.frame_to_dict_index_orient:
545551

546552
Raise ValueError in ``DataFrame.to_dict(orient='index')``

pandas/core/accessor.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -113,15 +113,18 @@ def delegate_names(delegate, accessors, typ, overwrite=False):
113113
114114
Parameters
115115
----------
116-
delegate : the class to get methods/properties & doc-strings
117-
acccessors : string list of accessors to add
118-
typ : 'property' or 'method'
116+
delegate : object
117+
the class to get methods/properties & doc-strings
118+
acccessors : Sequence[str]
119+
List of accessor to add
120+
typ : {'property', 'method'}
119121
overwrite : boolean, default False
120122
overwrite the method/property in the target class if it exists
121123
122124
Returns
123125
-------
124-
decorator
126+
callable
127+
A class decorator.
125128
126129
Examples
127130
--------

pandas/core/arrays/sparse.py

+174
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from pandas.errors import PerformanceWarning
1818
from pandas.compat.numpy import function as nv
1919

20+
from pandas.core.accessor import PandasDelegate, delegate_names
2021
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
2122
import pandas.core.common as com
2223
from pandas.core.dtypes.base import ExtensionDtype
@@ -178,6 +179,7 @@ def _is_boolean(self):
178179

179180
@property
180181
def kind(self):
182+
"""The sparse kind. Either 'integer', or 'block'."""
181183
return self.subtype.kind
182184

183185
@property
@@ -648,10 +650,22 @@ def _from_factorized(cls, values, original):
648650
# ------------------------------------------------------------------------
649651
@property
650652
def sp_index(self):
653+
"""
654+
The SparseIndex containing the location of non- ``fill_value`` points.
655+
"""
651656
return self._sparse_index
652657

653658
@property
654659
def sp_values(self):
660+
"""
661+
An ndarray containing the non- ``fill_value`` values.
662+
663+
Examples
664+
--------
665+
>>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0)
666+
>>> s.sp_values
667+
array([1, 2])
668+
"""
655669
return self._sparse_values
656670

657671
@property
@@ -704,6 +718,31 @@ def _fill_value_matches(self, fill_value):
704718
def nbytes(self):
705719
return self.sp_values.nbytes + self.sp_index.nbytes
706720

721+
@property
722+
def density(self):
723+
"""The percent of non- ``fill_value`` points, as decimal.
724+
725+
Examples
726+
--------
727+
>>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
728+
>>> s.density
729+
0.6
730+
"""
731+
r = float(self.sp_index.npoints) / float(self.sp_index.length)
732+
return r
733+
734+
@property
735+
def npoints(self):
736+
"""The number of non- ``fill_value`` points.
737+
738+
Examples
739+
--------
740+
>>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
741+
>>> s.npoints
742+
3
743+
"""
744+
return self.sp_index.npoints
745+
707746
@property
708747
def values(self):
709748
"""
@@ -1744,3 +1783,138 @@ def _make_index(length, indices, kind):
17441783
else: # pragma: no cover
17451784
raise ValueError('must be block or integer type')
17461785
return index
1786+
1787+
1788+
# ----------------------------------------------------------------------------
1789+
# Accessor
1790+
1791+
@delegate_names(SparseArray, ['npoints', 'density', 'fill_value',
1792+
'sp_values'],
1793+
typ='property')
1794+
class SparseAccessor(PandasDelegate):
1795+
def __init__(self, data=None):
1796+
self._validate(data)
1797+
# Store the Series since we need that for to_coo
1798+
self._parent = data
1799+
1800+
@staticmethod
1801+
def _validate(data):
1802+
if not isinstance(data.dtype, SparseDtype):
1803+
msg = "Can only use the '.sparse' accessor with Sparse data."
1804+
raise AttributeError(msg)
1805+
1806+
def _delegate_property_get(self, name, *args, **kwargs):
1807+
return getattr(self._parent.values, name)
1808+
1809+
def _delegate_method(self, name, *args, **kwargs):
1810+
if name == 'from_coo':
1811+
return self.from_coo(*args, **kwargs)
1812+
elif name == 'to_coo':
1813+
return self.to_coo(*args, **kwargs)
1814+
else:
1815+
raise ValueError
1816+
1817+
@classmethod
1818+
def from_coo(cls, A, dense_index=False):
1819+
"""
1820+
Create a SparseSeries from a scipy.sparse.coo_matrix.
1821+
1822+
Parameters
1823+
----------
1824+
A : scipy.sparse.coo_matrix
1825+
dense_index : bool, default False
1826+
If False (default), the SparseSeries index consists of only the
1827+
coords of the non-null entries of the original coo_matrix.
1828+
If True, the SparseSeries index consists of the full sorted
1829+
(row, col) coordinates of the coo_matrix.
1830+
1831+
Returns
1832+
-------
1833+
s : SparseSeries
1834+
1835+
Examples
1836+
---------
1837+
>>> from scipy import sparse
1838+
>>> A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])),
1839+
shape=(3, 4))
1840+
>>> A
1841+
<3x4 sparse matrix of type '<class 'numpy.float64'>'
1842+
with 3 stored elements in COOrdinate format>
1843+
>>> A.todense()
1844+
matrix([[ 0., 0., 1., 2.],
1845+
[ 3., 0., 0., 0.],
1846+
[ 0., 0., 0., 0.]])
1847+
>>> ss = pd.SparseSeries.from_coo(A)
1848+
>>> ss
1849+
0 2 1
1850+
3 2
1851+
1 0 3
1852+
dtype: float64
1853+
BlockIndex
1854+
Block locations: array([0], dtype=int32)
1855+
Block lengths: array([3], dtype=int32)
1856+
"""
1857+
from pandas.core.sparse.scipy_sparse import _coo_to_sparse_series
1858+
from pandas import Series
1859+
1860+
result = _coo_to_sparse_series(A, dense_index=dense_index)
1861+
# SparseSeries -> Series[sparse]
1862+
result = Series(result.values, index=result.index, copy=False)
1863+
1864+
return result
1865+
1866+
def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
1867+
"""
1868+
Create a scipy.sparse.coo_matrix from a SparseSeries with MultiIndex.
1869+
1870+
Use row_levels and column_levels to determine the row and column
1871+
coordinates respectively. row_levels and column_levels are the names
1872+
(labels) or numbers of the levels. {row_levels, column_levels} must be
1873+
a partition of the MultiIndex level names (or numbers).
1874+
1875+
Parameters
1876+
----------
1877+
row_levels : tuple/list
1878+
column_levels : tuple/list
1879+
sort_labels : bool, default False
1880+
Sort the row and column labels before forming the sparse matrix.
1881+
1882+
Returns
1883+
-------
1884+
y : scipy.sparse.coo_matrix
1885+
rows : list (row labels)
1886+
columns : list (column labels)
1887+
1888+
Examples
1889+
--------
1890+
>>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])
1891+
>>> s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0),
1892+
(1, 2, 'a', 1),
1893+
(1, 1, 'b', 0),
1894+
(1, 1, 'b', 1),
1895+
(2, 1, 'b', 0),
1896+
(2, 1, 'b', 1)],
1897+
names=['A', 'B', 'C', 'D'])
1898+
>>> ss = s.to_sparse()
1899+
>>> A, rows, columns = ss.to_coo(row_levels=['A', 'B'],
1900+
column_levels=['C', 'D'],
1901+
sort_labels=True)
1902+
>>> A
1903+
<3x4 sparse matrix of type '<class 'numpy.float64'>'
1904+
with 3 stored elements in COOrdinate format>
1905+
>>> A.todense()
1906+
matrix([[ 0., 0., 1., 3.],
1907+
[ 3., 0., 0., 0.],
1908+
[ 0., 0., 0., 0.]])
1909+
>>> rows
1910+
[(1, 1), (1, 2), (2, 1)]
1911+
>>> columns
1912+
[('a', 0), ('a', 1), ('b', 0), ('b', 1)]
1913+
"""
1914+
from pandas.core.sparse.scipy_sparse import _sparse_series_to_coo
1915+
1916+
A, rows, columns = _sparse_series_to_coo(self._parent,
1917+
row_levels,
1918+
column_levels,
1919+
sort_labels=sort_labels)
1920+
return A, rows, columns

pandas/core/indexes/accessors.py

-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""
22
datetimelike delegation
33
"""
4-
54
import numpy as np
65

76
from pandas.core.dtypes.generic import ABCSeries

pandas/core/series.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from pandas.core.accessor import CachedAccessor
2727
from pandas.core.arrays import ExtensionArray, period_array
2828
from pandas.core.arrays.categorical import Categorical, CategoricalAccessor
29+
from pandas.core.arrays.sparse import SparseAccessor
2930
from pandas.core.config import get_option
3031
from pandas.core.dtypes.cast import (
3132
construct_1d_arraylike_from_scalar, construct_1d_ndarray_preserving_na,
@@ -142,7 +143,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame):
142143
Copy input data
143144
"""
144145
_metadata = ['name']
145-
_accessors = {'dt', 'cat', 'str'}
146+
_accessors = {'dt', 'cat', 'str', 'sparse'}
146147
_deprecations = generic.NDFrame._deprecations | frozenset(
147148
['asobject', 'sortlevel', 'reshape', 'get_value', 'set_value',
148149
'from_csv', 'valid'])
@@ -4151,6 +4152,7 @@ def to_period(self, freq=None, copy=True):
41514152
dt = CachedAccessor("dt", CombinedDatetimelikeProperties)
41524153
cat = CachedAccessor("cat", CategoricalAccessor)
41534154
plot = CachedAccessor("plot", gfx.SeriesPlotMethods)
4155+
sparse = CachedAccessor("sparse", SparseAccessor)
41544156

41554157
# ----------------------------------------------------------------------
41564158
# Add plotting methods to Series

0 commit comments

Comments
 (0)