Skip to content

API: DataFrame.sparse accessor #25682

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
May 14, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions doc/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,29 @@ specific plotting methods of the form ``DataFrame.plot.<kind>``.
DataFrame.boxplot
DataFrame.hist


.. _api.frame.sparse:

Sparse Accessor
~~~~~~~~~~~~~~~

Sparse-dtype specific methods and attributes are provided under the
``DataFrame.sparse`` accessor.

.. autosummary::
:toctree: api/
:template: autosummary/accessor_attribute.rst

DataFrame.sparse.density

.. autosummary::
:toctree: api/

DataFrame.sparse.from_spmatrix
DataFrame.sparse.to_coo
DataFrame.sparse.to_dense


Serialization / IO / Conversion
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ Other Enhancements
- :meth:`RangeIndex.union` now supports the ``sort`` argument. If ``sort=False`` an unsorted ``Int64Index`` is always returned. ``sort=None`` is the default and returns a mononotically increasing ``RangeIndex`` if possible or a sorted ``Int64Index`` if not (:issue:`24471`)
- :meth:`TimedeltaIndex.intersection` now also supports the ``sort`` keyword (:issue:`24471`)
- :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`)
- Added :ref:`api.frame.sparse` for working with a ``DataFrame`` whose values are sparse (:issue:`25681`)
- :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`)
- :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`)
- :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`)
Expand Down
263 changes: 252 additions & 11 deletions pandas/core/arrays/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,55 @@ def _simple_new(
new._dtype = dtype
return new

@classmethod
def from_spmatrix(cls, data):
"""
Create a SparseArray from a scipy.sparse matrix.

.. versionadded:: 0.25.0

Parameters
----------
data : scipy.sparse.sp_matrix
This should be a SciPy sparse matrix where the size
of the second dimension is 1. In other words, a
sparse matrix with a single column.

Returns
-------
SparseArray

Examples
--------
>>> import scipy.sparse
>>> mat = scipy.sparse.coo_matrix((4, 1))
>>> pd.SparseArray.from_spmatrix(mat)
[0.0, 0.0, 0.0, 0.0]
Fill: 0.0
IntIndex
Indices: array([], dtype=int32)
"""
length, ncol = data.shape

if ncol != 1:
raise ValueError(
"'data' must have a single column, not '{}'".format(ncol)
)

# our sparse index classes require that the positions be strictly
# increasing. So we need to sort loc, and arr accordingly.
arr = data.data
idx, _ = data.nonzero()
loc = np.argsort(idx)
arr = arr.take(loc)
idx.sort()

zero = np.array(0, dtype=arr.dtype).item()
dtype = SparseDtype(arr.dtype, zero)
index = IntIndex(length, idx)

return cls._simple_new(arr, index, dtype)

def __array__(self, dtype=None, copy=True):
fill_value = self.fill_value

Expand Down Expand Up @@ -1906,27 +1955,32 @@ def _make_index(length, indices, kind):
# ----------------------------------------------------------------------------
# Accessor


class BaseAccessor:
_validation_msg = "Can only use the '.sparse' accessor with Sparse data."

def __init__(self, data=None):
self._parent = data
self._validate(data)

def _validate(self, data):
raise NotImplementedError


@delegate_names(SparseArray, ['npoints', 'density', 'fill_value',
'sp_values'],
typ='property')
class SparseAccessor(PandasDelegate):
class SparseAccessor(BaseAccessor, PandasDelegate):
"""
Accessor for SparseSparse from other sparse matrix data types.
"""

def __init__(self, data=None):
self._validate(data)
# Store the Series since we need that for to_coo
self._parent = data

@staticmethod
def _validate(data):
def _validate(self, data):
if not isinstance(data.dtype, SparseDtype):
msg = "Can only use the '.sparse' accessor with Sparse data."
raise AttributeError(msg)
raise AttributeError(self._validation_msg)

def _delegate_property_get(self, name, *args, **kwargs):
return getattr(self._parent.values, name)
return getattr(self._parent.array, name)

def _delegate_method(self, name, *args, **kwargs):
if name == 'from_coo':
Expand Down Expand Up @@ -2040,3 +2094,190 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
column_levels,
sort_labels=sort_labels)
return A, rows, columns

def to_dense(self):
"""
Convert a Series from sparse values to dense.

.. versionadded:: 0.25.0

Returns
-------
Series:
A Series with the same values, stored as a dense array.

Examples
--------
>>> series = pd.Series(pd.SparseArray([0, 1, 0]))
>>> series
0 0
1 1
2 0
dtype: Sparse[int64, 0]

>>> series.sparse.to_dense()
0 0
1 1
2 0
dtype: int64
"""
from pandas import Series
return Series(self._parent.array.to_dense(),
index=self._parent.index,
name=self._parent.name)


class SparseFrameAccessor(BaseAccessor, PandasDelegate):
"""
DataFrame accessor for sparse data.

.. versionadded :: 0.25.0
"""

def _validate(self, data):
dtypes = data.dtypes
if not all(isinstance(t, SparseDtype) for t in dtypes):
raise AttributeError(self._validation_msg)

@classmethod
def from_spmatrix(cls, data, index=None, columns=None):
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am assuming you are defining this here because then we can simply deprecate SparseDataFrame as this is much simpler / direct?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, this is the replacement for SparseDataFrame(sp_matrix).

Create a new DataFrame from a scipy sparse matrix.

.. versionadded:: 0.25.0

Parameters
----------
data : scipy.sparse.spmatrix
Must be convertible to csc format.
index, columns : Index, optional
Row and column labels to use for the resulting DataFrame.
Defaults to a RangeIndex.

Returns
-------
DataFrame
Each column of the DataFrame is stored as a
:class:`SparseArray`.

Examples
--------
>>> import scipy.sparse
>>> mat = scipy.sparse.eye(3)
>>> pd.DataFrame.sparse.from_spmatrix(mat)
0 1 2
0 1.0 0.0 0.0
1 0.0 1.0 0.0
2 0.0 0.0 1.0
"""
from pandas import DataFrame

data = data.tocsc()
index, columns = cls._prep_index(data, index, columns)
sparrays = [
SparseArray.from_spmatrix(data[:, i])
for i in range(data.shape[1])
]
data = dict(enumerate(sparrays))
result = DataFrame(data, index=index)
result.columns = columns
return result

def to_dense(self):
"""
Convert a DataFrame with sparse values to dense.

.. versionadded:: 0.25.0

Returns
-------
DataFrame
A DataFrame with the same values stored as dense arrays.

Examples
--------
>>> df = pd.DataFrame({"A": pd.SparseArray([0, 1, 0])})
>>> df.sparse.to_dense()
A
0 0
1 1
2 0
"""
from pandas import DataFrame

data = {k: v.array.to_dense()
for k, v in self._parent.items()}
return DataFrame(data,
index=self._parent.index,
columns=self._parent.columns)

def to_coo(self):
"""
Return the contents of the frame as a sparse SciPy COO matrix.

.. versionadded:: 0.25.0

Returns
-------
coo_matrix : scipy.sparse.spmatrix
If the caller is heterogeneous and contains booleans or objects,
the result will be of dtype=object. See Notes.

Notes
-----
The dtype will be the lowest-common-denominator type (implicit
upcasting); that is to say if the dtypes (even of numeric types)
are mixed, the one that accommodates all will be chosen.

e.g. If the dtypes are float16 and float32, dtype will be upcast to
float32. By numpy.find_common_type convention, mixing int64 and
and uint64 will result in a float64 dtype.
"""
try:
from scipy.sparse import coo_matrix
except ImportError:
raise ImportError('Scipy is not installed')

dtype = find_common_type(self._parent.dtypes)
if isinstance(dtype, SparseDtype):
dtype = dtype.subtype

cols, rows, datas = [], [], []
for col, name in enumerate(self._parent):
s = self._parent[name]
row = s.array.sp_index.to_int_index().indices
cols.append(np.repeat(col, len(row)))
rows.append(row)
datas.append(s.array.sp_values.astype(dtype, copy=False))

cols = np.concatenate(cols)
rows = np.concatenate(rows)
datas = np.concatenate(datas)
return coo_matrix((datas, (rows, cols)), shape=self._parent.shape)

@property
def density(self) -> float:
"""
Ratio of non-sparse points to total (dense) data points
represented in the DataFrame.
"""
return np.mean([column.array.density
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would not taking the mean, and returning a Series instead, be more useful?

for _, column in self._parent.items()])

@staticmethod
def _prep_index(data, index, columns):
import pandas.core.indexes.base as ibase

N, K = data.shape
if index is None:
index = ibase.default_index(N)
if columns is None:
columns = ibase.default_index(K)

if len(columns) != K:
raise ValueError('Column length mismatch: {columns} vs. {K}'
.format(columns=len(columns), K=K))
if len(index) != N:
raise ValueError('Index length mismatch: {index} vs. {N}'
.format(index=len(index), N=N))
return index, columns
2 changes: 2 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@

from pandas.compat import PY36, raise_with_traceback
from pandas.compat.numpy import function as nv
from pandas.core.arrays.sparse import SparseFrameAccessor
from pandas.core.dtypes.cast import (
maybe_upcast,
cast_scalar_to_array,
Expand Down Expand Up @@ -8027,6 +8028,7 @@ def isin(self, values):
plot = CachedAccessor("plot", gfx.FramePlotMethods)
hist = gfx.hist_frame
boxplot = gfx.boxplot_frame
sparse = CachedAccessor("sparse", SparseFrameAccessor)


DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0,
Expand Down
Loading