Skip to content

CLN: clean-up internal sparse imports + restructure sparse submodule #28516

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,10 @@ Deprecations
Removed SparseSeries and SparseDataFrame
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

``SparseSeries`` and ``SparseDataFrame`` have been removed (:issue:`28425`).
We recommend using a ``Series`` or ``DataFrame`` with sparse values instead.
See :ref:`sparse.migration` for help with migrating existing code.
``SparseSeries``, ``SparseDataFrame`` and the ``DataFrame.to_sparse`` method
have been removed (:issue:`28425`). We recommend using a ``Series`` or
``DataFrame`` with sparse values instead. See :ref:`sparse.migration` for help
with migrating existing code.

Removal of prior version deprecations/changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
2 changes: 1 addition & 1 deletion pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@
DataFrame,
)

from pandas.core.sparse.api import SparseArray, SparseDtype
from pandas.core.arrays.sparse import SparseArray, SparseDtype

from pandas.tseries.api import infer_freq
from pandas.tseries import offsets
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/arrays/sparse/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# flake8: noqa: F401

from .accessor import SparseAccessor, SparseFrameAccessor
from .array import BlockIndex, IntIndex, SparseArray, _make_index
from .dtype import SparseDtype
336 changes: 336 additions & 0 deletions pandas/core/arrays/sparse/accessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,336 @@
"""Sparse accessor"""

import numpy as np

from pandas.compat._optional import import_optional_dependency

from pandas.core.dtypes.cast import find_common_type

from pandas.core.accessor import PandasDelegate, delegate_names

from .array import SparseArray
from .dtype import SparseDtype


class BaseAccessor:
_validation_msg = "Can only use the '.sparse' accessor with Sparse data."

def __init__(self, data=None):
self._parent = data
self._validate(data)

def _validate(self, data):
raise NotImplementedError


@delegate_names(
SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property"
)
class SparseAccessor(BaseAccessor, PandasDelegate):
"""
Accessor for SparseSparse from other sparse matrix data types.
"""

def _validate(self, data):
if not isinstance(data.dtype, SparseDtype):
raise AttributeError(self._validation_msg)

def _delegate_property_get(self, name, *args, **kwargs):
return getattr(self._parent.array, name)

def _delegate_method(self, name, *args, **kwargs):
if name == "from_coo":
return self.from_coo(*args, **kwargs)
elif name == "to_coo":
return self.to_coo(*args, **kwargs)
else:
raise ValueError

@classmethod
def from_coo(cls, A, dense_index=False):
"""
Create a Series with sparse values from a scipy.sparse.coo_matrix.

Parameters
----------
A : scipy.sparse.coo_matrix
dense_index : bool, default False
If False (default), the SparseSeries index consists of only the
coords of the non-null entries of the original coo_matrix.
If True, the SparseSeries index consists of the full sorted
(row, col) coordinates of the coo_matrix.

Returns
-------
s : Series
A Series with sparse values.

Examples
--------
>>> from scipy import sparse
>>> A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])),
shape=(3, 4))
>>> A
<3x4 sparse matrix of type '<class 'numpy.float64'>'
with 3 stored elements in COOrdinate format>
>>> A.todense()
matrix([[ 0., 0., 1., 2.],
[ 3., 0., 0., 0.],
[ 0., 0., 0., 0.]])
>>> ss = pd.Series.sparse.from_coo(A)
>>> ss
0 2 1
3 2
1 0 3
dtype: float64
BlockIndex
Block locations: array([0], dtype=int32)
Block lengths: array([3], dtype=int32)
"""
from pandas.core.arrays.sparse.scipy_sparse import _coo_to_sparse_series
from pandas import Series

result = _coo_to_sparse_series(A, dense_index=dense_index)
result = Series(result.array, index=result.index, copy=False)

return result

def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False):
"""
Create a scipy.sparse.coo_matrix from a Series with MultiIndex.

Use row_levels and column_levels to determine the row and column
coordinates respectively. row_levels and column_levels are the names
(labels) or numbers of the levels. {row_levels, column_levels} must be
a partition of the MultiIndex level names (or numbers).

Parameters
----------
row_levels : tuple/list
column_levels : tuple/list
sort_labels : bool, default False
Sort the row and column labels before forming the sparse matrix.

Returns
-------
y : scipy.sparse.coo_matrix
rows : list (row labels)
columns : list (column labels)

Examples
--------
>>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])
>>> s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0),
(1, 2, 'a', 1),
(1, 1, 'b', 0),
(1, 1, 'b', 1),
(2, 1, 'b', 0),
(2, 1, 'b', 1)],
names=['A', 'B', 'C', 'D'])
>>> ss = s.astype("Sparse")
>>> A, rows, columns = ss.sparse.to_coo(row_levels=['A', 'B'],
... column_levels=['C', 'D'],
... sort_labels=True)
>>> A
<3x4 sparse matrix of type '<class 'numpy.float64'>'
with 3 stored elements in COOrdinate format>
>>> A.todense()
matrix([[ 0., 0., 1., 3.],
[ 3., 0., 0., 0.],
[ 0., 0., 0., 0.]])
>>> rows
[(1, 1), (1, 2), (2, 1)]
>>> columns
[('a', 0), ('a', 1), ('b', 0), ('b', 1)]
"""
from pandas.core.arrays.sparse.scipy_sparse import _sparse_series_to_coo

A, rows, columns = _sparse_series_to_coo(
self._parent, row_levels, column_levels, sort_labels=sort_labels
)
return A, rows, columns

def to_dense(self):
"""
Convert a Series from sparse values to dense.

.. versionadded:: 0.25.0

Returns
-------
Series:
A Series with the same values, stored as a dense array.

Examples
--------
>>> series = pd.Series(pd.SparseArray([0, 1, 0]))
>>> series
0 0
1 1
2 0
dtype: Sparse[int64, 0]

>>> series.sparse.to_dense()
0 0
1 1
2 0
dtype: int64
"""
from pandas import Series

return Series(
self._parent.array.to_dense(),
index=self._parent.index,
name=self._parent.name,
)


class SparseFrameAccessor(BaseAccessor, PandasDelegate):
"""
DataFrame accessor for sparse data.

.. versionadded:: 0.25.0
"""

def _validate(self, data):
dtypes = data.dtypes
if not all(isinstance(t, SparseDtype) for t in dtypes):
raise AttributeError(self._validation_msg)

@classmethod
def from_spmatrix(cls, data, index=None, columns=None):
"""
Create a new DataFrame from a scipy sparse matrix.

.. versionadded:: 0.25.0

Parameters
----------
data : scipy.sparse.spmatrix
Must be convertible to csc format.
index, columns : Index, optional
Row and column labels to use for the resulting DataFrame.
Defaults to a RangeIndex.

Returns
-------
DataFrame
Each column of the DataFrame is stored as a
:class:`SparseArray`.

Examples
--------
>>> import scipy.sparse
>>> mat = scipy.sparse.eye(3)
>>> pd.DataFrame.sparse.from_spmatrix(mat)
0 1 2
0 1.0 0.0 0.0
1 0.0 1.0 0.0
2 0.0 0.0 1.0
"""
from pandas import DataFrame

data = data.tocsc()
index, columns = cls._prep_index(data, index, columns)
sparrays = [SparseArray.from_spmatrix(data[:, i]) for i in range(data.shape[1])]
data = dict(enumerate(sparrays))
result = DataFrame(data, index=index)
result.columns = columns
return result

def to_dense(self):
"""
Convert a DataFrame with sparse values to dense.

.. versionadded:: 0.25.0

Returns
-------
DataFrame
A DataFrame with the same values stored as dense arrays.

Examples
--------
>>> df = pd.DataFrame({"A": pd.SparseArray([0, 1, 0])})
>>> df.sparse.to_dense()
A
0 0
1 1
2 0
"""
from pandas import DataFrame

data = {k: v.array.to_dense() for k, v in self._parent.items()}
return DataFrame(data, index=self._parent.index, columns=self._parent.columns)

def to_coo(self):
"""
Return the contents of the frame as a sparse SciPy COO matrix.

.. versionadded:: 0.25.0

Returns
-------
coo_matrix : scipy.sparse.spmatrix
If the caller is heterogeneous and contains booleans or objects,
the result will be of dtype=object. See Notes.

Notes
-----
The dtype will be the lowest-common-denominator type (implicit
upcasting); that is to say if the dtypes (even of numeric types)
are mixed, the one that accommodates all will be chosen.

e.g. If the dtypes are float16 and float32, dtype will be upcast to
float32. By numpy.find_common_type convention, mixing int64 and
and uint64 will result in a float64 dtype.
"""
import_optional_dependency("scipy")
from scipy.sparse import coo_matrix

dtype = find_common_type(self._parent.dtypes)
if isinstance(dtype, SparseDtype):
dtype = dtype.subtype

cols, rows, datas = [], [], []
for col, name in enumerate(self._parent):
s = self._parent[name]
row = s.array.sp_index.to_int_index().indices
cols.append(np.repeat(col, len(row)))
rows.append(row)
datas.append(s.array.sp_values.astype(dtype, copy=False))

cols = np.concatenate(cols)
rows = np.concatenate(rows)
datas = np.concatenate(datas)
return coo_matrix((datas, (rows, cols)), shape=self._parent.shape)

@property
def density(self) -> float:
"""
Ratio of non-sparse points to total (dense) data points
represented in the DataFrame.
"""
return np.mean([column.array.density for _, column in self._parent.items()])

@staticmethod
def _prep_index(data, index, columns):
import pandas.core.indexes.base as ibase

N, K = data.shape
if index is None:
index = ibase.default_index(N)
if columns is None:
columns = ibase.default_index(K)

if len(columns) != K:
raise ValueError(
"Column length mismatch: {columns} vs. {K}".format(
columns=len(columns), K=K
)
)
if len(index) != N:
raise ValueError(
"Index length mismatch: {index} vs. {N}".format(index=len(index), N=N)
)
return index, columns
Loading