diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index a5af4e727391a..b890278d9ca30 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -96,9 +96,10 @@ Deprecations Removed SparseSeries and SparseDataFrame ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -``SparseSeries`` and ``SparseDataFrame`` have been removed (:issue:`28425`). -We recommend using a ``Series`` or ``DataFrame`` with sparse values instead. -See :ref:`sparse.migration` for help with migrating existing code. +``SparseSeries``, ``SparseDataFrame`` and the ``DataFrame.to_sparse`` method +have been removed (:issue:`28425`). We recommend using a ``Series`` or +``DataFrame`` with sparse values instead. See :ref:`sparse.migration` for help +with migrating existing code. Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/__init__.py b/pandas/__init__.py index 59ecc7f609ae9..6d0c55a45ed46 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -114,7 +114,7 @@ DataFrame, ) -from pandas.core.sparse.api import SparseArray, SparseDtype +from pandas.core.arrays.sparse import SparseArray, SparseDtype from pandas.tseries.api import infer_freq from pandas.tseries import offsets diff --git a/pandas/core/arrays/sparse/__init__.py b/pandas/core/arrays/sparse/__init__.py new file mode 100644 index 0000000000000..75f3819fb19fd --- /dev/null +++ b/pandas/core/arrays/sparse/__init__.py @@ -0,0 +1,5 @@ +# flake8: noqa: F401 + +from .accessor import SparseAccessor, SparseFrameAccessor +from .array import BlockIndex, IntIndex, SparseArray, _make_index +from .dtype import SparseDtype diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py new file mode 100644 index 0000000000000..57fd6d284af31 --- /dev/null +++ b/pandas/core/arrays/sparse/accessor.py @@ -0,0 +1,336 @@ +"""Sparse accessor""" + +import numpy as np + +from pandas.compat._optional import import_optional_dependency + +from pandas.core.dtypes.cast import find_common_type + +from pandas.core.accessor import PandasDelegate, delegate_names + +from .array import SparseArray +from .dtype import SparseDtype + + +class BaseAccessor: + _validation_msg = "Can only use the '.sparse' accessor with Sparse data." + + def __init__(self, data=None): + self._parent = data + self._validate(data) + + def _validate(self, data): + raise NotImplementedError + + +@delegate_names( + SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property" +) +class SparseAccessor(BaseAccessor, PandasDelegate): + """ + Accessor for SparseSparse from other sparse matrix data types. + """ + + def _validate(self, data): + if not isinstance(data.dtype, SparseDtype): + raise AttributeError(self._validation_msg) + + def _delegate_property_get(self, name, *args, **kwargs): + return getattr(self._parent.array, name) + + def _delegate_method(self, name, *args, **kwargs): + if name == "from_coo": + return self.from_coo(*args, **kwargs) + elif name == "to_coo": + return self.to_coo(*args, **kwargs) + else: + raise ValueError + + @classmethod + def from_coo(cls, A, dense_index=False): + """ + Create a Series with sparse values from a scipy.sparse.coo_matrix. + + Parameters + ---------- + A : scipy.sparse.coo_matrix + dense_index : bool, default False + If False (default), the SparseSeries index consists of only the + coords of the non-null entries of the original coo_matrix. + If True, the SparseSeries index consists of the full sorted + (row, col) coordinates of the coo_matrix. + + Returns + ------- + s : Series + A Series with sparse values. + + Examples + -------- + >>> from scipy import sparse + >>> A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), + shape=(3, 4)) + >>> A + <3x4 sparse matrix of type '' + with 3 stored elements in COOrdinate format> + >>> A.todense() + matrix([[ 0., 0., 1., 2.], + [ 3., 0., 0., 0.], + [ 0., 0., 0., 0.]]) + >>> ss = pd.Series.sparse.from_coo(A) + >>> ss + 0 2 1 + 3 2 + 1 0 3 + dtype: float64 + BlockIndex + Block locations: array([0], dtype=int32) + Block lengths: array([3], dtype=int32) + """ + from pandas.core.arrays.sparse.scipy_sparse import _coo_to_sparse_series + from pandas import Series + + result = _coo_to_sparse_series(A, dense_index=dense_index) + result = Series(result.array, index=result.index, copy=False) + + return result + + def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False): + """ + Create a scipy.sparse.coo_matrix from a Series with MultiIndex. + + Use row_levels and column_levels to determine the row and column + coordinates respectively. row_levels and column_levels are the names + (labels) or numbers of the levels. {row_levels, column_levels} must be + a partition of the MultiIndex level names (or numbers). + + Parameters + ---------- + row_levels : tuple/list + column_levels : tuple/list + sort_labels : bool, default False + Sort the row and column labels before forming the sparse matrix. + + Returns + ------- + y : scipy.sparse.coo_matrix + rows : list (row labels) + columns : list (column labels) + + Examples + -------- + >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) + >>> s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), + (1, 2, 'a', 1), + (1, 1, 'b', 0), + (1, 1, 'b', 1), + (2, 1, 'b', 0), + (2, 1, 'b', 1)], + names=['A', 'B', 'C', 'D']) + >>> ss = s.astype("Sparse") + >>> A, rows, columns = ss.sparse.to_coo(row_levels=['A', 'B'], + ... column_levels=['C', 'D'], + ... sort_labels=True) + >>> A + <3x4 sparse matrix of type '' + with 3 stored elements in COOrdinate format> + >>> A.todense() + matrix([[ 0., 0., 1., 3.], + [ 3., 0., 0., 0.], + [ 0., 0., 0., 0.]]) + >>> rows + [(1, 1), (1, 2), (2, 1)] + >>> columns + [('a', 0), ('a', 1), ('b', 0), ('b', 1)] + """ + from pandas.core.arrays.sparse.scipy_sparse import _sparse_series_to_coo + + A, rows, columns = _sparse_series_to_coo( + self._parent, row_levels, column_levels, sort_labels=sort_labels + ) + return A, rows, columns + + def to_dense(self): + """ + Convert a Series from sparse values to dense. + + .. versionadded:: 0.25.0 + + Returns + ------- + Series: + A Series with the same values, stored as a dense array. + + Examples + -------- + >>> series = pd.Series(pd.SparseArray([0, 1, 0])) + >>> series + 0 0 + 1 1 + 2 0 + dtype: Sparse[int64, 0] + + >>> series.sparse.to_dense() + 0 0 + 1 1 + 2 0 + dtype: int64 + """ + from pandas import Series + + return Series( + self._parent.array.to_dense(), + index=self._parent.index, + name=self._parent.name, + ) + + +class SparseFrameAccessor(BaseAccessor, PandasDelegate): + """ + DataFrame accessor for sparse data. + + .. versionadded:: 0.25.0 + """ + + def _validate(self, data): + dtypes = data.dtypes + if not all(isinstance(t, SparseDtype) for t in dtypes): + raise AttributeError(self._validation_msg) + + @classmethod + def from_spmatrix(cls, data, index=None, columns=None): + """ + Create a new DataFrame from a scipy sparse matrix. + + .. versionadded:: 0.25.0 + + Parameters + ---------- + data : scipy.sparse.spmatrix + Must be convertible to csc format. + index, columns : Index, optional + Row and column labels to use for the resulting DataFrame. + Defaults to a RangeIndex. + + Returns + ------- + DataFrame + Each column of the DataFrame is stored as a + :class:`SparseArray`. + + Examples + -------- + >>> import scipy.sparse + >>> mat = scipy.sparse.eye(3) + >>> pd.DataFrame.sparse.from_spmatrix(mat) + 0 1 2 + 0 1.0 0.0 0.0 + 1 0.0 1.0 0.0 + 2 0.0 0.0 1.0 + """ + from pandas import DataFrame + + data = data.tocsc() + index, columns = cls._prep_index(data, index, columns) + sparrays = [SparseArray.from_spmatrix(data[:, i]) for i in range(data.shape[1])] + data = dict(enumerate(sparrays)) + result = DataFrame(data, index=index) + result.columns = columns + return result + + def to_dense(self): + """ + Convert a DataFrame with sparse values to dense. + + .. versionadded:: 0.25.0 + + Returns + ------- + DataFrame + A DataFrame with the same values stored as dense arrays. + + Examples + -------- + >>> df = pd.DataFrame({"A": pd.SparseArray([0, 1, 0])}) + >>> df.sparse.to_dense() + A + 0 0 + 1 1 + 2 0 + """ + from pandas import DataFrame + + data = {k: v.array.to_dense() for k, v in self._parent.items()} + return DataFrame(data, index=self._parent.index, columns=self._parent.columns) + + def to_coo(self): + """ + Return the contents of the frame as a sparse SciPy COO matrix. + + .. versionadded:: 0.25.0 + + Returns + ------- + coo_matrix : scipy.sparse.spmatrix + If the caller is heterogeneous and contains booleans or objects, + the result will be of dtype=object. See Notes. + + Notes + ----- + The dtype will be the lowest-common-denominator type (implicit + upcasting); that is to say if the dtypes (even of numeric types) + are mixed, the one that accommodates all will be chosen. + + e.g. If the dtypes are float16 and float32, dtype will be upcast to + float32. By numpy.find_common_type convention, mixing int64 and + and uint64 will result in a float64 dtype. + """ + import_optional_dependency("scipy") + from scipy.sparse import coo_matrix + + dtype = find_common_type(self._parent.dtypes) + if isinstance(dtype, SparseDtype): + dtype = dtype.subtype + + cols, rows, datas = [], [], [] + for col, name in enumerate(self._parent): + s = self._parent[name] + row = s.array.sp_index.to_int_index().indices + cols.append(np.repeat(col, len(row))) + rows.append(row) + datas.append(s.array.sp_values.astype(dtype, copy=False)) + + cols = np.concatenate(cols) + rows = np.concatenate(rows) + datas = np.concatenate(datas) + return coo_matrix((datas, (rows, cols)), shape=self._parent.shape) + + @property + def density(self) -> float: + """ + Ratio of non-sparse points to total (dense) data points + represented in the DataFrame. + """ + return np.mean([column.array.density for _, column in self._parent.items()]) + + @staticmethod + def _prep_index(data, index, columns): + import pandas.core.indexes.base as ibase + + N, K = data.shape + if index is None: + index = ibase.default_index(N) + if columns is None: + columns = ibase.default_index(K) + + if len(columns) != K: + raise ValueError( + "Column length mismatch: {columns} vs. {K}".format( + columns=len(columns), K=K + ) + ) + if len(index) != N: + raise ValueError( + "Index length mismatch: {index} vs. {N}".format(index=len(index), N=N) + ) + return index, columns diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse/array.py similarity index 71% rename from pandas/core/arrays/sparse.py rename to pandas/core/arrays/sparse/array.py index c88289c3a4592..5acc922734529 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse/array.py @@ -4,7 +4,6 @@ from collections import abc import numbers import operator -import re from typing import Any, Callable import warnings @@ -15,11 +14,9 @@ from pandas._libs.sparse import BlockIndex, IntIndex, SparseIndex from pandas._libs.tslibs import NaT import pandas.compat as compat -from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.errors import PerformanceWarning -from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import ( astype_nansafe, construct_1d_arraylike_from_scalar, @@ -37,7 +34,6 @@ is_string_dtype, pandas_dtype, ) -from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCIndexClass, @@ -46,8 +42,6 @@ ) from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna -from pandas._typing import Dtype -from pandas.core.accessor import PandasDelegate, delegate_names import pandas.core.algorithms as algos from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin from pandas.core.base import PandasObject @@ -58,329 +52,7 @@ import pandas.io.formats.printing as printing - -# ---------------------------------------------------------------------------- -# Dtype -@register_extension_dtype -class SparseDtype(ExtensionDtype): - """ - Dtype for data stored in :class:`SparseArray`. - - This dtype implements the pandas ExtensionDtype interface. - - .. versionadded:: 0.24.0 - - Parameters - ---------- - dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64 - The dtype of the underlying array storing the non-fill value values. - fill_value : scalar, optional - The scalar value not stored in the SparseArray. By default, this - depends on `dtype`. - - =========== ========== - dtype na_value - =========== ========== - float ``np.nan`` - int ``0`` - bool ``False`` - datetime64 ``pd.NaT`` - timedelta64 ``pd.NaT`` - =========== ========== - - The default value may be overridden by specifying a `fill_value`. - - Attributes - ---------- - None - - Methods - ------- - None - """ - - # We include `_is_na_fill_value` in the metadata to avoid hash collisions - # between SparseDtype(float, 0.0) and SparseDtype(float, nan). - # Without is_na_fill_value in the comparison, those would be equal since - # hash(nan) is (sometimes?) 0. - _metadata = ("_dtype", "_fill_value", "_is_na_fill_value") - - def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None) -> None: - - if isinstance(dtype, type(self)): - if fill_value is None: - fill_value = dtype.fill_value - dtype = dtype.subtype - - dtype = pandas_dtype(dtype) - if is_string_dtype(dtype): - dtype = np.dtype("object") - - if fill_value is None: - fill_value = na_value_for_dtype(dtype) - - if not is_scalar(fill_value): - raise ValueError( - "fill_value must be a scalar. Got {} instead".format(fill_value) - ) - self._dtype = dtype - self._fill_value = fill_value - - def __hash__(self): - # Python3 doesn't inherit __hash__ when a base class overrides - # __eq__, so we explicitly do it here. - return super().__hash__() - - def __eq__(self, other): - # We have to override __eq__ to handle NA values in _metadata. - # The base class does simple == checks, which fail for NA. - if isinstance(other, str): - try: - other = self.construct_from_string(other) - except TypeError: - return False - - if isinstance(other, type(self)): - subtype = self.subtype == other.subtype - if self._is_na_fill_value: - # this case is complicated by two things: - # SparseDtype(float, float(nan)) == SparseDtype(float, np.nan) - # SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT) - # i.e. we want to treat any floating-point NaN as equal, but - # not a floating-point NaN and a datetime NaT. - fill_value = ( - other._is_na_fill_value - and isinstance(self.fill_value, type(other.fill_value)) - or isinstance(other.fill_value, type(self.fill_value)) - ) - else: - fill_value = self.fill_value == other.fill_value - - return subtype and fill_value - return False - - @property - def fill_value(self): - """ - The fill value of the array. - - Converting the SparseArray to a dense ndarray will fill the - array with this value. - - .. warning:: - - It's possible to end up with a SparseArray that has ``fill_value`` - values in ``sp_values``. This can occur, for example, when setting - ``SparseArray.fill_value`` directly. - """ - return self._fill_value - - @property - def _is_na_fill_value(self): - return isna(self.fill_value) - - @property - def _is_numeric(self): - return not is_object_dtype(self.subtype) - - @property - def _is_boolean(self): - return is_bool_dtype(self.subtype) - - @property - def kind(self): - """ - The sparse kind. Either 'integer', or 'block'. - """ - return self.subtype.kind - - @property - def type(self): - return self.subtype.type - - @property - def subtype(self): - return self._dtype - - @property - def name(self): - return "Sparse[{}, {}]".format(self.subtype.name, self.fill_value) - - def __repr__(self): - return self.name - - @classmethod - def construct_array_type(cls): - return SparseArray - - @classmethod - def construct_from_string(cls, string): - """ - Construct a SparseDtype from a string form. - - Parameters - ---------- - string : str - Can take the following forms. - - string dtype - ================ ============================ - 'int' SparseDtype[np.int64, 0] - 'Sparse' SparseDtype[np.float64, nan] - 'Sparse[int]' SparseDtype[np.int64, 0] - 'Sparse[int, 0]' SparseDtype[np.int64, 0] - ================ ============================ - - It is not possible to specify non-default fill values - with a string. An argument like ``'Sparse[int, 1]'`` - will raise a ``TypeError`` because the default fill value - for integers is 0. - - Returns - ------- - SparseDtype - """ - msg = "Could not construct SparseDtype from '{}'".format(string) - if string.startswith("Sparse"): - try: - sub_type, has_fill_value = cls._parse_subtype(string) - except ValueError: - raise TypeError(msg) - else: - result = SparseDtype(sub_type) - msg = ( - "Could not construct SparseDtype from '{}'.\n\nIt " - "looks like the fill_value in the string is not " - "the default for the dtype. Non-default fill_values " - "are not supported. Use the 'SparseDtype()' " - "constructor instead." - ) - if has_fill_value and str(result) != string: - raise TypeError(msg.format(string)) - return result - else: - raise TypeError(msg) - - @staticmethod - def _parse_subtype(dtype): - """ - Parse a string to get the subtype - - Parameters - ---------- - dtype : str - A string like - - * Sparse[subtype] - * Sparse[subtype, fill_value] - - Returns - ------- - subtype : str - - Raises - ------ - ValueError - When the subtype cannot be extracted. - """ - xpr = re.compile(r"Sparse\[(?P[^,]*)(, )?(?P.*?)?\]$") - m = xpr.match(dtype) - has_fill_value = False - if m: - subtype = m.groupdict()["subtype"] - has_fill_value = m.groupdict()["fill_value"] or has_fill_value - elif dtype == "Sparse": - subtype = "float64" - else: - raise ValueError("Cannot parse {}".format(dtype)) - return subtype, has_fill_value - - @classmethod - def is_dtype(cls, dtype): - dtype = getattr(dtype, "dtype", dtype) - if isinstance(dtype, str) and dtype.startswith("Sparse"): - sub_type, _ = cls._parse_subtype(dtype) - dtype = np.dtype(sub_type) - elif isinstance(dtype, cls): - return True - return isinstance(dtype, np.dtype) or dtype == "Sparse" - - def update_dtype(self, dtype): - """ - Convert the SparseDtype to a new dtype. - - This takes care of converting the ``fill_value``. - - Parameters - ---------- - dtype : Union[str, numpy.dtype, SparseDtype] - The new dtype to use. - - * For a SparseDtype, it is simply returned - * For a NumPy dtype (or str), the current fill value - is converted to the new dtype, and a SparseDtype - with `dtype` and the new fill value is returned. - - Returns - ------- - SparseDtype - A new SparseDtype with the corret `dtype` and fill value - for that `dtype`. - - Raises - ------ - ValueError - When the current fill value cannot be converted to the - new `dtype` (e.g. trying to convert ``np.nan`` to an - integer dtype). - - - Examples - -------- - >>> SparseDtype(int, 0).update_dtype(float) - Sparse[float64, 0.0] - - >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan)) - Sparse[float64, nan] - """ - cls = type(self) - dtype = pandas_dtype(dtype) - - if not isinstance(dtype, cls): - fill_value = astype_nansafe(np.array(self.fill_value), dtype).item() - dtype = cls(dtype, fill_value=fill_value) - - return dtype - - @property - def _subtype_with_str(self): - """ - Whether the SparseDtype's subtype should be considered ``str``. - - Typically, pandas will store string data in an object-dtype array. - When converting values to a dtype, e.g. in ``.astype``, we need to - be more specific, we need the actual underlying type. - - Returns - ------- - - >>> SparseDtype(int, 1)._subtype_with_str - dtype('int64') - - >>> SparseDtype(object, 1)._subtype_with_str - dtype('O') - - >>> dtype = SparseDtype(str, '') - >>> dtype.subtype - dtype('O') - - >>> dtype._subtype_with_str - str - """ - if isinstance(self.fill_value, str): - return type(self.fill_value) - return self.subtype - +from .dtype import SparseDtype # ---------------------------------------------------------------------------- # Array @@ -1925,331 +1597,3 @@ def _make_index(length, indices, kind): else: # pragma: no cover raise ValueError("must be block or integer type") return index - - -# ---------------------------------------------------------------------------- -# Accessor - - -class BaseAccessor: - _validation_msg = "Can only use the '.sparse' accessor with Sparse data." - - def __init__(self, data=None): - self._parent = data - self._validate(data) - - def _validate(self, data): - raise NotImplementedError - - -@delegate_names( - SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property" -) -class SparseAccessor(BaseAccessor, PandasDelegate): - """ - Accessor for SparseSparse from other sparse matrix data types. - """ - - def _validate(self, data): - if not isinstance(data.dtype, SparseDtype): - raise AttributeError(self._validation_msg) - - def _delegate_property_get(self, name, *args, **kwargs): - return getattr(self._parent.array, name) - - def _delegate_method(self, name, *args, **kwargs): - if name == "from_coo": - return self.from_coo(*args, **kwargs) - elif name == "to_coo": - return self.to_coo(*args, **kwargs) - else: - raise ValueError - - @classmethod - def from_coo(cls, A, dense_index=False): - """ - Create a Series with sparse values from a scipy.sparse.coo_matrix. - - Parameters - ---------- - A : scipy.sparse.coo_matrix - dense_index : bool, default False - If False (default), the SparseSeries index consists of only the - coords of the non-null entries of the original coo_matrix. - If True, the SparseSeries index consists of the full sorted - (row, col) coordinates of the coo_matrix. - - Returns - ------- - s : Series - A Series with sparse values. - - Examples - -------- - >>> from scipy import sparse - >>> A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), - shape=(3, 4)) - >>> A - <3x4 sparse matrix of type '' - with 3 stored elements in COOrdinate format> - >>> A.todense() - matrix([[ 0., 0., 1., 2.], - [ 3., 0., 0., 0.], - [ 0., 0., 0., 0.]]) - >>> ss = pd.Series.sparse.from_coo(A) - >>> ss - 0 2 1 - 3 2 - 1 0 3 - dtype: float64 - BlockIndex - Block locations: array([0], dtype=int32) - Block lengths: array([3], dtype=int32) - """ - from pandas.core.sparse.scipy_sparse import _coo_to_sparse_series - from pandas import Series - - result = _coo_to_sparse_series(A, dense_index=dense_index) - result = Series(result.array, index=result.index, copy=False) - - return result - - def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False): - """ - Create a scipy.sparse.coo_matrix from a Series with MultiIndex. - - Use row_levels and column_levels to determine the row and column - coordinates respectively. row_levels and column_levels are the names - (labels) or numbers of the levels. {row_levels, column_levels} must be - a partition of the MultiIndex level names (or numbers). - - Parameters - ---------- - row_levels : tuple/list - column_levels : tuple/list - sort_labels : bool, default False - Sort the row and column labels before forming the sparse matrix. - - Returns - ------- - y : scipy.sparse.coo_matrix - rows : list (row labels) - columns : list (column labels) - - Examples - -------- - >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) - >>> s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), - (1, 2, 'a', 1), - (1, 1, 'b', 0), - (1, 1, 'b', 1), - (2, 1, 'b', 0), - (2, 1, 'b', 1)], - names=['A', 'B', 'C', 'D']) - >>> ss = s.astype("Sparse") - >>> A, rows, columns = ss.sparse.to_coo(row_levels=['A', 'B'], - ... column_levels=['C', 'D'], - ... sort_labels=True) - >>> A - <3x4 sparse matrix of type '' - with 3 stored elements in COOrdinate format> - >>> A.todense() - matrix([[ 0., 0., 1., 3.], - [ 3., 0., 0., 0.], - [ 0., 0., 0., 0.]]) - >>> rows - [(1, 1), (1, 2), (2, 1)] - >>> columns - [('a', 0), ('a', 1), ('b', 0), ('b', 1)] - """ - from pandas.core.sparse.scipy_sparse import _sparse_series_to_coo - - A, rows, columns = _sparse_series_to_coo( - self._parent, row_levels, column_levels, sort_labels=sort_labels - ) - return A, rows, columns - - def to_dense(self): - """ - Convert a Series from sparse values to dense. - - .. versionadded:: 0.25.0 - - Returns - ------- - Series: - A Series with the same values, stored as a dense array. - - Examples - -------- - >>> series = pd.Series(pd.SparseArray([0, 1, 0])) - >>> series - 0 0 - 1 1 - 2 0 - dtype: Sparse[int64, 0] - - >>> series.sparse.to_dense() - 0 0 - 1 1 - 2 0 - dtype: int64 - """ - from pandas import Series - - return Series( - self._parent.array.to_dense(), - index=self._parent.index, - name=self._parent.name, - ) - - -class SparseFrameAccessor(BaseAccessor, PandasDelegate): - """ - DataFrame accessor for sparse data. - - .. versionadded:: 0.25.0 - """ - - def _validate(self, data): - dtypes = data.dtypes - if not all(isinstance(t, SparseDtype) for t in dtypes): - raise AttributeError(self._validation_msg) - - @classmethod - def from_spmatrix(cls, data, index=None, columns=None): - """ - Create a new DataFrame from a scipy sparse matrix. - - .. versionadded:: 0.25.0 - - Parameters - ---------- - data : scipy.sparse.spmatrix - Must be convertible to csc format. - index, columns : Index, optional - Row and column labels to use for the resulting DataFrame. - Defaults to a RangeIndex. - - Returns - ------- - DataFrame - Each column of the DataFrame is stored as a - :class:`SparseArray`. - - Examples - -------- - >>> import scipy.sparse - >>> mat = scipy.sparse.eye(3) - >>> pd.DataFrame.sparse.from_spmatrix(mat) - 0 1 2 - 0 1.0 0.0 0.0 - 1 0.0 1.0 0.0 - 2 0.0 0.0 1.0 - """ - from pandas import DataFrame - - data = data.tocsc() - index, columns = cls._prep_index(data, index, columns) - sparrays = [SparseArray.from_spmatrix(data[:, i]) for i in range(data.shape[1])] - data = dict(enumerate(sparrays)) - result = DataFrame(data, index=index) - result.columns = columns - return result - - def to_dense(self): - """ - Convert a DataFrame with sparse values to dense. - - .. versionadded:: 0.25.0 - - Returns - ------- - DataFrame - A DataFrame with the same values stored as dense arrays. - - Examples - -------- - >>> df = pd.DataFrame({"A": pd.SparseArray([0, 1, 0])}) - >>> df.sparse.to_dense() - A - 0 0 - 1 1 - 2 0 - """ - from pandas import DataFrame - - data = {k: v.array.to_dense() for k, v in self._parent.items()} - return DataFrame(data, index=self._parent.index, columns=self._parent.columns) - - def to_coo(self): - """ - Return the contents of the frame as a sparse SciPy COO matrix. - - .. versionadded:: 0.25.0 - - Returns - ------- - coo_matrix : scipy.sparse.spmatrix - If the caller is heterogeneous and contains booleans or objects, - the result will be of dtype=object. See Notes. - - Notes - ----- - The dtype will be the lowest-common-denominator type (implicit - upcasting); that is to say if the dtypes (even of numeric types) - are mixed, the one that accommodates all will be chosen. - - e.g. If the dtypes are float16 and float32, dtype will be upcast to - float32. By numpy.find_common_type convention, mixing int64 and - and uint64 will result in a float64 dtype. - """ - import_optional_dependency("scipy") - from scipy.sparse import coo_matrix - - dtype = find_common_type(self._parent.dtypes) - if isinstance(dtype, SparseDtype): - dtype = dtype.subtype - - cols, rows, datas = [], [], [] - for col, name in enumerate(self._parent): - s = self._parent[name] - row = s.array.sp_index.to_int_index().indices - cols.append(np.repeat(col, len(row))) - rows.append(row) - datas.append(s.array.sp_values.astype(dtype, copy=False)) - - cols = np.concatenate(cols) - rows = np.concatenate(rows) - datas = np.concatenate(datas) - return coo_matrix((datas, (rows, cols)), shape=self._parent.shape) - - @property - def density(self) -> float: - """ - Ratio of non-sparse points to total (dense) data points - represented in the DataFrame. - """ - return np.mean([column.array.density for _, column in self._parent.items()]) - - @staticmethod - def _prep_index(data, index, columns): - import pandas.core.indexes.base as ibase - - N, K = data.shape - if index is None: - index = ibase.default_index(N) - if columns is None: - columns = ibase.default_index(K) - - if len(columns) != K: - raise ValueError( - "Column length mismatch: {columns} vs. {K}".format( - columns=len(columns), K=K - ) - ) - if len(index) != N: - raise ValueError( - "Index length mismatch: {index} vs. {N}".format(index=len(index), N=N) - ) - return index, columns diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py new file mode 100644 index 0000000000000..6fd73ae14fff1 --- /dev/null +++ b/pandas/core/arrays/sparse/dtype.py @@ -0,0 +1,343 @@ +"""Sparse Dtype""" + +import re +from typing import Any + +import numpy as np + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.cast import astype_nansafe +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_object_dtype, + is_scalar, + is_string_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.missing import isna, na_value_for_dtype + +from pandas._typing import Dtype + + +@register_extension_dtype +class SparseDtype(ExtensionDtype): + """ + Dtype for data stored in :class:`SparseArray`. + + This dtype implements the pandas ExtensionDtype interface. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64 + The dtype of the underlying array storing the non-fill value values. + fill_value : scalar, optional + The scalar value not stored in the SparseArray. By default, this + depends on `dtype`. + + =========== ========== + dtype na_value + =========== ========== + float ``np.nan`` + int ``0`` + bool ``False`` + datetime64 ``pd.NaT`` + timedelta64 ``pd.NaT`` + =========== ========== + + The default value may be overridden by specifying a `fill_value`. + + Attributes + ---------- + None + + Methods + ------- + None + """ + + # We include `_is_na_fill_value` in the metadata to avoid hash collisions + # between SparseDtype(float, 0.0) and SparseDtype(float, nan). + # Without is_na_fill_value in the comparison, those would be equal since + # hash(nan) is (sometimes?) 0. + _metadata = ("_dtype", "_fill_value", "_is_na_fill_value") + + def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None) -> None: + + if isinstance(dtype, type(self)): + if fill_value is None: + fill_value = dtype.fill_value + dtype = dtype.subtype + + dtype = pandas_dtype(dtype) + if is_string_dtype(dtype): + dtype = np.dtype("object") + + if fill_value is None: + fill_value = na_value_for_dtype(dtype) + + if not is_scalar(fill_value): + raise ValueError( + "fill_value must be a scalar. Got {} instead".format(fill_value) + ) + self._dtype = dtype + self._fill_value = fill_value + + def __hash__(self): + # Python3 doesn't inherit __hash__ when a base class overrides + # __eq__, so we explicitly do it here. + return super().__hash__() + + def __eq__(self, other): + # We have to override __eq__ to handle NA values in _metadata. + # The base class does simple == checks, which fail for NA. + if isinstance(other, str): + try: + other = self.construct_from_string(other) + except TypeError: + return False + + if isinstance(other, type(self)): + subtype = self.subtype == other.subtype + if self._is_na_fill_value: + # this case is complicated by two things: + # SparseDtype(float, float(nan)) == SparseDtype(float, np.nan) + # SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT) + # i.e. we want to treat any floating-point NaN as equal, but + # not a floating-point NaN and a datetime NaT. + fill_value = ( + other._is_na_fill_value + and isinstance(self.fill_value, type(other.fill_value)) + or isinstance(other.fill_value, type(self.fill_value)) + ) + else: + fill_value = self.fill_value == other.fill_value + + return subtype and fill_value + return False + + @property + def fill_value(self): + """ + The fill value of the array. + + Converting the SparseArray to a dense ndarray will fill the + array with this value. + + .. warning:: + + It's possible to end up with a SparseArray that has ``fill_value`` + values in ``sp_values``. This can occur, for example, when setting + ``SparseArray.fill_value`` directly. + """ + return self._fill_value + + @property + def _is_na_fill_value(self): + return isna(self.fill_value) + + @property + def _is_numeric(self): + return not is_object_dtype(self.subtype) + + @property + def _is_boolean(self): + return is_bool_dtype(self.subtype) + + @property + def kind(self): + """ + The sparse kind. Either 'integer', or 'block'. + """ + return self.subtype.kind + + @property + def type(self): + return self.subtype.type + + @property + def subtype(self): + return self._dtype + + @property + def name(self): + return "Sparse[{}, {}]".format(self.subtype.name, self.fill_value) + + def __repr__(self): + return self.name + + @classmethod + def construct_array_type(cls): + from .array import SparseArray + + return SparseArray + + @classmethod + def construct_from_string(cls, string): + """ + Construct a SparseDtype from a string form. + + Parameters + ---------- + string : str + Can take the following forms. + + string dtype + ================ ============================ + 'int' SparseDtype[np.int64, 0] + 'Sparse' SparseDtype[np.float64, nan] + 'Sparse[int]' SparseDtype[np.int64, 0] + 'Sparse[int, 0]' SparseDtype[np.int64, 0] + ================ ============================ + + It is not possible to specify non-default fill values + with a string. An argument like ``'Sparse[int, 1]'`` + will raise a ``TypeError`` because the default fill value + for integers is 0. + + Returns + ------- + SparseDtype + """ + msg = "Could not construct SparseDtype from '{}'".format(string) + if string.startswith("Sparse"): + try: + sub_type, has_fill_value = cls._parse_subtype(string) + except ValueError: + raise TypeError(msg) + else: + result = SparseDtype(sub_type) + msg = ( + "Could not construct SparseDtype from '{}'.\n\nIt " + "looks like the fill_value in the string is not " + "the default for the dtype. Non-default fill_values " + "are not supported. Use the 'SparseDtype()' " + "constructor instead." + ) + if has_fill_value and str(result) != string: + raise TypeError(msg.format(string)) + return result + else: + raise TypeError(msg) + + @staticmethod + def _parse_subtype(dtype): + """ + Parse a string to get the subtype + + Parameters + ---------- + dtype : str + A string like + + * Sparse[subtype] + * Sparse[subtype, fill_value] + + Returns + ------- + subtype : str + + Raises + ------ + ValueError + When the subtype cannot be extracted. + """ + xpr = re.compile(r"Sparse\[(?P[^,]*)(, )?(?P.*?)?\]$") + m = xpr.match(dtype) + has_fill_value = False + if m: + subtype = m.groupdict()["subtype"] + has_fill_value = m.groupdict()["fill_value"] or has_fill_value + elif dtype == "Sparse": + subtype = "float64" + else: + raise ValueError("Cannot parse {}".format(dtype)) + return subtype, has_fill_value + + @classmethod + def is_dtype(cls, dtype): + dtype = getattr(dtype, "dtype", dtype) + if isinstance(dtype, str) and dtype.startswith("Sparse"): + sub_type, _ = cls._parse_subtype(dtype) + dtype = np.dtype(sub_type) + elif isinstance(dtype, cls): + return True + return isinstance(dtype, np.dtype) or dtype == "Sparse" + + def update_dtype(self, dtype): + """ + Convert the SparseDtype to a new dtype. + + This takes care of converting the ``fill_value``. + + Parameters + ---------- + dtype : Union[str, numpy.dtype, SparseDtype] + The new dtype to use. + + * For a SparseDtype, it is simply returned + * For a NumPy dtype (or str), the current fill value + is converted to the new dtype, and a SparseDtype + with `dtype` and the new fill value is returned. + + Returns + ------- + SparseDtype + A new SparseDtype with the corret `dtype` and fill value + for that `dtype`. + + Raises + ------ + ValueError + When the current fill value cannot be converted to the + new `dtype` (e.g. trying to convert ``np.nan`` to an + integer dtype). + + + Examples + -------- + >>> SparseDtype(int, 0).update_dtype(float) + Sparse[float64, 0.0] + + >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan)) + Sparse[float64, nan] + """ + cls = type(self) + dtype = pandas_dtype(dtype) + + if not isinstance(dtype, cls): + fill_value = astype_nansafe(np.array(self.fill_value), dtype).item() + dtype = cls(dtype, fill_value=fill_value) + + return dtype + + @property + def _subtype_with_str(self): + """ + Whether the SparseDtype's subtype should be considered ``str``. + + Typically, pandas will store string data in an object-dtype array. + When converting values to a dtype, e.g. in ``.astype``, we need to + be more specific, we need the actual underlying type. + + Returns + ------- + + >>> SparseDtype(int, 1)._subtype_with_str + dtype('int64') + + >>> SparseDtype(object, 1)._subtype_with_str + dtype('O') + + >>> dtype = SparseDtype(str, '') + >>> dtype.subtype + dtype('O') + + >>> dtype._subtype_with_str + str + """ + if isinstance(self.fill_value, str): + return type(self.fill_value) + return self.subtype diff --git a/pandas/core/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py similarity index 100% rename from pandas/core/sparse/scipy_sparse.py rename to pandas/core/arrays/sparse/scipy_sparse.py diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py index 071a8db707b69..f1d2803ce5505 100644 --- a/pandas/tests/arrays/sparse/test_arithmetics.py +++ b/pandas/tests/arrays/sparse/test_arithmetics.py @@ -5,7 +5,7 @@ import pandas as pd from pandas.core import ops -from pandas.core.sparse.api import SparseDtype +from pandas.core.arrays.sparse import SparseDtype import pandas.util.testing as tm diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 5d5ee565c7891..c02d8ae4e7429 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import isna -from pandas.core.sparse.api import SparseArray, SparseDtype +from pandas.core.arrays.sparse import SparseArray, SparseDtype import pandas.util.testing as tm from pandas.util.testing import assert_almost_equal diff --git a/pandas/tests/sparse/test_combine_concat.py b/pandas/tests/arrays/sparse/test_combine_concat.py similarity index 100% rename from pandas/tests/sparse/test_combine_concat.py rename to pandas/tests/arrays/sparse/test_combine_concat.py diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py index db8f62962f0b0..aa8d2afca11e6 100644 --- a/pandas/tests/arrays/sparse/test_dtype.py +++ b/pandas/tests/arrays/sparse/test_dtype.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas.core.sparse.api import SparseDtype +from pandas.core.arrays.sparse import SparseDtype @pytest.mark.parametrize( diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 3288c9c584565..036b0213973d6 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -30,7 +30,7 @@ import pandas as pd from pandas import Categorical, CategoricalIndex, IntervalIndex, Series, date_range -from pandas.core.sparse.api import SparseDtype +from pandas.core.arrays.sparse import SparseDtype import pandas.util.testing as tm diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 9d08981d39894..5e80c317a587b 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -8,7 +8,7 @@ import pandas as pd from pandas import Categorical, DataFrame, Index, Series, get_dummies -from pandas.core.sparse.api import SparseArray, SparseDtype +from pandas.core.arrays.sparse import SparseArray, SparseDtype import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal diff --git a/pandas/tests/sparse/__init__.py b/pandas/tests/sparse/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000