From 24f48c3b380f6ec0a21944d553d1fa78ddd9d107 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 11 Mar 2019 22:05:46 -0500 Subject: [PATCH 01/13] API: DataFrame.sparse accessor Closes https://github.com/pandas-dev/pandas/issues/25681 --- doc/source/reference/frame.rst | 23 +++ doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/arrays/sparse.py | 164 +++++++++++++++++++- pandas/core/frame.py | 2 + pandas/core/sparse/frame.py | 27 +--- pandas/tests/arrays/sparse/test_accessor.py | 76 +++++++++ 6 files changed, 265 insertions(+), 28 deletions(-) create mode 100644 pandas/tests/arrays/sparse/test_accessor.py diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 568acd5207bd1..4f256e86710ae 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -312,6 +312,29 @@ specific plotting methods of the form ``DataFrame.plot.``. DataFrame.boxplot DataFrame.hist + +.. _api.frame.sparse: + +Sparse Accessor +~~~~~~~~~~~~~~~ + +Sparse-dtype specific methods and attributes are provided under the +``DataFrame.sparse`` accessor. + +.. autosummary:: + :toctree: api/ + :template: autosummary/accessor_attribute.rst + + DataFrame.sparse.density + +.. autosummary:: + :toctree: api/ + + DataFrame.sparse.from_spmatrix + DataFrame.sparse.to_coo + DataFrame.sparse.to_dense + + Serialization / IO / Conversion ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 284943cf49070..1efc97a38c282 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -26,6 +26,7 @@ Other Enhancements - :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`) - :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`) - :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`) +- Added :ref:`api.frame.sparse` for working with a ``DataFrame`` whose values are sparse (:issue:`25681`) .. _whatsnew_0250.api_breaking: diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index fd7149edc8d7c..459c9866d0466 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -678,6 +678,36 @@ def _simple_new(cls, sparse_array, sparse_index, dtype): new._dtype = dtype return new + @classmethod + def from_spmatrix(cls, data): + """ + Create a SparseArray from a scipy.sparse matrix. + + Parameters + ---------- + data : scipy.sparse.sp_matrix + This should be a 2-D SciPy sparse where the size + of the second dimension is 1. In other words, a + sparse matrix with a single column. + + Returns + ------- + SparseArray. + """ + assert data.ndim == 2 + + length, ncol = data.shape + + assert ncol == 1 + + arr = data.data + idx, _ = data.nonzero() + zero = np.array(0, dtype=arr.dtype).item() + dtype = SparseDtype(arr.dtype, zero) + index = IntIndex(length, idx) + + return cls._simple_new(arr, index, dtype) + def __array__(self, dtype=None, copy=True): fill_value = self.fill_value @@ -1891,6 +1921,9 @@ def _make_index(length, indices, kind): # ---------------------------------------------------------------------------- # Accessor +_validation_msg = "Can only use the '.sparse' accessor with Sparse data." + + @delegate_names(SparseArray, ['npoints', 'density', 'fill_value', 'sp_values'], typ='property') @@ -1900,15 +1933,13 @@ class SparseAccessor(PandasDelegate): """ def __init__(self, data=None): - self._validate(data) # Store the Series since we need that for to_coo self._parent = data + self._validate(data) - @staticmethod - def _validate(data): + def _validate(self, data): if not isinstance(data.dtype, SparseDtype): - msg = "Can only use the '.sparse' accessor with Sparse data." - raise AttributeError(msg) + raise AttributeError(_validation_msg) def _delegate_property_get(self, name, *args, **kwargs): return getattr(self._parent.values, name) @@ -2025,3 +2056,126 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False): column_levels, sort_labels=sort_labels) return A, rows, columns + + def to_dense(self): + from pandas import Series + return Series(self._parent.array.to_dense(), + index=self._parent.index, + name=self._parent.name) + + +class SparseFrameAccessor(PandasDelegate): + + def __init__(self, data=None): + # Store the Series since we need that for to_coo + self._parent = data + self._validate(data) + + def _validate(self, data): + dtypes = data.dtypes + if not all(isinstance(t, SparseDtype) for t in dtypes): + raise AttributeError(_validation_msg) + + @classmethod + def from_spmatrix(cls, data, index=None, columns=None): + """ + Create a new DataFrame from a scipy sparse matrix. + + Parameters + ---------- + data : scipy.sparse.spmatrix + Must be convertible to csc format. + index, columns : Index, optional + Row and column labels to use for the resulting DataFrame. + Defaults to a RangeIndex. + + Returns + ------- + DataFrame + + Examples + -------- + >>> import scipy.sparse + >>> mat = scipy.sparse.eye(3) + >>> pd.DataFrame.sparse.from_spmatrix(mat) + 0 1 2 + 0 1.0 0.0 0.0 + 1 0.0 1.0 0.0 + 2 0.0 0.0 1.0 + """ + from pandas import DataFrame + + data = data.tocsc() + index, columns = cls._prep_index(data, index, columns) + sparrays = [ + SparseArray.from_spmatrix(data[:, i]) + for i in range(data.shape[1]) + ] + data = dict(zip(columns, sparrays)) + return DataFrame(data, index=index) + + def to_dense(self): + """ + Convert to dense DataFrame + + Returns + ------- + df : DataFrame + """ + from pandas import DataFrame + + data = {k: v.array.to_dense() + for k, v in compat.iteritems(self._parent)} + return DataFrame(data, + index=self._parent.index, + columns=self._parent.columns) + + def to_coo(self): + try: + from scipy.sparse import coo_matrix + except ImportError: + raise ImportError('Scipy is not installed') + + dtype = find_common_type(self._parent.dtypes) + if isinstance(dtype, SparseDtype): + dtype = dtype.subtype + + cols, rows, datas = [], [], [] + for col, name in enumerate(self._parent): + s = self._parent[name] + row = s.array.sp_index.to_int_index().indices + cols.append(np.repeat(col, len(row))) + rows.append(row) + datas.append(s.array.sp_values.astype(dtype, copy=False)) + + cols = np.concatenate(cols) + rows = np.concatenate(rows) + datas = np.concatenate(datas) + return coo_matrix((datas, (rows, cols)), shape=self._parent.shape) + + @property + def density(self): + """ + Ratio of non-sparse points to total (dense) data points + represented in the DataFrame. + """ + return np.mean([column.array.density + for _, column in self._parent.iteritems()]) + + @staticmethod + def _prep_index(data, index, columns): + import pandas.core.indexes.base as ibase + + N, K = data.shape + if index is None: + index = ibase.default_index(N) + if columns is None: + columns = ibase.default_index(K) + + if len(columns) != K: + raise ValueError('Column length mismatch: {columns} vs. {K}' + .format(columns=len(columns), K=K)) + if len(index) != N: + raise ValueError('Index length mismatch: {index} vs. {N}' + .format(index=len(index), N=N)) + return index, columns diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3996728a1cc90..5f6259da684a7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -36,6 +36,7 @@ PY36, raise_with_traceback, Iterator, string_and_binary_types) from pandas.compat.numpy import function as nv +from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.dtypes.cast import ( maybe_upcast, cast_scalar_to_array, @@ -8009,6 +8010,7 @@ def isin(self, values): plot = CachedAccessor("plot", gfx.FramePlotMethods) hist = gfx.hist_frame boxplot = gfx.boxplot_frame + sparse = CachedAccessor("sparse", SparseFrameAccessor) DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0, diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 2d54b82a3c844..14113538e2e95 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -14,12 +14,12 @@ from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender -from pandas.core.dtypes.cast import find_common_type, maybe_upcast +from pandas.core.dtypes.cast import maybe_upcast from pandas.core.dtypes.common import ensure_platform_int, is_scipy_sparse from pandas.core.dtypes.missing import isna, notna import pandas.core.algorithms as algos -from pandas.core.arrays.sparse import SparseArray, SparseDtype +from pandas.core.arrays.sparse import SparseArray import pandas.core.common as com from pandas.core.frame import DataFrame import pandas.core.generic as generic @@ -271,27 +271,8 @@ def to_coo(self): float32. By numpy.find_common_type convention, mixing int64 and and uint64 will result in a float64 dtype. """ - try: - from scipy.sparse import coo_matrix - except ImportError: - raise ImportError('Scipy is not installed') - - dtype = find_common_type(self.dtypes) - if isinstance(dtype, SparseDtype): - dtype = dtype.subtype - - cols, rows, datas = [], [], [] - for col, name in enumerate(self): - s = self[name] - row = s.sp_index.to_int_index().indices - cols.append(np.repeat(col, len(row))) - rows.append(row) - datas.append(s.sp_values.astype(dtype, copy=False)) - - cols = np.concatenate(cols) - rows = np.concatenate(rows) - datas = np.concatenate(datas) - return coo_matrix((datas, (rows, cols)), shape=self.shape) + from pandas.core.arrays.sparse import SparseFrameAccessor + return SparseFrameAccessor(self).to_coo() def __array_wrap__(self, result): return self._constructor( diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py new file mode 100644 index 0000000000000..af53f74dea962 --- /dev/null +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -0,0 +1,76 @@ +import string + +import numpy as np +import pytest + +import pandas as pd +import pandas.util.testing as tm + + +class TestSeriesAccessor(object): + # TODO: collect other accessor tests + def test_to_dense(self): + s = pd.Series([0, 1, 0, 10], dtype='Sparse[int]') + result = s.sparse.to_dense() + expected = pd.Series([0, 1, 0, 10]) + tm.assert_series_equal(result, expected) + + +class TestFrameAccessor(object): + @pytest.mark.parametrize('format', ['csc', 'csr', 'coo']) + @pytest.mark.parametrize("labels", [ + None, + list(string.ascii_letters[:10]), + ]) + @pytest.mark.parametrize('dtype', ['float64', 'int64']) + def test_from_spmatrix(self, format, labels, dtype): + pytest.importorskip("scipy") + import scipy.sparse + sp_dtype = pd.SparseDtype(dtype, np.array(0, dtype=dtype).item()) + + mat = scipy.sparse.eye(10, format=format, dtype=dtype) + result = pd.DataFrame.sparse.from_spmatrix( + mat, index=labels, columns=labels + ) + expected = pd.DataFrame( + np.eye(10, dtype=dtype), + index=labels, + columns=labels, + ).astype(sp_dtype) + tm.assert_frame_equal(result, expected) + + def test_to_coo(self): + pytest.importorskip("scipy") + import scipy.sparse + + df = pd.DataFrame({ + "A": [0, 1, 0], + "B": [1, 0, 0], + }, dtype='Sparse[int64, 0]') + result = df.sparse.to_coo() + expected = scipy.sparse.coo_matrix(np.asarray(df)) + assert (result != expected).nnz == 0 + + def test_to_dense(self): + df = pd.DataFrame({ + "A": pd.SparseArray([1, 0], dtype=pd.SparseDtype('int64', 0)), + "B": pd.SparseArray([1, 0], dtype=pd.SparseDtype('int64', 1)), + "C": pd.SparseArray([1., 0.], + dtype=pd.SparseDtype('float64', 0.0)), + }, index=['b', 'a']) + result = df.sparse.to_dense() + expected = pd.DataFrame({ + 'A': [1, 0], + 'B': [1, 0], + 'C': [1.0, 0.0], + }, index=['b', 'a']) + tm.assert_frame_equal(result, expected) + + def test_density(self): + df = pd.DataFrame({ + 'A': pd.SparseArray([1, 0, 2, 1], fill_value=0), + 'B': pd.SparseArray([0, 1, 1, 1], fill_value=0), + }) + res = df.sparse.density + expected = 0.75 + assert res == expected From 6f619b5b8aee971f4d20e5dfce6f592c29ef9776 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 12 Mar 2019 13:38:48 -0500 Subject: [PATCH 02/13] 32-bit compat --- pandas/tests/arrays/sparse/test_accessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index af53f74dea962..2a5cb0116a6fd 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -10,7 +10,7 @@ class TestSeriesAccessor(object): # TODO: collect other accessor tests def test_to_dense(self): - s = pd.Series([0, 1, 0, 10], dtype='Sparse[int]') + s = pd.Series([0, 1, 0, 10], dtype='Sparse[int64]') result = s.sparse.to_dense() expected = pd.Series([0, 1, 0, 10]) tm.assert_series_equal(result, expected) From 94a7bafc33af00f0202c46eb07248c5e397aa77f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 12 Mar 2019 14:22:48 -0500 Subject: [PATCH 03/13] fixups --- pandas/core/arrays/sparse.py | 53 +++++++++++++-------- pandas/core/sparse/frame.py | 23 ++------- pandas/tests/arrays/sparse/test_accessor.py | 14 ++++-- pandas/tests/arrays/sparse/test_array.py | 18 +++++++ 4 files changed, 64 insertions(+), 44 deletions(-) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 459c9866d0466..d1f3f88ca0b5f 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -686,22 +686,34 @@ def from_spmatrix(cls, data): Parameters ---------- data : scipy.sparse.sp_matrix - This should be a 2-D SciPy sparse where the size + This should be a SciPy sparse matrix where the size of the second dimension is 1. In other words, a sparse matrix with a single column. Returns ------- - SparseArray. - """ - assert data.ndim == 2 + SparseArray + Examples + -------- + >>> import scipy.sparse + >>> mat = scipy.sparse.coo_matrix((4, 1)) + >>> pd.SparseArray.from_spmatrix(mat) + [0.0, 0.0, 0.0, 0.0] + Fill: 0.0 + IntIndex + Indices: array([], dtype=int32) + """ length, ncol = data.shape - assert ncol == 1 + if ncol != 1: + raise ValueError( + "'data' must have a single column, not '{}'".format(ncol) + ) arr = data.data idx, _ = data.nonzero() + idx.sort() zero = np.array(0, dtype=arr.dtype).item() dtype = SparseDtype(arr.dtype, zero) index = IntIndex(length, idx) @@ -1921,28 +1933,32 @@ def _make_index(length, indices, kind): # ---------------------------------------------------------------------------- # Accessor -_validation_msg = "Can only use the '.sparse' accessor with Sparse data." + +class BaseAccessor(object): + _validation_msg = "Can only use the '.sparse' accessor with Sparse data." + + def __init__(self, data=None): + self._parent = data + self._validate(data) + + def _validate(self, data): + raise NotImplementedError @delegate_names(SparseArray, ['npoints', 'density', 'fill_value', 'sp_values'], typ='property') -class SparseAccessor(PandasDelegate): +class SparseAccessor(BaseAccessor, PandasDelegate): """ Accessor for SparseSparse from other sparse matrix data types. """ - def __init__(self, data=None): - # Store the Series since we need that for to_coo - self._parent = data - self._validate(data) - def _validate(self, data): if not isinstance(data.dtype, SparseDtype): - raise AttributeError(_validation_msg) + raise AttributeError(self._validation_msg) def _delegate_property_get(self, name, *args, **kwargs): - return getattr(self._parent.values, name) + return getattr(self._parent.array, name) def _delegate_method(self, name, *args, **kwargs): if name == 'from_coo': @@ -2064,17 +2080,12 @@ def to_dense(self): name=self._parent.name) -class SparseFrameAccessor(PandasDelegate): - - def __init__(self, data=None): - # Store the Series since we need that for to_coo - self._parent = data - self._validate(data) +class SparseFrameAccessor(BaseAccessor, PandasDelegate): def _validate(self, data): dtypes = data.dtypes if not all(isinstance(t, SparseDtype) for t in dtypes): - raise AttributeError(_validation_msg) + raise AttributeError(self._validation_msg) @classmethod def from_spmatrix(cls, data, index=None, columns=None): diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 14113538e2e95..574d6fce7d5d6 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -19,12 +19,11 @@ from pandas.core.dtypes.missing import isna, notna import pandas.core.algorithms as algos -from pandas.core.arrays.sparse import SparseArray +from pandas.core.arrays.sparse import SparseArray, SparseFrameAccessor import pandas.core.common as com from pandas.core.frame import DataFrame import pandas.core.generic as generic from pandas.core.index import Index, MultiIndex, ensure_index -import pandas.core.indexes.base as ibase from pandas.core.internals import ( BlockManager, create_block_manager_from_arrays) from pandas.core.internals.construction import extract_index, prep_ndarray @@ -198,7 +197,7 @@ def _init_matrix(self, data, index, columns, dtype=None): Init self from ndarray or list of lists. """ data = prep_ndarray(data, copy=False) - index, columns = self._prep_index(data, index, columns) + index, columns = SparseFrameAccessor._prep_index(data, index, columns) data = {idx: data[:, i] for i, idx in enumerate(columns)} return self._init_dict(data, index, columns, dtype) @@ -207,7 +206,7 @@ def _init_spmatrix(self, data, index, columns, dtype=None, """ Init self from scipy.sparse matrix. """ - index, columns = self._prep_index(data, index, columns) + index, columns = SparseFrameAccessor._prep_index(data, index, columns) data = data.tocoo() N = len(index) @@ -234,21 +233,6 @@ def _init_spmatrix(self, data, index, columns, dtype=None, return self._init_dict(sdict, index, columns, dtype) - def _prep_index(self, data, index, columns): - N, K = data.shape - if index is None: - index = ibase.default_index(N) - if columns is None: - columns = ibase.default_index(K) - - if len(columns) != K: - raise ValueError('Column length mismatch: {columns} vs. {K}' - .format(columns=len(columns), K=K)) - if len(index) != N: - raise ValueError('Index length mismatch: {index} vs. {N}' - .format(index=len(index), N=N)) - return index, columns - def to_coo(self): """ Return the contents of the frame as a sparse SciPy COO matrix. @@ -271,7 +255,6 @@ def to_coo(self): float32. By numpy.find_common_type convention, mixing int64 and and uint64 will result in a float64 dtype. """ - from pandas.core.arrays.sparse import SparseFrameAccessor return SparseFrameAccessor(self).to_coo() def __array_wrap__(self, result): diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index 2a5cb0116a6fd..a7ed796da3d3d 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -5,10 +5,11 @@ import pandas as pd import pandas.util.testing as tm +import pandas.util._test_decorators as td class TestSeriesAccessor(object): - # TODO: collect other accessor tests + # TODO: collect other Series accessor tests def test_to_dense(self): s = pd.Series([0, 1, 0, 10], dtype='Sparse[int64]') result = s.sparse.to_dense() @@ -17,15 +18,22 @@ def test_to_dense(self): class TestFrameAccessor(object): + + def test_accessor_raises(self): + df = pd.DataFrame({"A": [0, 1]}) + with pytest.raises(AttributeError, match='sparse'): + df.sparse + @pytest.mark.parametrize('format', ['csc', 'csr', 'coo']) @pytest.mark.parametrize("labels", [ None, list(string.ascii_letters[:10]), ]) @pytest.mark.parametrize('dtype', ['float64', 'int64']) + @td.skip_if_no_scipy def test_from_spmatrix(self, format, labels, dtype): - pytest.importorskip("scipy") import scipy.sparse + sp_dtype = pd.SparseDtype(dtype, np.array(0, dtype=dtype).item()) mat = scipy.sparse.eye(10, format=format, dtype=dtype) @@ -39,8 +47,8 @@ def test_from_spmatrix(self, format, labels, dtype): ).astype(sp_dtype) tm.assert_frame_equal(result, expected) + @td.skip_if_no_scipy def test_to_coo(self): - pytest.importorskip("scipy") import scipy.sparse df = pd.DataFrame({ diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 11b5bcf702e75..0a08dcfe07746 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -172,6 +172,24 @@ def test_constructor_inferred_fill_value(self, data, fill_value): else: assert result == fill_value + @pytest.mark.parametrize('format', ['coo', 'csc', 'csr']) + def test_from_spmatrix(self, format): + pytest.importorskip('scipy') + import scipy.sparse + + mat = scipy.sparse.random(10, 1, density=0.5, format=format) + result = SparseArray.from_spmatrix(mat) + np.testing.assert_array_equal(mat.data, result.sp_values) + + def test_from_spmatrix_raises(self): + pytest.importorskip('scipy') + import scipy.sparse + + mat = scipy.sparse.eye(5, 4, format='csc') + + with pytest.raises(ValueError, match="not '4'"): + SparseArray.from_spmatrix(mat) + @pytest.mark.parametrize('scalar,dtype', [ (False, SparseDtype(bool, False)), (0.0, SparseDtype('float64', 0)), From f433be85712e7e0753beaa99c074d56a16552359 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 13 Mar 2019 20:54:07 -0500 Subject: [PATCH 04/13] lint --- pandas/tests/arrays/sparse/test_array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 0a08dcfe07746..1050ef8146037 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -179,7 +179,7 @@ def test_from_spmatrix(self, format): mat = scipy.sparse.random(10, 1, density=0.5, format=format) result = SparseArray.from_spmatrix(mat) - np.testing.assert_array_equal(mat.data, result.sp_values) + tm.assert_numpy_array_equal(mat.data, result.sp_values) def test_from_spmatrix_raises(self): pytest.importorskip('scipy') From 0922296d5852956cc4bbe6f3995507daadb32257 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 13 Mar 2019 21:35:51 -0500 Subject: [PATCH 05/13] updates --- pandas/core/arrays/sparse.py | 78 +++++++++++++++++++++++- pandas/core/sparse/frame.py | 33 +--------- pandas/tests/arrays/sparse/test_array.py | 10 ++- 3 files changed, 86 insertions(+), 35 deletions(-) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index d1f3f88ca0b5f..e2fb4a67f2d35 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -683,6 +683,8 @@ def from_spmatrix(cls, data): """ Create a SparseArray from a scipy.sparse matrix. + .. versionadded:: 0.25.0 + Parameters ---------- data : scipy.sparse.sp_matrix @@ -711,9 +713,14 @@ def from_spmatrix(cls, data): "'data' must have a single column, not '{}'".format(ncol) ) + # our sparse index classes require that the positions be strictly + # increasing. So we need to sort loc, and arr accordingly. arr = data.data idx, _ = data.nonzero() + loc = np.argsort(idx) + arr = arr.take(loc) idx.sort() + zero = np.array(0, dtype=arr.dtype).item() dtype = SparseDtype(arr.dtype, zero) index = IntIndex(length, idx) @@ -2074,6 +2081,31 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False): return A, rows, columns def to_dense(self): + """ + Convert a Series from sparse values to dense. + + .. versionadded:: 0.25.0 + + Returns + ------- + Series: + A Series with the same values, stored as a dense array. + + Examples + -------- + >>> series = pd.Series(pd.SparseArray([0, 1, 0])) + >>> series + 0 0 + 1 1 + 2 0 + dtype: Sparse[int64, 0] + + >>> series.sparse.to_dense() + 0 0 + 1 1 + 2 0 + dtype: int64 + """ from pandas import Series return Series(self._parent.array.to_dense(), index=self._parent.index, @@ -2081,6 +2113,11 @@ def to_dense(self): class SparseFrameAccessor(BaseAccessor, PandasDelegate): + """ + DataFrame accessor for sparse data. + + .. versionadded :: 0.25.0 + """ def _validate(self, data): dtypes = data.dtypes @@ -2092,6 +2129,8 @@ def from_spmatrix(cls, data, index=None, columns=None): """ Create a new DataFrame from a scipy sparse matrix. + .. versionadded:: 0.25.0 + Parameters ---------- data : scipy.sparse.spmatrix @@ -2103,6 +2142,8 @@ def from_spmatrix(cls, data, index=None, columns=None): Returns ------- DataFrame + Each column of the DataFrame is stored as a + :class:`SparseArray`. Examples -------- @@ -2127,11 +2168,23 @@ def from_spmatrix(cls, data, index=None, columns=None): def to_dense(self): """ - Convert to dense DataFrame + Convert a DataFrame with sparse values to dense. + + .. versionadded:: 0.25.0 Returns ------- - df : DataFrame + DataFrame + A DataFrame with the same values stored as dense arrays. + + Examples + -------- + >>> df = pd.DataFrame({"A": pd.SparseArray([0, 1, 0])}) + >>> df.sparse.to_dense() + A + 0 0 + 1 1 + 2 0 """ from pandas import DataFrame @@ -2142,6 +2195,27 @@ def to_dense(self): columns=self._parent.columns) def to_coo(self): + """ + Return the contents of the frame as a sparse SciPy COO matrix. + + .. versionadded:: 0.20.0 + + Returns + ------- + coo_matrix : scipy.sparse.spmatrix + If the caller is heterogeneous and contains booleans or objects, + the result will be of dtype=object. See Notes. + + Notes + ----- + The dtype will be the lowest-common-denominator type (implicit + upcasting); that is to say if the dtypes (even of numeric types) + are mixed, the one that accommodates all will be chosen. + + e.g. If the dtypes are float16 and float32, dtype will be upcast to + float32. By numpy.find_common_type convention, mixing int64 and + and uint64 will result in a float64 dtype. + """ try: from scipy.sparse import coo_matrix except ImportError: diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 574d6fce7d5d6..9860f87e3c06d 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -233,28 +233,8 @@ def _init_spmatrix(self, data, index, columns, dtype=None, return self._init_dict(sdict, index, columns, dtype) + @Appender(SparseFrameAccessor.to_coo.__doc__) def to_coo(self): - """ - Return the contents of the frame as a sparse SciPy COO matrix. - - .. versionadded:: 0.20.0 - - Returns - ------- - coo_matrix : scipy.sparse.spmatrix - If the caller is heterogeneous and contains booleans or objects, - the result will be of dtype=object. See Notes. - - Notes - ----- - The dtype will be the lowest-common-denominator type (implicit - upcasting); that is to say if the dtypes (even of numeric types) - are mixed, the one that accommodates all will be chosen. - - e.g. If the dtypes are float16 and float32, dtype will be upcast to - float32. By numpy.find_common_type convention, mixing int64 and - and uint64 will result in a float64 dtype. - """ return SparseFrameAccessor(self).to_coo() def __array_wrap__(self, result): @@ -296,16 +276,9 @@ def _unpickle_sparse_frame_compat(self, state): self._default_fill_value = fv self._default_kind = kind + @Appender(SparseFrameAccessor.to_dense.__doc__) def to_dense(self): - """ - Convert to dense DataFrame - - Returns - ------- - df : DataFrame - """ - data = {k: v.to_dense() for k, v in compat.iteritems(self)} - return DataFrame(data, index=self.index, columns=self.columns) + return SparseFrameAccessor(self).to_dense() def _apply_columns(self, func): """ diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 1050ef8146037..510ab621b9396 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -173,13 +173,17 @@ def test_constructor_inferred_fill_value(self, data, fill_value): assert result == fill_value @pytest.mark.parametrize('format', ['coo', 'csc', 'csr']) - def test_from_spmatrix(self, format): + @pytest.mark.parametrize('size', [0, 10]) + def test_from_spmatrix(self, size, format): pytest.importorskip('scipy') import scipy.sparse - mat = scipy.sparse.random(10, 1, density=0.5, format=format) + mat = scipy.sparse.random(size, 1, density=0.5, format=format) result = SparseArray.from_spmatrix(mat) - tm.assert_numpy_array_equal(mat.data, result.sp_values) + + result = np.asarray(result) + expected = mat.toarray().ravel() + tm.assert_numpy_array_equal(result, expected) def test_from_spmatrix_raises(self): pytest.importorskip('scipy') From 3005aed6b9877217a3a45785aa246c2a8b67fd22 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 14 Mar 2019 06:26:32 -0500 Subject: [PATCH 06/13] isort? --- pandas/tests/arrays/sparse/test_accessor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index a7ed796da3d3d..d43addc45082c 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -3,9 +3,10 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas.util.testing as tm -import pandas.util._test_decorators as td class TestSeriesAccessor(object): From 9cbcccd5d40b45bed877da884e57435d10c67e42 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 18 Apr 2019 14:07:11 -0500 Subject: [PATCH 07/13] compat --- pandas/core/arrays/sparse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 7a4b8a6fa27da..f1080cff80197 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -2196,7 +2196,7 @@ def to_dense(self): from pandas import DataFrame data = {k: v.array.to_dense() - for k, v in compat.iteritems(self._parent)} + for k, v in self._parent.iteritems()} return DataFrame(data, index=self._parent.index, columns=self._parent.columns) From 663a87e74782c59cb25a1b63b6351d9873fb5843 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 19 Apr 2019 05:33:28 -0500 Subject: [PATCH 08/13] compat --- pandas/tests/arrays/sparse/test_array.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index eef7bf78c54b2..4738b5cfe98a7 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -6,6 +6,7 @@ import pytest from pandas._libs.sparse import IntIndex +from pandas.compat.numpy import _np_version_under1p16 import pandas.util._test_decorators as td import pandas as pd @@ -172,7 +173,12 @@ def test_constructor_inferred_fill_value(self, data, fill_value): assert result == fill_value @pytest.mark.parametrize('format', ['coo', 'csc', 'csr']) - @pytest.mark.parametrize('size', [0, 10]) + @pytest.mark.parametrize('size', [ + pytest.param(0, + marks=pytest.mark.skipif(_np_version_under1p16, + reason='NumPy-11383')), + 10 + ]) def test_from_spmatrix(self, size, format): pytest.importorskip('scipy') import scipy.sparse From 3f6a5aa93f30c3bc163e833457349b82bc39d08e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 19 Apr 2019 14:17:53 -0500 Subject: [PATCH 09/13] lint --- pandas/core/arrays/sparse.py | 2 +- pandas/tests/arrays/sparse/test_accessor.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index f1080cff80197..0edee8b0e2942 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1948,7 +1948,7 @@ def _make_index(length, indices, kind): # Accessor -class BaseAccessor(object): +class BaseAccessor: _validation_msg = "Can only use the '.sparse' accessor with Sparse data." def __init__(self, data=None): diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index d43addc45082c..4305761706376 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -9,7 +9,7 @@ import pandas.util.testing as tm -class TestSeriesAccessor(object): +class TestSeriesAccessor: # TODO: collect other Series accessor tests def test_to_dense(self): s = pd.Series([0, 1, 0, 10], dtype='Sparse[int64]') @@ -18,7 +18,7 @@ def test_to_dense(self): tm.assert_series_equal(result, expected) -class TestFrameAccessor(object): +class TestFrameAccessor: def test_accessor_raises(self): df = pd.DataFrame({"A": [0, 1]}) From 8a46ef44c02ae3d90235a5568403603697fb87c0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 29 Apr 2019 10:28:28 -0500 Subject: [PATCH 10/13] special columns --- pandas/core/arrays/sparse.py | 6 ++++-- pandas/tests/arrays/sparse/test_accessor.py | 16 ++++++++++++++++ pandas/tests/arrays/sparse/test_array.py | 11 ++++------- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index be5574fe0a89a..df50d63539fa7 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -2178,8 +2178,10 @@ def from_spmatrix(cls, data, index=None, columns=None): SparseArray.from_spmatrix(data[:, i]) for i in range(data.shape[1]) ] - data = dict(zip(columns, sparrays)) - return DataFrame(data, index=index) + data = dict(enumerate(sparrays)) + result = DataFrame(data, index=index) + result.columns = columns + return result def to_dense(self): """ diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index 4305761706376..c6e29770a9fd4 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -48,6 +48,22 @@ def test_from_spmatrix(self, format, labels, dtype): ).astype(sp_dtype) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("columns", [ + ['a', 'b'], + pd.MultiIndex.from_product([['A'], ['a', 'b']]), + ['a', 'a'], + ]) + def test_from_spmatrix_columns(self, columns): + sparse = pytest.importorskip('scipy.sparse') + dtype = pd.SparseDtype('float64', 0.0) + + mat = sparse.random(10, 2, density=0.5) + result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns) + expected = pd.DataFrame( + mat.toarray(), columns=columns + ).astype(dtype) + tm.assert_frame_equal(result, expected) + @td.skip_if_no_scipy def test_to_coo(self): import scipy.sparse diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 1ef45dc068a36..5d6a516f2441d 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -180,10 +180,9 @@ def test_constructor_inferred_fill_value(self, data, fill_value): 10 ]) def test_from_spmatrix(self, size, format): - pytest.importorskip('scipy') - import scipy.sparse + sparse = pytest.importorskip('scipy.sparse') - mat = scipy.sparse.random(size, 1, density=0.5, format=format) + mat = sparse.random(size, 1, density=0.5, format=format) result = SparseArray.from_spmatrix(mat) result = np.asarray(result) @@ -191,10 +190,8 @@ def test_from_spmatrix(self, size, format): tm.assert_numpy_array_equal(result, expected) def test_from_spmatrix_raises(self): - pytest.importorskip('scipy') - import scipy.sparse - - mat = scipy.sparse.eye(5, 4, format='csc') + sparse = pytest.importorskip('scipy.sparse') + mat = sparse.eye(5, 4, format='csc') with pytest.raises(ValueError, match="not '4'"): SparseArray.from_spmatrix(mat) From 727625e8b549cbd30e720628a4fdbe17c0e944fc Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 29 Apr 2019 10:30:41 -0500 Subject: [PATCH 11/13] fixup --- pandas/tests/arrays/sparse/test_accessor.py | 5 +++-- pandas/tests/arrays/sparse/test_array.py | 16 ++++++++++------ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index c6e29770a9fd4..8ef66ea4bdeb7 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -54,10 +54,11 @@ def test_from_spmatrix(self, format, labels, dtype): ['a', 'a'], ]) def test_from_spmatrix_columns(self, columns): - sparse = pytest.importorskip('scipy.sparse') + import scipy.sparse + dtype = pd.SparseDtype('float64', 0.0) - mat = sparse.random(10, 2, density=0.5) + mat = scipy.sparse.random(10, 2, density=0.5) result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns) expected = pd.DataFrame( mat.toarray(), columns=columns diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 5d6a516f2441d..9611b43c15728 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -189,9 +189,11 @@ def test_from_spmatrix(self, size, format): expected = mat.toarray().ravel() tm.assert_numpy_array_equal(result, expected) + @td.skip_if_no_scipy def test_from_spmatrix_raises(self): - sparse = pytest.importorskip('scipy.sparse') - mat = sparse.eye(5, 4, format='csc') + import scipy.sparse + + mat = scipy.sparse.eye(5, 4, format='csc') with pytest.raises(ValueError, match="not '4'"): SparseArray.from_spmatrix(mat) @@ -1109,27 +1111,29 @@ def test_get_attributes(self, attr): expected = getattr(arr, attr) assert result == expected + @td.skip_if_no_scipy def test_from_coo(self): - sparse = pytest.importorskip("scipy.sparse") + import scipy.sparse row = [0, 3, 1, 0] col = [0, 3, 1, 2] data = [4, 5, 7, 9] - sp_array = sparse.coo_matrix((data, (row, col))) + sp_array = scipy.sparse.coo_matrix((data, (row, col))) result = pd.Series.sparse.from_coo(sp_array) index = pd.MultiIndex.from_arrays([[0, 0, 1, 3], [0, 2, 1, 3]]) expected = pd.Series([4, 9, 7, 5], index=index, dtype='Sparse[int]') tm.assert_series_equal(result, expected) + @td.skip_if_no_scipy def test_to_coo(self): - sparse = pytest.importorskip("scipy.sparse") + import scipy.sparse ser = pd.Series([1, 2, 3], index=pd.MultiIndex.from_product([[0], [1, 2, 3]], names=['a', 'b']), dtype='Sparse[int]') A, _, _ = ser.sparse.to_coo() - assert isinstance(A, sparse.coo.coo_matrix) + assert isinstance(A, scipy.sparse.coo.coo_matrix) def test_non_sparse_raises(self): ser = pd.Series([1, 2, 3]) From 5890c2854b24e79a6009b76b0ddb1da163142bf2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 29 Apr 2019 13:32:41 -0500 Subject: [PATCH 12/13] fixup --- pandas/tests/arrays/sparse/test_accessor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index 8ef66ea4bdeb7..676f578dd2acc 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -53,6 +53,7 @@ def test_from_spmatrix(self, format, labels, dtype): pd.MultiIndex.from_product([['A'], ['a', 'b']]), ['a', 'a'], ]) + @td.skip_if_no_scipy def test_from_spmatrix_columns(self, columns): import scipy.sparse From f23fa521e9ffb21dcbb2b2fa016d8a32e86867c7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 13 May 2019 21:27:38 -0500 Subject: [PATCH 13/13] fixups --- pandas/core/arrays/sparse.py | 8 ++++---- pandas/tests/arrays/sparse/test_array.py | 5 +++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index fc0eb4fb7c5b3..66ccd6848864d 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -2206,7 +2206,7 @@ def to_dense(self): from pandas import DataFrame data = {k: v.array.to_dense() - for k, v in self._parent.iteritems()} + for k, v in self._parent.items()} return DataFrame(data, index=self._parent.index, columns=self._parent.columns) @@ -2215,7 +2215,7 @@ def to_coo(self): """ Return the contents of the frame as a sparse SciPy COO matrix. - .. versionadded:: 0.20.0 + .. versionadded:: 0.25.0 Returns ------- @@ -2256,13 +2256,13 @@ def to_coo(self): return coo_matrix((datas, (rows, cols)), shape=self._parent.shape) @property - def density(self): + def density(self) -> float: """ Ratio of non-sparse points to total (dense) data points represented in the DataFrame. """ return np.mean([column.array.density - for _, column in self._parent.iteritems()]) + for _, column in self._parent.items()]) @staticmethod def _prep_index(data, index, columns): diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 9611b43c15728..e09f4e2ccc59c 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -179,10 +179,11 @@ def test_constructor_inferred_fill_value(self, data, fill_value): reason='NumPy-11383')), 10 ]) + @td.skip_if_no_scipy def test_from_spmatrix(self, size, format): - sparse = pytest.importorskip('scipy.sparse') + import scipy.sparse - mat = sparse.random(size, 1, density=0.5, format=format) + mat = scipy.sparse.random(size, 1, density=0.5, format=format) result = SparseArray.from_spmatrix(mat) result = np.asarray(result)