From e72e59405b51f663ff412017386dfc789267ca19 Mon Sep 17 00:00:00 2001 From: Kernc Date: Fri, 24 Feb 2017 11:09:30 +0100 Subject: [PATCH 1/2] ENH: Native conversion from/to scipy.sparse matrix to SparseDataFrame --- doc/source/api.rst | 7 ++ doc/source/sparse.rst | 27 ++++++- doc/source/whatsnew/v0.20.0.txt | 24 ++++++ pandas/sparse/array.py | 9 ++- pandas/sparse/frame.py | 107 ++++++++++++++++++++++----- pandas/tests/sparse/common.py | 10 +++ pandas/tests/sparse/test_frame.py | 62 ++++++++++++++++ pandas/tests/types/test_inference.py | 9 +++ pandas/types/common.py | 14 ++++ pandas/util/testing.py | 5 ++ 10 files changed, 254 insertions(+), 20 deletions(-) create mode 100644 pandas/tests/sparse/common.py diff --git a/doc/source/api.rst b/doc/source/api.rst index fbce64df84859..137ea520d82e5 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1031,6 +1031,13 @@ Serialization / IO / Conversion DataFrame.to_string DataFrame.to_clipboard +Sparse methods +~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + SparseDataFrame.to_coo + .. _api.panel: Panel diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst index 2bc5d3f6dd0f5..fcd1f72819f61 100644 --- a/doc/source/sparse.rst +++ b/doc/source/sparse.rst @@ -186,9 +186,32 @@ the correct dense result. Interaction with scipy.sparse ----------------------------- -Experimental api to transform between sparse pandas and scipy.sparse structures. +.. versionadded:: 0.20.0 -A :meth:`SparseSeries.to_coo` method is implemented for transforming a ``SparseSeries`` indexed by a ``MultiIndex`` to a ``scipy.sparse.coo_matrix``. +Pandas supports creating sparse dataframes directly from ``scipy.sparse`` matrices. + +.. ipython:: python + + from scipy.sparse import csr_matrix + + arr = np.random.random(size=(1000, 5)) + arr[arr < .9] = 0 + + sp_arr = csr_matrix(arr) + sp_arr + + sdf = pd.SparseDataFrame(sp_arr) + sdf + +All sparse formats are supported, but matrices that aren't in :mod:`COOrdinate ` format will be converted to it, copying the data as needed. To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you can use :meth:`SparseDataFrame.to_coo` method: + +.. ipython:: python + + sdf.to_coo() + +.. versionadded:: 0.16.0 + +Additionally, a :meth:`SparseSeries.to_coo` method is implemented for transforming a ``SparseSeries`` indexed by a ``MultiIndex`` to a ``scipy.sparse.coo_matrix``. The method requires a ``MultiIndex`` with two or more levels. diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index ad7571662b8f4..569620305a2ec 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -184,6 +184,30 @@ You must enable this by setting the ``display.html.table_schema`` option to True .. _Table Schema: http://specs.frictionlessdata.io/json-table-schema/ .. _nteract: http://nteract.io/ +.. _whatsnew_0200.enhancements.scipy_sparse: + +SciPy sparse matrix from/to SparseDataFrame +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Pandas now supports creating sparse dataframes directly from ``scipy.sparse.spmatrix`` instances. See the :ref:`documentation ` for more information. (:issue:`4343`) + +All sparse formats are supported, but matrices that aren't in :mod:`COOrdinate ` format will be converted to it, copying the data as needed. + +.. ipython:: python + + from scipy.sparse import csr_matrix + arr = np.random.random(size=(1000, 5)) + arr[arr < .9] = 0 + sp_arr = csr_matrix(arr) + sp_arr + sdf = pd.SparseDataFrame(sp_arr) + sdf + +To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you can use: + +.. ipython:: python + + sdf.to_coo() + .. _whatsnew_0200.enhancements.other: Other enhancements diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index 762b6d869eae0..5f4c07971d37e 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -20,6 +20,7 @@ is_integer_dtype, is_bool_dtype, is_list_like, + is_string_dtype, is_scalar, is_dtype_equal) from pandas.types.cast import (_possibly_convert_platform, _maybe_promote, _astype_nansafe, _find_common_type) @@ -769,6 +770,12 @@ def make_sparse(arr, kind='block', fill_value=None): if isnull(fill_value): mask = notnull(arr) else: + # For str arrays in NumPy 1.12.0, operator!= below isn't + # element-wise but just returns False if fill_value is not str, + # so cast to object comparison to be safe + if is_string_dtype(arr): + arr = arr.astype(object) + mask = arr != fill_value length = len(arr) @@ -776,7 +783,7 @@ def make_sparse(arr, kind='block', fill_value=None): # the arr is a SparseArray indices = mask.sp_index.indices else: - indices = np.arange(length, dtype=np.int32)[mask] + indices = mask.nonzero()[0].astype(np.int32) index = _make_index(length, indices, kind) sparsified_values = arr[mask] diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 61b8434b0ea09..a21f64f524a0a 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -11,8 +11,8 @@ import numpy as np from pandas.types.missing import isnull, notnull -from pandas.types.cast import _maybe_upcast -from pandas.types.common import _ensure_platform_int +from pandas.types.cast import _maybe_upcast, _find_common_type +from pandas.types.common import _ensure_platform_int, is_scipy_sparse from pandas.core.common import _try_sort from pandas.compat.numpy import function as nv @@ -25,6 +25,7 @@ create_block_manager_from_arrays) import pandas.core.generic as generic from pandas.sparse.series import SparseSeries, SparseArray +from pandas.sparse.libsparse import BlockIndex, get_blocks from pandas.util.decorators import Appender import pandas.core.ops as ops @@ -39,15 +40,15 @@ class SparseDataFrame(DataFrame): Parameters ---------- - data : same types as can be passed to DataFrame + data : same types as can be passed to DataFrame or scipy.sparse.spmatrix index : array-like, optional column : array-like, optional default_kind : {'block', 'integer'}, default 'block' Default sparse kind for converting Series to SparseSeries. Will not override SparseSeries passed into constructor default_fill_value : float - Default fill_value for converting Series to SparseSeries. Will not - override SparseSeries passed in + Default fill_value for converting Series to SparseSeries + (default: nan). Will not override SparseSeries passed in. """ _constructor_sliced = SparseSeries _subtyp = 'sparse_frame' @@ -84,22 +85,19 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None, self._default_kind = default_kind self._default_fill_value = default_fill_value - if isinstance(data, dict): - mgr = self._init_dict(data, index, columns) - if dtype is not None: - mgr = mgr.astype(dtype) + if is_scipy_sparse(data): + mgr = self._init_spmatrix(data, index, columns, dtype=dtype, + fill_value=default_fill_value) + elif isinstance(data, dict): + mgr = self._init_dict(data, index, columns, dtype=dtype) elif isinstance(data, (np.ndarray, list)): - mgr = self._init_matrix(data, index, columns) - if dtype is not None: - mgr = mgr.astype(dtype) + mgr = self._init_matrix(data, index, columns, dtype=dtype) elif isinstance(data, SparseDataFrame): mgr = self._init_mgr(data._data, dict(index=index, columns=columns), dtype=dtype, copy=copy) elif isinstance(data, DataFrame): - mgr = self._init_dict(data, data.index, data.columns) - if dtype is not None: - mgr = mgr.astype(dtype) + mgr = self._init_dict(data, data.index, data.columns, dtype=dtype) elif isinstance(data, BlockManager): mgr = self._init_mgr(data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy) @@ -174,7 +172,43 @@ def _init_dict(self, data, index, columns, dtype=None): return to_manager(sdict, columns, index) def _init_matrix(self, data, index, columns, dtype=None): + """ Init self from ndarray or list of lists """ data = _prep_ndarray(data, copy=False) + index, columns = self._prep_index(data, index, columns) + data = dict([(idx, data[:, i]) for i, idx in enumerate(columns)]) + return self._init_dict(data, index, columns, dtype) + + def _init_spmatrix(self, data, index, columns, dtype=None, + fill_value=None): + """ Init self from scipy.sparse matrix """ + index, columns = self._prep_index(data, index, columns) + data = data.tocoo() + N = len(index) + + # Construct a dict of SparseSeries + sdict = {} + values = Series(data.data, index=data.row, copy=False) + for col, rowvals in values.groupby(data.col): + # get_blocks expects int32 row indices in sorted order + rows = rowvals.index.values.astype(np.int32) + rows.sort() + blocs, blens = get_blocks(rows) + + sdict[columns[col]] = SparseSeries( + rowvals.values, index=index, + fill_value=fill_value, + sparse_index=BlockIndex(N, blocs, blens)) + + # Add any columns that were empty and thus not grouped on above + sdict.update({column: SparseSeries(index=index, + fill_value=fill_value, + sparse_index=BlockIndex(N, [], [])) + for column in columns + if column not in sdict}) + + return self._init_dict(sdict, index, columns, dtype) + + def _prep_index(self, data, index, columns): N, K = data.shape if index is None: index = _default_index(N) @@ -187,9 +221,48 @@ def _init_matrix(self, data, index, columns, dtype=None): if len(index) != N: raise ValueError('Index length mismatch: %d vs. %d' % (len(index), N)) + return index, columns - data = dict([(idx, data[:, i]) for i, idx in enumerate(columns)]) - return self._init_dict(data, index, columns, dtype) + def to_coo(self): + """ + Return the contents of the frame as a sparse SciPy COO matrix. + + .. versionadded:: 0.20.0 + + Returns + ------- + coo_matrix : scipy.sparse.spmatrix + If the caller is heterogeneous and contains booleans or objects, + the result will be of dtype=object. See Notes. + + Notes + ----- + The dtype will be the lowest-common-denominator type (implicit + upcasting); that is to say if the dtypes (even of numeric types) + are mixed, the one that accommodates all will be chosen. + + e.g. If the dtypes are float16 and float32, dtype will be upcast to + float32. By numpy.find_common_type convention, mixing int64 and + and uint64 will result in a float64 dtype. + """ + try: + from scipy.sparse import coo_matrix + except ImportError: + raise ImportError('Scipy is not installed') + + dtype = _find_common_type(self.dtypes) + cols, rows, datas = [], [], [] + for col, name in enumerate(self): + s = self[name] + row = s.sp_index.to_int_index().indices + cols.append(np.repeat(col, len(row))) + rows.append(row) + datas.append(s.sp_values.astype(dtype, copy=False)) + + cols = np.concatenate(cols) + rows = np.concatenate(rows) + datas = np.concatenate(datas) + return coo_matrix((datas, (rows, cols)), shape=self.shape) def __array_wrap__(self, result): return self._constructor( diff --git a/pandas/tests/sparse/common.py b/pandas/tests/sparse/common.py new file mode 100644 index 0000000000000..3aeef8d436e1a --- /dev/null +++ b/pandas/tests/sparse/common.py @@ -0,0 +1,10 @@ +import pytest + +import pandas.util.testing as tm + + +@pytest.fixture(params=['bsr', 'coo', 'csc', 'csr', 'dia', 'dok', 'lil']) +def spmatrix(request): + tm._skip_if_no_scipy() + from scipy import sparse + return getattr(sparse, request.param + '_matrix') diff --git a/pandas/tests/sparse/test_frame.py b/pandas/tests/sparse/test_frame.py index a7dd7f2e81033..4cd5a643ce4be 100644 --- a/pandas/tests/sparse/test_frame.py +++ b/pandas/tests/sparse/test_frame.py @@ -2,11 +2,17 @@ import operator +import pytest + from numpy import nan import numpy as np import pandas as pd from pandas import Series, DataFrame, bdate_range, Panel +from pandas.types.common import (is_bool_dtype, + is_float_dtype, + is_object_dtype, + is_float) from pandas.tseries.index import DatetimeIndex from pandas.tseries.offsets import BDay import pandas.util.testing as tm @@ -18,6 +24,8 @@ from pandas.sparse.api import SparseSeries, SparseDataFrame, SparseArray from pandas.tests.frame.test_misc_api import SharedWithSparse +from pandas.tests.sparse.common import spmatrix # noqa: F401 + class TestSparseDataFrame(tm.TestCase, SharedWithSparse): @@ -1118,6 +1126,60 @@ def test_isnotnull(self): tm.assert_frame_equal(res.to_dense(), exp) +@pytest.mark.parametrize('index', [None, list('ab')]) # noqa: F811 +@pytest.mark.parametrize('columns', [None, list('cd')]) +@pytest.mark.parametrize('fill_value', [None, 0, np.nan]) +@pytest.mark.parametrize('dtype', [object, bool, int, float, np.uint16]) +def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype): + # GH 4343 + tm._skip_if_no_scipy() + + # Make one ndarray and from it one sparse matrix, both to be used for + # constructing frames and comparing results + arr = np.eye(2, dtype=dtype) + try: + spm = spmatrix(arr) + assert spm.dtype == arr.dtype + except (TypeError, AssertionError): + # If conversion to sparse fails for this spmatrix type and arr.dtype, + # then the combination is not currently supported in NumPy, so we + # can just skip testing it thoroughly + return + + sdf = pd.SparseDataFrame(spm, index=index, columns=columns, + default_fill_value=fill_value) + + # Expected result construction is kind of tricky for all + # dtype-fill_value combinations; easiest to cast to something generic + # and except later on + rarr = arr.astype(object) + rarr[arr == 0] = np.nan + expected = pd.SparseDataFrame(rarr, index=index, columns=columns).fillna( + fill_value if fill_value is not None else np.nan) + + # Assert frame is as expected + sdf_obj = sdf.astype(object) + tm.assert_sp_frame_equal(sdf_obj, expected) + tm.assert_frame_equal(sdf_obj.to_dense(), expected.to_dense()) + + # Assert spmatrices equal + tm.assert_equal(dict(sdf.to_coo().todok()), dict(spm.todok())) + + # Ensure dtype is preserved if possible + was_upcast = ((fill_value is None or is_float(fill_value)) and + not is_object_dtype(dtype) and + not is_float_dtype(dtype)) + res_dtype = (bool if is_bool_dtype(dtype) else + float if was_upcast else + dtype) + tm.assert_contains_all(sdf.dtypes, {np.dtype(res_dtype)}) + tm.assert_equal(sdf.to_coo().dtype, res_dtype) + + # However, adding a str column results in an upcast to object + sdf['strings'] = np.arange(len(sdf)).astype(str) + tm.assert_equal(sdf.to_coo().dtype, np.object_) + + class TestSparseDataFrameArithmetic(tm.TestCase): def test_numeric_op_scalar(self): diff --git a/pandas/tests/types/test_inference.py b/pandas/tests/types/test_inference.py index a36a77a70f9ad..b41df0da45234 100644 --- a/pandas/tests/types/test_inference.py +++ b/pandas/tests/types/test_inference.py @@ -30,11 +30,14 @@ is_float, is_bool, is_scalar, + is_scipy_sparse, _ensure_int32, _ensure_categorical) from pandas.types.missing import isnull from pandas.util import testing as tm +from pandas.tests.sparse.test_frame import spmatrix # noqa: F401 + def test_is_sequence(): is_seq = inference.is_sequence @@ -946,6 +949,12 @@ def test_nan_to_nat_conversions(): assert (s[8].value == np.datetime64('NaT').astype(np.int64)) +def test_is_scipy_sparse(spmatrix): # noqa: F811 + tm._skip_if_no_scipy() + assert is_scipy_sparse(spmatrix([[0, 1]])) + assert not is_scipy_sparse(np.array([1])) + + def test_ensure_int32(): values = np.arange(10, dtype=np.int32) result = _ensure_int32(values) diff --git a/pandas/types/common.py b/pandas/types/common.py index 1be5b5f6f1368..a1f03e59a5e6e 100644 --- a/pandas/types/common.py +++ b/pandas/types/common.py @@ -23,6 +23,9 @@ _TD_DTYPE = np.dtype('m8[ns]') _INT64_DTYPE = np.dtype(np.int64) +# oh the troubles to reduce import time +_is_scipy_sparse = None + _ensure_float64 = algos.ensure_float64 _ensure_float32 = algos.ensure_float32 @@ -59,6 +62,17 @@ def is_sparse(array): return isinstance(array, (ABCSparseArray, ABCSparseSeries)) +def is_scipy_sparse(array): + """ return if we are a scipy.sparse.spmatrix """ + global _is_scipy_sparse + if _is_scipy_sparse is None: + try: + from scipy.sparse import issparse as _is_scipy_sparse + except ImportError: + _is_scipy_sparse = lambda _: False + return _is_scipy_sparse(array) + + def is_categorical(array): """ return if we are a categorical possibility """ return isinstance(array, ABCCategorical) or is_categorical_dtype(array) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index b68bf55a347b2..ec30a9376a9da 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -297,6 +297,11 @@ def _skip_if_no_scipy(): except ImportError: import pytest pytest.skip('scipy.interpolate missing') + try: + import scipy.sparse # noqa + except ImportError: + import pytest + pytest.skip('scipy.sparse missing') def _skip_if_scipy_0_17(): From a0f22085325e72bb67fe3b98f366ba2b8f47f6a6 Mon Sep 17 00:00:00 2001 From: Kernc Date: Wed, 8 Mar 2017 23:06:23 +0100 Subject: [PATCH 2/2] DOC: Fix some whatsnew/v0.20.0.txt sphinx warnings --- doc/source/whatsnew/v0.20.0.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 569620305a2ec..8e9bfc8483f37 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -150,7 +150,7 @@ New Behavior: df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum() -.. _whatsnew_0200.enhancements.table_schema +.. _whatsnew_0200.enhancements.table_schema: Table Schema Output ^^^^^^^^^^^^^^^^^^^ @@ -308,7 +308,7 @@ Using ``.iloc``. Here we will get the location of the 'A' column, then use *posi df.iloc[[0, 2], df.columns.get_loc('A')] -.. _whatsnew.api_breaking.io_compat +.. _whatsnew.api_breaking.io_compat: Possible incompat for HDF5 formats for pandas < 0.13.0 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -729,7 +729,7 @@ Bug Fixes - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) -- Bug in ``.asfreq()``, where frequency was not set for empty ``Series` (:issue:`14320`) +- Bug in ``.asfreq()``, where frequency was not set for empty ``Series`` (:issue:`14320`) - Bug in ``pd.read_msgpack()`` in which ``Series`` categoricals were being improperly processed (:issue:`14901`) - Bug in ``Series.ffill()`` with mixed dtypes containing tz-aware datetimes. (:issue:`14956`)