From eb241028936aa72e21d664c00a802a357c65d0fc Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 12 May 2016 18:17:50 +0900 Subject: [PATCH] BUG: Misc fixes for SparseSeries indexing with MI --- doc/source/whatsnew/v0.18.2.txt | 3 + pandas/indexes/multi.py | 4 +- pandas/sparse/series.py | 20 ++-- pandas/sparse/tests/test_format.py | 60 +++++++++++ pandas/sparse/tests/test_indexing.py | 142 ++++++++++++++++++++++++--- pandas/sparse/tests/test_series.py | 9 ++ pandas/tests/formats/test_format.py | 19 ---- 7 files changed, 214 insertions(+), 43 deletions(-) create mode 100644 pandas/sparse/tests/test_format.py diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 85209c0dfa03d..bdaa654ba8520 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -107,6 +107,9 @@ Performance Improvements Bug Fixes ~~~~~~~~~ + +- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) +- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) - Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 3effc9b1315e6..db2f80ae78446 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -592,7 +592,6 @@ def fillna(self, value=None, downcast=None): def get_value(self, series, key): # somewhat broken encapsulation from pandas.core.indexing import maybe_droplevels - from pandas.core.series import Series # Label-based s = _values_from_object(series) @@ -604,7 +603,8 @@ def _try_mi(k): new_values = series._values[loc] new_index = self[loc] new_index = maybe_droplevels(new_index, k) - return Series(new_values, index=new_index, name=series.name) + return series._constructor(new_values, index=new_index, + name=series.name).__finalize__(self) try: return self._engine.get_value(s, k) diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index a783a7c596955..519068b97a010 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -5,14 +5,13 @@ # pylint: disable=E1101,E1103,W0231 -from numpy import nan, ndarray import numpy as np import warnings import operator from pandas.compat.numpy import function as nv from pandas.core.common import isnull, _values_from_object, _maybe_match_name -from pandas.core.index import Index, _ensure_index +from pandas.core.index import Index, _ensure_index, InvalidIndexError from pandas.core.series import Series from pandas.core.frame import DataFrame from pandas.core.internals import SingleBlockManager @@ -135,7 +134,7 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block', if is_sparse_array: fill_value = data.fill_value else: - fill_value = nan + fill_value = np.nan if is_sparse_array: if isinstance(data, SparseSeries) and index is None: @@ -393,8 +392,10 @@ def _get_val_at(self, loc): def __getitem__(self, key): try: - return self._get_val_at(self.index.get_loc(key)) + return self.index.get_value(self, key) + except InvalidIndexError: + pass except KeyError: if isinstance(key, (int, np.integer)): return self._get_val_at(key) @@ -406,13 +407,12 @@ def __getitem__(self, key): # Could not hash item, must be array-like? pass - # is there a case where this would NOT be an ndarray? - # need to find an example, I took out the case for now - key = _values_from_object(key) - dataSlice = self.values[key] - new_index = Index(self.index.view(ndarray)[key]) - return self._constructor(dataSlice, index=new_index).__finalize__(self) + if self.index.nlevels > 1 and isinstance(key, tuple): + # to handle MultiIndex labels + key = self.index.get_loc(key) + return self._constructor(self.values[key], + index=self.index[key]).__finalize__(self) def _get_values(self, indexer): try: diff --git a/pandas/sparse/tests/test_format.py b/pandas/sparse/tests/test_format.py new file mode 100644 index 0000000000000..2981e0f4af0bf --- /dev/null +++ b/pandas/sparse/tests/test_format.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +from __future__ import print_function + +import numpy as np +import pandas as pd + +import pandas.util.testing as tm +from pandas.compat import (is_platform_windows, + is_platform_32bit) +from pandas.core.config import option_context + + +use_32bit_repr = is_platform_windows() or is_platform_32bit() + + +class TestSeriesFormatting(tm.TestCase): + + _multiprocess_can_split_ = True + + def test_sparse_max_row(self): + s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse() + result = repr(s) + dtype = '' if use_32bit_repr else ', dtype=int32' + exp = ("0 1.0\n1 NaN\n2 NaN\n3 3.0\n" + "4 NaN\ndtype: float64\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) + + with option_context("display.max_rows", 3): + # GH 10560 + result = repr(s) + exp = ("0 1.0\n ... \n4 NaN\n" + "dtype: float64\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) + + def test_sparse_mi_max_row(self): + idx = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 0), + ('C', 0), ('C', 1), ('C', 2)]) + s = pd.Series([1, np.nan, np.nan, 3, np.nan, np.nan], + index=idx).to_sparse() + result = repr(s) + dtype = '' if use_32bit_repr else ', dtype=int32' + exp = ("A 0 1.0\n 1 NaN\nB 0 NaN\n" + "C 0 3.0\n 1 NaN\n 2 NaN\n" + "dtype: float64\nBlockIndex\n" + "Block locations: array([0, 3], dtype=int32)\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) + + with option_context("display.max_rows", 3): + # GH 13144 + result = repr(s) + exp = ("A 0 1.0\n ... \nC 2 NaN\n" + "dtype: float64\nBlockIndex\n" + "Block locations: array([0, 3], dtype=int32)\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) diff --git a/pandas/sparse/tests/test_indexing.py b/pandas/sparse/tests/test_indexing.py index ca2996941aef7..1f88d22bd8f93 100644 --- a/pandas/sparse/tests/test_indexing.py +++ b/pandas/sparse/tests/test_indexing.py @@ -10,9 +10,13 @@ class TestSparseSeriesIndexing(tm.TestCase): _multiprocess_can_split_ = True + def setUp(self): + self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) + self.sparse = self.orig.to_sparse() + def test_getitem(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse self.assertEqual(sparse[0], 1) self.assertTrue(np.isnan(sparse[1])) @@ -33,8 +37,9 @@ def test_getitem(self): tm.assert_sp_series_equal(result, exp) def test_getitem_slice(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse + tm.assert_sp_series_equal(sparse[:2], orig[:2].to_sparse()) tm.assert_sp_series_equal(sparse[4:2], orig[4:2].to_sparse()) tm.assert_sp_series_equal(sparse[::2], orig[::2].to_sparse()) @@ -84,8 +89,8 @@ def test_getitem_slice_fill_value(self): orig[-5:].to_sparse(fill_value=0)) def test_loc(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse self.assertEqual(sparse.loc[0], 1) self.assertTrue(np.isnan(sparse.loc[1])) @@ -154,10 +159,17 @@ def test_loc_index_fill_value(self): tm.assert_sp_series_equal(result, exp) def test_loc_slice(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse tm.assert_sp_series_equal(sparse.loc[2:], orig.loc[2:].to_sparse()) + def test_loc_slice_index_fill_value(self): + orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE')) + sparse = orig.to_sparse(fill_value=0) + + tm.assert_sp_series_equal(sparse.loc['C':], + orig.loc['C':].to_sparse(fill_value=0)) + def test_loc_slice_fill_value(self): orig = pd.Series([1, np.nan, 0, 3, 0]) sparse = orig.to_sparse(fill_value=0) @@ -165,8 +177,8 @@ def test_loc_slice_fill_value(self): orig.loc[2:].to_sparse(fill_value=0)) def test_iloc(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse self.assertEqual(sparse.iloc[3], 3) self.assertTrue(np.isnan(sparse.iloc[2])) @@ -234,8 +246,9 @@ def test_at_fill_value(self): self.assertEqual(sparse.at['e'], orig.at['e']) def test_iat(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse + self.assertEqual(sparse.iat[0], orig.iat[0]) self.assertTrue(np.isnan(sparse.iat[1])) self.assertTrue(np.isnan(sparse.iat[2])) @@ -356,6 +369,111 @@ def test_reindex_fill_value(self): tm.assert_sp_series_equal(res, exp) +class TestSparseSeriesMultiIndexing(TestSparseSeriesIndexing): + + _multiprocess_can_split_ = True + + def setUp(self): + # Mi with duplicated values + idx = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 0), + ('C', 0), ('C', 1)]) + self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=idx) + self.sparse = self.orig.to_sparse() + + def test_getitem_multi(self): + orig = self.orig + sparse = self.sparse + + self.assertEqual(sparse[0], orig[0]) + self.assertTrue(np.isnan(sparse[1])) + self.assertEqual(sparse[3], orig[3]) + + tm.assert_sp_series_equal(sparse['A'], orig['A'].to_sparse()) + tm.assert_sp_series_equal(sparse['B'], orig['B'].to_sparse()) + + result = sparse[[1, 3, 4]] + exp = orig[[1, 3, 4]].to_sparse() + tm.assert_sp_series_equal(result, exp) + + # dense array + result = sparse[orig % 2 == 1] + exp = orig[orig % 2 == 1].to_sparse() + tm.assert_sp_series_equal(result, exp) + + # sparse array (actuary it coerces to normal Series) + result = sparse[sparse % 2 == 1] + exp = orig[orig % 2 == 1].to_sparse() + tm.assert_sp_series_equal(result, exp) + + def test_getitem_multi_tuple(self): + orig = self.orig + sparse = self.sparse + + self.assertEqual(sparse['C', 0], orig['C', 0]) + self.assertTrue(np.isnan(sparse['A', 1])) + self.assertTrue(np.isnan(sparse['B', 0])) + + def test_getitems_slice_multi(self): + orig = self.orig + sparse = self.sparse + + tm.assert_sp_series_equal(sparse[2:], orig[2:].to_sparse()) + tm.assert_sp_series_equal(sparse.loc['B':], orig.loc['B':].to_sparse()) + tm.assert_sp_series_equal(sparse.loc['C':], orig.loc['C':].to_sparse()) + + tm.assert_sp_series_equal(sparse.loc['A':'B'], + orig.loc['A':'B'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[:'B'], orig.loc[:'B'].to_sparse()) + + def test_loc(self): + # need to be override to use different label + orig = self.orig + sparse = self.sparse + + tm.assert_sp_series_equal(sparse.loc['A'], + orig.loc['A'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc['B'], + orig.loc['B'].to_sparse()) + + result = sparse.loc[[1, 3, 4]] + exp = orig.loc[[1, 3, 4]].to_sparse() + tm.assert_sp_series_equal(result, exp) + + # exceeds the bounds + result = sparse.loc[[1, 3, 4, 5]] + exp = orig.loc[[1, 3, 4, 5]].to_sparse() + tm.assert_sp_series_equal(result, exp) + + # dense array + result = sparse.loc[orig % 2 == 1] + exp = orig.loc[orig % 2 == 1].to_sparse() + tm.assert_sp_series_equal(result, exp) + + # sparse array (actuary it coerces to normal Series) + result = sparse.loc[sparse % 2 == 1] + exp = orig.loc[orig % 2 == 1].to_sparse() + tm.assert_sp_series_equal(result, exp) + + def test_loc_multi_tuple(self): + orig = self.orig + sparse = self.sparse + + self.assertEqual(sparse.loc['C', 0], orig.loc['C', 0]) + self.assertTrue(np.isnan(sparse.loc['A', 1])) + self.assertTrue(np.isnan(sparse.loc['B', 0])) + + def test_loc_slice(self): + orig = self.orig + sparse = self.sparse + tm.assert_sp_series_equal(sparse.loc['A':], orig.loc['A':].to_sparse()) + tm.assert_sp_series_equal(sparse.loc['B':], orig.loc['B':].to_sparse()) + tm.assert_sp_series_equal(sparse.loc['C':], orig.loc['C':].to_sparse()) + + tm.assert_sp_series_equal(sparse.loc['A':'B'], + orig.loc['A':'B'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[:'B'], orig.loc[:'B'].to_sparse()) + + class TestSparseDataFrameIndexing(tm.TestCase): _multiprocess_can_split_ = True diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index 44bc51077ef3e..5cbc509b836db 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -1019,6 +1019,15 @@ def test_from_coo_nodense_index(self): check = check.dropna().to_sparse() tm.assert_sp_series_equal(ss, check) + def test_from_coo_long_repr(self): + # GH 13114 + # test it doesn't raise error. Formatting is tested in test_format + tm._skip_if_no_scipy() + import scipy.sparse + + sparse = SparseSeries.from_coo(scipy.sparse.rand(350, 18)) + repr(sparse) + def _run_test(self, ss, kwargs, check): results = ss.to_coo(**kwargs) self._check_results_to_coo(results, check) diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index 96770a86ff383..7a806280916f1 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -3758,25 +3758,6 @@ def test_to_string_header(self): exp = '0 0\n ..\n9 9' self.assertEqual(res, exp) - def test_sparse_max_row(self): - s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse() - result = repr(s) - dtype = '' if use_32bit_repr else ', dtype=int32' - exp = ("0 1.0\n1 NaN\n2 NaN\n3 3.0\n" - "4 NaN\ndtype: float64\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dtype)) - self.assertEqual(result, exp) - - with option_context("display.max_rows", 3): - # GH 10560 - result = repr(s) - exp = ("0 1.0\n ... \n4 NaN\n" - "dtype: float64\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dtype)) - self.assertEqual(result, exp) - class TestEngFormatter(tm.TestCase): _multiprocess_can_split_ = True