Skip to content

Commit 00d4ec3

Browse files
sinhrksjreback
authored andcommitted
BUG: Misc fixes for SparseSeries indexing with MI
closes #13144 Author: sinhrks <[email protected]> Closes #13163 from sinhrks/sparse_multi and squashes the following commits: eb24102 [sinhrks] BUG: Misc fixes for SparseSeries indexing with MI
1 parent e5c18b4 commit 00d4ec3

File tree

7 files changed

+214
-43
lines changed

7 files changed

+214
-43
lines changed

doc/source/whatsnew/v0.18.2.txt

+3
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,9 @@ Performance Improvements
106106

107107
Bug Fixes
108108
~~~~~~~~~
109+
110+
- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`)
111+
- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`)
109112
- Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`)
110113
- Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`)
111114

pandas/indexes/multi.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -592,7 +592,6 @@ def fillna(self, value=None, downcast=None):
592592
def get_value(self, series, key):
593593
# somewhat broken encapsulation
594594
from pandas.core.indexing import maybe_droplevels
595-
from pandas.core.series import Series
596595

597596
# Label-based
598597
s = _values_from_object(series)
@@ -604,7 +603,8 @@ def _try_mi(k):
604603
new_values = series._values[loc]
605604
new_index = self[loc]
606605
new_index = maybe_droplevels(new_index, k)
607-
return Series(new_values, index=new_index, name=series.name)
606+
return series._constructor(new_values, index=new_index,
607+
name=series.name).__finalize__(self)
608608

609609
try:
610610
return self._engine.get_value(s, k)

pandas/sparse/series.py

+10-10
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,13 @@
55

66
# pylint: disable=E1101,E1103,W0231
77

8-
from numpy import nan, ndarray
98
import numpy as np
109
import warnings
1110
import operator
1211

1312
from pandas.compat.numpy import function as nv
1413
from pandas.core.common import isnull, _values_from_object, _maybe_match_name
15-
from pandas.core.index import Index, _ensure_index
14+
from pandas.core.index import Index, _ensure_index, InvalidIndexError
1615
from pandas.core.series import Series
1716
from pandas.core.frame import DataFrame
1817
from pandas.core.internals import SingleBlockManager
@@ -135,7 +134,7 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block',
135134
if is_sparse_array:
136135
fill_value = data.fill_value
137136
else:
138-
fill_value = nan
137+
fill_value = np.nan
139138

140139
if is_sparse_array:
141140
if isinstance(data, SparseSeries) and index is None:
@@ -393,8 +392,10 @@ def _get_val_at(self, loc):
393392

394393
def __getitem__(self, key):
395394
try:
396-
return self._get_val_at(self.index.get_loc(key))
395+
return self.index.get_value(self, key)
397396

397+
except InvalidIndexError:
398+
pass
398399
except KeyError:
399400
if isinstance(key, (int, np.integer)):
400401
return self._get_val_at(key)
@@ -406,13 +407,12 @@ def __getitem__(self, key):
406407
# Could not hash item, must be array-like?
407408
pass
408409

409-
# is there a case where this would NOT be an ndarray?
410-
# need to find an example, I took out the case for now
411-
412410
key = _values_from_object(key)
413-
dataSlice = self.values[key]
414-
new_index = Index(self.index.view(ndarray)[key])
415-
return self._constructor(dataSlice, index=new_index).__finalize__(self)
411+
if self.index.nlevels > 1 and isinstance(key, tuple):
412+
# to handle MultiIndex labels
413+
key = self.index.get_loc(key)
414+
return self._constructor(self.values[key],
415+
index=self.index[key]).__finalize__(self)
416416

417417
def _get_values(self, indexer):
418418
try:

pandas/sparse/tests/test_format.py

+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# -*- coding: utf-8 -*-
2+
from __future__ import print_function
3+
4+
import numpy as np
5+
import pandas as pd
6+
7+
import pandas.util.testing as tm
8+
from pandas.compat import (is_platform_windows,
9+
is_platform_32bit)
10+
from pandas.core.config import option_context
11+
12+
13+
use_32bit_repr = is_platform_windows() or is_platform_32bit()
14+
15+
16+
class TestSeriesFormatting(tm.TestCase):
17+
18+
_multiprocess_can_split_ = True
19+
20+
def test_sparse_max_row(self):
21+
s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse()
22+
result = repr(s)
23+
dtype = '' if use_32bit_repr else ', dtype=int32'
24+
exp = ("0 1.0\n1 NaN\n2 NaN\n3 3.0\n"
25+
"4 NaN\ndtype: float64\nBlockIndex\n"
26+
"Block locations: array([0, 3]{0})\n"
27+
"Block lengths: array([1, 1]{0})".format(dtype))
28+
self.assertEqual(result, exp)
29+
30+
with option_context("display.max_rows", 3):
31+
# GH 10560
32+
result = repr(s)
33+
exp = ("0 1.0\n ... \n4 NaN\n"
34+
"dtype: float64\nBlockIndex\n"
35+
"Block locations: array([0, 3]{0})\n"
36+
"Block lengths: array([1, 1]{0})".format(dtype))
37+
self.assertEqual(result, exp)
38+
39+
def test_sparse_mi_max_row(self):
40+
idx = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 0),
41+
('C', 0), ('C', 1), ('C', 2)])
42+
s = pd.Series([1, np.nan, np.nan, 3, np.nan, np.nan],
43+
index=idx).to_sparse()
44+
result = repr(s)
45+
dtype = '' if use_32bit_repr else ', dtype=int32'
46+
exp = ("A 0 1.0\n 1 NaN\nB 0 NaN\n"
47+
"C 0 3.0\n 1 NaN\n 2 NaN\n"
48+
"dtype: float64\nBlockIndex\n"
49+
"Block locations: array([0, 3], dtype=int32)\n"
50+
"Block lengths: array([1, 1]{0})".format(dtype))
51+
self.assertEqual(result, exp)
52+
53+
with option_context("display.max_rows", 3):
54+
# GH 13144
55+
result = repr(s)
56+
exp = ("A 0 1.0\n ... \nC 2 NaN\n"
57+
"dtype: float64\nBlockIndex\n"
58+
"Block locations: array([0, 3], dtype=int32)\n"
59+
"Block lengths: array([1, 1]{0})".format(dtype))
60+
self.assertEqual(result, exp)

pandas/sparse/tests/test_indexing.py

+130-12
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,13 @@ class TestSparseSeriesIndexing(tm.TestCase):
1010

1111
_multiprocess_can_split_ = True
1212

13+
def setUp(self):
14+
self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
15+
self.sparse = self.orig.to_sparse()
16+
1317
def test_getitem(self):
14-
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
15-
sparse = orig.to_sparse()
18+
orig = self.orig
19+
sparse = self.sparse
1620

1721
self.assertEqual(sparse[0], 1)
1822
self.assertTrue(np.isnan(sparse[1]))
@@ -33,8 +37,9 @@ def test_getitem(self):
3337
tm.assert_sp_series_equal(result, exp)
3438

3539
def test_getitem_slice(self):
36-
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
37-
sparse = orig.to_sparse()
40+
orig = self.orig
41+
sparse = self.sparse
42+
3843
tm.assert_sp_series_equal(sparse[:2], orig[:2].to_sparse())
3944
tm.assert_sp_series_equal(sparse[4:2], orig[4:2].to_sparse())
4045
tm.assert_sp_series_equal(sparse[::2], orig[::2].to_sparse())
@@ -84,8 +89,8 @@ def test_getitem_slice_fill_value(self):
8489
orig[-5:].to_sparse(fill_value=0))
8590

8691
def test_loc(self):
87-
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
88-
sparse = orig.to_sparse()
92+
orig = self.orig
93+
sparse = self.sparse
8994

9095
self.assertEqual(sparse.loc[0], 1)
9196
self.assertTrue(np.isnan(sparse.loc[1]))
@@ -154,19 +159,26 @@ def test_loc_index_fill_value(self):
154159
tm.assert_sp_series_equal(result, exp)
155160

156161
def test_loc_slice(self):
157-
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
158-
sparse = orig.to_sparse()
162+
orig = self.orig
163+
sparse = self.sparse
159164
tm.assert_sp_series_equal(sparse.loc[2:], orig.loc[2:].to_sparse())
160165

166+
def test_loc_slice_index_fill_value(self):
167+
orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE'))
168+
sparse = orig.to_sparse(fill_value=0)
169+
170+
tm.assert_sp_series_equal(sparse.loc['C':],
171+
orig.loc['C':].to_sparse(fill_value=0))
172+
161173
def test_loc_slice_fill_value(self):
162174
orig = pd.Series([1, np.nan, 0, 3, 0])
163175
sparse = orig.to_sparse(fill_value=0)
164176
tm.assert_sp_series_equal(sparse.loc[2:],
165177
orig.loc[2:].to_sparse(fill_value=0))
166178

167179
def test_iloc(self):
168-
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
169-
sparse = orig.to_sparse()
180+
orig = self.orig
181+
sparse = self.sparse
170182

171183
self.assertEqual(sparse.iloc[3], 3)
172184
self.assertTrue(np.isnan(sparse.iloc[2]))
@@ -234,8 +246,9 @@ def test_at_fill_value(self):
234246
self.assertEqual(sparse.at['e'], orig.at['e'])
235247

236248
def test_iat(self):
237-
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
238-
sparse = orig.to_sparse()
249+
orig = self.orig
250+
sparse = self.sparse
251+
239252
self.assertEqual(sparse.iat[0], orig.iat[0])
240253
self.assertTrue(np.isnan(sparse.iat[1]))
241254
self.assertTrue(np.isnan(sparse.iat[2]))
@@ -356,6 +369,111 @@ def test_reindex_fill_value(self):
356369
tm.assert_sp_series_equal(res, exp)
357370

358371

372+
class TestSparseSeriesMultiIndexing(TestSparseSeriesIndexing):
373+
374+
_multiprocess_can_split_ = True
375+
376+
def setUp(self):
377+
# Mi with duplicated values
378+
idx = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 0),
379+
('C', 0), ('C', 1)])
380+
self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=idx)
381+
self.sparse = self.orig.to_sparse()
382+
383+
def test_getitem_multi(self):
384+
orig = self.orig
385+
sparse = self.sparse
386+
387+
self.assertEqual(sparse[0], orig[0])
388+
self.assertTrue(np.isnan(sparse[1]))
389+
self.assertEqual(sparse[3], orig[3])
390+
391+
tm.assert_sp_series_equal(sparse['A'], orig['A'].to_sparse())
392+
tm.assert_sp_series_equal(sparse['B'], orig['B'].to_sparse())
393+
394+
result = sparse[[1, 3, 4]]
395+
exp = orig[[1, 3, 4]].to_sparse()
396+
tm.assert_sp_series_equal(result, exp)
397+
398+
# dense array
399+
result = sparse[orig % 2 == 1]
400+
exp = orig[orig % 2 == 1].to_sparse()
401+
tm.assert_sp_series_equal(result, exp)
402+
403+
# sparse array (actuary it coerces to normal Series)
404+
result = sparse[sparse % 2 == 1]
405+
exp = orig[orig % 2 == 1].to_sparse()
406+
tm.assert_sp_series_equal(result, exp)
407+
408+
def test_getitem_multi_tuple(self):
409+
orig = self.orig
410+
sparse = self.sparse
411+
412+
self.assertEqual(sparse['C', 0], orig['C', 0])
413+
self.assertTrue(np.isnan(sparse['A', 1]))
414+
self.assertTrue(np.isnan(sparse['B', 0]))
415+
416+
def test_getitems_slice_multi(self):
417+
orig = self.orig
418+
sparse = self.sparse
419+
420+
tm.assert_sp_series_equal(sparse[2:], orig[2:].to_sparse())
421+
tm.assert_sp_series_equal(sparse.loc['B':], orig.loc['B':].to_sparse())
422+
tm.assert_sp_series_equal(sparse.loc['C':], orig.loc['C':].to_sparse())
423+
424+
tm.assert_sp_series_equal(sparse.loc['A':'B'],
425+
orig.loc['A':'B'].to_sparse())
426+
tm.assert_sp_series_equal(sparse.loc[:'B'], orig.loc[:'B'].to_sparse())
427+
428+
def test_loc(self):
429+
# need to be override to use different label
430+
orig = self.orig
431+
sparse = self.sparse
432+
433+
tm.assert_sp_series_equal(sparse.loc['A'],
434+
orig.loc['A'].to_sparse())
435+
tm.assert_sp_series_equal(sparse.loc['B'],
436+
orig.loc['B'].to_sparse())
437+
438+
result = sparse.loc[[1, 3, 4]]
439+
exp = orig.loc[[1, 3, 4]].to_sparse()
440+
tm.assert_sp_series_equal(result, exp)
441+
442+
# exceeds the bounds
443+
result = sparse.loc[[1, 3, 4, 5]]
444+
exp = orig.loc[[1, 3, 4, 5]].to_sparse()
445+
tm.assert_sp_series_equal(result, exp)
446+
447+
# dense array
448+
result = sparse.loc[orig % 2 == 1]
449+
exp = orig.loc[orig % 2 == 1].to_sparse()
450+
tm.assert_sp_series_equal(result, exp)
451+
452+
# sparse array (actuary it coerces to normal Series)
453+
result = sparse.loc[sparse % 2 == 1]
454+
exp = orig.loc[orig % 2 == 1].to_sparse()
455+
tm.assert_sp_series_equal(result, exp)
456+
457+
def test_loc_multi_tuple(self):
458+
orig = self.orig
459+
sparse = self.sparse
460+
461+
self.assertEqual(sparse.loc['C', 0], orig.loc['C', 0])
462+
self.assertTrue(np.isnan(sparse.loc['A', 1]))
463+
self.assertTrue(np.isnan(sparse.loc['B', 0]))
464+
465+
def test_loc_slice(self):
466+
orig = self.orig
467+
sparse = self.sparse
468+
tm.assert_sp_series_equal(sparse.loc['A':], orig.loc['A':].to_sparse())
469+
tm.assert_sp_series_equal(sparse.loc['B':], orig.loc['B':].to_sparse())
470+
tm.assert_sp_series_equal(sparse.loc['C':], orig.loc['C':].to_sparse())
471+
472+
tm.assert_sp_series_equal(sparse.loc['A':'B'],
473+
orig.loc['A':'B'].to_sparse())
474+
tm.assert_sp_series_equal(sparse.loc[:'B'], orig.loc[:'B'].to_sparse())
475+
476+
359477
class TestSparseDataFrameIndexing(tm.TestCase):
360478

361479
_multiprocess_can_split_ = True

pandas/sparse/tests/test_series.py

+9
Original file line numberDiff line numberDiff line change
@@ -1019,6 +1019,15 @@ def test_from_coo_nodense_index(self):
10191019
check = check.dropna().to_sparse()
10201020
tm.assert_sp_series_equal(ss, check)
10211021

1022+
def test_from_coo_long_repr(self):
1023+
# GH 13114
1024+
# test it doesn't raise error. Formatting is tested in test_format
1025+
tm._skip_if_no_scipy()
1026+
import scipy.sparse
1027+
1028+
sparse = SparseSeries.from_coo(scipy.sparse.rand(350, 18))
1029+
repr(sparse)
1030+
10221031
def _run_test(self, ss, kwargs, check):
10231032
results = ss.to_coo(**kwargs)
10241033
self._check_results_to_coo(results, check)

pandas/tests/formats/test_format.py

-19
Original file line numberDiff line numberDiff line change
@@ -3758,25 +3758,6 @@ def test_to_string_header(self):
37583758
exp = '0 0\n ..\n9 9'
37593759
self.assertEqual(res, exp)
37603760

3761-
def test_sparse_max_row(self):
3762-
s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse()
3763-
result = repr(s)
3764-
dtype = '' if use_32bit_repr else ', dtype=int32'
3765-
exp = ("0 1.0\n1 NaN\n2 NaN\n3 3.0\n"
3766-
"4 NaN\ndtype: float64\nBlockIndex\n"
3767-
"Block locations: array([0, 3]{0})\n"
3768-
"Block lengths: array([1, 1]{0})".format(dtype))
3769-
self.assertEqual(result, exp)
3770-
3771-
with option_context("display.max_rows", 3):
3772-
# GH 10560
3773-
result = repr(s)
3774-
exp = ("0 1.0\n ... \n4 NaN\n"
3775-
"dtype: float64\nBlockIndex\n"
3776-
"Block locations: array([0, 3]{0})\n"
3777-
"Block lengths: array([1, 1]{0})".format(dtype))
3778-
self.assertEqual(result, exp)
3779-
37803761

37813762
class TestEngFormatter(tm.TestCase):
37823763
_multiprocess_can_split_ = True

0 commit comments

Comments
 (0)