diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 25233d970b3a6..e5c58bf817a36 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -282,7 +282,7 @@ Selection By Label See :ref:`Returning a View versus Copy ` pandas provides a suite of methods in order to have **purely label based indexing**. This is a strict inclusion based protocol. -**ALL** of the labels for which you ask, must be in the index or a ``KeyError`` will be raised! When slicing, the start bound is *included*, **AND** the stop bound is *included*. Integers are valid labels, but they refer to the label **and not the position**. +**at least 1** of the labels for which you ask, must be in the index or a ``KeyError`` will be raised! When slicing, the start bound is *included*, **AND** the stop bound is *included*. Integers are valid labels, but they refer to the label **and not the position**. The ``.loc`` attribute is the primary access method. The following are valid inputs: diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 322bcba9664d9..0cbcff9670379 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -172,6 +172,50 @@ API changes as the ``left`` argument. (:issue:`7737`) - Histogram from ``DataFrame.plot`` with ``kind='hist'`` (:issue:`7809`), See :ref:`the docs`. +- Consistency when indexing with ``.loc`` and a list-like indexer when no values are found. + + .. ipython:: python + + df = DataFrame([['a'],['b']],index=[1,2]) + df + + In prior versions there was a difference in these two constructs: + + - ``df.loc[[3]]`` would (prior to 0.15.0) return a frame reindexed by 3 (with all ``np.nan`` values) + - ``df.loc[[3],:]`` would raise ``KeyError``. + + Both will now raise a ``KeyError``. The rule is that *at least 1* indexer must be found when using a list-like and ``.loc`` (:issue:`7999`) + + There was also a difference between ``df.loc[[1,3]]`` (returns a frame reindexed by ``[1, 3]``) and ``df.loc[[1, 3],:]`` (would raise ``KeyError`` prior to 0.15.0). Both will now return a reindexed frame. + + .. ipython:: python + + df.loc[[1,3]] + df.loc[[1,3],:] + + This can also be seen in multi-axis indexing with a ``Panel``. + + .. ipython:: python + + p = Panel(np.arange(2*3*4).reshape(2,3,4), + items=['ItemA','ItemB'],major_axis=[1,2,3],minor_axis=['A','B','C','D']) + p + + The following would raise ``KeyError`` prior to 0.15.0: + + .. ipython:: python + + p.loc[['ItemA','ItemD'],:,'D'] + + Furthermore, ``.loc`` will raise If no values are found in a multi-index with a list-like indexer: + + .. ipython:: python + :okexcept: + + s = Series(np.arange(3,dtype='int64'),index=MultiIndex.from_product([['A'],['foo','bar','baz']], + names=['one','two'])).sortlevel() + s + s.loc[['D']] .. _whatsnew_0150.dt: diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 6ee03eab4bab8..dfc552e8df0d7 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -132,6 +132,16 @@ def _has_valid_tuple(self, key): raise ValueError("Location based indexing can only have [%s] " "types" % self._valid_types) + def _should_validate_iterable(self, axis=0): + """ return a boolean whether this axes needs validation for a passed iterable """ + ax = self.obj._get_axis(axis) + if isinstance(ax, MultiIndex): + return False + elif ax.is_floating(): + return False + + return True + def _is_nested_tuple_indexer(self, tup): if any([ isinstance(ax, MultiIndex) for ax in self.obj.axes ]): return any([ _is_nested_tuple(tup,ax) for ax in self.obj.axes ]) @@ -762,7 +772,7 @@ def _getitem_lowerdim(self, tup): # we can directly get the axis result since the axis is specified if self.axis is not None: axis = self.obj._get_axis_number(self.axis) - return self._getitem_axis(tup, axis=axis, validate_iterable=True) + return self._getitem_axis(tup, axis=axis) # we may have a nested tuples indexer here if self._is_nested_tuple_indexer(tup): @@ -825,7 +835,7 @@ def _getitem_nested_tuple(self, tup): return result # this is a series with a multi-index specified a tuple of selectors - return self._getitem_axis(tup, axis=0, validate_iterable=True) + return self._getitem_axis(tup, axis=0) # handle the multi-axis by taking sections and reducing # this is iterative @@ -838,7 +848,7 @@ def _getitem_nested_tuple(self, tup): continue current_ndim = obj.ndim - obj = getattr(obj, self.name)._getitem_axis(key, axis=axis, validate_iterable=True) + obj = getattr(obj, self.name)._getitem_axis(key, axis=axis) axis += 1 # if we have a scalar, we are done @@ -859,9 +869,11 @@ def _getitem_nested_tuple(self, tup): return obj - def _getitem_axis(self, key, axis=0, validate_iterable=False): + def _getitem_axis(self, key, axis=0): + + if self._should_validate_iterable(axis): + self._has_valid_type(key, axis) - self._has_valid_type(key, axis) labels = self.obj._get_axis(axis) if isinstance(key, slice): return self._get_slice_axis(key, axis=axis) @@ -888,17 +900,29 @@ def _getitem_axis(self, key, axis=0, validate_iterable=False): return self._get_label(key, axis=axis) def _getitem_iterable(self, key, axis=0): + if self._should_validate_iterable(axis): + self._has_valid_type(key, axis) + labels = self.obj._get_axis(axis) def _reindex(keys, level=None): + try: - return self.obj.reindex_axis(keys, axis=axis, level=level) + result = self.obj.reindex_axis(keys, axis=axis, level=level) except AttributeError: # Series if axis != 0: raise AssertionError('axis must be 0') return self.obj.reindex(keys, level=level) + # this is an error as we are trying to find + # keys in a multi-index that don't exist + if isinstance(labels, MultiIndex) and level is not None: + if hasattr(result,'ndim') and not np.prod(result.shape) and len(keys): + raise KeyError("cannot index a multi-index axis with these keys") + + return result + if com._is_bool_indexer(key): key = _check_bool_indexer(labels, key) inds, = key.nonzero() @@ -1149,7 +1173,7 @@ def __getitem__(self, key): else: return self._getitem_axis(key, axis=0) - def _getitem_axis(self, key, axis=0, validate_iterable=False): + def _getitem_axis(self, key, axis=0): raise NotImplementedError() def _getbool_axis(self, key, axis=0): @@ -1223,11 +1247,11 @@ def _has_valid_type(self, key, axis): if isinstance(key, tuple) and isinstance(ax, MultiIndex): return True - # require all elements in the index + # require at least 1 element in the index idx = _ensure_index(key) - if not idx.isin(ax).all(): + if len(idx) and not idx.isin(ax).any(): - raise KeyError("[%s] are not in ALL in the [%s]" % + raise KeyError("None of [%s] are in the [%s]" % (key, self.obj._get_axis_name(axis))) return True @@ -1256,7 +1280,7 @@ def error(): return True - def _getitem_axis(self, key, axis=0, validate_iterable=False): + def _getitem_axis(self, key, axis=0): labels = self.obj._get_axis(axis) if isinstance(key, slice): @@ -1280,9 +1304,6 @@ def _getitem_axis(self, key, axis=0, validate_iterable=False): if hasattr(key, 'ndim') and key.ndim > 1: raise ValueError('Cannot index with multidimensional key') - if validate_iterable: - self._has_valid_type(key, axis) - return self._getitem_iterable(key, axis=axis) # nested tuple slicing @@ -1389,7 +1410,7 @@ def _get_slice_axis(self, slice_obj, axis=0): else: return self.obj.take(slice_obj, axis=axis, convert=False) - def _getitem_axis(self, key, axis=0, validate_iterable=False): + def _getitem_axis(self, key, axis=0): if isinstance(key, slice): self._has_valid_type(key, axis) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 3552c75900745..e7bb716de60f3 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -743,11 +743,14 @@ def test_loc_getitem_label_list(self): self.check_result('list lbl', 'loc', [Timestamp('20130102'),Timestamp('20130103')], 'ix', [Timestamp('20130102'),Timestamp('20130103')], typs = ['ts'], axes=0) - # fails self.check_result('list lbl', 'loc', [0,1,2], 'indexer', [0,1,2], typs = ['empty'], fails = KeyError) self.check_result('list lbl', 'loc', [0,2,3], 'ix', [0,2,3], typs = ['ints'], axes=0, fails = KeyError) - self.check_result('list lbl', 'loc', [3,6,7], 'ix', [3,6,9], typs = ['ints'], axes=1, fails = KeyError) - self.check_result('list lbl', 'loc', [4,8,10], 'ix', [4,8,12], typs = ['ints'], axes=2, fails = KeyError) + self.check_result('list lbl', 'loc', [3,6,7], 'ix', [3,6,7], typs = ['ints'], axes=1, fails = KeyError) + self.check_result('list lbl', 'loc', [4,8,10], 'ix', [4,8,10], typs = ['ints'], axes=2, fails = KeyError) + + # fails + self.check_result('list lbl', 'loc', [20,30,40], 'ix', [20,30,40], typs = ['ints'], axes=1, fails = KeyError) + self.check_result('list lbl', 'loc', [20,30,40], 'ix', [20,30,40], typs = ['ints'], axes=2, fails = KeyError) # array like self.check_result('array like', 'loc', Series(index=[0,2,4]).index, 'ix', [0,2,4], typs = ['ints'], axes=0) @@ -815,14 +818,9 @@ def test_loc_to_fail(self): s.loc['a'] = 2 self.assertRaises(KeyError, lambda : s.loc[-1]) + self.assertRaises(KeyError, lambda : s.loc[[-1, -2]]) - result = s.loc[[-1, -2]] - expected = Series(np.nan,index=[-1,-2]) - assert_series_equal(result, expected) - - result = s.loc[['4']] - expected = Series(np.nan,index=['4']) - assert_series_equal(result, expected) + self.assertRaises(KeyError, lambda : s.loc[['4']]) s.loc[-1] = 3 result = s.loc[[-1,-2]] @@ -830,15 +828,25 @@ def test_loc_to_fail(self): assert_series_equal(result, expected) s['a'] = 2 - result = s.loc[[-2]] - expected = Series([np.nan],index=[-2]) - assert_series_equal(result, expected) + self.assertRaises(KeyError, lambda : s.loc[[-2]]) del s['a'] def f(): s.loc[[-2]] = 0 self.assertRaises(KeyError, f) + # inconsistency between .loc[values] and .loc[values,:] + # GH 7999 + df = DataFrame([['a'],['b']],index=[1,2],columns=['value']) + + def f(): + df.loc[[3],:] + self.assertRaises(KeyError, f) + + def f(): + df.loc[[3]] + self.assertRaises(KeyError, f) + def test_loc_getitem_label_slice(self): # label slices (with ints) @@ -1575,11 +1583,13 @@ def f(): self.assertRaises(ValueError, f) # ambiguous cases - # these can be multiply interpreted - # but we can catch this in some cases - def f(): - df.loc[(slice(None),[1])] - self.assertRaises(KeyError, f) + # these can be multiply interpreted (e.g. in this case + # as df.loc[slice(None),[1]] as well + self.assertRaises(KeyError, lambda : df.loc[slice(None),[1]]) + + result = df.loc[(slice(None),[1]),:] + expected = df.iloc[[0,3]] + assert_frame_equal(result, expected) # not lexsorted self.assertEqual(df.index.lexsort_depth,2) @@ -1960,9 +1970,12 @@ def f(): result = s.loc[['A','D']] assert_series_equal(result,expected) - # empty series - result = s.loc[['D']] - expected = s.loc[[]] + # not any values found + self.assertRaises(KeyError, lambda : s.loc[['D']]) + + # empty ok + result = s.loc[[]] + expected = s.iloc[[]] assert_series_equal(result,expected) idx = pd.IndexSlice @@ -2788,9 +2801,8 @@ def test_series_partial_set(self): result = ser.loc[[3, 2, 3]] assert_series_equal(result, expected) - expected = Series([np.nan, np.nan, np.nan], index=[3, 3, 3]) - result = ser.loc[[3, 3, 3]] - assert_series_equal(result, expected) + # raises as nothing in in the index + self.assertRaises(KeyError, lambda : ser.loc[[3, 3, 3]]) expected = Series([0.2, 0.2, np.nan], index=[2, 2, 3]) result = ser.loc[[2, 2, 3]]