From 372e77f7d31a8e18cc1ec4748356f67f48883e3e Mon Sep 17 00:00:00 2001 From: y-p Date: Tue, 28 Jan 2014 01:09:47 +0200 Subject: [PATCH 1/9] ENH: support per-axis, per-level indexing with loc[] CLN: add comments in indexing code CLN: comment out possibly stale kludge fix and wait for explosion CLN: Mark if clause for handling of per-axis tuple indexing with loc PERF: vectorize _spec_to_array_indices, for 3-4x speedup PERF: remove no longer needed list conversion. 1.4x speedup --- pandas/core/indexing.py | 210 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 200 insertions(+), 10 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 029055d80b1af..d7994a015d801 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -61,8 +61,7 @@ def _get_label(self, label, axis=0): return self.obj[label] elif (isinstance(label, tuple) and isinstance(label[axis], slice)): - - raise IndexingError('no slices here') + raise IndexingError('no slices here, handle elsewhere') try: return self.obj._xs(label, axis=axis, copy=False) @@ -700,24 +699,32 @@ def _getitem_lowerdim(self, tup): # a bit kludgy if isinstance(ax0, MultiIndex): try: + # fast path for series or for tup devoid of slices return self._get_label(tup, axis=0) except TypeError: # slices are unhashable pass except Exception as e1: if isinstance(tup[0], (slice, Index)): - raise IndexingError + raise IndexingError("Handle elsewhere") # raise the error if we are not sorted if not ax0.is_lexsorted_for_tuple(tup): raise e1 - try: - loc = ax0.get_loc(tup[0]) - except KeyError: - raise e1 + + # GH911 introduced this clause, but the regression test + # added for it now passes even without it. Let's rock the boat. + # 2014/01/27 + + # # should we abort, or keep going? + # try: + # loc = ax0.get_loc(tup[0]) + # except KeyError: + # raise e1 + if len(tup) > self.obj.ndim: - raise IndexingError + raise IndexingError("Too many indexers. handle elsewhere") # to avoid wasted computation # df.ix[d1:d2, 0] -> columns first (True) @@ -730,9 +737,9 @@ def _getitem_lowerdim(self, tup): if not _is_list_like(section): return section - # might have been a MultiIndex elif section.ndim == self.ndim: - + # we're in the middle of slicing through a MultiIndex + # revise the key wrt to `section` by inserting an _NS new_key = tup[:i] + (_NS,) + tup[i + 1:] else: @@ -748,6 +755,7 @@ def _getitem_lowerdim(self, tup): if len(new_key) == 1: new_key, = new_key + # This is an elided recursive call to iloc/loc/etc' return getattr(section, self.name)[new_key] raise IndexingError('not applicable') @@ -1171,6 +1179,14 @@ def _getitem_axis(self, key, axis=0): raise ValueError('Cannot index with multidimensional key') return self._getitem_iterable(key, axis=axis) + elif isinstance(key, tuple) and isinstance(labels, MultiIndex) and \ + any([isinstance(x,slice) for x in key]): + # handle per-axis tuple containting label criteria for + # each level (or a prefix of levels), may contain + # (None) slices, list of labels or labels + specs = _tuple_to_mi_locs(labels,key) + g = _spec_to_array_indices(labels, specs) + return self.obj.iloc[g] else: self._has_valid_type(key, axis) return self._get_label(key, axis=axis) @@ -1554,3 +1570,177 @@ def _maybe_droplevels(index, key): pass return index + +def _tuple_to_mi_locs(ix,tup): + """Convert a tuple of slices/label lists/labels to a level-wise spec + + Parameters + ---------- + ix: a sufficiently lexsorted, unique/non-dupe MultIindex. + tup: a tuple of slices, labels or lists of labels. + slice(None) is acceptable, and the case of len(tup)>> mi = pd.MultiIndex.from_product([['A0', 'A1', 'A2'],['B0', 'B1']]) + >>> for x in mi.get_values(): print(x) + ('A0', 'B0') + ('A0', 'B1') + ('A1', 'B0') + ('A1', 'B1') + ('A2', 'B0') + ('A2', 'B1') + >>> _tuple_to_mi_locs(mi,(slice('A0','A2'),['B0', 'B1'])) + [(0, 2), [0, 1]] + + read as: + - All labels in position [0,1) in first level + - for each of those, all labels at positions 0 or 1. + + The same effective result can be achieved by specifying the None Slice, + or omitting it completely. Note the tuple (0,2) has replaced the list [0 1], + but the outcome is the same. + + >>> _tuple_to_mi_locs(mi,(slice('A0','A2'),slice(None))) + [(0, 2), (0,2)] + + >>> _tuple_to_mi_locs(mi,(slice('A0','A2'),)) + [(0, 2), (0,2)] + + """ + + + ranges = [] + + # ix must be lexsorted to at least as many levels + # as there are elements in `tup` + assert ix.is_lexsorted_for_tuple(tup) + assert ix.is_unique + assert isinstance(ix,MultiIndex) + + for i,k in enumerate(tup): + level = ix.levels[i] + + if _is_list_like(k): + # a collection of labels to include from this level + ranges.append([level.get_loc(x) for x in k]) + continue + if k == slice(None): + start = 0 + stop = len(level) + elif isinstance(k,slice): + start = level.get_loc(k.start) + stop = len(level) + if k.stop: + stop = level.get_loc(k.stop) + else: + # a single label + start = level.get_loc(k) + stop = start + + ranges.append((start,stop)) + + for i in range(i+1,len(ix.levels)): + # omitting trailing dims + # means include all values + level = ix.levels[i] + start = 0 + stop = len(level) + ranges.append((start,stop)) + + return ranges + +def _spec_to_array_indices(ix, specs): + """Convert a tuple of slices/label lists/labels to a level-wise spec + + Parameters + ---------- + ix: a sufficiently lexsorted, unique/non-dupe MultIindex. + specs: a list of 2-tuples/list of label positions. Specifically, The + output of _tuple_to_mi_locs. + len(specs) must matc ix.nlevels. + + Returns + ------- + a generator of row positions relative to ix, corresponding to specs. + Suitable for usage with `iloc`. + + Example (This is *not* a doctest): + >>> mi = pd.MultiIndex.from_product([['A0', 'A1', 'A2'],['B0', 'B1']]) + >>> for x in mi.get_values(): print(x) + ('A0', 'B0') + ('A0', 'B1') + ('A1', 'B0') + ('A1', 'B1') + ('A2', 'B0') + ('A2', 'B1') + + >>> specs = _tuple_to_mi_locs(mi,(slice('A0','A2'),['B0', 'B1'])) + >>> list(_spec_to_array_indices(mi, specs)) + [0, 1, 2, 3] + + Which are all the labels having 'A0' to 'A2' (non-inclusive) at level=0 + and 'B0' or 'B1' at level = 0 + + """ + assert ix.is_lexsorted_for_tuple(specs) + assert len(specs) == ix.nlevels + assert ix.is_unique + assert isinstance(ix,MultiIndex) + + # step size/increment for iteration at each level + giant_steps = np.cumprod(ix.levshape[::-1])[::-1] + giant_steps[:-1] = giant_steps[1:] + giant_steps[-1] = 1 + + def _iter_vectorize(specs, i=0): + step_size = giant_steps[i] + spec=specs[i] + if isinstance(spec,tuple): + # tuples are 2-tuples of (start,stop) label indices to include + valrange = compat.range(*spec) + elif isinstance(spec,list): + # lists are discrete label indicies to include + valrange = spec + + if len(specs)-1 == i: + return np.array(valrange) + else: + tmpl = np.array([v for v in _iter_vectorize(specs,i+1)]) + res=np.tile(tmpl,(len(valrange),1)) + steps=(np.array(valrange)*step_size).reshape((len(valrange),1)) + return (res+steps).flatten() + + + def _iter_generator(specs, i=0): + step_size = giant_steps[i] + spec=specs[i] + if isinstance(spec,tuple): + # tuples are 2-tuples of (start,stop) label indices to include + valrange = compat.range(*spec) + elif isinstance(spec,list): + # lists are discrete label indicies to include + valrange = spec + + if len(specs)-1 == i: + # base case + for v in valrange: + yield v + else: + for base in valrange: + base *= step_size + for v in _iter_generator(specs,i+1): + yield base + v + # validate + + return _iter_vectorize(specs) From 30eb6dbbbe75cbbad36bd0d18bbb9563139bc009 Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 4 Feb 2014 10:56:13 -0500 Subject: [PATCH 2/9] CLN: move indexing loc changes to index.py --- pandas/core/index.py | 173 ++++++++++++++++++++++++++++++++++++++ pandas/core/indexing.py | 180 +--------------------------------------- 2 files changed, 175 insertions(+), 178 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index 5a02c0445c006..82491ab99b312 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -3249,6 +3249,179 @@ def _get_level_indexer(self, key, level=0): j = labels.searchsorted(loc, side='right') return slice(i, j) + def get_locs(self, tup): + """Convert a tuple of slices/label lists/labels to a level-wise spec + + Parameters + ---------- + self: a sufficiently lexsorted, unique/non-dupe MultIindex. + tup: a tuple of slices, labels or lists of labels. + slice(None) is acceptable, and the case of len(tup)>> mi = pd.MultiIndex.from_product([['A0', 'A1', 'A2'],['B0', 'B1']]) + >>> for x in mi.get_values(): print(x) + ('A0', 'B0') + ('A0', 'B1') + ('A1', 'B0') + ('A1', 'B1') + ('A2', 'B0') + ('A2', 'B1') + >>> mi.get_specs((slice('A0','A2'),['B0', 'B1'])) + [(0, 2), [0, 1]] + + read as: + - All labels in position [0,1) in first level + - for each of those, all labels at positions 0 or 1. + + The same effective result can be achieved by specifying the None Slice, + or omitting it completely. Note the tuple (0,2) has replaced the list [0 1], + but the outcome is the same. + + >>> mi.get_locs((slice('A0','A2'),slice(None))) + [(0, 2), (0,2)] + + >>> mi.get_locs((slice('A0','A2'),)) + [(0, 2), (0,2)] + + """ + + ranges = [] + + # self must be lexsorted to at least as many levels + # as there are elements in `tup` + assert self.is_lexsorted_for_tuple(tup) + assert self.is_unique + assert isinstance(self,MultiIndex) + + for i,k in enumerate(tup): + level = self.levels[i] + + if com.is_list_like(k): + # a collection of labels to include from this level + ranges.append([level.get_loc(x) for x in k]) + continue + if k == slice(None): + start = 0 + stop = len(level) + elif isinstance(k,slice): + start = level.get_loc(k.start) + stop = len(level) + if k.stop: + stop = level.get_loc(k.stop) + else: + # a single label + start = level.get_loc(k) + stop = start + + ranges.append((start,stop)) + + for i in range(i+1,len(self.levels)): + # omitting trailing dims + # means include all values + level = self.levels[i] + start = 0 + stop = len(level) + ranges.append((start,stop)) + + return ranges + + def locs_to_indexer(self, specs): + """ Take a location specification to an indexer + + Parameters + ---------- + self: a sufficiently lexsorted, unique/non-dupe MultIindex. + specs: a list of 2-tuples/list of label positions. Specifically, The + output of _tuple_to_mi_locs. + len(specs) must matc ix.nlevels. + + Returns + ------- + a generator of row positions relative to ix, corresponding to specs. + Suitable for usage with `iloc`. + + Example (This is *not* a doctest): + >>> mi = pd.MultiIndex.from_product([['A0', 'A1', 'A2'],['B0', 'B1']]) + >>> for x in mi.get_values(): print(x) + ('A0', 'B0') + ('A0', 'B1') + ('A1', 'B0') + ('A1', 'B1') + ('A2', 'B0') + ('A2', 'B1') + + >>> locs = mi.get_locs((slice('A0','A2'),['B0', 'B1'])) + >>> list(mi.locs_to_indexer(locs)) + [0, 1, 2, 3] + + Which are all the labels having 'A0' to 'A2' (non-inclusive) at level=0 + and 'B0' or 'B1' at level = 0 + + """ + assert self.is_lexsorted_for_tuple(specs) + assert len(specs) == self.nlevels + assert self.is_unique + assert isinstance(self,MultiIndex) + + # step size/increment for iteration at each level + giant_steps = np.cumprod(self.levshape[::-1])[::-1] + giant_steps[:-1] = giant_steps[1:] + giant_steps[-1] = 1 + + def _iter_vectorize(specs, i=0): + step_size = giant_steps[i] + spec=specs[i] + if isinstance(spec,tuple): + # tuples are 2-tuples of (start,stop) label indices to include + valrange = compat.range(*spec) + elif isinstance(spec,list): + # lists are discrete label indicies to include + valrange = spec + + if len(specs)-1 == i: + return np.array(valrange) + else: + tmpl = np.array([v for v in _iter_vectorize(specs,i+1)]) + res=np.tile(tmpl,(len(valrange),1)) + steps=(np.array(valrange)*step_size).reshape((len(valrange),1)) + return (res+steps).flatten() + + + def _iter_generator(specs, i=0): + step_size = giant_steps[i] + spec=specs[i] + if isinstance(spec,tuple): + # tuples are 2-tuples of (start,stop) label indices to include + valrange = compat.range(*spec) + elif isinstance(spec,list): + # lists are discrete label indicies to include + valrange = spec + + if len(specs)-1 == i: + # base case + for v in valrange: + yield v + else: + for base in valrange: + base *= step_size + for v in _iter_generator(specs,i+1): + yield base + v + # validate + + return _iter_vectorize(specs) + def truncate(self, before=None, after=None): """ Slice index between two labels / tuples, return new MultiIndex diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index d7994a015d801..efe39104553b3 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1181,11 +1181,8 @@ def _getitem_axis(self, key, axis=0): return self._getitem_iterable(key, axis=axis) elif isinstance(key, tuple) and isinstance(labels, MultiIndex) and \ any([isinstance(x,slice) for x in key]): - # handle per-axis tuple containting label criteria for - # each level (or a prefix of levels), may contain - # (None) slices, list of labels or labels - specs = _tuple_to_mi_locs(labels,key) - g = _spec_to_array_indices(labels, specs) + locs = labels.get_locs(key) + g = labels.locs_to_indexer(locs) return self.obj.iloc[g] else: self._has_valid_type(key, axis) @@ -1571,176 +1568,3 @@ def _maybe_droplevels(index, key): return index -def _tuple_to_mi_locs(ix,tup): - """Convert a tuple of slices/label lists/labels to a level-wise spec - - Parameters - ---------- - ix: a sufficiently lexsorted, unique/non-dupe MultIindex. - tup: a tuple of slices, labels or lists of labels. - slice(None) is acceptable, and the case of len(tup)>> mi = pd.MultiIndex.from_product([['A0', 'A1', 'A2'],['B0', 'B1']]) - >>> for x in mi.get_values(): print(x) - ('A0', 'B0') - ('A0', 'B1') - ('A1', 'B0') - ('A1', 'B1') - ('A2', 'B0') - ('A2', 'B1') - >>> _tuple_to_mi_locs(mi,(slice('A0','A2'),['B0', 'B1'])) - [(0, 2), [0, 1]] - - read as: - - All labels in position [0,1) in first level - - for each of those, all labels at positions 0 or 1. - - The same effective result can be achieved by specifying the None Slice, - or omitting it completely. Note the tuple (0,2) has replaced the list [0 1], - but the outcome is the same. - - >>> _tuple_to_mi_locs(mi,(slice('A0','A2'),slice(None))) - [(0, 2), (0,2)] - - >>> _tuple_to_mi_locs(mi,(slice('A0','A2'),)) - [(0, 2), (0,2)] - - """ - - - ranges = [] - - # ix must be lexsorted to at least as many levels - # as there are elements in `tup` - assert ix.is_lexsorted_for_tuple(tup) - assert ix.is_unique - assert isinstance(ix,MultiIndex) - - for i,k in enumerate(tup): - level = ix.levels[i] - - if _is_list_like(k): - # a collection of labels to include from this level - ranges.append([level.get_loc(x) for x in k]) - continue - if k == slice(None): - start = 0 - stop = len(level) - elif isinstance(k,slice): - start = level.get_loc(k.start) - stop = len(level) - if k.stop: - stop = level.get_loc(k.stop) - else: - # a single label - start = level.get_loc(k) - stop = start - - ranges.append((start,stop)) - - for i in range(i+1,len(ix.levels)): - # omitting trailing dims - # means include all values - level = ix.levels[i] - start = 0 - stop = len(level) - ranges.append((start,stop)) - - return ranges - -def _spec_to_array_indices(ix, specs): - """Convert a tuple of slices/label lists/labels to a level-wise spec - - Parameters - ---------- - ix: a sufficiently lexsorted, unique/non-dupe MultIindex. - specs: a list of 2-tuples/list of label positions. Specifically, The - output of _tuple_to_mi_locs. - len(specs) must matc ix.nlevels. - - Returns - ------- - a generator of row positions relative to ix, corresponding to specs. - Suitable for usage with `iloc`. - - Example (This is *not* a doctest): - >>> mi = pd.MultiIndex.from_product([['A0', 'A1', 'A2'],['B0', 'B1']]) - >>> for x in mi.get_values(): print(x) - ('A0', 'B0') - ('A0', 'B1') - ('A1', 'B0') - ('A1', 'B1') - ('A2', 'B0') - ('A2', 'B1') - - >>> specs = _tuple_to_mi_locs(mi,(slice('A0','A2'),['B0', 'B1'])) - >>> list(_spec_to_array_indices(mi, specs)) - [0, 1, 2, 3] - - Which are all the labels having 'A0' to 'A2' (non-inclusive) at level=0 - and 'B0' or 'B1' at level = 0 - - """ - assert ix.is_lexsorted_for_tuple(specs) - assert len(specs) == ix.nlevels - assert ix.is_unique - assert isinstance(ix,MultiIndex) - - # step size/increment for iteration at each level - giant_steps = np.cumprod(ix.levshape[::-1])[::-1] - giant_steps[:-1] = giant_steps[1:] - giant_steps[-1] = 1 - - def _iter_vectorize(specs, i=0): - step_size = giant_steps[i] - spec=specs[i] - if isinstance(spec,tuple): - # tuples are 2-tuples of (start,stop) label indices to include - valrange = compat.range(*spec) - elif isinstance(spec,list): - # lists are discrete label indicies to include - valrange = spec - - if len(specs)-1 == i: - return np.array(valrange) - else: - tmpl = np.array([v for v in _iter_vectorize(specs,i+1)]) - res=np.tile(tmpl,(len(valrange),1)) - steps=(np.array(valrange)*step_size).reshape((len(valrange),1)) - return (res+steps).flatten() - - - def _iter_generator(specs, i=0): - step_size = giant_steps[i] - spec=specs[i] - if isinstance(spec,tuple): - # tuples are 2-tuples of (start,stop) label indices to include - valrange = compat.range(*spec) - elif isinstance(spec,list): - # lists are discrete label indicies to include - valrange = spec - - if len(specs)-1 == i: - # base case - for v in valrange: - yield v - else: - for base in valrange: - base *= step_size - for v in _iter_generator(specs,i+1): - yield base + v - # validate - - return _iter_vectorize(specs) From bd2e2a1e519dbd7a150d903144ec424f49ef45cf Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 4 Feb 2014 20:29:06 -0500 Subject: [PATCH 3/9] TST: tests for per_axis_per_level_getitem ENH: add core/indexing.py/_getitem_nested_tuple to handle the nested_tuple cases for partial multi-indexing --- pandas/core/index.py | 39 +++--------- pandas/core/indexing.py | 116 ++++++++++++++++++++++++---------- pandas/tests/test_indexing.py | 56 ++++++++++++++++ 3 files changed, 148 insertions(+), 63 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index 82491ab99b312..a6f0627b50c21 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -3249,7 +3249,7 @@ def _get_level_indexer(self, key, level=0): j = labels.searchsorted(loc, side='right') return slice(i, j) - def get_locs(self, tup): + def get_specs(self, tup): """Convert a tuple of slices/label lists/labels to a level-wise spec Parameters @@ -3322,8 +3322,9 @@ def get_locs(self, tup): stop = level.get_loc(k.stop) else: # a single label - start = level.get_loc(k) - stop = start + # make this into a list of a tuple + ranges.append([level.get_loc(k)]) + continue ranges.append((start,stop)) @@ -3337,14 +3338,14 @@ def get_locs(self, tup): return ranges - def locs_to_indexer(self, specs): + def specs_to_indexer(self, specs): """ Take a location specification to an indexer Parameters ---------- self: a sufficiently lexsorted, unique/non-dupe MultIindex. specs: a list of 2-tuples/list of label positions. Specifically, The - output of _tuple_to_mi_locs. + output of get_specs len(specs) must matc ix.nlevels. Returns @@ -3362,8 +3363,8 @@ def locs_to_indexer(self, specs): ('A2', 'B0') ('A2', 'B1') - >>> locs = mi.get_locs((slice('A0','A2'),['B0', 'B1'])) - >>> list(mi.locs_to_indexer(locs)) + >>> locs = mi.get_specs((slice('A0','A2'),['B0', 'B1'])) + >>> list(mi.specs_to_indexer(locs)) [0, 1, 2, 3] Which are all the labels having 'A0' to 'A2' (non-inclusive) at level=0 @@ -3393,33 +3394,11 @@ def _iter_vectorize(specs, i=0): if len(specs)-1 == i: return np.array(valrange) else: - tmpl = np.array([v for v in _iter_vectorize(specs,i+1)]) + tmpl=np.array([v for v in _iter_vectorize(specs,i+1)]) res=np.tile(tmpl,(len(valrange),1)) steps=(np.array(valrange)*step_size).reshape((len(valrange),1)) return (res+steps).flatten() - - def _iter_generator(specs, i=0): - step_size = giant_steps[i] - spec=specs[i] - if isinstance(spec,tuple): - # tuples are 2-tuples of (start,stop) label indices to include - valrange = compat.range(*spec) - elif isinstance(spec,list): - # lists are discrete label indicies to include - valrange = spec - - if len(specs)-1 == i: - # base case - for v in valrange: - yield v - else: - for base in valrange: - base *= step_size - for v in _iter_generator(specs,i+1): - yield base + v - # validate - return _iter_vectorize(specs) def truncate(self, before=None, after=None): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index efe39104553b3..7233e53396e4d 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -693,35 +693,39 @@ def _convert_for_reindex(self, key, axis=0): return keyarr - def _getitem_lowerdim(self, tup): + def _handle_lowerdim_multi_index_axis0(self, tup): + # we have an axis0 multi-index, handle or raise - ax0 = self.obj._get_axis(0) - # a bit kludgy - if isinstance(ax0, MultiIndex): - try: - # fast path for series or for tup devoid of slices - return self._get_label(tup, axis=0) - except TypeError: - # slices are unhashable - pass - except Exception as e1: - if isinstance(tup[0], (slice, Index)): - raise IndexingError("Handle elsewhere") + try: + # fast path for series or for tup devoid of slices + return self._get_label(tup, axis=0) + except TypeError: + # slices are unhashable + pass + except Exception as e1: + if isinstance(tup[0], (slice, Index)): + raise IndexingError("Handle elsewhere") - # raise the error if we are not sorted - if not ax0.is_lexsorted_for_tuple(tup): - raise e1 + # raise the error if we are not sorted + ax0 = self.obj._get_axis(0) + if not ax0.is_lexsorted_for_tuple(tup): + raise e1 - # GH911 introduced this clause, but the regression test - # added for it now passes even without it. Let's rock the boat. - # 2014/01/27 + return None - # # should we abort, or keep going? - # try: - # loc = ax0.get_loc(tup[0]) - # except KeyError: - # raise e1 + def _getitem_lowerdim(self, tup): + # we may have a nested tuples indexer here + if any([ isinstance(ax, MultiIndex) for ax in self.obj.axes ]): + if any([ _is_nested_tuple(tup,ax) for ax in self.obj.axes ]): + return self._getitem_nested_tuple(tup) + + # we maybe be using a tuple to represent multiple dimensions here + ax0 = self.obj._get_axis(0) + if isinstance(ax0, MultiIndex): + result = self._handle_lowerdim_multi_index_axis0(tup) + if result is not None: + return result if len(tup) > self.obj.ndim: raise IndexingError("Too many indexers. handle elsewhere") @@ -760,7 +764,31 @@ def _getitem_lowerdim(self, tup): raise IndexingError('not applicable') - def _getitem_axis(self, key, axis=0): + def _getitem_nested_tuple(self, tup): + # we have a nested tuple so have at least 1 multi-index level + # we should be able to match up the dimensionaility here + + # we have too many indexers for our dim, but have at least 1 + # multi-index dimension, try to see if we have something like + # a tuple passed to a series with a multi-index + if len(tup) > self.ndim: + return self._handle_lowerdim_multi_index_axis0(tup) + + # handle the multi-axis by taking sections and reducing + # this is iterative + obj = self.obj + axis = 0 + for key in tup: + + obj = getattr(obj, self.name)._getitem_axis(key, axis=axis, validate_iterable=True) + axis += 1 + + if obj.ndim < self.ndim: + axis -= 1 + + return obj + + def _getitem_axis(self, key, axis=0, validate_iterable=False): self._has_valid_type(key, axis) labels = self.obj._get_axis(axis) @@ -1058,7 +1086,7 @@ def __getitem__(self, key): else: return self._getitem_axis(key, axis=0) - def _getitem_axis(self, key, axis=0): + def _getitem_axis(self, key, axis=0, validate_iterable=False): raise NotImplementedError() def _getbool_axis(self, key, axis=0): @@ -1135,6 +1163,7 @@ def _has_valid_type(self, key, axis): # require all elements in the index idx = _ensure_index(key) if not idx.isin(ax).all(): + raise KeyError("[%s] are not in ALL in the [%s]" % (key, self.obj._get_axis_name(axis))) @@ -1164,7 +1193,7 @@ def error(): return True - def _getitem_axis(self, key, axis=0): + def _getitem_axis(self, key, axis=0, validate_iterable=False): labels = self.obj._get_axis(axis) if isinstance(key, slice): @@ -1178,12 +1207,15 @@ def _getitem_axis(self, key, axis=0): if hasattr(key, 'ndim') and key.ndim > 1: raise ValueError('Cannot index with multidimensional key') + if validate_iterable: + self._has_valid_type(key, axis) return self._getitem_iterable(key, axis=axis) - elif isinstance(key, tuple) and isinstance(labels, MultiIndex) and \ - any([isinstance(x,slice) for x in key]): - locs = labels.get_locs(key) - g = labels.locs_to_indexer(locs) - return self.obj.iloc[g] + elif _is_nested_tuple(key, labels): + specs = labels.get_specs(key) + g = labels.specs_to_indexer(specs) + indexer = [ slice(None) ] * self.ndim + indexer[axis] = g + return self.obj.iloc[tuple(indexer)] else: self._has_valid_type(key, axis) return self._get_label(key, axis=axis) @@ -1256,7 +1288,7 @@ def _get_slice_axis(self, slice_obj, axis=0): else: return self.obj.take(slice_obj, axis=axis, convert=False) - def _getitem_axis(self, key, axis=0): + def _getitem_axis(self, key, axis=0, validate_iterable=False): if isinstance(key, slice): self._has_valid_type(key, axis) @@ -1515,6 +1547,24 @@ def _maybe_convert_ix(*args): return args +def _is_nested_tuple(tup, labels): + # check for a compatiable nested tuple and multiindexes among the axes + + if not isinstance(tup, tuple): + return False + + # are we nested tuple of: tuple,list,slice + for i, k in enumerate(tup): + + #if i > len(axes): + # raise IndexingError("invalid indxing tuple passed, has too many indexers for this object") + #ax = axes[i] + if isinstance(k, (tuple, list, slice)): + return isinstance(labels, MultiIndex) + + return False + + def _is_null_slice(obj): return (isinstance(obj, slice) and obj.start is None and obj.stop is None and obj.step is None) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 52de461f0281b..149960dd7f1c4 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -1062,6 +1062,62 @@ def test_xs_multiindex(self): expected.columns = expected.columns.droplevel('lvl1') assert_frame_equal(result, expected) + def test_per_axis_per_level_getitem(self): + + # GH6134 + # example test case + def mklbl(prefix,n): + return ["%s%s" % (prefix,i) for i in range(n)] + + ix = MultiIndex.from_product([mklbl('A',5),mklbl('B',7),mklbl('C',4),mklbl('D',2)]) + df = DataFrame(np.arange(len(ix.get_values())),index=ix) + result = df.loc[(slice('A1','A3'),slice(None), ['C1','C3']),:] + expected = df.loc[[ tuple([a,b,c,d]) for a,b,c,d in df.index.values if (a == 'A1' or a == 'A2') and (c == 'C1' or c == 'C3')]] + assert_frame_equal(result, expected) + + # test multi-index slicing with per axis and per index controls + index = MultiIndex.from_tuples([('A',1),('A',2),('A',3),('B',1)], + names=['one','two']) + columns = MultiIndex.from_tuples([('a','foo'),('a','bar'),('b','hello'),('b','world')], + names=['lvl0', 'lvl1']) + + df = DataFrame(np.arange(16).reshape(4, 4), index=index, columns=columns) + df = df.sortlevel(axis=0).sortlevel(axis=1) + + # identity + result = df.loc[(slice(None),slice(None)),:] + assert_frame_equal(result, df) + result = df.loc[(slice(None),slice(None)),(slice(None),slice(None))] + assert_frame_equal(result, df) + result = df.loc[:,(slice(None),slice(None))] + assert_frame_equal(result, df) + + # index + result = df.loc[(slice(None),[1]),:] + expected = df.iloc[[0,3]] + assert_frame_equal(result, expected) + + result = df.loc[(slice(None),1),:] + expected = df.iloc[[0,3]] + assert_frame_equal(result, expected) + + # columns + result = df.loc[:,(slice(None),['world'])] + expected = df.iloc[:,[3]] + assert_frame_equal(result, expected) + + # both + result = df.loc[(slice(None),1),(slice(None),['world'])] + expected = df.iloc[[0,3],[3]] + assert_frame_equal(result, expected) + + # ambiguous cases + # these can be multiply interpreted + # but we can catch this in some cases + def f(): + df.loc[(slice(None),[1])] + self.assertRaises(KeyError, f) + def test_getitem_multiindex(self): # GH 5725 From 73202630a211263684ac9ee39f672c1535ebeb04 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 8 Feb 2014 16:12:11 -0500 Subject: [PATCH 4/9] ENH: allow core/index/_get_loc_level to deal with a slice indexer for a particular level ENH: remove get_specs/specs_to_index -> replace with get_locs, to directly compute an indexer for a multi-level specification --- pandas/core/index.py | 175 +++++++++------------------------- pandas/core/indexing.py | 9 +- pandas/tests/test_indexing.py | 20 +++- 3 files changed, 64 insertions(+), 140 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index a6f0627b50c21..9e90af3f7f6aa 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1,7 +1,7 @@ # pylint: disable=E1101,E1103,W0232 import datetime from functools import partial -from pandas.compat import range, zip, lrange, lzip, u +from pandas.compat import range, zip, lrange, lzip, u, reduce from pandas import compat import numpy as np @@ -3231,6 +3231,13 @@ def partial_selection(key): if key[i] != slice(None, None)] return indexer, _maybe_drop_levels(indexer, ilevels, drop_level) + elif isinstance(key, slice): + # handle a passed slice for this level + start = self._get_level_indexer(key.start,level=level) + stop = self._get_level_indexer(key.stop,level=level) + step = key.step + indexer = slice(start.start,stop.start,step) + return indexer, _maybe_drop_levels(indexer, [level], drop_level) else: indexer = self._get_level_indexer(key, level=level) new_index = _maybe_drop_levels(indexer, [level], drop_level) @@ -3249,157 +3256,61 @@ def _get_level_indexer(self, key, level=0): j = labels.searchsorted(loc, side='right') return slice(i, j) - def get_specs(self, tup): - """Convert a tuple of slices/label lists/labels to a level-wise spec + def get_locs(self, tup): + """ + Given a tuple of slices/lists/labels to a level-wise spec + produce an indexer to extract those locations Parameters ---------- - self: a sufficiently lexsorted, unique/non-dupe MultIindex. - tup: a tuple of slices, labels or lists of labels. - slice(None) is acceptable, and the case of len(tup)>> mi = pd.MultiIndex.from_product([['A0', 'A1', 'A2'],['B0', 'B1']]) - >>> for x in mi.get_values(): print(x) - ('A0', 'B0') - ('A0', 'B1') - ('A1', 'B0') - ('A1', 'B1') - ('A2', 'B0') - ('A2', 'B1') - >>> mi.get_specs((slice('A0','A2'),['B0', 'B1'])) - [(0, 2), [0, 1]] - - read as: - - All labels in position [0,1) in first level - - for each of those, all labels at positions 0 or 1. - - The same effective result can be achieved by specifying the None Slice, - or omitting it completely. Note the tuple (0,2) has replaced the list [0 1], - but the outcome is the same. - - >>> mi.get_locs((slice('A0','A2'),slice(None))) - [(0, 2), (0,2)] - - >>> mi.get_locs((slice('A0','A2'),)) - [(0, 2), (0,2)] - + locs : integer list of locations or boolean indexer suitable + for passing to iloc """ - ranges = [] - - # self must be lexsorted to at least as many levels - # as there are elements in `tup` + # must be lexsorted to at least as many levels assert self.is_lexsorted_for_tuple(tup) assert self.is_unique - assert isinstance(self,MultiIndex) + def _convert_indexer(r): + if isinstance(r, slice): + m = np.zeros(len(self),dtype=bool) + m[r] = True + return m + return r + + ranges = [] for i,k in enumerate(tup): - level = self.levels[i] if com.is_list_like(k): - # a collection of labels to include from this level - ranges.append([level.get_loc(x) for x in k]) - continue - if k == slice(None): - start = 0 - stop = len(level) + # a collection of labels to include from this level (these are or'd) + ranges.append(reduce( + np.logical_or,[ _convert_indexer(self._get_level_indexer(x, level=i) + ) for x in k ])) + elif k == slice(None): + # include all from this level + pass elif isinstance(k,slice): - start = level.get_loc(k.start) - stop = len(level) - if k.stop: - stop = level.get_loc(k.stop) + start = self._get_level_indexer(k.start,level=i) + stop = self._get_level_indexer(k.stop,level=i) + step = k.step + ranges.append(slice(start.start,stop.start,step)) else: # a single label - # make this into a list of a tuple - ranges.append([level.get_loc(k)]) - continue - - ranges.append((start,stop)) + ranges.append(self.get_loc_level(k,level=i,drop_level=False)[0]) - for i in range(i+1,len(self.levels)): - # omitting trailing dims - # means include all values - level = self.levels[i] - start = 0 - stop = len(level) - ranges.append((start,stop)) + # identity + if len(ranges) == 0: + return slice(0,len(self)) - return ranges - - def specs_to_indexer(self, specs): - """ Take a location specification to an indexer - - Parameters - ---------- - self: a sufficiently lexsorted, unique/non-dupe MultIindex. - specs: a list of 2-tuples/list of label positions. Specifically, The - output of get_specs - len(specs) must matc ix.nlevels. - - Returns - ------- - a generator of row positions relative to ix, corresponding to specs. - Suitable for usage with `iloc`. - - Example (This is *not* a doctest): - >>> mi = pd.MultiIndex.from_product([['A0', 'A1', 'A2'],['B0', 'B1']]) - >>> for x in mi.get_values(): print(x) - ('A0', 'B0') - ('A0', 'B1') - ('A1', 'B0') - ('A1', 'B1') - ('A2', 'B0') - ('A2', 'B1') - - >>> locs = mi.get_specs((slice('A0','A2'),['B0', 'B1'])) - >>> list(mi.specs_to_indexer(locs)) - [0, 1, 2, 3] - - Which are all the labels having 'A0' to 'A2' (non-inclusive) at level=0 - and 'B0' or 'B1' at level = 0 - - """ - assert self.is_lexsorted_for_tuple(specs) - assert len(specs) == self.nlevels - assert self.is_unique - assert isinstance(self,MultiIndex) - - # step size/increment for iteration at each level - giant_steps = np.cumprod(self.levshape[::-1])[::-1] - giant_steps[:-1] = giant_steps[1:] - giant_steps[-1] = 1 - - def _iter_vectorize(specs, i=0): - step_size = giant_steps[i] - spec=specs[i] - if isinstance(spec,tuple): - # tuples are 2-tuples of (start,stop) label indices to include - valrange = compat.range(*spec) - elif isinstance(spec,list): - # lists are discrete label indicies to include - valrange = spec - - if len(specs)-1 == i: - return np.array(valrange) - else: - tmpl=np.array([v for v in _iter_vectorize(specs,i+1)]) - res=np.tile(tmpl,(len(valrange),1)) - steps=(np.array(valrange)*step_size).reshape((len(valrange),1)) - return (res+steps).flatten() + elif len(ranges) == 1: + return ranges[0] - return _iter_vectorize(specs) + # construct a boolean indexer if we have a slice or boolean indexer + return reduce(np.logical_and,[ _convert_indexer(r) for r in ranges ]) def truncate(self, before=None, after=None): """ diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 7233e53396e4d..47ccbe8a184c8 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -780,6 +780,10 @@ def _getitem_nested_tuple(self, tup): axis = 0 for key in tup: + if _is_null_slice(key): + axis += 1 + continue + obj = getattr(obj, self.name)._getitem_axis(key, axis=axis, validate_iterable=True) axis += 1 @@ -1211,10 +1215,9 @@ def _getitem_axis(self, key, axis=0, validate_iterable=False): self._has_valid_type(key, axis) return self._getitem_iterable(key, axis=axis) elif _is_nested_tuple(key, labels): - specs = labels.get_specs(key) - g = labels.specs_to_indexer(specs) + locs = labels.get_locs(key) indexer = [ slice(None) ] * self.ndim - indexer[axis] = g + indexer[axis] = locs return self.obj.iloc[tuple(indexer)] else: self._has_valid_type(key, axis) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 149960dd7f1c4..98dcb5600936a 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -1078,7 +1078,7 @@ def mklbl(prefix,n): # test multi-index slicing with per axis and per index controls index = MultiIndex.from_tuples([('A',1),('A',2),('A',3),('B',1)], names=['one','two']) - columns = MultiIndex.from_tuples([('a','foo'),('a','bar'),('b','hello'),('b','world')], + columns = MultiIndex.from_tuples([('a','foo'),('a','bar'),('b','foo'),('b','bah')], names=['lvl0', 'lvl1']) df = DataFrame(np.arange(16).reshape(4, 4), index=index, columns=columns) @@ -1102,13 +1102,23 @@ def mklbl(prefix,n): assert_frame_equal(result, expected) # columns - result = df.loc[:,(slice(None),['world'])] - expected = df.iloc[:,[3]] + result = df.loc[:,(slice(None),['foo'])] + expected = df.iloc[:,[1,3]] assert_frame_equal(result, expected) # both - result = df.loc[(slice(None),1),(slice(None),['world'])] - expected = df.iloc[[0,3],[3]] + result = df.loc[(slice(None),1),(slice(None),['foo'])] + expected = df.iloc[[0,3],[1,3]] + assert_frame_equal(result, expected) + + result = df.loc['A','a'] + expected = DataFrame(dict(bar = [1,5,9], foo = [0,4,8]), + index=Index([1,2,3],name='two'), + columns=Index(['bar','foo'],name='lvl1')) + assert_frame_equal(result, expected) + + result = df.loc[(slice(None),[1,2]),:] + expected = df.iloc[[0,1,3]] assert_frame_equal(result, expected) # ambiguous cases From de84842c11c148ac9c855a07abcb04a9f96a1f3b Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 8 Feb 2014 17:04:06 -0500 Subject: [PATCH 5/9] ENH: allow per_axis, per_level multiindex setting TST: better error messages when levels are not sorted with core/index/get_locs ENH: add boolean indexer support on per_axis/per_level BUG: handle a multi-level indexed series passed like with a nested tuple of selectors e.g. something like: s.loc['A1':'A3',:,['C1','C3']] --- pandas/core/index.py | 74 +++++++++++----- pandas/core/indexing.py | 20 ++++- pandas/tests/test_indexing.py | 159 +++++++++++++++++++++++++++++++++- 3 files changed, 224 insertions(+), 29 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index 9e90af3f7f6aa..1f4ee5246a04a 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -3231,34 +3231,52 @@ def partial_selection(key): if key[i] != slice(None, None)] return indexer, _maybe_drop_levels(indexer, ilevels, drop_level) - elif isinstance(key, slice): - # handle a passed slice for this level - start = self._get_level_indexer(key.start,level=level) - stop = self._get_level_indexer(key.stop,level=level) - step = key.step - indexer = slice(start.start,stop.start,step) - return indexer, _maybe_drop_levels(indexer, [level], drop_level) else: indexer = self._get_level_indexer(key, level=level) - new_index = _maybe_drop_levels(indexer, [level], drop_level) - return indexer, new_index + return indexer, _maybe_drop_levels(indexer, [level], drop_level) def _get_level_indexer(self, key, level=0): + # return a boolean indexer or a slice showing where the key is + # in the totality of values + level_index = self.levels[level] - loc = level_index.get_loc(key) labels = self.labels[level] - if level > 0 or self.lexsort_depth == 0: - return np.array(labels == loc,dtype=bool) + if isinstance(key, slice): + # handle a slice, returnig a slice if we can + # otherwise a boolean indexer + + start = level_index.get_loc(key.start) + stop = level_index.get_loc(key.stop) + step = key.step + + if level > 0 or self.lexsort_depth == 0: + # need to have like semantics here to right + # searching as when we are using a slice + # so include the stop+1 (so we include stop) + m = np.zeros(len(labels),dtype=bool) + m[np.in1d(labels,np.arange(start,stop+1,step))] = True + return m + else: + # sorted, so can return slice object -> view + i = labels.searchsorted(start, side='left') + j = labels.searchsorted(stop, side='right') + return slice(i, j, step) + else: - # sorted, so can return slice object -> view - i = labels.searchsorted(loc, side='left') - j = labels.searchsorted(loc, side='right') - return slice(i, j) + + loc = level_index.get_loc(key) + if level > 0 or self.lexsort_depth == 0: + return np.array(labels == loc,dtype=bool) + else: + # sorted, so can return slice object -> view + i = labels.searchsorted(loc, side='left') + j = labels.searchsorted(loc, side='right') + return slice(i, j) def get_locs(self, tup): """ - Given a tuple of slices/lists/labels to a level-wise spec + Given a tuple of slices/lists/labels/boolean indexer to a level-wise spec produce an indexer to extract those locations Parameters @@ -3272,8 +3290,11 @@ def get_locs(self, tup): """ # must be lexsorted to at least as many levels - assert self.is_lexsorted_for_tuple(tup) - assert self.is_unique + if not self.is_lexsorted_for_tuple(tup): + raise KeyError('MultiIndex Slicing requires the index to be fully lexsorted' + ' tuple len ({0}), lexsort depth ({1})'.format(len(tup), self.lexsort_depth)) + if not self.is_unique: + raise ValueError('MultiIndex Slicing requires a unique index') def _convert_indexer(r): if isinstance(r, slice): @@ -3285,7 +3306,14 @@ def _convert_indexer(r): ranges = [] for i,k in enumerate(tup): - if com.is_list_like(k): + if com._is_bool_indexer(k): + # a boolean indexer, must be the same length! + k = np.asarray(k) + if len(k) != len(self): + raise ValueError("cannot index with a boolean indexer that is" + " not the same length as the index") + ranges.append(k) + elif com.is_list_like(k): # a collection of labels to include from this level (these are or'd) ranges.append(reduce( np.logical_or,[ _convert_indexer(self._get_level_indexer(x, level=i) @@ -3294,10 +3322,8 @@ def _convert_indexer(r): # include all from this level pass elif isinstance(k,slice): - start = self._get_level_indexer(k.start,level=i) - stop = self._get_level_indexer(k.stop,level=i) - step = k.step - ranges.append(slice(start.start,stop.start,step)) + # a slice, include BOTH of the labels + ranges.append(self._get_level_indexer(k,level=i)) else: # a single label ranges.append(self.get_loc_level(k,level=i,drop_level=False)[0]) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 47ccbe8a184c8..38c857e24dacf 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -99,6 +99,7 @@ def _slice(self, obj, axis=0, raise_on_error=False, typ=None): typ=typ) def __setitem__(self, key, value): + # kludgetastic ax = self.obj._get_axis(0) if isinstance(ax, MultiIndex): @@ -131,6 +132,11 @@ def _has_valid_tuple(self, key): raise ValueError("Location based indexing can only have [%s] " "types" % self._valid_types) + def _is_nested_tuple_indexer(self, tup): + if any([ isinstance(ax, MultiIndex) for ax in self.obj.axes ]): + return any([ _is_nested_tuple(tup,ax) for ax in self.obj.axes ]) + return False + def _convert_tuple(self, key, is_setter=False): keyidx = [] for i, k in enumerate(key): @@ -716,9 +722,8 @@ def _handle_lowerdim_multi_index_axis0(self, tup): def _getitem_lowerdim(self, tup): # we may have a nested tuples indexer here - if any([ isinstance(ax, MultiIndex) for ax in self.obj.axes ]): - if any([ _is_nested_tuple(tup,ax) for ax in self.obj.axes ]): - return self._getitem_nested_tuple(tup) + if self._is_nested_tuple_indexer(tup): + return self._getitem_nested_tuple(tup) # we maybe be using a tuple to represent multiple dimensions here ax0 = self.obj._get_axis(0) @@ -772,7 +777,12 @@ def _getitem_nested_tuple(self, tup): # multi-index dimension, try to see if we have something like # a tuple passed to a series with a multi-index if len(tup) > self.ndim: - return self._handle_lowerdim_multi_index_axis0(tup) + result = self._handle_lowerdim_multi_index_axis0(tup) + if result is not None: + return result + + # this is a series with a multi-index specified a tuple of selectors + return self._getitem_axis(tup, axis=0, validate_iterable=True) # handle the multi-axis by taking sections and reducing # this is iterative @@ -983,6 +993,8 @@ def _convert_to_indexer(self, obj, axis=0, is_setter=False): if isinstance(obj, slice): return self._convert_slice_indexer(obj, axis) + elif _is_nested_tuple(obj, labels): + return labels.get_locs(obj) elif _is_list_like(obj): if com._is_bool_indexer(obj): obj = _check_bool_indexer(labels, obj) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 98dcb5600936a..2a8723292b2ec 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -1072,7 +1072,13 @@ def mklbl(prefix,n): ix = MultiIndex.from_product([mklbl('A',5),mklbl('B',7),mklbl('C',4),mklbl('D',2)]) df = DataFrame(np.arange(len(ix.get_values())),index=ix) result = df.loc[(slice('A1','A3'),slice(None), ['C1','C3']),:] - expected = df.loc[[ tuple([a,b,c,d]) for a,b,c,d in df.index.values if (a == 'A1' or a == 'A2') and (c == 'C1' or c == 'C3')]] + expected = df.loc[[ tuple([a,b,c,d]) for a,b,c,d in df.index.values if ( + a == 'A1' or a == 'A2' or a == 'A3') and (c == 'C1' or c == 'C3')]] + assert_frame_equal(result, expected) + + expected = df.loc[[ tuple([a,b,c,d]) for a,b,c,d in df.index.values if ( + a == 'A1' or a == 'A2' or a == 'A3') and (c == 'C1' or c == 'C2' or c == 'C3')]] + result = df.loc[(slice('A1','A3'),slice(None), slice('C1','C3')),:] assert_frame_equal(result, expected) # test multi-index slicing with per axis and per index controls @@ -1121,6 +1127,22 @@ def mklbl(prefix,n): expected = df.iloc[[0,1,3]] assert_frame_equal(result, expected) + # multi-level series + s = Series(np.arange(len(ix.get_values())),index=ix) + result = s.loc['A1':'A3', :, ['C1','C3']] + expected = s.loc[[ tuple([a,b,c,d]) for a,b,c,d in s.index.values if ( + a == 'A1' or a == 'A2' or a == 'A3') and (c == 'C1' or c == 'C3')]] + assert_series_equal(result, expected) + + # boolean indexers + result = df.loc[(slice(None),df.loc[:,('a','bar')]>5),:] + expected = df.iloc[[2,3]] + assert_frame_equal(result, expected) + + def f(): + df.loc[(slice(None),np.array([True,False])),:] + self.assertRaises(ValueError, f) + # ambiguous cases # these can be multiply interpreted # but we can catch this in some cases @@ -1128,6 +1150,141 @@ def f(): df.loc[(slice(None),[1])] self.assertRaises(KeyError, f) + def test_per_axis_per_level_getitem_doc_examples(self): + + # from indexing.rst / advanced + def mklbl(prefix,n): + return ["%s%s" % (prefix,i) for i in range(n)] + + index = MultiIndex.from_product([mklbl('A',4), + mklbl('B',2), + mklbl('C',4), + mklbl('D',2)]) + columns = MultiIndex.from_tuples([('a','foo'),('a','bar'), + ('b','foo'),('b','bah')], + names=['lvl0', 'lvl1']) + df = DataFrame(np.arange(len(index)*len(columns)).reshape((len(index),len(columns))), + index=index, + columns=columns) + result = df.loc[(slice('A1','A3'),slice(None), ['C1','C3']),:] + expected = df.loc[[ tuple([a,b,c,d]) for a,b,c,d in df.index.values if ( + a == 'A1' or a == 'A2' or a == 'A3') and (c == 'C1' or c == 'C3')]] + assert_frame_equal(result, expected) + + result = df.loc[(slice(None),slice(None), ['C1','C3']),:] + expected = df.loc[[ tuple([a,b,c,d]) for a,b,c,d in df.index.values if ( + c == 'C1' or c == 'C3')]] + assert_frame_equal(result, expected) + + # not sorted + def f(): + df.loc['A1',(slice(None),'foo')] + self.assertRaises(KeyError, f) + df = df.sortlevel(axis=1) + + df.loc['A1',(slice(None),'foo')] + df.loc[(slice(None),slice(None), ['C1','C3']),(slice(None),'foo')] + + def test_per_axis_per_level_setitem(self): + + # test multi-index slicing with per axis and per index controls + index = MultiIndex.from_tuples([('A',1),('A',2),('A',3),('B',1)], + names=['one','two']) + columns = MultiIndex.from_tuples([('a','foo'),('a','bar'),('b','foo'),('b','bah')], + names=['lvl0', 'lvl1']) + + df_orig = DataFrame(np.arange(16).reshape(4, 4), index=index, columns=columns) + df_orig = df_orig.sortlevel(axis=0).sortlevel(axis=1) + + # identity + df = df_orig.copy() + df.loc[(slice(None),slice(None)),:] = 100 + expected = df_orig.copy() + expected.iloc[:,:] = 100 + assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[(slice(None),slice(None)),(slice(None),slice(None))] = 100 + expected = df_orig.copy() + expected.iloc[:,:] = 100 + assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[:,(slice(None),slice(None))] = 100 + expected = df_orig.copy() + expected.iloc[:,:] = 100 + assert_frame_equal(df, expected) + + # index + df = df_orig.copy() + df.loc[(slice(None),[1]),:] = 100 + expected = df_orig.copy() + expected.iloc[[0,3]] = 100 + assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[(slice(None),1),:] = 100 + expected = df_orig.copy() + expected.iloc[[0,3]] = 100 + assert_frame_equal(df, expected) + + # columns + df = df_orig.copy() + df.loc[:,(slice(None),['foo'])] = 100 + expected = df_orig.copy() + expected.iloc[:,[1,3]] = 100 + assert_frame_equal(df, expected) + + # both + df = df_orig.copy() + df.loc[(slice(None),1),(slice(None),['foo'])] = 100 + expected = df_orig.copy() + expected.iloc[[0,3],[1,3]] = 100 + assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc['A','a'] = 100 + expected = df_orig.copy() + expected.iloc[0:3,0:2] = 100 + assert_frame_equal(df, expected) + + # setting with a list-like + df = df_orig.copy() + df.loc[(slice(None),1),(slice(None),['foo'])] = np.array([[100, 100], [100, 100]],dtype='int64') + expected = df_orig.copy() + expected.iloc[[0,3],[1,3]] = 100 + assert_frame_equal(df, expected) + + # not enough values + df = df_orig.copy() + def f(): + df.loc[(slice(None),1),(slice(None),['foo'])] = np.array([[100], [100, 100]],dtype='int64') + self.assertRaises(ValueError, f) + def f(): + df.loc[(slice(None),1),(slice(None),['foo'])] = np.array([100, 100, 100, 100],dtype='int64') + self.assertRaises(ValueError, f) + + # with an alignable rhs + df = df_orig.copy() + df.loc[(slice(None),1),(slice(None),['foo'])] = df.loc[(slice(None),1),(slice(None),['foo'])] * 5 + expected = df_orig.copy() + expected.iloc[[0,3],[1,3]] = expected.iloc[[0,3],[1,3]] * 5 + assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[(slice(None),1),(slice(None),['foo'])] *= df.loc[(slice(None),1),(slice(None),['foo'])] + expected = df_orig.copy() + expected.iloc[[0,3],[1,3]] *= expected.iloc[[0,3],[1,3]] + assert_frame_equal(df, expected) + + rhs = df_orig.loc[(slice(None),1),(slice(None),['foo'])].copy() + rhs.loc[:,('c','bah')] = 10 + df = df_orig.copy() + df.loc[(slice(None),1),(slice(None),['foo'])] *= rhs + expected = df_orig.copy() + expected.iloc[[0,3],[1,3]] *= expected.iloc[[0,3],[1,3]] + assert_frame_equal(df, expected) + def test_getitem_multiindex(self): # GH 5725 From 65a99767097cdef6f0d276e4659a1f9c7145ac58 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 8 Feb 2014 18:23:21 -0500 Subject: [PATCH 6/9] DOC: v0.14.0 and indexing doc updates for mi slicing DOC: release notes and issues for mi_slicing --- doc/source/indexing.rst | 141 ++++++++++++++++++++++++++++++++-------- doc/source/release.rst | 1 + doc/source/v0.14.0.txt | 86 ++++++++++++++++++++++++ 3 files changed, 202 insertions(+), 26 deletions(-) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index d65c1519fe869..dff2fb51770b9 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -426,14 +426,14 @@ python/numpy allow slicing past the end of an array without an associated error. values. A single indexer that is out-of-bounds and drops the dimensions of the object will still raise ``IndexError`` (:issue:`6296`). This could result in an empty axis (e.g. an empty DataFrame being returned) - .. ipython:: python +.. ipython:: python - df = DataFrame(np.random.randn(5,2),columns=list('AB')) - df - df.iloc[[4,5,6]] - df.iloc[4:6] - df.iloc[:,2:3] - df.iloc[:,1:3] + dfl = DataFrame(np.random.randn(5,2),columns=list('AB')) + dfl + dfl.iloc[[4,5,6]] + dfl.iloc[4:6] + dfl.iloc[:,2:3] + dfl.iloc[:,1:3] .. _indexing.basics.partial_setting: @@ -1684,7 +1684,7 @@ of tuples: Advanced indexing with hierarchical index ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Syntactically integrating ``MultiIndex`` in advanced indexing with ``.ix`` is a +Syntactically integrating ``MultiIndex`` in advanced indexing with ``.loc/.ix`` is a bit challenging, but we've made every effort to do so. for example the following works as you would expect: @@ -1692,22 +1692,21 @@ following works as you would expect: df = df.T df - df.ix['bar'] - df.ix['bar', 'two'] + df.loc['bar'] + df.loc['bar', 'two'] -"Partial" slicing also works quite nicely for the topmost level: +"Partial" slicing also works quite nicely. .. ipython:: python - df.ix['baz':'foo'] + df.loc['baz':'foo'] -But lower levels cannot be sliced in this way, because the MultiIndex uses -its multiple index dimensions to slice along one dimension of your object: +You can slice with a 'range' of values, by providing a slice of tuples. .. ipython:: python - df.ix[('baz', 'two'):('qux', 'one')] - df.ix[('baz', 'two'):'foo'] + df.loc[('baz', 'two'):('qux', 'one')] + df.loc[('baz', 'two'):'foo'] Passing a list of labels or tuples works similar to reindexing: @@ -1715,16 +1714,92 @@ Passing a list of labels or tuples works similar to reindexing: df.ix[[('bar', 'two'), ('qux', 'one')]] -The following does not work, and it's not clear if it should or not: +.. _indexing.mi_slicers: -:: +Multiindexing using slicers +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.14.0 + +In 0.14.0 we added a new way to slice multi-indexed objects. +You can slice a multi-index by providing multiple indexers. + +You can provide any of the selectors as if you are indexing by label, see :ref:`Selection by Label `, +including slices, lists of labels, labels, and boolean indexers. + +You can use ``slice(None)`` to select all the contents of *that* level. You do not need to specify all the +*deeper* levels, they will be implied as ``slice(None)``. + +As usual, **both sides** of the slicers are included as this is label indexing. + +.. warning:: + + You should specify all axes in the ``.loc`` specifier, meaning the indexer for the **index** and + for the **columns**. Their are some ambiguous cases where the passed indexer could be mis-interpreted + as indexing *both* axes, rather than into say the MuliIndex for the rows. + + You should do this: + + .. code-block:: python + + df.loc[(slice('A1','A3'),.....,:] + + rather than this: + + .. code-block:: python + + df.loc[(slice('A1','A3'),.....] + +.. warning:: + + You will need to make sure that the selection axes are fully lexsorted! + +.. ipython:: python + + def mklbl(prefix,n): + return ["%s%s" % (prefix,i) for i in range(n)] + + miindex = MultiIndex.from_product([mklbl('A',4), + mklbl('B',2), + mklbl('C',4), + mklbl('D',2)]) + micolumns = MultiIndex.from_tuples([('a','foo'),('a','bar'), + ('b','foo'),('b','bah')], + names=['lvl0', 'lvl1']) + dfmi = DataFrame(np.arange(len(miindex)*len(micolumns)).reshape((len(miindex),len(micolumns))), + index=miindex, + columns=micolumns).sortlevel().sortlevel(axis=1) + dfmi + +.. ipython:: python + + dfmi.loc[(slice('A1','A3'),slice(None), ['C1','C3']),:] + dfmi.loc[(slice(None),slice(None), ['C1','C3']),:] - >>> df.ix[['bar', 'qux']] +It is possible to perform quite complicated selections using this method on multiple +axes at the same time. -The code for implementing ``.ix`` makes every attempt to "do the right thing" -but as you use it you may uncover corner cases or unintuitive behavior. If you -do find something like this, do not hesitate to report the issue or ask on the -mailing list. +.. ipython:: python + + dfmi.loc['A1',(slice(None),'foo')] + dfmi.loc[(slice(None),slice(None), ['C1','C3']),(slice(None),'foo')] + dfmi.loc[df[('a','foo')]>200,slice(None), ['C1','C3']),(slice(None),'foo')] + +Furthermore you can *set* the values using these methods + +.. ipython:: python + + df2 = dfmi.copy() + df2.loc[(slice(None),slice(None), ['C1','C3']),:] = -10 + df2 + +You use a right-hand-side of an alignable object as well. + +.. ipython:: python + + df2 = dfmi.copy() + df2.loc[(slice(None),slice(None), ['C1','C3']),:] = df2*1000 + df2 .. _indexing.xs: @@ -1738,6 +1813,11 @@ selecting data at a particular level of a MultiIndex easier. df.xs('one', level='second') +.. ipython:: python + + # using the slicers (new in 0.14.0) + df.loc[(slice(None),'one'),:] + You can also select on the columns with :meth:`~pandas.MultiIndex.xs`, by providing the axis argument @@ -1746,29 +1826,38 @@ providing the axis argument df = df.T df.xs('one', level='second', axis=1) +.. ipython:: python + + # using the slicers (new in 0.14.0) + df.loc[:,(slice(None),'one')] + :meth:`~pandas.MultiIndex.xs` also allows selection with multiple keys .. ipython:: python df.xs(('one', 'bar'), level=('second', 'first'), axis=1) +.. ipython:: python + + # using the slicers (new in 0.14.0) + df.loc[:,('bar','one')] .. versionadded:: 0.13.0 You can pass ``drop_level=False`` to :meth:`~pandas.MultiIndex.xs` to retain the level that was selected -.. ipython:: +.. ipython:: python df.xs('one', level='second', axis=1, drop_level=False) versus the result with ``drop_level=True`` (the default value) -.. ipython:: +.. ipython:: python df.xs('one', level='second', axis=1, drop_level=True) -.. ipython:: +.. ipython:: python :suppress: df = df.T diff --git a/doc/source/release.rst b/doc/source/release.rst index 40913e40f485f..1e4b7756af706 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -74,6 +74,7 @@ Improvements to existing features the func (:issue:`6289`) - ``plot(legend='reverse')`` will now reverse the order of legend labels for most plot kinds. (:issue:`6014`) +- Allow multi-index slicers (:issue:`6134`, :issue:`4036`, :issue:`3057`, :issue:`2598`, :issue:`5641`) .. _release.bug_fixes-0.14.0: diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index ee38fed810af0..2dbfff6943004 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -29,6 +29,92 @@ API changes df.iloc[:,2:3] df.iloc[:,1:3] +MultiIndexing Using Slicers +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In 0.14.0 we added a new way to slice multi-indexed objects. +You can slice a multi-index by providing multiple indexers. + +You can provide any of the selectors as if you are indexing by label, see :ref:`Selection by Label `, +including slices, lists of labels, labels, and boolean indexers. + +You can use ``slice(None)`` to select all the contents of *that* level. You do not need to specify all the +*deeper* levels, they will be implied as ``slice(None)``. + +As usual, **both sides** of the slicers are included as this is label indexing. + +See :ref:`the docs` +See also issues (:issue:`6134`, :issue:`4036`, :issue:`3057`, :issue:`2598`, :issue:`5641`) + +.. warning:: + + You should specify all axes in the ``.loc`` specifier, meaning the indexer for the **index** and + for the **columns**. Their are some ambiguous cases where the passed indexer could be mis-interpreted + as indexing *both* axes, rather than into say the MuliIndex for the rows. + + You should do this: + + .. code-block:: python + + df.loc[(slice('A1','A3'),.....,:] + + rather than this: + + .. code-block:: python + + df.loc[(slice('A1','A3'),.....] + +.. warning:: + + You will need to make sure that the selection axes are fully lexsorted! + +.. ipython:: python + + def mklbl(prefix,n): + return ["%s%s" % (prefix,i) for i in range(n)] + + index = MultiIndex.from_product([mklbl('A',4), + mklbl('B',2), + mklbl('C',4), + mklbl('D',2)]) + columns = MultiIndex.from_tuples([('a','foo'),('a','bar'), + ('b','foo'),('b','bah')], + names=['lvl0', 'lvl1']) + df = DataFrame(np.arange(len(index)*len(columns)).reshape((len(index),len(columns))), + index=index, + columns=columns).sortlevel().sortlevel(axis=1) + df + +.. ipython:: python + + df.loc[(slice('A1','A3'),slice(None), ['C1','C3']),:] + df.loc[(slice(None),slice(None), ['C1','C3']),:] + +It is possible to perform quite complicated selections using this method on multiple +axes at the same time. + +.. ipython:: python + + df.loc['A1',(slice(None),'foo')] + df.loc[(slice(None),slice(None), ['C1','C3']),(slice(None),'foo')] + df.loc[df[('a','foo')]>200,slice(None), ['C1','C3']),(slice(None),'foo')] + +Furthermore you can *set* the values using these methods + +.. ipython:: python + + df2 = df.copy() + df2.loc[(slice(None),slice(None), ['C1','C3']),:] = -10 + df2 + +You use a right-hand-side of an alignable object as well. + +.. ipython:: python + + df2 = df.copy() + df2.loc[(slice(None),slice(None), ['C1','C3']),:] = df2*1000 + df2 + Prior Version Deprecations/Changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 1068a448574a7315a0deb8c36311d216bd7c568b Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 8 Feb 2014 19:26:21 -0500 Subject: [PATCH 7/9] BUG: Raise a TypeError when trying to assign with a rhs of a multi-index of differeing levels (GH3738) --- doc/source/indexing.rst | 4 ++-- doc/source/release.rst | 1 + doc/source/v0.14.0.txt | 4 ++-- pandas/core/indexing.py | 9 ++++++++- pandas/tests/test_indexing.py | 22 ++++++++++++++++++++++ 5 files changed, 35 insertions(+), 5 deletions(-) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index dff2fb51770b9..2b1b238a0ad1f 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1742,13 +1742,13 @@ As usual, **both sides** of the slicers are included as this is label indexing. .. code-block:: python - df.loc[(slice('A1','A3'),.....,:] + df.loc[(slice('A1','A3'),.....),:] rather than this: .. code-block:: python - df.loc[(slice('A1','A3'),.....] + df.loc[(slice('A1','A3'),.....)] .. warning:: diff --git a/doc/source/release.rst b/doc/source/release.rst index 1e4b7756af706..829a21f8033ca 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -95,6 +95,7 @@ Bug Fixes - Issue with groupby ``agg`` with a single function and a a mixed-type frame (:issue:`6337`) - Bug in ``DataFrame.replace()`` when passing a non- ``bool`` ``to_replace`` argument (:issue:`6332`) +- Raise when trying to align on different levels of a multi-index assignment (:issue:`3738`) pandas 0.13.1 ------------- diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 2dbfff6943004..5aa21710be45f 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -56,13 +56,13 @@ See also issues (:issue:`6134`, :issue:`4036`, :issue:`3057`, :issue:`2598`, :is .. code-block:: python - df.loc[(slice('A1','A3'),.....,:] + df.loc[(slice('A1','A3'),.....),:] rather than this: .. code-block:: python - df.loc[(slice('A1','A3'),.....] + df.loc[(slice('A1','A3'),.....)] .. warning:: diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 38c857e24dacf..45ec528e6265d 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -591,7 +591,14 @@ def _align_frame(self, indexer, df): if df.index.equals(ax): val = df.copy().values else: - val = df.reindex(ax).values + + # we have a multi-index and are trying to align + # with a particular, level GH3738 + if isinstance(ax, MultiIndex) and isinstance( + df.index, MultiIndex) and ax.nlevels != df.index.nlevels: + raise TypeError("cannot align on a multi-index with out specifying the join levels") + + val = df.reindex(index=ax).values return val elif np.isscalar(indexer) and not is_frame: diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 2a8723292b2ec..f5d2adbb00bf6 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -1285,6 +1285,28 @@ def f(): expected.iloc[[0,3],[1,3]] *= expected.iloc[[0,3],[1,3]] assert_frame_equal(df, expected) + def test_multiindex_setitem(self): + + # GH 3738 + # setting with a multi-index right hand side + arrays = [np.array(['bar', 'bar', 'baz', 'qux', 'qux', 'bar']), + np.array(['one', 'two', 'one', 'one', 'two', 'one']), + np.arange(0, 6, 1)] + + df_orig = pd.DataFrame(np.random.randn(6, 3), + index=arrays, + columns=['A', 'B', 'C']).sort_index() + + expected = df_orig.loc[['bar']]*2 + df = df_orig.copy() + df.loc[['bar']] *= 2 + assert_frame_equal(df.loc[['bar']],expected) + + # raise because these have differing levels + def f(): + df.loc['bar'] *= 2 + self.assertRaises(TypeError, f) + def test_getitem_multiindex(self): # GH 5725 From 03284f30085e0c964cfdeb6834465a2fbae844ed Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 11 Feb 2014 20:01:17 -0500 Subject: [PATCH 8/9] API: add in IndexSlice indexer shortcut --- doc/source/indexing.rst | 7 +++++++ doc/source/v0.14.0.txt | 7 +++++++ pandas/core/api.py | 1 + pandas/core/indexing.py | 6 +++++- pandas/tests/test_indexing.py | 16 ++++++++++++++++ 5 files changed, 36 insertions(+), 1 deletion(-) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 2b1b238a0ad1f..521aebb6f6104 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1785,6 +1785,13 @@ axes at the same time. dfmi.loc[(slice(None),slice(None), ['C1','C3']),(slice(None),'foo')] dfmi.loc[df[('a','foo')]>200,slice(None), ['C1','C3']),(slice(None),'foo')] +You can use a ``pd.IndexSlice`` to shortcut the creation of these slices + +.. ipython:: python + + idx = pd.IndexSlice + dfmi.loc[idx[:,:,['C1','C3']],idx[:,'foo']] + Furthermore you can *set* the values using these methods .. ipython:: python diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 5aa21710be45f..2dfb0eeb6fc08 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -99,6 +99,13 @@ axes at the same time. df.loc[(slice(None),slice(None), ['C1','C3']),(slice(None),'foo')] df.loc[df[('a','foo')]>200,slice(None), ['C1','C3']),(slice(None),'foo')] +You can use a ``pd.IndexSlice`` to shortcut the creation of these slices + +.. ipython:: python + + idx = pd.IndexSlice + df.loc[idx[:,:,['C1','C3']],idx[:,'foo']] + Furthermore you can *set* the values using these methods .. ipython:: python diff --git a/pandas/core/api.py b/pandas/core/api.py index b36c9f7499df6..4d8d4dcda7589 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -19,6 +19,7 @@ WidePanel = Panel +from pandas.core.indexing import IndexSlice from pandas.tseries.offsets import DateOffset from pandas.tseries.tools import to_datetime from pandas.tseries.index import (DatetimeIndex, Timestamp, diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 45ec528e6265d..e4707274eb30c 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -12,7 +12,6 @@ import numpy as np - # the supported indexers def get_indexers_list(): @@ -27,6 +26,11 @@ def get_indexers_list(): # "null slice" _NS = slice(None, None) +# the public IndexSlicerMaker +class _IndexSlice(object): + def __getitem__(self, arg): + return arg +IndexSlice = _IndexSlice() class IndexingError(Exception): pass diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index f5d2adbb00bf6..e099e24e44153 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -1152,6 +1152,9 @@ def f(): def test_per_axis_per_level_getitem_doc_examples(self): + # test index maker + idx = pd.IndexSlice + # from indexing.rst / advanced def mklbl(prefix,n): return ["%s%s" % (prefix,i) for i in range(n)] @@ -1170,11 +1173,15 @@ def mklbl(prefix,n): expected = df.loc[[ tuple([a,b,c,d]) for a,b,c,d in df.index.values if ( a == 'A1' or a == 'A2' or a == 'A3') and (c == 'C1' or c == 'C3')]] assert_frame_equal(result, expected) + result = df.loc[idx['A1':'A3',:,['C1','C3']],:] + assert_frame_equal(result, expected) result = df.loc[(slice(None),slice(None), ['C1','C3']),:] expected = df.loc[[ tuple([a,b,c,d]) for a,b,c,d in df.index.values if ( c == 'C1' or c == 'C3')]] assert_frame_equal(result, expected) + result = df.loc[idx[:,:,['C1','C3']],:] + assert_frame_equal(result, expected) # not sorted def f(): @@ -1187,6 +1194,9 @@ def f(): def test_per_axis_per_level_setitem(self): + # test index maker + idx = pd.IndexSlice + # test multi-index slicing with per axis and per index controls index = MultiIndex.from_tuples([('A',1),('A',2),('A',3),('B',1)], names=['one','two']) @@ -1242,6 +1252,12 @@ def test_per_axis_per_level_setitem(self): expected.iloc[[0,3],[1,3]] = 100 assert_frame_equal(df, expected) + df = df_orig.copy() + df.loc[idx[:,1],idx[:,['foo']]] = 100 + expected = df_orig.copy() + expected.iloc[[0,3],[1,3]] = 100 + assert_frame_equal(df, expected) + df = df_orig.copy() df.loc['A','a'] = 100 expected = df_orig.copy() From 7d707101c1987dc68d52dc18b31f0553ee861e48 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 12 Feb 2014 19:40:03 -0500 Subject: [PATCH 9/9] ENH: make it possible to pass keyword argument to .loc ENH: allow the axis keyword to short-circuit indexing --- doc/source/indexing.rst | 32 +++++++++---- doc/source/v0.14.0.txt | 32 +++++++++---- pandas/core/indexing.py | 64 ++++++++++++++++++-------- pandas/tests/test_indexing.py | 85 ++++++++++++++++++++++++++++++----- 4 files changed, 165 insertions(+), 48 deletions(-) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 521aebb6f6104..afeb3fcc7764c 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1771,10 +1771,18 @@ As usual, **both sides** of the slicers are included as this is label indexing. columns=micolumns).sortlevel().sortlevel(axis=1) dfmi +Basic multi-index slicing using slices, lists, and labels. + .. ipython:: python dfmi.loc[(slice('A1','A3'),slice(None), ['C1','C3']),:] - dfmi.loc[(slice(None),slice(None), ['C1','C3']),:] + +You can use a ``pd.IndexSlice`` to shortcut the creation of these slices + +.. ipython:: python + + idx = pd.IndexSlice + dfmi.loc[idx[:,:,['C1','C3']],idx[:,'foo']] It is possible to perform quite complicated selections using this method on multiple axes at the same time. @@ -1782,30 +1790,36 @@ axes at the same time. .. ipython:: python dfmi.loc['A1',(slice(None),'foo')] - dfmi.loc[(slice(None),slice(None), ['C1','C3']),(slice(None),'foo')] - dfmi.loc[df[('a','foo')]>200,slice(None), ['C1','C3']),(slice(None),'foo')] + dfmi.loc[idx[:,:,['C1','C3']],idx[:,'foo']] -You can use a ``pd.IndexSlice`` to shortcut the creation of these slices +Using a boolean indexer you can provide selection related to the *values*. .. ipython:: python - idx = pd.IndexSlice - dfmi.loc[idx[:,:,['C1','C3']],idx[:,'foo']] + mask = dfmi[('a','foo')]>200 + dfmi.loc[idx[mask,:,['C1','C3']],idx[:,'foo']] + +You can also specify the ``axis`` argument to ``.loc`` to interpret the passed +slicers on a single axis. + +.. ipython:: python + + dfmi.loc(axis=0)[:,:,['C1','C3']] Furthermore you can *set* the values using these methods .. ipython:: python df2 = dfmi.copy() - df2.loc[(slice(None),slice(None), ['C1','C3']),:] = -10 + df2.loc(axis=0)[:,:,['C1','C3']] = -10 df2 -You use a right-hand-side of an alignable object as well. +You can use a right-hand-side of an alignable object as well. .. ipython:: python df2 = dfmi.copy() - df2.loc[(slice(None),slice(None), ['C1','C3']),:] = df2*1000 + df2.loc[idx[:,:,['C1','C3']],:] = df2*1000 df2 .. _indexing.xs: diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 2dfb0eeb6fc08..7bdc101c37709 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -85,10 +85,18 @@ See also issues (:issue:`6134`, :issue:`4036`, :issue:`3057`, :issue:`2598`, :is columns=columns).sortlevel().sortlevel(axis=1) df +Basic multi-index slicing using slices, lists, and labels. + .. ipython:: python df.loc[(slice('A1','A3'),slice(None), ['C1','C3']),:] - df.loc[(slice(None),slice(None), ['C1','C3']),:] + +You can use a ``pd.IndexSlice`` to shortcut the creation of these slices + +.. ipython:: python + + idx = pd.IndexSlice + df.loc[idx[:,:,['C1','C3']],idx[:,'foo']] It is possible to perform quite complicated selections using this method on multiple axes at the same time. @@ -96,30 +104,36 @@ axes at the same time. .. ipython:: python df.loc['A1',(slice(None),'foo')] - df.loc[(slice(None),slice(None), ['C1','C3']),(slice(None),'foo')] - df.loc[df[('a','foo')]>200,slice(None), ['C1','C3']),(slice(None),'foo')] + df.loc[idx[:,:,['C1','C3']],idx[:,'foo']] -You can use a ``pd.IndexSlice`` to shortcut the creation of these slices +Using a boolean indexer you can provide selection related to the *values*. .. ipython:: python - idx = pd.IndexSlice - df.loc[idx[:,:,['C1','C3']],idx[:,'foo']] + mask = df[('a','foo')]>200 + df.loc[idx[mask,:,['C1','C3']],idx[:,'foo']] + +You can also specify the ``axis`` argument to ``.loc`` to interpret the passed +slicers on a single axis. + +.. ipython:: python + + df.loc(axis=0)[:,:,['C1','C3']] Furthermore you can *set* the values using these methods .. ipython:: python df2 = df.copy() - df2.loc[(slice(None),slice(None), ['C1','C3']),:] = -10 + df2.loc(axis=0)[:,:,['C1','C3']] = -10 df2 -You use a right-hand-side of an alignable object as well. +You can use a right-hand-side of an alignable object as well. .. ipython:: python df2 = df.copy() - df2.loc[(slice(None),slice(None), ['C1','C3']),:] = df2*1000 + df2.loc[idx[:,:,['C1','C3']],:] = df2*1000 df2 Prior Version Deprecations/Changes diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index e4707274eb30c..f8ce855e6bfdc 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -44,6 +44,16 @@ def __init__(self, obj, name): self.obj = obj self.ndim = obj.ndim self.name = name + self.axis = None + + def __call__(self, *args, **kwargs): + # we need to return a copy of ourselves + self = self.__class__(self.obj, self.name) + + # set the passed in values + for k, v in compat.iteritems(kwargs): + setattr(self,k,v) + return self def __iter__(self): raise NotImplementedError('ix is not iterable') @@ -104,23 +114,28 @@ def _slice(self, obj, axis=0, raise_on_error=False, typ=None): def __setitem__(self, key, value): - # kludgetastic - ax = self.obj._get_axis(0) - if isinstance(ax, MultiIndex): - try: - indexer = ax.get_loc(key) - self._setitem_with_indexer(indexer, value) - return - except Exception: - pass - - if isinstance(key, tuple): - if len(key) > self.ndim: - raise IndexingError('only tuples of length <= %d supported' % - self.ndim) + if self.axis is not None: indexer = self._convert_tuple(key, is_setter=True) + else: - indexer = self._convert_to_indexer(key, is_setter=True) + + # kludgetastic + ax = self.obj._get_axis(0) + if isinstance(ax, MultiIndex): + try: + indexer = ax.get_loc(key) + self._setitem_with_indexer(indexer, value) + return + except Exception: + pass + + if isinstance(key, tuple): + if len(key) > self.ndim: + raise IndexingError('only tuples of length <= %d supported' % + self.ndim) + indexer = self._convert_tuple(key, is_setter=True) + else: + indexer = self._convert_to_indexer(key, is_setter=True) self._setitem_with_indexer(indexer, value) @@ -143,9 +158,17 @@ def _is_nested_tuple_indexer(self, tup): def _convert_tuple(self, key, is_setter=False): keyidx = [] - for i, k in enumerate(key): - idx = self._convert_to_indexer(k, axis=i, is_setter=is_setter) - keyidx.append(idx) + if self.axis is not None: + axis = self.obj._get_axis_number(self.axis) + for i in range(self.ndim): + if i == axis: + keyidx.append(self._convert_to_indexer(key, axis=axis, is_setter=is_setter)) + else: + keyidx.append(slice(None)) + else: + for i, k in enumerate(key): + idx = self._convert_to_indexer(k, axis=i, is_setter=is_setter) + keyidx.append(idx) return tuple(keyidx) def _convert_scalar_indexer(self, key, axis): @@ -732,6 +755,11 @@ def _handle_lowerdim_multi_index_axis0(self, tup): def _getitem_lowerdim(self, tup): + # we can directly get the axis result since the axis is specified + if self.axis is not None: + axis = self.obj._get_axis_number(self.axis) + return self._getitem_axis(tup, axis=axis, validate_iterable=True) + # we may have a nested tuples indexer here if self._is_nested_tuple_indexer(tup): return self._getitem_nested_tuple(tup) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index e099e24e44153..41b28172d0d42 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -83,6 +83,9 @@ def _axify(obj, key, axis): return k +def _mklbl(prefix,n): + return ["%s%s" % (prefix,i) for i in range(n)] + class TestIndexing(tm.TestCase): _multiprocess_can_split_ = True @@ -1066,11 +1069,9 @@ def test_per_axis_per_level_getitem(self): # GH6134 # example test case - def mklbl(prefix,n): - return ["%s%s" % (prefix,i) for i in range(n)] - - ix = MultiIndex.from_product([mklbl('A',5),mklbl('B',7),mklbl('C',4),mklbl('D',2)]) + ix = MultiIndex.from_product([_mklbl('A',5),_mklbl('B',7),_mklbl('C',4),_mklbl('D',2)]) df = DataFrame(np.arange(len(ix.get_values())),index=ix) + result = df.loc[(slice('A1','A3'),slice(None), ['C1','C3']),:] expected = df.loc[[ tuple([a,b,c,d]) for a,b,c,d in df.index.values if ( a == 'A1' or a == 'A2' or a == 'A3') and (c == 'C1' or c == 'C3')]] @@ -1150,19 +1151,16 @@ def f(): df.loc[(slice(None),[1])] self.assertRaises(KeyError, f) - def test_per_axis_per_level_getitem_doc_examples(self): + def test_per_axis_per_level_doc_examples(self): # test index maker idx = pd.IndexSlice # from indexing.rst / advanced - def mklbl(prefix,n): - return ["%s%s" % (prefix,i) for i in range(n)] - - index = MultiIndex.from_product([mklbl('A',4), - mklbl('B',2), - mklbl('C',4), - mklbl('D',2)]) + index = MultiIndex.from_product([_mklbl('A',4), + _mklbl('B',2), + _mklbl('C',4), + _mklbl('D',2)]) columns = MultiIndex.from_tuples([('a','foo'),('a','bar'), ('b','foo'),('b','bah')], names=['lvl0', 'lvl1']) @@ -1189,9 +1187,60 @@ def f(): self.assertRaises(KeyError, f) df = df.sortlevel(axis=1) + # slicing df.loc['A1',(slice(None),'foo')] df.loc[(slice(None),slice(None), ['C1','C3']),(slice(None),'foo')] + # setitem + df.loc(axis=0)[:,:,['C1','C3']] = -10 + + def test_loc_arguments(self): + + index = MultiIndex.from_product([_mklbl('A',4), + _mklbl('B',2), + _mklbl('C',4), + _mklbl('D',2)]) + columns = MultiIndex.from_tuples([('a','foo'),('a','bar'), + ('b','foo'),('b','bah')], + names=['lvl0', 'lvl1']) + df = DataFrame(np.arange(len(index)*len(columns)).reshape((len(index),len(columns))), + index=index, + columns=columns).sortlevel().sortlevel(axis=1) + + + # axis 0 + result = df.loc(axis=0)['A1':'A3',:,['C1','C3']] + expected = df.loc[[ tuple([a,b,c,d]) for a,b,c,d in df.index.values if ( + a == 'A1' or a == 'A2' or a == 'A3') and (c == 'C1' or c == 'C3')]] + assert_frame_equal(result, expected) + + result = df.loc(axis='index')[:,:,['C1','C3']] + expected = df.loc[[ tuple([a,b,c,d]) for a,b,c,d in df.index.values if ( + c == 'C1' or c == 'C3')]] + assert_frame_equal(result, expected) + + # axis 1 + result = df.loc(axis=1)[:,'foo'] + expected = df.loc[:,(slice(None),'foo')] + assert_frame_equal(result, expected) + + result = df.loc(axis='columns')[:,'foo'] + expected = df.loc[:,(slice(None),'foo')] + assert_frame_equal(result, expected) + + # invalid axis + def f(): + df.loc(axis=-1)[:,:,['C1','C3']] + self.assertRaises(ValueError, f) + + def f(): + df.loc(axis=2)[:,:,['C1','C3']] + self.assertRaises(ValueError, f) + + def f(): + df.loc(axis='foo')[:,:,['C1','C3']] + self.assertRaises(ValueError, f) + def test_per_axis_per_level_setitem(self): # test index maker @@ -1213,6 +1262,12 @@ def test_per_axis_per_level_setitem(self): expected.iloc[:,:] = 100 assert_frame_equal(df, expected) + df = df_orig.copy() + df.loc(axis=0)[:,:] = 100 + expected = df_orig.copy() + expected.iloc[:,:] = 100 + assert_frame_equal(df, expected) + df = df_orig.copy() df.loc[(slice(None),slice(None)),(slice(None),slice(None))] = 100 expected = df_orig.copy() @@ -1238,6 +1293,12 @@ def test_per_axis_per_level_setitem(self): expected.iloc[[0,3]] = 100 assert_frame_equal(df, expected) + df = df_orig.copy() + df.loc(axis=0)[:,1] = 100 + expected = df_orig.copy() + expected.iloc[[0,3]] = 100 + assert_frame_equal(df, expected) + # columns df = df_orig.copy() df.loc[:,(slice(None),['foo'])] = 100