diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index d65c1519fe869..afeb3fcc7764c 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -426,14 +426,14 @@ python/numpy allow slicing past the end of an array without an associated error. values. A single indexer that is out-of-bounds and drops the dimensions of the object will still raise ``IndexError`` (:issue:`6296`). This could result in an empty axis (e.g. an empty DataFrame being returned) - .. ipython:: python +.. ipython:: python - df = DataFrame(np.random.randn(5,2),columns=list('AB')) - df - df.iloc[[4,5,6]] - df.iloc[4:6] - df.iloc[:,2:3] - df.iloc[:,1:3] + dfl = DataFrame(np.random.randn(5,2),columns=list('AB')) + dfl + dfl.iloc[[4,5,6]] + dfl.iloc[4:6] + dfl.iloc[:,2:3] + dfl.iloc[:,1:3] .. _indexing.basics.partial_setting: @@ -1684,7 +1684,7 @@ of tuples: Advanced indexing with hierarchical index ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Syntactically integrating ``MultiIndex`` in advanced indexing with ``.ix`` is a +Syntactically integrating ``MultiIndex`` in advanced indexing with ``.loc/.ix`` is a bit challenging, but we've made every effort to do so. for example the following works as you would expect: @@ -1692,22 +1692,21 @@ following works as you would expect: df = df.T df - df.ix['bar'] - df.ix['bar', 'two'] + df.loc['bar'] + df.loc['bar', 'two'] -"Partial" slicing also works quite nicely for the topmost level: +"Partial" slicing also works quite nicely. .. ipython:: python - df.ix['baz':'foo'] + df.loc['baz':'foo'] -But lower levels cannot be sliced in this way, because the MultiIndex uses -its multiple index dimensions to slice along one dimension of your object: +You can slice with a 'range' of values, by providing a slice of tuples. .. ipython:: python - df.ix[('baz', 'two'):('qux', 'one')] - df.ix[('baz', 'two'):'foo'] + df.loc[('baz', 'two'):('qux', 'one')] + df.loc[('baz', 'two'):'foo'] Passing a list of labels or tuples works similar to reindexing: @@ -1715,16 +1714,113 @@ Passing a list of labels or tuples works similar to reindexing: df.ix[[('bar', 'two'), ('qux', 'one')]] -The following does not work, and it's not clear if it should or not: +.. _indexing.mi_slicers: -:: +Multiindexing using slicers +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.14.0 + +In 0.14.0 we added a new way to slice multi-indexed objects. +You can slice a multi-index by providing multiple indexers. + +You can provide any of the selectors as if you are indexing by label, see :ref:`Selection by Label `, +including slices, lists of labels, labels, and boolean indexers. + +You can use ``slice(None)`` to select all the contents of *that* level. You do not need to specify all the +*deeper* levels, they will be implied as ``slice(None)``. + +As usual, **both sides** of the slicers are included as this is label indexing. + +.. warning:: - >>> df.ix[['bar', 'qux']] + You should specify all axes in the ``.loc`` specifier, meaning the indexer for the **index** and + for the **columns**. Their are some ambiguous cases where the passed indexer could be mis-interpreted + as indexing *both* axes, rather than into say the MuliIndex for the rows. + + You should do this: + + .. code-block:: python + + df.loc[(slice('A1','A3'),.....),:] + + rather than this: + + .. code-block:: python + + df.loc[(slice('A1','A3'),.....)] + +.. warning:: + + You will need to make sure that the selection axes are fully lexsorted! + +.. ipython:: python + + def mklbl(prefix,n): + return ["%s%s" % (prefix,i) for i in range(n)] + + miindex = MultiIndex.from_product([mklbl('A',4), + mklbl('B',2), + mklbl('C',4), + mklbl('D',2)]) + micolumns = MultiIndex.from_tuples([('a','foo'),('a','bar'), + ('b','foo'),('b','bah')], + names=['lvl0', 'lvl1']) + dfmi = DataFrame(np.arange(len(miindex)*len(micolumns)).reshape((len(miindex),len(micolumns))), + index=miindex, + columns=micolumns).sortlevel().sortlevel(axis=1) + dfmi + +Basic multi-index slicing using slices, lists, and labels. + +.. ipython:: python + + dfmi.loc[(slice('A1','A3'),slice(None), ['C1','C3']),:] + +You can use a ``pd.IndexSlice`` to shortcut the creation of these slices + +.. ipython:: python + + idx = pd.IndexSlice + dfmi.loc[idx[:,:,['C1','C3']],idx[:,'foo']] + +It is possible to perform quite complicated selections using this method on multiple +axes at the same time. + +.. ipython:: python -The code for implementing ``.ix`` makes every attempt to "do the right thing" -but as you use it you may uncover corner cases or unintuitive behavior. If you -do find something like this, do not hesitate to report the issue or ask on the -mailing list. + dfmi.loc['A1',(slice(None),'foo')] + dfmi.loc[idx[:,:,['C1','C3']],idx[:,'foo']] + +Using a boolean indexer you can provide selection related to the *values*. + +.. ipython:: python + + mask = dfmi[('a','foo')]>200 + dfmi.loc[idx[mask,:,['C1','C3']],idx[:,'foo']] + +You can also specify the ``axis`` argument to ``.loc`` to interpret the passed +slicers on a single axis. + +.. ipython:: python + + dfmi.loc(axis=0)[:,:,['C1','C3']] + +Furthermore you can *set* the values using these methods + +.. ipython:: python + + df2 = dfmi.copy() + df2.loc(axis=0)[:,:,['C1','C3']] = -10 + df2 + +You can use a right-hand-side of an alignable object as well. + +.. ipython:: python + + df2 = dfmi.copy() + df2.loc[idx[:,:,['C1','C3']],:] = df2*1000 + df2 .. _indexing.xs: @@ -1738,6 +1834,11 @@ selecting data at a particular level of a MultiIndex easier. df.xs('one', level='second') +.. ipython:: python + + # using the slicers (new in 0.14.0) + df.loc[(slice(None),'one'),:] + You can also select on the columns with :meth:`~pandas.MultiIndex.xs`, by providing the axis argument @@ -1746,29 +1847,38 @@ providing the axis argument df = df.T df.xs('one', level='second', axis=1) +.. ipython:: python + + # using the slicers (new in 0.14.0) + df.loc[:,(slice(None),'one')] + :meth:`~pandas.MultiIndex.xs` also allows selection with multiple keys .. ipython:: python df.xs(('one', 'bar'), level=('second', 'first'), axis=1) +.. ipython:: python + + # using the slicers (new in 0.14.0) + df.loc[:,('bar','one')] .. versionadded:: 0.13.0 You can pass ``drop_level=False`` to :meth:`~pandas.MultiIndex.xs` to retain the level that was selected -.. ipython:: +.. ipython:: python df.xs('one', level='second', axis=1, drop_level=False) versus the result with ``drop_level=True`` (the default value) -.. ipython:: +.. ipython:: python df.xs('one', level='second', axis=1, drop_level=True) -.. ipython:: +.. ipython:: python :suppress: df = df.T diff --git a/doc/source/release.rst b/doc/source/release.rst index 40913e40f485f..829a21f8033ca 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -74,6 +74,7 @@ Improvements to existing features the func (:issue:`6289`) - ``plot(legend='reverse')`` will now reverse the order of legend labels for most plot kinds. (:issue:`6014`) +- Allow multi-index slicers (:issue:`6134`, :issue:`4036`, :issue:`3057`, :issue:`2598`, :issue:`5641`) .. _release.bug_fixes-0.14.0: @@ -94,6 +95,7 @@ Bug Fixes - Issue with groupby ``agg`` with a single function and a a mixed-type frame (:issue:`6337`) - Bug in ``DataFrame.replace()`` when passing a non- ``bool`` ``to_replace`` argument (:issue:`6332`) +- Raise when trying to align on different levels of a multi-index assignment (:issue:`3738`) pandas 0.13.1 ------------- diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index ee38fed810af0..7bdc101c37709 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -29,6 +29,113 @@ API changes df.iloc[:,2:3] df.iloc[:,1:3] +MultiIndexing Using Slicers +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In 0.14.0 we added a new way to slice multi-indexed objects. +You can slice a multi-index by providing multiple indexers. + +You can provide any of the selectors as if you are indexing by label, see :ref:`Selection by Label `, +including slices, lists of labels, labels, and boolean indexers. + +You can use ``slice(None)`` to select all the contents of *that* level. You do not need to specify all the +*deeper* levels, they will be implied as ``slice(None)``. + +As usual, **both sides** of the slicers are included as this is label indexing. + +See :ref:`the docs` +See also issues (:issue:`6134`, :issue:`4036`, :issue:`3057`, :issue:`2598`, :issue:`5641`) + +.. warning:: + + You should specify all axes in the ``.loc`` specifier, meaning the indexer for the **index** and + for the **columns**. Their are some ambiguous cases where the passed indexer could be mis-interpreted + as indexing *both* axes, rather than into say the MuliIndex for the rows. + + You should do this: + + .. code-block:: python + + df.loc[(slice('A1','A3'),.....),:] + + rather than this: + + .. code-block:: python + + df.loc[(slice('A1','A3'),.....)] + +.. warning:: + + You will need to make sure that the selection axes are fully lexsorted! + +.. ipython:: python + + def mklbl(prefix,n): + return ["%s%s" % (prefix,i) for i in range(n)] + + index = MultiIndex.from_product([mklbl('A',4), + mklbl('B',2), + mklbl('C',4), + mklbl('D',2)]) + columns = MultiIndex.from_tuples([('a','foo'),('a','bar'), + ('b','foo'),('b','bah')], + names=['lvl0', 'lvl1']) + df = DataFrame(np.arange(len(index)*len(columns)).reshape((len(index),len(columns))), + index=index, + columns=columns).sortlevel().sortlevel(axis=1) + df + +Basic multi-index slicing using slices, lists, and labels. + +.. ipython:: python + + df.loc[(slice('A1','A3'),slice(None), ['C1','C3']),:] + +You can use a ``pd.IndexSlice`` to shortcut the creation of these slices + +.. ipython:: python + + idx = pd.IndexSlice + df.loc[idx[:,:,['C1','C3']],idx[:,'foo']] + +It is possible to perform quite complicated selections using this method on multiple +axes at the same time. + +.. ipython:: python + + df.loc['A1',(slice(None),'foo')] + df.loc[idx[:,:,['C1','C3']],idx[:,'foo']] + +Using a boolean indexer you can provide selection related to the *values*. + +.. ipython:: python + + mask = df[('a','foo')]>200 + df.loc[idx[mask,:,['C1','C3']],idx[:,'foo']] + +You can also specify the ``axis`` argument to ``.loc`` to interpret the passed +slicers on a single axis. + +.. ipython:: python + + df.loc(axis=0)[:,:,['C1','C3']] + +Furthermore you can *set* the values using these methods + +.. ipython:: python + + df2 = df.copy() + df2.loc(axis=0)[:,:,['C1','C3']] = -10 + df2 + +You can use a right-hand-side of an alignable object as well. + +.. ipython:: python + + df2 = df.copy() + df2.loc[idx[:,:,['C1','C3']],:] = df2*1000 + df2 + Prior Version Deprecations/Changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/api.py b/pandas/core/api.py index b36c9f7499df6..4d8d4dcda7589 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -19,6 +19,7 @@ WidePanel = Panel +from pandas.core.indexing import IndexSlice from pandas.tseries.offsets import DateOffset from pandas.tseries.tools import to_datetime from pandas.tseries.index import (DatetimeIndex, Timestamp, diff --git a/pandas/core/index.py b/pandas/core/index.py index 5a02c0445c006..1f4ee5246a04a 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1,7 +1,7 @@ # pylint: disable=E1101,E1103,W0232 import datetime from functools import partial -from pandas.compat import range, zip, lrange, lzip, u +from pandas.compat import range, zip, lrange, lzip, u, reduce from pandas import compat import numpy as np @@ -3233,21 +3233,110 @@ def partial_selection(key): drop_level) else: indexer = self._get_level_indexer(key, level=level) - new_index = _maybe_drop_levels(indexer, [level], drop_level) - return indexer, new_index + return indexer, _maybe_drop_levels(indexer, [level], drop_level) def _get_level_indexer(self, key, level=0): + # return a boolean indexer or a slice showing where the key is + # in the totality of values + level_index = self.levels[level] - loc = level_index.get_loc(key) labels = self.labels[level] - if level > 0 or self.lexsort_depth == 0: - return np.array(labels == loc,dtype=bool) + if isinstance(key, slice): + # handle a slice, returnig a slice if we can + # otherwise a boolean indexer + + start = level_index.get_loc(key.start) + stop = level_index.get_loc(key.stop) + step = key.step + + if level > 0 or self.lexsort_depth == 0: + # need to have like semantics here to right + # searching as when we are using a slice + # so include the stop+1 (so we include stop) + m = np.zeros(len(labels),dtype=bool) + m[np.in1d(labels,np.arange(start,stop+1,step))] = True + return m + else: + # sorted, so can return slice object -> view + i = labels.searchsorted(start, side='left') + j = labels.searchsorted(stop, side='right') + return slice(i, j, step) + else: - # sorted, so can return slice object -> view - i = labels.searchsorted(loc, side='left') - j = labels.searchsorted(loc, side='right') - return slice(i, j) + + loc = level_index.get_loc(key) + if level > 0 or self.lexsort_depth == 0: + return np.array(labels == loc,dtype=bool) + else: + # sorted, so can return slice object -> view + i = labels.searchsorted(loc, side='left') + j = labels.searchsorted(loc, side='right') + return slice(i, j) + + def get_locs(self, tup): + """ + Given a tuple of slices/lists/labels/boolean indexer to a level-wise spec + produce an indexer to extract those locations + + Parameters + ---------- + key : tuple of (slices/list/labels) + + Returns + ------- + locs : integer list of locations or boolean indexer suitable + for passing to iloc + """ + + # must be lexsorted to at least as many levels + if not self.is_lexsorted_for_tuple(tup): + raise KeyError('MultiIndex Slicing requires the index to be fully lexsorted' + ' tuple len ({0}), lexsort depth ({1})'.format(len(tup), self.lexsort_depth)) + if not self.is_unique: + raise ValueError('MultiIndex Slicing requires a unique index') + + def _convert_indexer(r): + if isinstance(r, slice): + m = np.zeros(len(self),dtype=bool) + m[r] = True + return m + return r + + ranges = [] + for i,k in enumerate(tup): + + if com._is_bool_indexer(k): + # a boolean indexer, must be the same length! + k = np.asarray(k) + if len(k) != len(self): + raise ValueError("cannot index with a boolean indexer that is" + " not the same length as the index") + ranges.append(k) + elif com.is_list_like(k): + # a collection of labels to include from this level (these are or'd) + ranges.append(reduce( + np.logical_or,[ _convert_indexer(self._get_level_indexer(x, level=i) + ) for x in k ])) + elif k == slice(None): + # include all from this level + pass + elif isinstance(k,slice): + # a slice, include BOTH of the labels + ranges.append(self._get_level_indexer(k,level=i)) + else: + # a single label + ranges.append(self.get_loc_level(k,level=i,drop_level=False)[0]) + + # identity + if len(ranges) == 0: + return slice(0,len(self)) + + elif len(ranges) == 1: + return ranges[0] + + # construct a boolean indexer if we have a slice or boolean indexer + return reduce(np.logical_and,[ _convert_indexer(r) for r in ranges ]) def truncate(self, before=None, after=None): """ diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 029055d80b1af..f8ce855e6bfdc 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -12,7 +12,6 @@ import numpy as np - # the supported indexers def get_indexers_list(): @@ -27,6 +26,11 @@ def get_indexers_list(): # "null slice" _NS = slice(None, None) +# the public IndexSlicerMaker +class _IndexSlice(object): + def __getitem__(self, arg): + return arg +IndexSlice = _IndexSlice() class IndexingError(Exception): pass @@ -40,6 +44,16 @@ def __init__(self, obj, name): self.obj = obj self.ndim = obj.ndim self.name = name + self.axis = None + + def __call__(self, *args, **kwargs): + # we need to return a copy of ourselves + self = self.__class__(self.obj, self.name) + + # set the passed in values + for k, v in compat.iteritems(kwargs): + setattr(self,k,v) + return self def __iter__(self): raise NotImplementedError('ix is not iterable') @@ -61,8 +75,7 @@ def _get_label(self, label, axis=0): return self.obj[label] elif (isinstance(label, tuple) and isinstance(label[axis], slice)): - - raise IndexingError('no slices here') + raise IndexingError('no slices here, handle elsewhere') try: return self.obj._xs(label, axis=axis, copy=False) @@ -100,23 +113,29 @@ def _slice(self, obj, axis=0, raise_on_error=False, typ=None): typ=typ) def __setitem__(self, key, value): - # kludgetastic - ax = self.obj._get_axis(0) - if isinstance(ax, MultiIndex): - try: - indexer = ax.get_loc(key) - self._setitem_with_indexer(indexer, value) - return - except Exception: - pass - if isinstance(key, tuple): - if len(key) > self.ndim: - raise IndexingError('only tuples of length <= %d supported' % - self.ndim) + if self.axis is not None: indexer = self._convert_tuple(key, is_setter=True) + else: - indexer = self._convert_to_indexer(key, is_setter=True) + + # kludgetastic + ax = self.obj._get_axis(0) + if isinstance(ax, MultiIndex): + try: + indexer = ax.get_loc(key) + self._setitem_with_indexer(indexer, value) + return + except Exception: + pass + + if isinstance(key, tuple): + if len(key) > self.ndim: + raise IndexingError('only tuples of length <= %d supported' % + self.ndim) + indexer = self._convert_tuple(key, is_setter=True) + else: + indexer = self._convert_to_indexer(key, is_setter=True) self._setitem_with_indexer(indexer, value) @@ -132,11 +151,24 @@ def _has_valid_tuple(self, key): raise ValueError("Location based indexing can only have [%s] " "types" % self._valid_types) + def _is_nested_tuple_indexer(self, tup): + if any([ isinstance(ax, MultiIndex) for ax in self.obj.axes ]): + return any([ _is_nested_tuple(tup,ax) for ax in self.obj.axes ]) + return False + def _convert_tuple(self, key, is_setter=False): keyidx = [] - for i, k in enumerate(key): - idx = self._convert_to_indexer(k, axis=i, is_setter=is_setter) - keyidx.append(idx) + if self.axis is not None: + axis = self.obj._get_axis_number(self.axis) + for i in range(self.ndim): + if i == axis: + keyidx.append(self._convert_to_indexer(key, axis=axis, is_setter=is_setter)) + else: + keyidx.append(slice(None)) + else: + for i, k in enumerate(key): + idx = self._convert_to_indexer(k, axis=i, is_setter=is_setter) + keyidx.append(idx) return tuple(keyidx) def _convert_scalar_indexer(self, key, axis): @@ -586,7 +618,14 @@ def _align_frame(self, indexer, df): if df.index.equals(ax): val = df.copy().values else: - val = df.reindex(ax).values + + # we have a multi-index and are trying to align + # with a particular, level GH3738 + if isinstance(ax, MultiIndex) and isinstance( + df.index, MultiIndex) and ax.nlevels != df.index.nlevels: + raise TypeError("cannot align on a multi-index with out specifying the join levels") + + val = df.reindex(index=ax).values return val elif np.isscalar(indexer) and not is_frame: @@ -694,30 +733,46 @@ def _convert_for_reindex(self, key, axis=0): return keyarr + def _handle_lowerdim_multi_index_axis0(self, tup): + # we have an axis0 multi-index, handle or raise + + try: + # fast path for series or for tup devoid of slices + return self._get_label(tup, axis=0) + except TypeError: + # slices are unhashable + pass + except Exception as e1: + if isinstance(tup[0], (slice, Index)): + raise IndexingError("Handle elsewhere") + + # raise the error if we are not sorted + ax0 = self.obj._get_axis(0) + if not ax0.is_lexsorted_for_tuple(tup): + raise e1 + + return None + def _getitem_lowerdim(self, tup): + # we can directly get the axis result since the axis is specified + if self.axis is not None: + axis = self.obj._get_axis_number(self.axis) + return self._getitem_axis(tup, axis=axis, validate_iterable=True) + + # we may have a nested tuples indexer here + if self._is_nested_tuple_indexer(tup): + return self._getitem_nested_tuple(tup) + + # we maybe be using a tuple to represent multiple dimensions here ax0 = self.obj._get_axis(0) - # a bit kludgy if isinstance(ax0, MultiIndex): - try: - return self._get_label(tup, axis=0) - except TypeError: - # slices are unhashable - pass - except Exception as e1: - if isinstance(tup[0], (slice, Index)): - raise IndexingError - - # raise the error if we are not sorted - if not ax0.is_lexsorted_for_tuple(tup): - raise e1 - try: - loc = ax0.get_loc(tup[0]) - except KeyError: - raise e1 + result = self._handle_lowerdim_multi_index_axis0(tup) + if result is not None: + return result if len(tup) > self.obj.ndim: - raise IndexingError + raise IndexingError("Too many indexers. handle elsewhere") # to avoid wasted computation # df.ix[d1:d2, 0] -> columns first (True) @@ -730,9 +785,9 @@ def _getitem_lowerdim(self, tup): if not _is_list_like(section): return section - # might have been a MultiIndex elif section.ndim == self.ndim: - + # we're in the middle of slicing through a MultiIndex + # revise the key wrt to `section` by inserting an _NS new_key = tup[:i] + (_NS,) + tup[i + 1:] else: @@ -748,11 +803,45 @@ def _getitem_lowerdim(self, tup): if len(new_key) == 1: new_key, = new_key + # This is an elided recursive call to iloc/loc/etc' return getattr(section, self.name)[new_key] raise IndexingError('not applicable') - def _getitem_axis(self, key, axis=0): + def _getitem_nested_tuple(self, tup): + # we have a nested tuple so have at least 1 multi-index level + # we should be able to match up the dimensionaility here + + # we have too many indexers for our dim, but have at least 1 + # multi-index dimension, try to see if we have something like + # a tuple passed to a series with a multi-index + if len(tup) > self.ndim: + result = self._handle_lowerdim_multi_index_axis0(tup) + if result is not None: + return result + + # this is a series with a multi-index specified a tuple of selectors + return self._getitem_axis(tup, axis=0, validate_iterable=True) + + # handle the multi-axis by taking sections and reducing + # this is iterative + obj = self.obj + axis = 0 + for key in tup: + + if _is_null_slice(key): + axis += 1 + continue + + obj = getattr(obj, self.name)._getitem_axis(key, axis=axis, validate_iterable=True) + axis += 1 + + if obj.ndim < self.ndim: + axis -= 1 + + return obj + + def _getitem_axis(self, key, axis=0, validate_iterable=False): self._has_valid_type(key, axis) labels = self.obj._get_axis(axis) @@ -943,6 +1032,8 @@ def _convert_to_indexer(self, obj, axis=0, is_setter=False): if isinstance(obj, slice): return self._convert_slice_indexer(obj, axis) + elif _is_nested_tuple(obj, labels): + return labels.get_locs(obj) elif _is_list_like(obj): if com._is_bool_indexer(obj): obj = _check_bool_indexer(labels, obj) @@ -1050,7 +1141,7 @@ def __getitem__(self, key): else: return self._getitem_axis(key, axis=0) - def _getitem_axis(self, key, axis=0): + def _getitem_axis(self, key, axis=0, validate_iterable=False): raise NotImplementedError() def _getbool_axis(self, key, axis=0): @@ -1127,6 +1218,7 @@ def _has_valid_type(self, key, axis): # require all elements in the index idx = _ensure_index(key) if not idx.isin(ax).all(): + raise KeyError("[%s] are not in ALL in the [%s]" % (key, self.obj._get_axis_name(axis))) @@ -1156,7 +1248,7 @@ def error(): return True - def _getitem_axis(self, key, axis=0): + def _getitem_axis(self, key, axis=0, validate_iterable=False): labels = self.obj._get_axis(axis) if isinstance(key, slice): @@ -1170,7 +1262,14 @@ def _getitem_axis(self, key, axis=0): if hasattr(key, 'ndim') and key.ndim > 1: raise ValueError('Cannot index with multidimensional key') + if validate_iterable: + self._has_valid_type(key, axis) return self._getitem_iterable(key, axis=axis) + elif _is_nested_tuple(key, labels): + locs = labels.get_locs(key) + indexer = [ slice(None) ] * self.ndim + indexer[axis] = locs + return self.obj.iloc[tuple(indexer)] else: self._has_valid_type(key, axis) return self._get_label(key, axis=axis) @@ -1243,7 +1342,7 @@ def _get_slice_axis(self, slice_obj, axis=0): else: return self.obj.take(slice_obj, axis=axis, convert=False) - def _getitem_axis(self, key, axis=0): + def _getitem_axis(self, key, axis=0, validate_iterable=False): if isinstance(key, slice): self._has_valid_type(key, axis) @@ -1502,6 +1601,24 @@ def _maybe_convert_ix(*args): return args +def _is_nested_tuple(tup, labels): + # check for a compatiable nested tuple and multiindexes among the axes + + if not isinstance(tup, tuple): + return False + + # are we nested tuple of: tuple,list,slice + for i, k in enumerate(tup): + + #if i > len(axes): + # raise IndexingError("invalid indxing tuple passed, has too many indexers for this object") + #ax = axes[i] + if isinstance(k, (tuple, list, slice)): + return isinstance(labels, MultiIndex) + + return False + + def _is_null_slice(obj): return (isinstance(obj, slice) and obj.start is None and obj.stop is None and obj.step is None) @@ -1554,3 +1671,4 @@ def _maybe_droplevels(index, key): pass return index + diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 52de461f0281b..41b28172d0d42 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -83,6 +83,9 @@ def _axify(obj, key, axis): return k +def _mklbl(prefix,n): + return ["%s%s" % (prefix,i) for i in range(n)] + class TestIndexing(tm.TestCase): _multiprocess_can_split_ = True @@ -1062,6 +1065,325 @@ def test_xs_multiindex(self): expected.columns = expected.columns.droplevel('lvl1') assert_frame_equal(result, expected) + def test_per_axis_per_level_getitem(self): + + # GH6134 + # example test case + ix = MultiIndex.from_product([_mklbl('A',5),_mklbl('B',7),_mklbl('C',4),_mklbl('D',2)]) + df = DataFrame(np.arange(len(ix.get_values())),index=ix) + + result = df.loc[(slice('A1','A3'),slice(None), ['C1','C3']),:] + expected = df.loc[[ tuple([a,b,c,d]) for a,b,c,d in df.index.values if ( + a == 'A1' or a == 'A2' or a == 'A3') and (c == 'C1' or c == 'C3')]] + assert_frame_equal(result, expected) + + expected = df.loc[[ tuple([a,b,c,d]) for a,b,c,d in df.index.values if ( + a == 'A1' or a == 'A2' or a == 'A3') and (c == 'C1' or c == 'C2' or c == 'C3')]] + result = df.loc[(slice('A1','A3'),slice(None), slice('C1','C3')),:] + assert_frame_equal(result, expected) + + # test multi-index slicing with per axis and per index controls + index = MultiIndex.from_tuples([('A',1),('A',2),('A',3),('B',1)], + names=['one','two']) + columns = MultiIndex.from_tuples([('a','foo'),('a','bar'),('b','foo'),('b','bah')], + names=['lvl0', 'lvl1']) + + df = DataFrame(np.arange(16).reshape(4, 4), index=index, columns=columns) + df = df.sortlevel(axis=0).sortlevel(axis=1) + + # identity + result = df.loc[(slice(None),slice(None)),:] + assert_frame_equal(result, df) + result = df.loc[(slice(None),slice(None)),(slice(None),slice(None))] + assert_frame_equal(result, df) + result = df.loc[:,(slice(None),slice(None))] + assert_frame_equal(result, df) + + # index + result = df.loc[(slice(None),[1]),:] + expected = df.iloc[[0,3]] + assert_frame_equal(result, expected) + + result = df.loc[(slice(None),1),:] + expected = df.iloc[[0,3]] + assert_frame_equal(result, expected) + + # columns + result = df.loc[:,(slice(None),['foo'])] + expected = df.iloc[:,[1,3]] + assert_frame_equal(result, expected) + + # both + result = df.loc[(slice(None),1),(slice(None),['foo'])] + expected = df.iloc[[0,3],[1,3]] + assert_frame_equal(result, expected) + + result = df.loc['A','a'] + expected = DataFrame(dict(bar = [1,5,9], foo = [0,4,8]), + index=Index([1,2,3],name='two'), + columns=Index(['bar','foo'],name='lvl1')) + assert_frame_equal(result, expected) + + result = df.loc[(slice(None),[1,2]),:] + expected = df.iloc[[0,1,3]] + assert_frame_equal(result, expected) + + # multi-level series + s = Series(np.arange(len(ix.get_values())),index=ix) + result = s.loc['A1':'A3', :, ['C1','C3']] + expected = s.loc[[ tuple([a,b,c,d]) for a,b,c,d in s.index.values if ( + a == 'A1' or a == 'A2' or a == 'A3') and (c == 'C1' or c == 'C3')]] + assert_series_equal(result, expected) + + # boolean indexers + result = df.loc[(slice(None),df.loc[:,('a','bar')]>5),:] + expected = df.iloc[[2,3]] + assert_frame_equal(result, expected) + + def f(): + df.loc[(slice(None),np.array([True,False])),:] + self.assertRaises(ValueError, f) + + # ambiguous cases + # these can be multiply interpreted + # but we can catch this in some cases + def f(): + df.loc[(slice(None),[1])] + self.assertRaises(KeyError, f) + + def test_per_axis_per_level_doc_examples(self): + + # test index maker + idx = pd.IndexSlice + + # from indexing.rst / advanced + index = MultiIndex.from_product([_mklbl('A',4), + _mklbl('B',2), + _mklbl('C',4), + _mklbl('D',2)]) + columns = MultiIndex.from_tuples([('a','foo'),('a','bar'), + ('b','foo'),('b','bah')], + names=['lvl0', 'lvl1']) + df = DataFrame(np.arange(len(index)*len(columns)).reshape((len(index),len(columns))), + index=index, + columns=columns) + result = df.loc[(slice('A1','A3'),slice(None), ['C1','C3']),:] + expected = df.loc[[ tuple([a,b,c,d]) for a,b,c,d in df.index.values if ( + a == 'A1' or a == 'A2' or a == 'A3') and (c == 'C1' or c == 'C3')]] + assert_frame_equal(result, expected) + result = df.loc[idx['A1':'A3',:,['C1','C3']],:] + assert_frame_equal(result, expected) + + result = df.loc[(slice(None),slice(None), ['C1','C3']),:] + expected = df.loc[[ tuple([a,b,c,d]) for a,b,c,d in df.index.values if ( + c == 'C1' or c == 'C3')]] + assert_frame_equal(result, expected) + result = df.loc[idx[:,:,['C1','C3']],:] + assert_frame_equal(result, expected) + + # not sorted + def f(): + df.loc['A1',(slice(None),'foo')] + self.assertRaises(KeyError, f) + df = df.sortlevel(axis=1) + + # slicing + df.loc['A1',(slice(None),'foo')] + df.loc[(slice(None),slice(None), ['C1','C3']),(slice(None),'foo')] + + # setitem + df.loc(axis=0)[:,:,['C1','C3']] = -10 + + def test_loc_arguments(self): + + index = MultiIndex.from_product([_mklbl('A',4), + _mklbl('B',2), + _mklbl('C',4), + _mklbl('D',2)]) + columns = MultiIndex.from_tuples([('a','foo'),('a','bar'), + ('b','foo'),('b','bah')], + names=['lvl0', 'lvl1']) + df = DataFrame(np.arange(len(index)*len(columns)).reshape((len(index),len(columns))), + index=index, + columns=columns).sortlevel().sortlevel(axis=1) + + + # axis 0 + result = df.loc(axis=0)['A1':'A3',:,['C1','C3']] + expected = df.loc[[ tuple([a,b,c,d]) for a,b,c,d in df.index.values if ( + a == 'A1' or a == 'A2' or a == 'A3') and (c == 'C1' or c == 'C3')]] + assert_frame_equal(result, expected) + + result = df.loc(axis='index')[:,:,['C1','C3']] + expected = df.loc[[ tuple([a,b,c,d]) for a,b,c,d in df.index.values if ( + c == 'C1' or c == 'C3')]] + assert_frame_equal(result, expected) + + # axis 1 + result = df.loc(axis=1)[:,'foo'] + expected = df.loc[:,(slice(None),'foo')] + assert_frame_equal(result, expected) + + result = df.loc(axis='columns')[:,'foo'] + expected = df.loc[:,(slice(None),'foo')] + assert_frame_equal(result, expected) + + # invalid axis + def f(): + df.loc(axis=-1)[:,:,['C1','C3']] + self.assertRaises(ValueError, f) + + def f(): + df.loc(axis=2)[:,:,['C1','C3']] + self.assertRaises(ValueError, f) + + def f(): + df.loc(axis='foo')[:,:,['C1','C3']] + self.assertRaises(ValueError, f) + + def test_per_axis_per_level_setitem(self): + + # test index maker + idx = pd.IndexSlice + + # test multi-index slicing with per axis and per index controls + index = MultiIndex.from_tuples([('A',1),('A',2),('A',3),('B',1)], + names=['one','two']) + columns = MultiIndex.from_tuples([('a','foo'),('a','bar'),('b','foo'),('b','bah')], + names=['lvl0', 'lvl1']) + + df_orig = DataFrame(np.arange(16).reshape(4, 4), index=index, columns=columns) + df_orig = df_orig.sortlevel(axis=0).sortlevel(axis=1) + + # identity + df = df_orig.copy() + df.loc[(slice(None),slice(None)),:] = 100 + expected = df_orig.copy() + expected.iloc[:,:] = 100 + assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc(axis=0)[:,:] = 100 + expected = df_orig.copy() + expected.iloc[:,:] = 100 + assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[(slice(None),slice(None)),(slice(None),slice(None))] = 100 + expected = df_orig.copy() + expected.iloc[:,:] = 100 + assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[:,(slice(None),slice(None))] = 100 + expected = df_orig.copy() + expected.iloc[:,:] = 100 + assert_frame_equal(df, expected) + + # index + df = df_orig.copy() + df.loc[(slice(None),[1]),:] = 100 + expected = df_orig.copy() + expected.iloc[[0,3]] = 100 + assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[(slice(None),1),:] = 100 + expected = df_orig.copy() + expected.iloc[[0,3]] = 100 + assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc(axis=0)[:,1] = 100 + expected = df_orig.copy() + expected.iloc[[0,3]] = 100 + assert_frame_equal(df, expected) + + # columns + df = df_orig.copy() + df.loc[:,(slice(None),['foo'])] = 100 + expected = df_orig.copy() + expected.iloc[:,[1,3]] = 100 + assert_frame_equal(df, expected) + + # both + df = df_orig.copy() + df.loc[(slice(None),1),(slice(None),['foo'])] = 100 + expected = df_orig.copy() + expected.iloc[[0,3],[1,3]] = 100 + assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[idx[:,1],idx[:,['foo']]] = 100 + expected = df_orig.copy() + expected.iloc[[0,3],[1,3]] = 100 + assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc['A','a'] = 100 + expected = df_orig.copy() + expected.iloc[0:3,0:2] = 100 + assert_frame_equal(df, expected) + + # setting with a list-like + df = df_orig.copy() + df.loc[(slice(None),1),(slice(None),['foo'])] = np.array([[100, 100], [100, 100]],dtype='int64') + expected = df_orig.copy() + expected.iloc[[0,3],[1,3]] = 100 + assert_frame_equal(df, expected) + + # not enough values + df = df_orig.copy() + def f(): + df.loc[(slice(None),1),(slice(None),['foo'])] = np.array([[100], [100, 100]],dtype='int64') + self.assertRaises(ValueError, f) + def f(): + df.loc[(slice(None),1),(slice(None),['foo'])] = np.array([100, 100, 100, 100],dtype='int64') + self.assertRaises(ValueError, f) + + # with an alignable rhs + df = df_orig.copy() + df.loc[(slice(None),1),(slice(None),['foo'])] = df.loc[(slice(None),1),(slice(None),['foo'])] * 5 + expected = df_orig.copy() + expected.iloc[[0,3],[1,3]] = expected.iloc[[0,3],[1,3]] * 5 + assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[(slice(None),1),(slice(None),['foo'])] *= df.loc[(slice(None),1),(slice(None),['foo'])] + expected = df_orig.copy() + expected.iloc[[0,3],[1,3]] *= expected.iloc[[0,3],[1,3]] + assert_frame_equal(df, expected) + + rhs = df_orig.loc[(slice(None),1),(slice(None),['foo'])].copy() + rhs.loc[:,('c','bah')] = 10 + df = df_orig.copy() + df.loc[(slice(None),1),(slice(None),['foo'])] *= rhs + expected = df_orig.copy() + expected.iloc[[0,3],[1,3]] *= expected.iloc[[0,3],[1,3]] + assert_frame_equal(df, expected) + + def test_multiindex_setitem(self): + + # GH 3738 + # setting with a multi-index right hand side + arrays = [np.array(['bar', 'bar', 'baz', 'qux', 'qux', 'bar']), + np.array(['one', 'two', 'one', 'one', 'two', 'one']), + np.arange(0, 6, 1)] + + df_orig = pd.DataFrame(np.random.randn(6, 3), + index=arrays, + columns=['A', 'B', 'C']).sort_index() + + expected = df_orig.loc[['bar']]*2 + df = df_orig.copy() + df.loc[['bar']] *= 2 + assert_frame_equal(df.loc[['bar']],expected) + + # raise because these have differing levels + def f(): + df.loc['bar'] *= 2 + self.assertRaises(TypeError, f) + def test_getitem_multiindex(self): # GH 5725