Merge branch 'mi-key-loc' of https://github.com/behzadnouri/pandas into behzadnouri-mi-key-loc

jreback · jreback · commit 6301f06c77cc · 2014-11-21T15:41:21.000-05:00
diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt
@@ -19,6 +19,26 @@ users upgrade to this version.
 
 API changes
 ~~~~~~~~~~~
+- Indexing in ``MultiIndex`` beyond lex-sort depth is now supported, though
+  a lexically sorted index will have a better performance. (:issue:`2646`)
+
+  .. ipython:: python
+
+    df = pd.DataFrame({'jim':[0, 0, 1, 1],
+                       'joe':['x', 'x', 'z', 'y'],
+                       'jolie':np.random.rand(4)}).set_index(['jim', 'joe'])
+    df
+    df.index.lexsort_depth
+
+    # in prior versions this would raise a KeyError
+    # will now show a PerformanceWarning
+    df.loc[(1, 'z')]
+
+    # lexically sorting
+    df2 = df.sortlevel()
+    df2
+    df2.index.lexsort_depth
+    df2.loc[(1,'z')]
 
 - Bug in concat of Series with ``category`` dtype which were coercing to ``object``. (:issue:`8641`)
 
@@ -129,3 +149,5 @@ Bug Fixes
 
 - Bugs when trying to stack multiple columns, when some (or all)
   of the level names are numbers (:issue:`8584`).
+- Bug in ``MultiIndex`` where ``__contains__`` returns wrong result if index is
+  not lexically sorted or unique (:issue:`7724`)
diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -21,6 +21,7 @@
 from pandas.core.common import (_values_from_object, is_float, is_integer,
                                 ABCSeries, _ensure_object, _ensure_int64)
 from pandas.core.config import get_option
+from pandas.io.common import PerformanceWarning
 
 # simplify
 default_pprint = lambda x: com.pprint_thing(x, escape_chars=('\t', '\r', '\n'),
@@ -4027,30 +4028,83 @@ def _partial_tup_index(self, tup, side='left'):
 
     def get_loc(self, key):
         """
-        Get integer location slice for requested label or tuple
+        Get integer location, slice or boolean mask for requested label or tuple
+        If the key is past the lexsort depth, the return may be a boolean mask
+        array, otherwise it is always a slice or int.
 
         Parameters
         ----------
         key : label or tuple
 
         Returns
         -------
-        loc : int or slice object
-        """
-        if isinstance(key, tuple):
-            if len(key) == self.nlevels:
-                if self.is_unique:
-                    return self._engine.get_loc(_values_from_object(key))
-                else:
-                    return slice(*self.slice_locs(key, key))
-            else:
-                # partial selection
-                result = slice(*self.slice_locs(key, key))
-                if result.start == result.stop:
-                    raise KeyError(key)
-                return result
-        else:
-            return self._get_level_indexer(key, level=0)
+        loc : int, slice object or boolean mask
+        """
+        def _maybe_to_slice(loc):
+            '''convert integer indexer to boolean mask or slice if possible'''
+            if not isinstance(loc, np.ndarray) or loc.dtype != 'int64':
+                return loc
+
+            loc = lib.maybe_indices_to_slice(loc)
+            if isinstance(loc, slice):
+                return loc
+
+            mask = np.empty(len(self), dtype='bool')
+            mask.fill(False)
+            mask[loc] = True
+            return mask
+
+        if not isinstance(key, tuple):
+            loc = self._get_level_indexer(key, level=0)
+            return _maybe_to_slice(loc)
+
+        keylen = len(key)
+        if self.nlevels < keylen:
+            raise KeyError('Key length ({0}) exceeds index depth ({1})'
+                    ''.format(keylen, self.nlevels))
+
+        if keylen == self.nlevels and self.is_unique:
+            def _maybe_str_to_time_stamp(key, lev):
+                if lev.is_all_dates and not isinstance(key, Timestamp):
+                    try:
+                        return Timestamp(key, tz=getattr(lev, 'tz', None))
+                    except Exception:
+                        pass
+                return key
+            key = _values_from_object(key)
+            key = tuple(map(_maybe_str_to_time_stamp, key, self.levels))
+            return self._engine.get_loc(key)
+
+        # -- partial selection or non-unique index
+        # break the key into 2 parts based on the lexsort_depth of the index;
+        # the first part returns a continuous slice of the index; the 2nd part
+        # needs linear search within the slice
+        i = self.lexsort_depth
+        lead_key, follow_key = key[:i], key[i:]
+        start, stop = self.slice_locs(lead_key, lead_key) \
+                if lead_key else (0, len(self))
+
+        if start == stop:
+            raise KeyError(key)
+
+        if not follow_key:
+            return slice(start, stop)
+
+        warnings.warn('indexing past lexsort depth may impact performance.',
+                PerformanceWarning)
+
+        loc = np.arange(start, stop, dtype='int64')
+
+        for i, k in enumerate(follow_key, len(lead_key)):
+            mask = self.labels[i][loc] == self.levels[i].get_loc(k)
+            if not mask.all():
+                loc = loc[mask]
+            if not len(loc):
+                raise KeyError(key)
+
+        return _maybe_to_slice(loc) \
+                if len(loc) != stop - start \
+                else slice(start, stop)
 
     def get_loc_level(self, key, level=0, drop_level=True):
         """
@@ -4115,10 +4169,10 @@ def _maybe_drop_levels(indexer, levels, drop_level):
             if not any(isinstance(k, slice) for k in key):
 
                 # partial selection
-                def partial_selection(key):
-                    indexer = slice(*self.slice_locs(key, key))
-                    if indexer.start == indexer.stop:
-                        raise KeyError(key)
+                # optionally get indexer to avoid re-calculation
+                def partial_selection(key, indexer=None):
+                    if indexer is None:
+                        indexer = self.get_loc(key)
                     ilevels = [i for i in range(len(key))
                                if key[i] != slice(None, None)]
                     return indexer, _maybe_drop_levels(indexer, ilevels,
@@ -4139,11 +4193,12 @@ def partial_selection(key):
                         if any([
                             l.is_all_dates for k, l in zip(key, self.levels)
                         ]) and not can_index_exactly:
-                            indexer = slice(*self.slice_locs(key, key))
+                            indexer = self.get_loc(key)
 
                             # we have a multiple selection here
-                            if not indexer.stop - indexer.start == 1:
-                                return partial_selection(key)
+                            if not isinstance(indexer, slice) \
+                                    or indexer.stop - indexer.start != 1:
+                                return partial_selection(key, indexer)
 
                             key = tuple(self[indexer].tolist()[0])
 
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -3257,7 +3257,9 @@ def take(self, indexer, axis=1, verify=True, convert=True):
         Take items along any axis.
         """
         self._consolidate_inplace()
-        indexer = np.asanyarray(indexer, dtype=np.int_)
+        indexer = np.arange(indexer.start, indexer.stop, indexer.step,
+                            dtype='int64') if isinstance(indexer, slice) \
+                                    else np.asanyarray(indexer, dtype='int64')
 
         n = self.shape[axis]
         if convert:
diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py
@@ -1488,6 +1488,86 @@ def test_loc_multiindex(self):
         result = s.loc[2:4:2, 'a':'c']
         assert_series_equal(result, expected)
 
+    def test_multiindex_perf_warn(self):
+        import sys
+        from pandas.io.common import PerformanceWarning
+
+        if sys.version_info < (2, 7):
+            raise nose.SkipTest('python version < 2.7')
+
+        df = DataFrame({'jim':[0, 0, 1, 1],
+                        'joe':['x', 'x', 'z', 'y'],
+                        'jolie':np.random.rand(4)}).set_index(['jim', 'joe'])
+
+        with tm.assert_produces_warning(PerformanceWarning):
+            _ = df.loc[(1, 'z')]
+
+        df = df.iloc[[2,1,3,0]]
+        with tm.assert_produces_warning(PerformanceWarning):
+            _ = df.loc[(0,)]
+
+    def test_multiindex_get_loc(self):  # GH7724, GH2646
+        # test indexing into a multi-index before & past the lexsort depth
+        from numpy.random import randint, choice, randn
+        cols = ['jim', 'joe', 'jolie', 'joline', 'jolia']
+
+        def validate(mi, df, key):
+            mask = np.ones(len(df)).astype('bool')
+
+            # test for all partials of this key
+            for i, k in enumerate(key):
+                mask &= df.iloc[:, i] == k
+
+                if not mask.any():
+                    self.assertNotIn(key[:i+1], mi.index)
+                    continue
+
+                self.assertIn(key[:i+1], mi.index)
+                right = df[mask].copy()
+
+                if i + 1 != len(key):  # partial key
+                    right.drop(cols[:i+1], axis=1, inplace=True)
+                    right.set_index(cols[i+1:-1], inplace=True)
+                    assert_frame_equal(mi.loc[key[:i+1]], right)
+
+                else:  # full key
+                    right.set_index(cols[:-1], inplace=True)
+                    if len(right) == 1:  # single hit
+                        right = Series(right['jolia'].values,
+                                name=right.index[0], index=['jolia'])
+                        assert_series_equal(mi.loc[key[:i+1]], right)
+                    else:  # multi hit
+                        assert_frame_equal(mi.loc[key[:i+1]], right)
+
+        def loop(mi, df, keys):
+            for key in keys:
+                validate(mi, df, key)
+
+        n, m = 1000, 50
+
+        vals = [randint(0, 10, n), choice(list('abcdefghij'), n),
+                choice(pd.date_range('20141009', periods=10).tolist(), n),
+                choice(list('ZYXWVUTSRQ'), n), randn(n)]
+        vals = list(map(tuple, zip(*vals)))
+
+        # bunch of keys for testing
+        keys = [randint(0, 11, m), choice(list('abcdefghijk'), m),
+                choice(pd.date_range('20141009', periods=11).tolist(), m),
+                choice(list('ZYXWVUTSRQP'), m)]
+        keys = list(map(tuple, zip(*keys)))
+        keys += list(map(lambda t: t[:-1], vals[::n//m]))
+
+        # covers both unique index and non-unique index
+        df = pd.DataFrame(vals, columns=cols)
+        a, b = pd.concat([df, df]), df.drop_duplicates(subset=cols[:-1])
+
+        for frame in a, b:
+            for i in range(5):  # lexsort depth
+                df = frame.copy() if i == 0 else frame.sort(columns=cols[:i])
+                mi = df.set_index(cols[:-1])
+                assert not mi.index.lexsort_depth < i
+                loop(mi, df, keys)
+
     def test_series_getitem_multiindex(self):
 
         # GH 6018
@@ -1541,10 +1621,7 @@ def test_ix_general(self):
                 'year': {0: 2012, 1: 2011, 2: 2012, 3: 2012, 4: 2012}}
         df = DataFrame(data).set_index(keys=['col', 'year'])
         key = 4.0, 2012
-
-        # this should raise correct error
-        with tm.assertRaises(KeyError):
-            df.ix[key]
+        tm.assert_frame_equal(df.ix[key], df.iloc[2:])
 
         # this is ok
         df.sortlevel(inplace=True)