From 85f45ffca246ff86c58fef25e1a9d0faacac6500 Mon Sep 17 00:00:00 2001
From: Pietro Battiston <me@pietrobattiston.it>
Date: Tue, 30 Jan 2018 06:40:39 +0100
Subject: [PATCH 1/3] PERF: MultiIndex._engine.get_loc() handles non-unique
 fine

---
 doc/source/whatsnew/v0.24.0.txt                 |  1 +
 pandas/core/indexes/multi.py                    | 17 ++++++++++++++---
 .../frame/test_sort_values_level_as_str.py      | 10 +---------
 pandas/tests/indexing/test_ix.py                |  5 +----
 pandas/tests/indexing/test_multiindex.py        |  4 ----
 5 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index 5c15c7b6a742f..aadbfe0f3c4cc 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -501,6 +501,7 @@ Performance Improvements
 - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
 - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
 - Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`,:issue:`21606`)
+- Improved performance of :func:`MultiIndex.get_loc` for non-unique indexes, which as a consequence does not emit a ``PerformanceWarning`` any more (:issue:`19464`)
 - Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex`
   (i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains`
   is likewise much faster (:issue:`21369`, :issue:`21508`)
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index 2a97c37449e12..db97439664d9a 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -16,6 +16,7 @@
 from pandas.core.dtypes.common import (
     ensure_int64,
     ensure_platform_int,
+    is_integer,
     is_categorical_dtype,
     is_object_dtype,
     is_hashable,
@@ -2197,10 +2198,16 @@ def _maybe_to_slice(loc):
             raise KeyError('Key length ({0}) exceeds index depth ({1})'
                            ''.format(keylen, self.nlevels))
 
-        if keylen == self.nlevels and self.is_unique:
-            return self._engine.get_loc(key)
+        # If the index is monotonic, the code for partial selection or
+        # non-unique index (below) is more efficient than the following:
+        if keylen == self.nlevels and not self.is_monotonic:
+            loc = self._engine.get_loc(key)
+            if not self.is_unique and is_integer(loc):
+                # Indexers expect a slice from indexing a non-unique index
+                loc = slice(loc, loc + 1)
+            return loc
 
-        # -- partial selection or non-unique index
+        # -- partial selection or non-unique index or monotonic index
         # break the key into 2 parts based on the lexsort_depth of the index;
         # the first part returns a continuous slice of the index; the 2nd part
         # needs linear search within the slice
@@ -2213,6 +2220,10 @@ def _maybe_to_slice(loc):
             raise KeyError(key)
 
         if not follow_key:
+            # Indexers expect an integer from indexing a key in a unique index
+            if self.is_unique:
+                # Breaks if we pass a np.int64. TODO: investigate why
+                return int(start)
             return slice(start, stop)
 
         warnings.warn('indexing past lexsort depth may impact performance.',
diff --git a/pandas/tests/frame/test_sort_values_level_as_str.py b/pandas/tests/frame/test_sort_values_level_as_str.py
index 3b4eadfce81cd..266c0d336d898 100644
--- a/pandas/tests/frame/test_sort_values_level_as_str.py
+++ b/pandas/tests/frame/test_sort_values_level_as_str.py
@@ -2,7 +2,6 @@
 import pytest
 
 from pandas import DataFrame, Index
-from pandas.errors import PerformanceWarning
 from pandas.util import testing as tm
 from pandas.util.testing import assert_frame_equal
 
@@ -85,14 +84,7 @@ def test_sort_column_level_and_index_label(
                                   ascending=ascending,
                                   axis=1)
 
-    if len(levels) > 1:
-        # Accessing multi-level columns that are not lexsorted raises a
-        # performance warning
-        with tm.assert_produces_warning(PerformanceWarning,
-                                        check_stacklevel=False):
-            assert_frame_equal(result, expected)
-    else:
-        assert_frame_equal(result, expected)
+    assert_frame_equal(result, expected)
 
 
 def test_sort_values_column_index_level_precedence():
diff --git a/pandas/tests/indexing/test_ix.py b/pandas/tests/indexing/test_ix.py
index c84576c984525..a8fd82ce3098e 100644
--- a/pandas/tests/indexing/test_ix.py
+++ b/pandas/tests/indexing/test_ix.py
@@ -11,7 +11,6 @@
 from pandas.compat import lrange
 from pandas import Series, DataFrame, option_context, MultiIndex
 from pandas.util import testing as tm
-from pandas.errors import PerformanceWarning
 
 
 class TestIX(object):
@@ -187,9 +186,7 @@ def test_ix_general(self):
         df = DataFrame(data).set_index(keys=['col', 'year'])
         key = 4.0, 2012
 
-        # emits a PerformanceWarning, ok
-        with tm.assert_produces_warning(PerformanceWarning):
-            tm.assert_frame_equal(df.loc[key], df.iloc[2:])
+        tm.assert_frame_equal(df.loc[key], df.iloc[2:])
 
         # this is ok
         df.sort_index(inplace=True)
diff --git a/pandas/tests/indexing/test_multiindex.py b/pandas/tests/indexing/test_multiindex.py
index d2c4c8f5e149b..581aee0dcf971 100644
--- a/pandas/tests/indexing/test_multiindex.py
+++ b/pandas/tests/indexing/test_multiindex.py
@@ -366,10 +366,6 @@ def test_multiindex_perf_warn(self):
                         'joe': ['x', 'x', 'z', 'y'],
                         'jolie': np.random.rand(4)}).set_index(['jim', 'joe'])
 
-        with tm.assert_produces_warning(PerformanceWarning,
-                                        clear=[pd.core.index]):
-            df.loc[(1, 'z')]
-
         df = df.iloc[[2, 1, 3, 0]]
         with tm.assert_produces_warning(PerformanceWarning):
             df.loc[(0, )]

From ddd29ae46eac20830e844967d3c5bbab05dd9aaf Mon Sep 17 00:00:00 2001
From: Pietro Battiston <me@pietrobattiston.it>
Date: Tue, 30 Jan 2018 06:51:32 +0100
Subject: [PATCH 2/3] DOC: sorting isn't (and wasn't) a problem for single key
 indexing

---
 doc/source/advanced.rst                    | 15 ++++++++-------
 pandas/tests/indexes/multi/test_sorting.py | 11 ++++++-----
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst
index e530ece2e12c5..de372eceb5aaa 100644
--- a/doc/source/advanced.rst
+++ b/doc/source/advanced.rst
@@ -535,16 +535,17 @@ they have a ``MultiIndex``:
 
    df.T.sort_index(level=1, axis=1)
 
-Indexing will work even if the data are not sorted, but will be rather
-inefficient (and show a ``PerformanceWarning``). It will also
+Indexing will work even if the data are not sorted, but partial indexing will
+be rather inefficient (and show a ``PerformanceWarning``). It will also
 return a copy of the data rather than a view:
 
 .. ipython:: python
 
    dfm = pd.DataFrame({'jim': [0, 0, 1, 1],
                        'joe': ['x', 'x', 'z', 'y'],
-                       'jolie': np.random.rand(4)})
-   dfm = dfm.set_index(['jim', 'joe'])
+                       'jolie': list('abcd'),
+                       'values' : np.random.rand(4)})
+   dfm = dfm.set_index(['jim', 'joe', 'jolie'])
    dfm
 
 .. code-block:: ipython
@@ -553,9 +554,9 @@ return a copy of the data rather than a view:
    PerformanceWarning: indexing past lexsort depth may impact performance.
 
    Out[4]:
-              jolie
-   jim joe
-   1   z    0.64094
+            values
+   jolie
+   0.879189      c
 
 .. _advanced.unsorted:
 
diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py
index ee29ea1be8aea..4a9831f1d5b80 100644
--- a/pandas/tests/indexes/multi/test_sorting.py
+++ b/pandas/tests/indexes/multi/test_sorting.py
@@ -114,11 +114,12 @@ def test_unsortedindex():
 
 def test_unsortedindex_doc_examples():
     # http://pandas.pydata.org/pandas-docs/stable/advanced.html#sorting-a-multiindex  # noqa
-    dfm = DataFrame({'jim': [0, 0, 1, 1],
-                     'joe': ['x', 'x', 'z', 'y'],
-                     'jolie': np.random.rand(4)})
+    dfm = pd.DataFrame({'jim': [0, 0, 1, 1],
+                        'joe': ['x', 'x', 'z', 'y'],
+                        'jolie': list('abcd'),
+                        'values': np.random.rand(4)})
 
-    dfm = dfm.set_index(['jim', 'joe'])
+    dfm = dfm.set_index(['jim', 'joe', 'jolie'])
     with tm.assert_produces_warning(PerformanceWarning):
         dfm.loc[(1, 'z')]
 
@@ -134,7 +135,7 @@ def test_unsortedindex_doc_examples():
     dfm.loc[(0, 'y'):(1, 'z')]
 
     assert dfm.index.is_lexsorted()
-    assert dfm.index.lexsort_depth == 2
+    assert dfm.index.lexsort_depth == 3
 
 
 def test_reconstruct_sort():

From b72f9c5885ba751f3a90127a859edf452faead1d Mon Sep 17 00:00:00 2001
From: Pietro Battiston <me@pietrobattiston.it>
Date: Wed, 31 Jan 2018 08:47:52 +0100
Subject: [PATCH 3/3] TST: asv tests for indexing in non-unique MultiIndex

---
 asv_bench/benchmarks/multiindex_object.py | 35 +++++++++++++++++++++--
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py
index 0c92214795557..eb4c4e1c5380e 100644
--- a/asv_bench/benchmarks/multiindex_object.py
+++ b/asv_bench/benchmarks/multiindex_object.py
@@ -49,16 +49,30 @@ class Duplicates(object):
     goal_time = 0.2
 
     def setup(self):
-        size = 65536
+        size = 6553
         arrays = [np.random.randint(0, 8192, size),
                   np.random.randint(0, 1024, size)]
-        mask = np.random.rand(size) < 0.1
         self.mi_unused_levels = MultiIndex.from_arrays(arrays)
-        self.mi_unused_levels = self.mi_unused_levels[mask]
+        self.mi = self.mi_unused_levels.remove_unused_levels()
+        self.sorted = self.mi.sort_values()
+        self.key = self.mi[len(self.mi) // 2]
+        self.partial_key = (self.key[0],)
 
     def time_remove_unused_levels(self):
         self.mi_unused_levels.remove_unused_levels()
 
+    def time_duplicates_loc(self):
+        self.mi.get_loc(self.key)
+
+    def time_duplicates_partial_loc(self):
+        self.mi.get_loc(self.partial_key)
+
+    def time_duplicates_sorted_loc(self):
+        self.sorted.get_loc(self.key)
+
+    def time_duplicates_sorted_partial_loc(self):
+        self.sorted.get_loc(self.partial_key)
+
 
 class Integer(object):
 
@@ -91,10 +105,25 @@ def setup(self):
                   1000 + np.arange(n)]
         labels = [np.random.choice(n, (k * n)) for lev in levels]
         self.mi = MultiIndex(levels=levels, labels=labels)
+        self.sorted = self.mi.sort_values()
+        self.key = self.mi[len(self.mi) // 2]
+        self.partial_key = (self.key[0], self.key[1])
 
     def time_duplicated(self):
         self.mi.duplicated()
 
+    def time_duplicated_loc(self):
+        self.mi.get_loc(self.key)
+
+    def time_duplicated_partial_loc(self):
+        self.mi.get_loc(self.partial_key)
+
+    def time_duplicates_sorted_loc(self):
+        self.sorted.get_loc(self.key)
+
+    def time_duplicates_sorted_partial_loc(self):
+        self.sorted.get_loc(self.partial_key)
+
 
 class Sortlevel(object):