From 16177af6589f476cfd26bc272e7d04b14c6aa53d Mon Sep 17 00:00:00 2001
From: Ryan Lee <ryantlee9@gmail.com>
Date: Tue, 4 Sep 2018 20:54:17 -0700
Subject: [PATCH 1/3] PERF: improve get_indexer_non_unique on sorted,
 non-unique indexes

Use binary search instead of re-indexing if the iterable key length
is small enough
---
 doc/source/whatsnew/v0.24.0.txt |  2 ++
 pandas/_libs/index.pyx          | 25 +++++++++++++++++--------
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index 6c91b6374b8af..50afa28b2606e 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -604,6 +604,8 @@ Performance Improvements
   :meth:`~HDFStore.keys`.  (i.e. ``x in store`` checks are much faster)
   (:issue:`21372`)
 - Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`)
+- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`)
+
 
 .. _whatsnew_0240.docs:
 
diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
index 562c1ba218141..3f76915655f58 100644
--- a/pandas/_libs/index.pyx
+++ b/pandas/_libs/index.pyx
@@ -294,14 +294,23 @@ cdef class IndexEngine:
         result = np.empty(n_alloc, dtype=np.int64)
         missing = np.empty(n_t, dtype=np.int64)
 
-        # form the set of the results (like ismember)
-        members = np.empty(n, dtype=np.uint8)
-        for i in range(n):
-            val = values[i]
-            if val in stargets:
-                if val not in d:
-                    d[val] = []
-                d[val].append(i)
+        # map each starget to its position in the index
+        if stargets and len(stargets) < 5 and self.is_monotonic_increasing:
+            # if there are few enough stargets and the index is monotonically
+            # increasing, then use binary search for each starget
+            for starget in stargets:
+                start = values.searchsorted(starget, side='left')
+                end = values.searchsorted(starget, side='right')
+                if start != end:
+                    d[starget] = list(range(start, end))
+        else:
+            # otherwise, map by iterating through all items in the index
+            for i in range(n):
+                val = values[i]
+                if val in stargets:
+                    if val not in d:
+                        d[val] = []
+                    d[val].append(i)
 
         for i in range(n_t):
             val = targets[i]

From 8b704c5bb8899bea1dcff20ffae78eea6477bbed Mon Sep 17 00:00:00 2001
From: Ryan Lee <ryantlee9@gmail.com>
Date: Tue, 4 Sep 2018 20:59:13 -0700
Subject: [PATCH 2/3] PERF: parameterize index structure for asv indexing
 benchmarks

Adds benchmarks for non-unique, sorted indices in NumericSeriesIndexing
and NonNumericSeriesIndexing classes
---
 asv_bench/benchmarks/indexing.py | 75 +++++++++++++++++++-------------
 1 file changed, 45 insertions(+), 30 deletions(-)

diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
index 739ad6a3d278b..8c6b0cccdff7b 100644
--- a/asv_bench/benchmarks/indexing.py
+++ b/asv_bench/benchmarks/indexing.py
@@ -11,95 +11,110 @@
 class NumericSeriesIndexing(object):
 
     goal_time = 0.2
-    params = [Int64Index, Float64Index]
-    param = ['index']
+    params = [
+        (Int64Index, Float64Index),
+        ('unique_monotonic_inc', 'nonunique_monotonic_inc'),
+    ]
+    param_names = ['index dtype', 'index structure']
 
-    def setup(self, index):
+    def setup(self, index, index_structure):
         N = 10**6
-        idx = index(range(N))
-        self.data = Series(np.random.rand(N), index=idx)
+        indices = {
+            'unique_monotonic_inc': index(range(N)),
+            'nonunique_monotonic_inc': index(
+                list(range(55)) + [54] + list(range(55, N - 1))),
+        }
+        self.data = Series(np.random.rand(N), index=indices[index_structure])
         self.array = np.arange(10000)
         self.array_list = self.array.tolist()
 
-    def time_getitem_scalar(self, index):
+    def time_getitem_scalar(self, index, index_structure):
         self.data[800000]
 
-    def time_getitem_slice(self, index):
+    def time_getitem_slice(self, index, index_structure):
         self.data[:800000]
 
-    def time_getitem_list_like(self, index):
+    def time_getitem_list_like(self, index, index_structure):
         self.data[[800000]]
 
-    def time_getitem_array(self, index):
+    def time_getitem_array(self, index, index_structure):
         self.data[self.array]
 
-    def time_getitem_lists(self, index):
+    def time_getitem_lists(self, index, index_structure):
         self.data[self.array_list]
 
-    def time_iloc_array(self, index):
+    def time_iloc_array(self, index, index_structure):
         self.data.iloc[self.array]
 
-    def time_iloc_list_like(self, index):
+    def time_iloc_list_like(self, index, index_structure):
         self.data.iloc[[800000]]
 
-    def time_iloc_scalar(self, index):
+    def time_iloc_scalar(self, index, index_structure):
         self.data.iloc[800000]
 
-    def time_iloc_slice(self, index):
+    def time_iloc_slice(self, index, index_structure):
         self.data.iloc[:800000]
 
-    def time_ix_array(self, index):
+    def time_ix_array(self, index, index_structure):
         self.data.ix[self.array]
 
-    def time_ix_list_like(self, index):
+    def time_ix_list_like(self, index, index_structure):
         self.data.ix[[800000]]
 
-    def time_ix_scalar(self, index):
+    def time_ix_scalar(self, index, index_structure):
         self.data.ix[800000]
 
-    def time_ix_slice(self, index):
+    def time_ix_slice(self, index, index_structure):
         self.data.ix[:800000]
 
-    def time_loc_array(self, index):
+    def time_loc_array(self, index, index_structure):
         self.data.loc[self.array]
 
-    def time_loc_list_like(self, index):
+    def time_loc_list_like(self, index, index_structure):
         self.data.loc[[800000]]
 
-    def time_loc_scalar(self, index):
+    def time_loc_scalar(self, index, index_structure):
         self.data.loc[800000]
 
-    def time_loc_slice(self, index):
+    def time_loc_slice(self, index, index_structure):
         self.data.loc[:800000]
 
 
 class NonNumericSeriesIndexing(object):
 
     goal_time = 0.2
-    params = ['string', 'datetime']
-    param_names = ['index']
+    params = [
+        ('string', 'datetime'),
+        ('unique_monotonic_inc', 'nonunique_monotonic_inc'),
+    ]
+    param_names = ['index dtype', 'index structure']
 
-    def setup(self, index):
-        N = 10**5
+    def setup(self, index, index_structure):
+        N = 10**6
         indexes = {'string': tm.makeStringIndex(N),
                    'datetime': date_range('1900', periods=N, freq='s')}
         index = indexes[index]
+        if index_structure == 'nonunique_monotonic_inc':
+            index = index.insert(item=index[2], loc=2)[:-1]
         self.s = Series(np.random.rand(N), index=index)
         self.lbl = index[80000]
 
-    def time_getitem_label_slice(self, index):
+    def time_getitem_label_slice(self, index, index_structure):
         self.s[:self.lbl]
 
-    def time_getitem_pos_slice(self, index):
+    def time_getitem_pos_slice(self, index, index_structure):
         self.s[:80000]
 
-    def time_get_value(self, index):
+    def time_get_value(self, index, index_structure):
         with warnings.catch_warnings(record=True):
             self.s.get_value(self.lbl)
 
-    def time_getitem_scalar(self, index):
+    def time_getitem_scalar(self, index, index_structure):
         self.s[self.lbl]
 
+    def time_getitem_list_like(self, index, index_structure):
+        self.s[[self.lbl]]
+
 
 class DataFrameStringIndexing(object):
 

From 4ad3006b7fa6b41a0c4d32c05272bb8cde786597 Mon Sep 17 00:00:00 2001
From: Ryan Lee <ryantlee9@gmail.com>
Date: Wed, 26 Sep 2018 11:17:47 -0700
Subject: [PATCH 3/3] CLN: use underscore in asv param names

---
 asv_bench/benchmarks/indexing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
index 8c6b0cccdff7b..c5b147b152aa6 100644
--- a/asv_bench/benchmarks/indexing.py
+++ b/asv_bench/benchmarks/indexing.py
@@ -15,7 +15,7 @@ class NumericSeriesIndexing(object):
         (Int64Index, Float64Index),
         ('unique_monotonic_inc', 'nonunique_monotonic_inc'),
     ]
-    param_names = ['index dtype', 'index structure']
+    param_names = ['index_dtype', 'index_structure']
 
     def setup(self, index, index_structure):
         N = 10**6
@@ -87,7 +87,7 @@ class NonNumericSeriesIndexing(object):
         ('string', 'datetime'),
         ('unique_monotonic_inc', 'nonunique_monotonic_inc'),
     ]
-    param_names = ['index dtype', 'index structure']
+    param_names = ['index_dtype', 'index_structure']
 
     def setup(self, index, index_structure):
         N = 10**6