From d6bcef84227636ada9e67a10e424f2cd3dc6c744 Mon Sep 17 00:00:00 2001
From: Danilo Horta <danilo.horta@gmail.com>
Date: Tue, 21 Feb 2017 10:12:59 +0000
Subject: [PATCH 1/4] fast non-unique indexing

---
 pandas/index.pyx       | 121 +++++++++++++++++++++++++++++++++++++++++
 pandas/indexes/base.py |  32 ++++++++++-
 2 files changed, 152 insertions(+), 1 deletion(-)

diff --git a/pandas/index.pyx b/pandas/index.pyx
index 37fe7d90bebe0..3045fa94d5497 100644
--- a/pandas/index.pyx
+++ b/pandas/index.pyx
@@ -44,6 +44,117 @@ PyDateTime_IMPORT
 cdef extern from "Python.h":
     int PySlice_Check(object)
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+cdef _count(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1,
+            int64_t[:] mapping_count, int64_t[:] missing_count):
+
+    cdef:
+        int64_t n_v = values.shape[0]
+        int64_t n_t = targets.shape[0]
+        int64_t i = 0
+        int64_t j = 0
+
+    while i < n_v and j < n_t:
+
+        val0 = values[idx0[i]]
+        val1 = targets[idx1[j]]
+
+        if val0 == val1:
+
+            while i < n_v and values[idx0[i]] == val1:
+               i += 1
+               mapping_count[idx1[j]] += 1
+
+            j += 1
+            while j < n_t and val0 == targets[idx1[j]]:
+                mapping_count[idx1[j]] = mapping_count[idx1[j-1]]
+                j += 1
+
+        elif val0 > val1:
+
+            mapping_count[idx1[j]] += 1
+            missing_count[idx1[j]] = 1
+            j += 1
+
+        else:
+            i += 1
+
+    while j < n_t:
+        mapping_count[idx1[j]] += 1
+        missing_count[idx1[j]] = 1
+        j += 1
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+cdef _map(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1,
+            int64_t[:] start_mapping, int64_t[:] start_missing,
+            int64_t[:] mapping, int64_t[:] missing):
+
+    cdef:
+        int64_t n_v = values.shape[0]
+        int64_t n_t = targets.shape[0]
+        int64_t i = 0
+        int64_t j = 0
+        int64_t c
+
+    while i < n_v and j < n_t:
+
+        val0 = values[idx0[i]]
+        val1 = targets[idx1[j]]
+
+        if val0 == val1:
+
+            c = 0
+            while i < n_v and values[idx0[i]] == val1:
+                mapping[start_mapping[idx1[j]] + c] = idx0[i]
+                i += 1
+                c += 1
+
+            j += 1
+            while j < n_t and val0 == targets[idx1[j]]:
+                for ii in range(c):
+                    mapping[start_mapping[idx1[j]] + ii] = \
+                        mapping[start_mapping[idx1[j-1]] + ii]
+                j += 1
+
+        elif val0 > val1:
+
+            mapping[start_mapping[idx1[j]]] = -1
+            missing[start_missing[idx1[j]]] = idx1[j]
+            j += 1
+
+        else:
+            i += 1
+
+    while j < n_t:
+
+        mapping[start_mapping[idx1[j]]] = -1
+        missing[start_missing[idx1[j]]] = idx1[j]
+        j += 1
+
+def _map_targets_to_values(values, targets, idx0, idx1):
+    mapping_count = np.zeros(len(targets), np.int64)
+    missing_count = np.zeros(len(targets), np.int64)
+
+    _count(values, targets, idx0, idx1, mapping_count, missing_count)
+
+    np.cumsum(mapping_count, out=mapping_count)
+    np.cumsum(missing_count, out=missing_count)
+
+    mapping = np.empty(mapping_count[-1], np.int64)
+    missing = np.empty(missing_count[-1], np.int64)
+
+    mapping_count[1:] = mapping_count[:-1]
+    mapping_count[0] = 0
+    missing_count -= 1
+
+    _map(values, targets, idx0, idx1, mapping_count, missing_count, mapping,
+         missing)
+
+    return mapping, missing
 
 cdef inline is_definitely_invalid_key(object val):
     if PyTuple_Check(val):
@@ -371,6 +482,16 @@ cdef class IndexEngine:
 
         return result[0:count], missing[0:count_missing]
 
+    def get_indexer_non_unique_orderable(self, ndarray targets,
+                                         int64_t[:] idx0,
+                                         int64_t[:] idx1):
+
+        cdef:
+            ndarray values
+
+        self._ensure_mapping_populated()
+        values = np.array(self._get_index_values(), copy=False)
+        return _map_targets_to_values(values, targets, idx0, idx1)
 
 cdef Py_ssize_t _bin_search(ndarray values, object val) except -1:
     cdef:
diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py
index f1f37622b2a74..f77a888d2f0a0 100644
--- a/pandas/indexes/base.py
+++ b/pandas/indexes/base.py
@@ -2530,10 +2530,40 @@ def get_indexer_non_unique(self, target):
         if self.is_all_dates:
             self = Index(self.asi8)
             tgt_values = target.asi8
+            src_values = self.asi8
         else:
             tgt_values = target._values
+            src_values = self._values
+
+        try:
+            src_values[0] < tgt_values[0]
+            src_values[0] > tgt_values[0]
+        except (TypeError, IndexError):
+            orderable = False
+        else:
+            try:
+                if self.is_monotonic_increasing:
+                    idx0 = np.arange(len(src_values), dtype=np.int64)
+                else:
+                    idx0 = np.argsort(src_values, kind='mergesort')
+                    idx0 = np.asarray(idx0, dtype=np.int64)
+
+                if target.is_monotonic_increasing:
+                    idx1 = np.arange(len(tgt_values), dtype=np.int64)
+                else:
+                    idx1 = np.argsort(tgt_values, kind='mergesort')
+                    idx1 = np.asarray(idx1, dtype=np.int64)
+
+            except TypeError:
+                orderable = False
+            else:
+                orderable = True
+
+        if orderable:
+            indexer, missing = self._engine.get_indexer_non_unique_orderable(tgt_values, idx0, idx1)
+        else:
+            indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
 
-        indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
         return Index(indexer), missing
 
     def get_indexer_for(self, target, **kwargs):

From dd14ddf994c809f0ff1a3314be92fe773bae67a0 Mon Sep 17 00:00:00 2001
From: Danilo Horta <danilo.horta@gmail.com>
Date: Wed, 22 Feb 2017 12:19:31 +0000
Subject: [PATCH 2/4] order them

---
 pandas/indexes/base.py | 73 +++++++++++++++++++++++++++---------------
 1 file changed, 48 insertions(+), 25 deletions(-)

diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py
index f77a888d2f0a0..4f741d94ac219 100644
--- a/pandas/indexes/base.py
+++ b/pandas/indexes/base.py
@@ -2535,34 +2535,17 @@ def get_indexer_non_unique(self, target):
             tgt_values = target._values
             src_values = self._values
 
-        try:
-            src_values[0] < tgt_values[0]
-            src_values[0] > tgt_values[0]
-        except (TypeError, IndexError):
-            orderable = False
-        else:
-            try:
-                if self.is_monotonic_increasing:
-                    idx0 = np.arange(len(src_values), dtype=np.int64)
-                else:
-                    idx0 = np.argsort(src_values, kind='mergesort')
-                    idx0 = np.asarray(idx0, dtype=np.int64)
-
-                if target.is_monotonic_increasing:
-                    idx1 = np.arange(len(tgt_values), dtype=np.int64)
-                else:
-                    idx1 = np.argsort(tgt_values, kind='mergesort')
-                    idx1 = np.asarray(idx1, dtype=np.int64)
-
-            except TypeError:
-                orderable = False
-            else:
-                orderable = True
+        is_mono0 = self.is_monotonic_increasing
+        is_mono1 = target.is_monotonic_increasing
+        (orderable, idx0, idx1) = _order_them(is_mono0, src_values, is_mono1,
+                                              tgt_values)
 
+        e = self._engine
         if orderable:
-            indexer, missing = self._engine.get_indexer_non_unique_orderable(tgt_values, idx0, idx1)
+            indexer, missing = e.get_indexer_non_unique_orderable(tgt_values,
+                                                                  idx0, idx1)
         else:
-            indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
+            indexer, missing = e.get_indexer_non_unique(tgt_values)
 
         return Index(indexer), missing
 
@@ -3905,3 +3888,43 @@ def _trim_front(strings):
 def _validate_join_method(method):
     if method not in ['left', 'right', 'inner', 'outer']:
         raise ValueError('do not recognize join method %s' % method)
+
+
+def _order_it(is_mono, x):
+    """Tries to sort a sequence."""
+    if is_mono:
+        return (True, np.arange(len(x), dtype=np.int64))
+    try:
+        indices = np.argsort(x, kind='mergesort')
+    except TypeError:
+        return (False, None)
+    return (True, indices)
+
+
+def _order_them(x_is_mono, x, y_is_mono, y):
+    """
+    Tries to sort `x` and `y`, checking whether they are jointly orderable.
+    """
+    (xorderable, xindices) = _order_it(x_is_mono, x)
+    (yorderable, yindices) = _order_it(y_is_mono, y)
+    ok = xorderable and yorderable and _are_orderable(x, y)
+    if ok:
+        xindices = np.asarray(xindices, dtype=np.int64)
+        yindices = np.asarray(yindices, dtype=np.int64)
+    return (ok, xindices, yindices)
+
+
+def _are_orderable(x, y):
+    """Are `x` and `y` jointly orderable?
+
+    We assume that `x` and `y` are indeed independently orderable. In this
+    case, they are also jointly orderable if their first elements are
+    orderable.
+    """
+    if len(x) > 0 and len(y) > 0:
+        try:
+            x[0] < y[0]
+            x[0] > y[0]
+        except TypeError:
+            return False
+    return True

From e39482928c9541f04357879ae8cd62b0a54358df Mon Sep 17 00:00:00 2001
From: Danilo Horta <danilo.horta@gmail.com>
Date: Fri, 24 Feb 2017 15:44:23 +0000
Subject: [PATCH 3/4] zero length issue

---
 pandas/index.pyx | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/pandas/index.pyx b/pandas/index.pyx
index 3045fa94d5497..63b33d727bb23 100644
--- a/pandas/index.pyx
+++ b/pandas/index.pyx
@@ -136,8 +136,13 @@ cdef _map(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1,
         j += 1
 
 def _map_targets_to_values(values, targets, idx0, idx1):
-    mapping_count = np.zeros(len(targets), np.int64)
-    missing_count = np.zeros(len(targets), np.int64)
+    n_t = len(targets)
+
+    mapping_count = np.zeros(n_t, np.int64)
+    missing_count = np.zeros(n_t, np.int64)
+
+    if n_t == 0:
+        return np.empty(0, np.int64), np.empty(0, np.int64)
 
     _count(values, targets, idx0, idx1, mapping_count, missing_count)
 

From 62cd783795986633d703cb33adba7ced3b2ad14b Mon Sep 17 00:00:00 2001
From: Danilo Horta <danilo.horta@gmail.com>
Date: Fri, 24 Feb 2017 16:21:19 +0000
Subject: [PATCH 4/4] doc

---
 pandas/index.pyx | 68 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/pandas/index.pyx b/pandas/index.pyx
index 63b33d727bb23..2cb1243e9ca83 100644
--- a/pandas/index.pyx
+++ b/pandas/index.pyx
@@ -49,6 +49,36 @@ cdef extern from "Python.h":
 @cython.initializedcheck(False)
 cdef _count(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1,
             int64_t[:] mapping_count, int64_t[:] missing_count):
+    """
+    Compute the number of times a `targets` value is in `values`.
+
+    We also want an indication for the case in which a `targets` value is not
+    found in `values`.
+
+    Parameters
+    ----------
+    values : Sequence of comparable values.
+    targets : Sequence of comparable values.
+    idx0 : np.argsort on `values`.
+    idx1 : np.argsort on `targets`.
+    mapping_count : Number of `targets` elements found in `values`.
+    missing_count : Number of `targets` elements not found in `values`.
+
+    Examples
+    --------
+    Let
+
+    .. code-block:: python
+
+        >>> values = np.array([5, 4, 5, 3, 3, 5, 1])
+        >>> targets = np.array([3, 5, 1, 2, 1])
+
+    be two integer arrays. The first element of `targets` is in `values` in
+    two positions. The second one is in `values` in three positions.
+    On the other hand, the fourth one is not found in `values`. The resulting
+    map will be `mapping_count = np.array([2, 3, 1, -1, 1])`. Moreover, the
+    resulting `missing_count` will be `np.array([0, 0, 0, 1, 0])`
+    """
 
     cdef:
         int64_t n_v = values.shape[0]
@@ -92,6 +122,39 @@ cdef _count(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1,
 cdef _map(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1,
             int64_t[:] start_mapping, int64_t[:] start_missing,
             int64_t[:] mapping, int64_t[:] missing):
+    """
+    Map `targets` values to `values` positions.
+
+    We also want to know the indices of `targets` whose values are not found in
+    `targets`.
+
+    Parameters
+    ----------
+    values : Sequence of comparable values.
+    targets : Sequence of comparable values.
+    idx0 : np.argsort on `values`.
+    idx1 : np.argsort on `targets`.
+    start_mapping : Index of the starting block of `mapping`.
+    start_missing : Index of the starting block of `missing`.
+    mapping : `values` indices of `targets` values found in `values`.
+    missing : `targets` indices of `targets` values not found in `values`.
+
+    Examples
+    --------
+    Let
+
+    .. code-block:: python
+
+        >>> values = np.array([5, 4, 5, 3, 3, 5, 1])
+        >>> targets = np.array([3, 5, 1, 2, 1])
+
+    be two integer arrays. The first block of `mapping` corresponds to the
+    first element of `targets` and will have size `2`. The second block of
+    `mapping` corresponds to the second element of `targets` and will have size
+    `3`. The resulting map will be
+    `mapping = np.array([3, 4, 0, 2, 5, 6, -1, 6])`. Moreover, the resulting
+    `missing` will be `np.array([3])`.
+    """
 
     cdef:
         int64_t n_v = values.shape[0]
@@ -136,6 +199,11 @@ cdef _map(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1,
         j += 1
 
 def _map_targets_to_values(values, targets, idx0, idx1):
+    """
+    Map `targets` values to `values` positions.
+
+    Please, refer to function `_map` for a complete description.
+    """
     n_t = len(targets)
 
     mapping_count = np.zeros(n_t, np.int64)