From d6bcef84227636ada9e67a10e424f2cd3dc6c744 Mon Sep 17 00:00:00 2001 From: Danilo Horta Date: Tue, 21 Feb 2017 10:12:59 +0000 Subject: [PATCH 1/4] fast non-unique indexing --- pandas/index.pyx | 121 +++++++++++++++++++++++++++++++++++++++++ pandas/indexes/base.py | 32 ++++++++++- 2 files changed, 152 insertions(+), 1 deletion(-) diff --git a/pandas/index.pyx b/pandas/index.pyx index 37fe7d90bebe0..3045fa94d5497 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -44,6 +44,117 @@ PyDateTime_IMPORT cdef extern from "Python.h": int PySlice_Check(object) +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.initializedcheck(False) +cdef _count(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1, + int64_t[:] mapping_count, int64_t[:] missing_count): + + cdef: + int64_t n_v = values.shape[0] + int64_t n_t = targets.shape[0] + int64_t i = 0 + int64_t j = 0 + + while i < n_v and j < n_t: + + val0 = values[idx0[i]] + val1 = targets[idx1[j]] + + if val0 == val1: + + while i < n_v and values[idx0[i]] == val1: + i += 1 + mapping_count[idx1[j]] += 1 + + j += 1 + while j < n_t and val0 == targets[idx1[j]]: + mapping_count[idx1[j]] = mapping_count[idx1[j-1]] + j += 1 + + elif val0 > val1: + + mapping_count[idx1[j]] += 1 + missing_count[idx1[j]] = 1 + j += 1 + + else: + i += 1 + + while j < n_t: + mapping_count[idx1[j]] += 1 + missing_count[idx1[j]] = 1 + j += 1 + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.initializedcheck(False) +cdef _map(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1, + int64_t[:] start_mapping, int64_t[:] start_missing, + int64_t[:] mapping, int64_t[:] missing): + + cdef: + int64_t n_v = values.shape[0] + int64_t n_t = targets.shape[0] + int64_t i = 0 + int64_t j = 0 + int64_t c + + while i < n_v and j < n_t: + + val0 = values[idx0[i]] + val1 = targets[idx1[j]] + + if val0 == val1: + + c = 0 + while i < n_v and values[idx0[i]] == val1: + mapping[start_mapping[idx1[j]] + c] = idx0[i] + i += 1 + c += 1 + + j += 1 + while j < n_t and val0 == targets[idx1[j]]: + for ii in range(c): + mapping[start_mapping[idx1[j]] + ii] = \ + mapping[start_mapping[idx1[j-1]] + ii] + j += 1 + + elif val0 > val1: + + mapping[start_mapping[idx1[j]]] = -1 + missing[start_missing[idx1[j]]] = idx1[j] + j += 1 + + else: + i += 1 + + while j < n_t: + + mapping[start_mapping[idx1[j]]] = -1 + missing[start_missing[idx1[j]]] = idx1[j] + j += 1 + +def _map_targets_to_values(values, targets, idx0, idx1): + mapping_count = np.zeros(len(targets), np.int64) + missing_count = np.zeros(len(targets), np.int64) + + _count(values, targets, idx0, idx1, mapping_count, missing_count) + + np.cumsum(mapping_count, out=mapping_count) + np.cumsum(missing_count, out=missing_count) + + mapping = np.empty(mapping_count[-1], np.int64) + missing = np.empty(missing_count[-1], np.int64) + + mapping_count[1:] = mapping_count[:-1] + mapping_count[0] = 0 + missing_count -= 1 + + _map(values, targets, idx0, idx1, mapping_count, missing_count, mapping, + missing) + + return mapping, missing cdef inline is_definitely_invalid_key(object val): if PyTuple_Check(val): @@ -371,6 +482,16 @@ cdef class IndexEngine: return result[0:count], missing[0:count_missing] + def get_indexer_non_unique_orderable(self, ndarray targets, + int64_t[:] idx0, + int64_t[:] idx1): + + cdef: + ndarray values + + self._ensure_mapping_populated() + values = np.array(self._get_index_values(), copy=False) + return _map_targets_to_values(values, targets, idx0, idx1) cdef Py_ssize_t _bin_search(ndarray values, object val) except -1: cdef: diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index f1f37622b2a74..f77a888d2f0a0 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2530,10 +2530,40 @@ def get_indexer_non_unique(self, target): if self.is_all_dates: self = Index(self.asi8) tgt_values = target.asi8 + src_values = self.asi8 else: tgt_values = target._values + src_values = self._values + + try: + src_values[0] < tgt_values[0] + src_values[0] > tgt_values[0] + except (TypeError, IndexError): + orderable = False + else: + try: + if self.is_monotonic_increasing: + idx0 = np.arange(len(src_values), dtype=np.int64) + else: + idx0 = np.argsort(src_values, kind='mergesort') + idx0 = np.asarray(idx0, dtype=np.int64) + + if target.is_monotonic_increasing: + idx1 = np.arange(len(tgt_values), dtype=np.int64) + else: + idx1 = np.argsort(tgt_values, kind='mergesort') + idx1 = np.asarray(idx1, dtype=np.int64) + + except TypeError: + orderable = False + else: + orderable = True + + if orderable: + indexer, missing = self._engine.get_indexer_non_unique_orderable(tgt_values, idx0, idx1) + else: + indexer, missing = self._engine.get_indexer_non_unique(tgt_values) - indexer, missing = self._engine.get_indexer_non_unique(tgt_values) return Index(indexer), missing def get_indexer_for(self, target, **kwargs): From dd14ddf994c809f0ff1a3314be92fe773bae67a0 Mon Sep 17 00:00:00 2001 From: Danilo Horta Date: Wed, 22 Feb 2017 12:19:31 +0000 Subject: [PATCH 2/4] order them --- pandas/indexes/base.py | 73 +++++++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 25 deletions(-) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index f77a888d2f0a0..4f741d94ac219 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2535,34 +2535,17 @@ def get_indexer_non_unique(self, target): tgt_values = target._values src_values = self._values - try: - src_values[0] < tgt_values[0] - src_values[0] > tgt_values[0] - except (TypeError, IndexError): - orderable = False - else: - try: - if self.is_monotonic_increasing: - idx0 = np.arange(len(src_values), dtype=np.int64) - else: - idx0 = np.argsort(src_values, kind='mergesort') - idx0 = np.asarray(idx0, dtype=np.int64) - - if target.is_monotonic_increasing: - idx1 = np.arange(len(tgt_values), dtype=np.int64) - else: - idx1 = np.argsort(tgt_values, kind='mergesort') - idx1 = np.asarray(idx1, dtype=np.int64) - - except TypeError: - orderable = False - else: - orderable = True + is_mono0 = self.is_monotonic_increasing + is_mono1 = target.is_monotonic_increasing + (orderable, idx0, idx1) = _order_them(is_mono0, src_values, is_mono1, + tgt_values) + e = self._engine if orderable: - indexer, missing = self._engine.get_indexer_non_unique_orderable(tgt_values, idx0, idx1) + indexer, missing = e.get_indexer_non_unique_orderable(tgt_values, + idx0, idx1) else: - indexer, missing = self._engine.get_indexer_non_unique(tgt_values) + indexer, missing = e.get_indexer_non_unique(tgt_values) return Index(indexer), missing @@ -3905,3 +3888,43 @@ def _trim_front(strings): def _validate_join_method(method): if method not in ['left', 'right', 'inner', 'outer']: raise ValueError('do not recognize join method %s' % method) + + +def _order_it(is_mono, x): + """Tries to sort a sequence.""" + if is_mono: + return (True, np.arange(len(x), dtype=np.int64)) + try: + indices = np.argsort(x, kind='mergesort') + except TypeError: + return (False, None) + return (True, indices) + + +def _order_them(x_is_mono, x, y_is_mono, y): + """ + Tries to sort `x` and `y`, checking whether they are jointly orderable. + """ + (xorderable, xindices) = _order_it(x_is_mono, x) + (yorderable, yindices) = _order_it(y_is_mono, y) + ok = xorderable and yorderable and _are_orderable(x, y) + if ok: + xindices = np.asarray(xindices, dtype=np.int64) + yindices = np.asarray(yindices, dtype=np.int64) + return (ok, xindices, yindices) + + +def _are_orderable(x, y): + """Are `x` and `y` jointly orderable? + + We assume that `x` and `y` are indeed independently orderable. In this + case, they are also jointly orderable if their first elements are + orderable. + """ + if len(x) > 0 and len(y) > 0: + try: + x[0] < y[0] + x[0] > y[0] + except TypeError: + return False + return True From e39482928c9541f04357879ae8cd62b0a54358df Mon Sep 17 00:00:00 2001 From: Danilo Horta Date: Fri, 24 Feb 2017 15:44:23 +0000 Subject: [PATCH 3/4] zero length issue --- pandas/index.pyx | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/index.pyx b/pandas/index.pyx index 3045fa94d5497..63b33d727bb23 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -136,8 +136,13 @@ cdef _map(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1, j += 1 def _map_targets_to_values(values, targets, idx0, idx1): - mapping_count = np.zeros(len(targets), np.int64) - missing_count = np.zeros(len(targets), np.int64) + n_t = len(targets) + + mapping_count = np.zeros(n_t, np.int64) + missing_count = np.zeros(n_t, np.int64) + + if n_t == 0: + return np.empty(0, np.int64), np.empty(0, np.int64) _count(values, targets, idx0, idx1, mapping_count, missing_count) From 62cd783795986633d703cb33adba7ced3b2ad14b Mon Sep 17 00:00:00 2001 From: Danilo Horta Date: Fri, 24 Feb 2017 16:21:19 +0000 Subject: [PATCH 4/4] doc --- pandas/index.pyx | 68 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/pandas/index.pyx b/pandas/index.pyx index 63b33d727bb23..2cb1243e9ca83 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -49,6 +49,36 @@ cdef extern from "Python.h": @cython.initializedcheck(False) cdef _count(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1, int64_t[:] mapping_count, int64_t[:] missing_count): + """ + Compute the number of times a `targets` value is in `values`. + + We also want an indication for the case in which a `targets` value is not + found in `values`. + + Parameters + ---------- + values : Sequence of comparable values. + targets : Sequence of comparable values. + idx0 : np.argsort on `values`. + idx1 : np.argsort on `targets`. + mapping_count : Number of `targets` elements found in `values`. + missing_count : Number of `targets` elements not found in `values`. + + Examples + -------- + Let + + .. code-block:: python + + >>> values = np.array([5, 4, 5, 3, 3, 5, 1]) + >>> targets = np.array([3, 5, 1, 2, 1]) + + be two integer arrays. The first element of `targets` is in `values` in + two positions. The second one is in `values` in three positions. + On the other hand, the fourth one is not found in `values`. The resulting + map will be `mapping_count = np.array([2, 3, 1, -1, 1])`. Moreover, the + resulting `missing_count` will be `np.array([0, 0, 0, 1, 0])` + """ cdef: int64_t n_v = values.shape[0] @@ -92,6 +122,39 @@ cdef _count(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1, cdef _map(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1, int64_t[:] start_mapping, int64_t[:] start_missing, int64_t[:] mapping, int64_t[:] missing): + """ + Map `targets` values to `values` positions. + + We also want to know the indices of `targets` whose values are not found in + `targets`. + + Parameters + ---------- + values : Sequence of comparable values. + targets : Sequence of comparable values. + idx0 : np.argsort on `values`. + idx1 : np.argsort on `targets`. + start_mapping : Index of the starting block of `mapping`. + start_missing : Index of the starting block of `missing`. + mapping : `values` indices of `targets` values found in `values`. + missing : `targets` indices of `targets` values not found in `values`. + + Examples + -------- + Let + + .. code-block:: python + + >>> values = np.array([5, 4, 5, 3, 3, 5, 1]) + >>> targets = np.array([3, 5, 1, 2, 1]) + + be two integer arrays. The first block of `mapping` corresponds to the + first element of `targets` and will have size `2`. The second block of + `mapping` corresponds to the second element of `targets` and will have size + `3`. The resulting map will be + `mapping = np.array([3, 4, 0, 2, 5, 6, -1, 6])`. Moreover, the resulting + `missing` will be `np.array([3])`. + """ cdef: int64_t n_v = values.shape[0] @@ -136,6 +199,11 @@ cdef _map(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1, j += 1 def _map_targets_to_values(values, targets, idx0, idx1): + """ + Map `targets` values to `values` positions. + + Please, refer to function `_map` for a complete description. + """ n_t = len(targets) mapping_count = np.zeros(n_t, np.int64)