From 8c525415eea433fcc1d5194c994cfbd23ea3c2ab Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Wed, 17 Oct 2018 16:05:05 -0700 Subject: [PATCH 1/2] use fused types for the rest of libjoin --- pandas/_libs/join.pyx | 360 +++++++++++++++++++++++- pandas/_libs/join_func_helper.pxi.in | 405 --------------------------- pandas/core/reshape/merge.py | 13 +- setup.py | 4 +- 4 files changed, 363 insertions(+), 419 deletions(-) delete mode 100644 pandas/_libs/join_func_helper.pxi.in diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 7c791ab8a1b00..eeaad6c7318e2 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -17,8 +17,6 @@ cdef double nan = NaN from pandas._libs.algos import groupsort_indexer, ensure_platform_int from pandas.core.algorithms import take_nd -include "join_func_helper.pxi" - def inner_join(ndarray[int64_t] left, ndarray[int64_t] right, Py_ssize_t max_groups): @@ -309,8 +307,8 @@ left_join_indexer_unique_int64 = left_join_indexer_unique["int64_t"] left_join_indexer_unique_uint64 = left_join_indexer_unique["uint64_t"] -# @cython.wraparound(False) -# @cython.boundscheck(False) +@cython.wraparound(False) +@cython.boundscheck(False) def left_join_indexer(ndarray[join_t] left, ndarray[join_t] right): """ Two-pass algorithm for monotonic indexes. Handles many-to-one merges @@ -656,3 +654,357 @@ outer_join_indexer_object = outer_join_indexer["object"] outer_join_indexer_int32 = outer_join_indexer["int32_t"] outer_join_indexer_int64 = outer_join_indexer["int64_t"] outer_join_indexer_uint64 = outer_join_indexer["uint64_t"] + + +# ---------------------------------------------------------------------- +# asof_join_by +# ---------------------------------------------------------------------- + +from hashtable cimport ( + HashTable, PyObjectHashTable, UInt64HashTable, Int64HashTable) + +ctypedef fused asof_t: + uint8_t + uint16_t + uint32_t + uint64_t + int8_t + int16_t + int32_t + int64_t + float + double + +ctypedef fused by_t: + object + int64_t + uint64_t + + +def asof_join_backward_on_X_by_Y(ndarray[asof_t] left_values, + ndarray[asof_t] right_values, + ndarray[by_t] left_by_values, + ndarray[by_t] right_by_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + asof_t tolerance_ = 0 + asof_t diff = 0 + HashTable hash_table + by_t by_value + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + if by_t is object: + hash_table = PyObjectHashTable(right_size) + elif by_t is int64_t: + hash_table = Int64HashTable(right_size) + elif by_t is uint64_t: + hash_table = UInt64HashTable(right_size) + + right_pos = 0 + for left_pos in range(left_size): + # restart right_pos if it went negative in a previous iteration + if right_pos < 0: + right_pos = 0 + + # find last position in right whose value is less than left's + if allow_exact_matches: + while (right_pos < right_size and + right_values[right_pos] <= left_values[left_pos]): + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + else: + while (right_pos < right_size and + right_values[right_pos] < left_values[left_pos]): + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + right_pos -= 1 + + # save positions as the desired index + by_value = left_by_values[left_pos] + found_right_pos = (hash_table.get_item(by_value) + if by_value in hash_table else -1) + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = found_right_pos + + # if needed, verify that tolerance is met + if has_tolerance and found_right_pos != -1: + diff = left_values[left_pos] - right_values[found_right_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + + +def asof_join_forward_on_X_by_Y(ndarray[asof_t] left_values, + ndarray[asof_t] right_values, + ndarray[by_t] left_by_values, + ndarray[by_t] right_by_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + asof_t tolerance_ = 0 + asof_t diff = 0 + HashTable hash_table + by_t by_value + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + if by_t is object: + hash_table = PyObjectHashTable(right_size) + elif by_t is int64_t: + hash_table = Int64HashTable(right_size) + elif by_t is uint64_t: + hash_table = UInt64HashTable(right_size) + + right_pos = right_size - 1 + for left_pos in range(left_size - 1, -1, -1): + # restart right_pos if it went over in a previous iteration + if right_pos == right_size: + right_pos = right_size - 1 + + # find first position in right whose value is greater than left's + if allow_exact_matches: + while (right_pos >= 0 and + right_values[right_pos] >= left_values[left_pos]): + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos -= 1 + else: + while (right_pos >= 0 and + right_values[right_pos] > left_values[left_pos]): + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos -= 1 + right_pos += 1 + + # save positions as the desired index + by_value = left_by_values[left_pos] + found_right_pos = (hash_table.get_item(by_value) + if by_value in hash_table else -1) + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = found_right_pos + + # if needed, verify that tolerance is met + if has_tolerance and found_right_pos != -1: + diff = right_values[found_right_pos] - left_values[left_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + + +def asof_join_nearest_on_X_by_Y(ndarray[asof_t] left_values, + ndarray[asof_t] right_values, + ndarray[by_t] left_by_values, + ndarray[by_t] right_by_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_size, right_size, i + ndarray[int64_t] left_indexer, right_indexer, bli, bri, fli, fri + asof_t bdiff, fdiff + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + # search both forward and backward + bli, bri = asof_join_backward_on_X_by_Y(left_values, + right_values, + left_by_values, + right_by_values, + allow_exact_matches, + tolerance) + fli, fri = asof_join_forward_on_X_by_Y(left_values, + right_values, + left_by_values, + right_by_values, + allow_exact_matches, + tolerance) + + for i in range(len(bri)): + # choose timestamp from right with smaller difference + if bri[i] != -1 and fri[i] != -1: + bdiff = left_values[bli[i]] - right_values[bri[i]] + fdiff = right_values[fri[i]] - left_values[fli[i]] + right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i] + else: + right_indexer[i] = bri[i] if bri[i] != -1 else fri[i] + left_indexer[i] = bli[i] + + return left_indexer, right_indexer + + +# ---------------------------------------------------------------------- +# asof_join +# ---------------------------------------------------------------------- + +def asof_join_backward(ndarray[asof_t] left_values, + ndarray[asof_t] right_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + asof_t tolerance_ = 0 + asof_t diff = 0 + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + right_pos = 0 + for left_pos in range(left_size): + # restart right_pos if it went negative in a previous iteration + if right_pos < 0: + right_pos = 0 + + # find last position in right whose value is less than left's + if allow_exact_matches: + while (right_pos < right_size and + right_values[right_pos] <= left_values[left_pos]): + right_pos += 1 + else: + while (right_pos < right_size and + right_values[right_pos] < left_values[left_pos]): + right_pos += 1 + right_pos -= 1 + + # save positions as the desired index + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = right_pos + + # if needed, verify that tolerance is met + if has_tolerance and right_pos != -1: + diff = left_values[left_pos] - right_values[right_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + + +def asof_join_forward(ndarray[asof_t] left_values, + ndarray[asof_t] right_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + asof_t tolerance_ = 0 + asof_t diff = 0 + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + right_pos = right_size - 1 + for left_pos in range(left_size - 1, -1, -1): + # restart right_pos if it went over in a previous iteration + if right_pos == right_size: + right_pos = right_size - 1 + + # find first position in right whose value is greater than left's + if allow_exact_matches: + while (right_pos >= 0 and + right_values[right_pos] >= left_values[left_pos]): + right_pos -= 1 + else: + while (right_pos >= 0 and + right_values[right_pos] > left_values[left_pos]): + right_pos -= 1 + right_pos += 1 + + # save positions as the desired index + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = (right_pos + if right_pos != right_size else -1) + + # if needed, verify that tolerance is met + if has_tolerance and right_pos != right_size: + diff = right_values[right_pos] - left_values[left_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + + +def asof_join_nearest(ndarray[asof_t] left_values, + ndarray[asof_t] right_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_size, right_size, i + ndarray[int64_t] left_indexer, right_indexer, bli, bri, fli, fri + asof_t bdiff, fdiff + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + # search both forward and backward + bli, bri = asof_join_backward(left_values, right_values, + allow_exact_matches, tolerance) + fli, fri = asof_join_forward(left_values, right_values, + allow_exact_matches, tolerance) + + for i in range(len(bri)): + # choose timestamp from right with smaller difference + if bri[i] != -1 and fri[i] != -1: + bdiff = left_values[bli[i]] - right_values[bri[i]] + fdiff = right_values[fri[i]] - left_values[fli[i]] + right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i] + else: + right_indexer[i] = bri[i] if bri[i] != -1 else fri[i] + left_indexer[i] = bli[i] + + return left_indexer, right_indexer diff --git a/pandas/_libs/join_func_helper.pxi.in b/pandas/_libs/join_func_helper.pxi.in deleted file mode 100644 index b7f604d2fc951..0000000000000 --- a/pandas/_libs/join_func_helper.pxi.in +++ /dev/null @@ -1,405 +0,0 @@ -# cython: boundscheck=False, wraparound=False -""" -Template for each `dtype` helper function for hashtable - -WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in -""" - -#---------------------------------------------------------------------- -# asof_join_by -#---------------------------------------------------------------------- - -from hashtable cimport PyObjectHashTable, UInt64HashTable, Int64HashTable - -{{py: - -# table_type, by_dtype -by_dtypes = [('PyObjectHashTable', 'object'), ('Int64HashTable', 'int64_t'), - ('UInt64HashTable', 'uint64_t')] - -# on_dtype -on_dtypes = ['uint8_t', 'uint16_t', 'uint32_t', 'uint64_t', - 'int8_t', 'int16_t', 'int32_t', 'int64_t', - 'float', 'double'] - -}} - - -{{for table_type, by_dtype in by_dtypes}} -{{for on_dtype in on_dtypes}} - - -def asof_join_backward_{{on_dtype}}_by_{{by_dtype}}( - ndarray[{{on_dtype}}] left_values, - ndarray[{{on_dtype}}] right_values, - ndarray[{{by_dtype}}] left_by_values, - ndarray[{{by_dtype}}] right_by_values, - bint allow_exact_matches=1, - tolerance=None): - - cdef: - Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos - ndarray[int64_t] left_indexer, right_indexer - bint has_tolerance = 0 - {{on_dtype}} tolerance_ = 0 - {{on_dtype}} diff = 0 - {{table_type}} hash_table - {{by_dtype}} by_value - - # if we are using tolerance, set our objects - if tolerance is not None: - has_tolerance = 1 - tolerance_ = tolerance - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) - - hash_table = {{table_type}}(right_size) - - right_pos = 0 - for left_pos in range(left_size): - # restart right_pos if it went negative in a previous iteration - if right_pos < 0: - right_pos = 0 - - # find last position in right whose value is less than left's - if allow_exact_matches: - while (right_pos < right_size and - right_values[right_pos] <= left_values[left_pos]): - hash_table.set_item(right_by_values[right_pos], right_pos) - right_pos += 1 - else: - while (right_pos < right_size and - right_values[right_pos] < left_values[left_pos]): - hash_table.set_item(right_by_values[right_pos], right_pos) - right_pos += 1 - right_pos -= 1 - - # save positions as the desired index - by_value = left_by_values[left_pos] - found_right_pos = (hash_table.get_item(by_value) - if by_value in hash_table else -1) - left_indexer[left_pos] = left_pos - right_indexer[left_pos] = found_right_pos - - # if needed, verify that tolerance is met - if has_tolerance and found_right_pos != -1: - diff = left_values[left_pos] - right_values[found_right_pos] - if diff > tolerance_: - right_indexer[left_pos] = -1 - - return left_indexer, right_indexer - - -def asof_join_forward_{{on_dtype}}_by_{{by_dtype}}( - ndarray[{{on_dtype}}] left_values, - ndarray[{{on_dtype}}] right_values, - ndarray[{{by_dtype}}] left_by_values, - ndarray[{{by_dtype}}] right_by_values, - bint allow_exact_matches=1, - tolerance=None): - - cdef: - Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos - ndarray[int64_t] left_indexer, right_indexer - bint has_tolerance = 0 - {{on_dtype}} tolerance_ = 0 - {{on_dtype}} diff = 0 - {{table_type}} hash_table - {{by_dtype}} by_value - - # if we are using tolerance, set our objects - if tolerance is not None: - has_tolerance = 1 - tolerance_ = tolerance - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) - - hash_table = {{table_type}}(right_size) - - right_pos = right_size - 1 - for left_pos in range(left_size - 1, -1, -1): - # restart right_pos if it went over in a previous iteration - if right_pos == right_size: - right_pos = right_size - 1 - - # find first position in right whose value is greater than left's - if allow_exact_matches: - while (right_pos >= 0 and - right_values[right_pos] >= left_values[left_pos]): - hash_table.set_item(right_by_values[right_pos], right_pos) - right_pos -= 1 - else: - while (right_pos >= 0 and - right_values[right_pos] > left_values[left_pos]): - hash_table.set_item(right_by_values[right_pos], right_pos) - right_pos -= 1 - right_pos += 1 - - # save positions as the desired index - by_value = left_by_values[left_pos] - found_right_pos = (hash_table.get_item(by_value) - if by_value in hash_table else -1) - left_indexer[left_pos] = left_pos - right_indexer[left_pos] = found_right_pos - - # if needed, verify that tolerance is met - if has_tolerance and found_right_pos != -1: - diff = right_values[found_right_pos] - left_values[left_pos] - if diff > tolerance_: - right_indexer[left_pos] = -1 - - return left_indexer, right_indexer - - -def asof_join_nearest_{{on_dtype}}_by_{{by_dtype}}( - ndarray[{{on_dtype}}] left_values, - ndarray[{{on_dtype}}] right_values, - ndarray[{{by_dtype}}] left_by_values, - ndarray[{{by_dtype}}] right_by_values, - bint allow_exact_matches=1, - tolerance=None): - - cdef: - Py_ssize_t left_size, right_size, i - ndarray[int64_t] left_indexer, right_indexer, bli, bri, fli, fri - {{on_dtype}} bdiff, fdiff - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) - - # search both forward and backward - bli, bri =\ - asof_join_backward_{{on_dtype}}_by_{{by_dtype}}(left_values, - right_values, - left_by_values, - right_by_values, - allow_exact_matches, - tolerance) - fli, fri =\ - asof_join_forward_{{on_dtype}}_by_{{by_dtype}}(left_values, - right_values, - left_by_values, - right_by_values, - allow_exact_matches, - tolerance) - - for i in range(len(bri)): - # choose timestamp from right with smaller difference - if bri[i] != -1 and fri[i] != -1: - bdiff = left_values[bli[i]] - right_values[bri[i]] - fdiff = right_values[fri[i]] - left_values[fli[i]] - right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i] - else: - right_indexer[i] = bri[i] if bri[i] != -1 else fri[i] - left_indexer[i] = bli[i] - - return left_indexer, right_indexer - -{{endfor}} -{{endfor}} - - -# ---------------------------------------------------------------------- -# asof_join -# ---------------------------------------------------------------------- - -ctypedef fused asof_t: - uint8_t - uint16_t - uint32_t - uint64_t - int8_t - int16_t - int32_t - int64_t - float - double - - -def asof_join_backward(ndarray[asof_t] left_values, - ndarray[asof_t] right_values, - bint allow_exact_matches=1, - tolerance=None): - - cdef: - Py_ssize_t left_pos, right_pos, left_size, right_size - ndarray[int64_t] left_indexer, right_indexer - bint has_tolerance = 0 - asof_t tolerance_ = 0 - asof_t diff = 0 - - # if we are using tolerance, set our objects - if tolerance is not None: - has_tolerance = 1 - tolerance_ = tolerance - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) - - right_pos = 0 - for left_pos in range(left_size): - # restart right_pos if it went negative in a previous iteration - if right_pos < 0: - right_pos = 0 - - # find last position in right whose value is less than left's - if allow_exact_matches: - while (right_pos < right_size and - right_values[right_pos] <= left_values[left_pos]): - right_pos += 1 - else: - while (right_pos < right_size and - right_values[right_pos] < left_values[left_pos]): - right_pos += 1 - right_pos -= 1 - - # save positions as the desired index - left_indexer[left_pos] = left_pos - right_indexer[left_pos] = right_pos - - # if needed, verify that tolerance is met - if has_tolerance and right_pos != -1: - diff = left_values[left_pos] - right_values[right_pos] - if diff > tolerance_: - right_indexer[left_pos] = -1 - - return left_indexer, right_indexer - - -asof_join_backward_uint8_t = asof_join_backward["uint8_t"] -asof_join_backward_uint16_t = asof_join_backward["uint16_t"] -asof_join_backward_uint32_t = asof_join_backward["uint32_t"] -asof_join_backward_uint64_t = asof_join_backward["uint64_t"] -asof_join_backward_int8_t = asof_join_backward["int8_t"] -asof_join_backward_int16_t = asof_join_backward["int16_t"] -asof_join_backward_int32_t = asof_join_backward["int32_t"] -asof_join_backward_int64_t = asof_join_backward["int64_t"] -asof_join_backward_float = asof_join_backward["float"] -asof_join_backward_double = asof_join_backward["double"] - - -def asof_join_forward(ndarray[asof_t] left_values, - ndarray[asof_t] right_values, - bint allow_exact_matches=1, - tolerance=None): - - cdef: - Py_ssize_t left_pos, right_pos, left_size, right_size - ndarray[int64_t] left_indexer, right_indexer - bint has_tolerance = 0 - asof_t tolerance_ = 0 - asof_t diff = 0 - - # if we are using tolerance, set our objects - if tolerance is not None: - has_tolerance = 1 - tolerance_ = tolerance - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) - - right_pos = right_size - 1 - for left_pos in range(left_size - 1, -1, -1): - # restart right_pos if it went over in a previous iteration - if right_pos == right_size: - right_pos = right_size - 1 - - # find first position in right whose value is greater than left's - if allow_exact_matches: - while (right_pos >= 0 and - right_values[right_pos] >= left_values[left_pos]): - right_pos -= 1 - else: - while (right_pos >= 0 and - right_values[right_pos] > left_values[left_pos]): - right_pos -= 1 - right_pos += 1 - - # save positions as the desired index - left_indexer[left_pos] = left_pos - right_indexer[left_pos] = (right_pos - if right_pos != right_size else -1) - - # if needed, verify that tolerance is met - if has_tolerance and right_pos != right_size: - diff = right_values[right_pos] - left_values[left_pos] - if diff > tolerance_: - right_indexer[left_pos] = -1 - - return left_indexer, right_indexer - - -asof_join_forward_uint8_t = asof_join_forward["uint8_t"] -asof_join_forward_uint16_t = asof_join_forward["uint16_t"] -asof_join_forward_uint32_t = asof_join_forward["uint32_t"] -asof_join_forward_uint64_t = asof_join_forward["uint64_t"] -asof_join_forward_int8_t = asof_join_forward["int8_t"] -asof_join_forward_int16_t = asof_join_forward["int16_t"] -asof_join_forward_int32_t = asof_join_forward["int32_t"] -asof_join_forward_int64_t = asof_join_forward["int64_t"] -asof_join_forward_float = asof_join_forward["float"] -asof_join_forward_double = asof_join_forward["double"] - - -def asof_join_nearest(ndarray[asof_t] left_values, - ndarray[asof_t] right_values, - bint allow_exact_matches=1, - tolerance=None): - - cdef: - Py_ssize_t left_size, right_size, i - ndarray[int64_t] left_indexer, right_indexer, bli, bri, fli, fri - asof_t bdiff, fdiff - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) - - # search both forward and backward - bli, bri = asof_join_backward(left_values, right_values, - allow_exact_matches, tolerance) - fli, fri = asof_join_forward(left_values, right_values, - allow_exact_matches, tolerance) - - for i in range(len(bri)): - # choose timestamp from right with smaller difference - if bri[i] != -1 and fri[i] != -1: - bdiff = left_values[bli[i]] - right_values[bri[i]] - fdiff = right_values[fri[i]] - left_values[fli[i]] - right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i] - else: - right_indexer[i] = bri[i] if bri[i] != -1 else fri[i] - left_indexer[i] = bli[i] - - return left_indexer, right_indexer - - -asof_join_nearest_uint8_t = asof_join_nearest["uint8_t"] -asof_join_nearest_uint16_t = asof_join_nearest["uint16_t"] -asof_join_nearest_uint32_t = asof_join_nearest["uint32_t"] -asof_join_nearest_uint64_t = asof_join_nearest["uint64_t"] -asof_join_nearest_int8_t = asof_join_nearest["int8_t"] -asof_join_nearest_int16_t = asof_join_nearest["int16_t"] -asof_join_nearest_int32_t = asof_join_nearest["int32_t"] -asof_join_nearest_int64_t = asof_join_nearest["int64_t"] -asof_join_nearest_float = asof_join_nearest["float"] -asof_join_nearest_double = asof_join_nearest["double"] diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index ed9466795f97f..6ff94075a661f 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1190,14 +1190,13 @@ def get_result(self): return result -def _asof_function(direction, on_type): - name = 'asof_join_{dir}_{on}'.format(dir=direction, on=on_type) +def _asof_function(direction): + name = 'asof_join_{dir}'.format(dir=direction) return getattr(libjoin, name, None) -def _asof_by_function(direction, on_type, by_type): - name = 'asof_join_{dir}_{on}_by_{by}'.format( - dir=direction, on=on_type, by=by_type) +def _asof_by_function(direction): + name = 'asof_join_{dir}_on_X_by_Y'.format(dir=direction) return getattr(libjoin, name, None) @@ -1439,7 +1438,7 @@ def flip(xs): # choose appropriate function by type on_type = _get_cython_type(left_values.dtype) - func = _asof_by_function(self.direction, on_type, by_type) + func = _asof_by_function(self.direction) return func(left_values, right_values, left_by_values, @@ -1449,7 +1448,7 @@ def flip(xs): else: # choose appropriate function by type on_type = _get_cython_type(left_values.dtype) - func = _asof_function(self.direction, on_type) + func = _asof_function(self.direction) return func(left_values, right_values, self.allow_exact_matches, diff --git a/setup.py b/setup.py index adffddc61cbac..cb52db98905d3 100755 --- a/setup.py +++ b/setup.py @@ -76,7 +76,6 @@ def is_platform_windows(): '_libs/algos_take_helper.pxi.in', '_libs/algos_rank_helper.pxi.in'], 'groupby': ['_libs/groupby_helper.pxi.in'], - 'join': ['_libs/join_func_helper.pxi.in'], 'hashtable': ['_libs/hashtable_class_helper.pxi.in', '_libs/hashtable_func_helper.pxi.in'], 'index': ['_libs/index_class_helper.pxi.in'], @@ -531,8 +530,7 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): 'pyxfile': '_libs/interval', 'depends': _pxi_dep['interval']}, '_libs.join': { - 'pyxfile': '_libs/join', - 'depends': _pxi_dep['join']}, + 'pyxfile': '_libs/join'}, '_libs.lib': { 'pyxfile': '_libs/lib', 'include': common_include + ts_include, From 1c012eff91cbfe241e3d13d372cafa483ed7b8ed Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Wed, 17 Oct 2018 18:24:36 -0700 Subject: [PATCH 2/2] flake8 fixups --- pandas/_libs/join.pyx | 10 +++++----- pandas/core/reshape/merge.py | 28 +--------------------------- 2 files changed, 6 insertions(+), 32 deletions(-) diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index eeaad6c7318e2..c6afeda6a37dc 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -682,11 +682,11 @@ ctypedef fused by_t: def asof_join_backward_on_X_by_Y(ndarray[asof_t] left_values, - ndarray[asof_t] right_values, - ndarray[by_t] left_by_values, - ndarray[by_t] right_by_values, - bint allow_exact_matches=1, - tolerance=None): + ndarray[asof_t] right_values, + ndarray[by_t] left_by_values, + ndarray[by_t] right_by_values, + bint allow_exact_matches=1, + tolerance=None): cdef: Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6ff94075a661f..5d4a0c718499a 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -34,8 +34,7 @@ is_datetimelike, ensure_int64, ensure_float64, - ensure_object, - _get_dtype) + ensure_object) from pandas.core.dtypes.missing import na_value_for_dtype, isnull from pandas.core.internals import (items_overlap_with_suffix, concatenate_block_managers) @@ -1206,29 +1205,6 @@ def _asof_by_function(direction): 'object': ensure_object, } -_cython_types = { - 'uint8': 'uint8_t', - 'uint32': 'uint32_t', - 'uint16': 'uint16_t', - 'uint64': 'uint64_t', - 'int8': 'int8_t', - 'int32': 'int32_t', - 'int16': 'int16_t', - 'int64': 'int64_t', - 'float16': 'error', - 'float32': 'float', - 'float64': 'double', -} - - -def _get_cython_type(dtype): - """ Given a dtype, return a C name like 'int64_t' or 'double' """ - type_name = _get_dtype(dtype).name - ctype = _cython_types.get(type_name, 'object') - if ctype == 'error': - raise MergeError('unsupported type: {type}'.format(type=type_name)) - return ctype - def _get_cython_type_upcast(dtype): """ Upcast a dtype to 'int64_t', 'double', or 'object' """ @@ -1437,7 +1413,6 @@ def flip(xs): right_by_values = by_type_caster(right_by_values) # choose appropriate function by type - on_type = _get_cython_type(left_values.dtype) func = _asof_by_function(self.direction) return func(left_values, right_values, @@ -1447,7 +1422,6 @@ def flip(xs): tolerance) else: # choose appropriate function by type - on_type = _get_cython_type(left_values.dtype) func = _asof_function(self.direction) return func(left_values, right_values,