diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 415e7026e09c8..d2914dc8ac751 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -353,6 +353,523 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1): return result +# ---------------------------------------------------------------------- + +ctypedef fused algos_t: + float64_t + float32_t + object + int32_t + int64_t + uint64_t + uint8_t + + +# TODO: unused; needed? +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices(ndarray[algos_t] index): + """ + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + """ + cdef: + Py_ssize_t i, length + dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t, ndim=1] indexer + algos_t cur, next + int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if not util.is_integer_object(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + + +pad_float64 = pad["float64_t"] +pad_float32 = pad["float32_t"] +pad_object = pad["object"] +pad_int64 = pad["int64_t"] +pad_int32 = pad["int32_t"] +pad_uint64 = pad["uint64_t"] +pad_bool = pad["uint8_t"] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace(ndarray[algos_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef: + Py_ssize_t i, N + algos_t val + int lim, fill_count = 0 + + N = len(values) + + # GH#2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if not util.is_integer_object(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +pad_inplace_float64 = pad_inplace["float64_t"] +pad_inplace_float32 = pad_inplace["float32_t"] +pad_inplace_object = pad_inplace["object"] +pad_inplace_int64 = pad_inplace["int64_t"] +pad_inplace_int32 = pad_inplace["int32_t"] +pad_inplace_uint64 = pad_inplace["uint64_t"] +pad_inplace_bool = pad_inplace["uint8_t"] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace(ndarray[algos_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef: + Py_ssize_t i, j, N, K + algos_t val + int lim, fill_count = 0 + + K, N = ( values).shape + + # GH#2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if not util.is_integer_object(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + + +pad_2d_inplace_float64 = pad_2d_inplace["float64_t"] +pad_2d_inplace_float32 = pad_2d_inplace["float32_t"] +pad_2d_inplace_object = pad_2d_inplace["object"] +pad_2d_inplace_int64 = pad_2d_inplace["int64_t"] +pad_2d_inplace_int32 = pad_2d_inplace["int32_t"] +pad_2d_inplace_uint64 = pad_2d_inplace["uint64_t"] +pad_2d_inplace_bool = pad_2d_inplace["uint8_t"] + + +""" +Backfilling logic for generating fill vector + +Diagram of what's going on + +Old New Fill vector Mask + . 0 1 + . 0 1 + . 0 1 +A A 0 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 +B B 1 1 + . 2 1 + . 2 1 + . 2 1 +C C 2 1 + . 0 + . 0 +D +""" + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t, ndim=1] indexer + algos_t cur, prev + int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if not util.is_integer_object(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + + +backfill_float64 = backfill["float64_t"] +backfill_float32 = backfill["float32_t"] +backfill_object = backfill["object"] +backfill_int64 = backfill["int64_t"] +backfill_int32 = backfill["int32_t"] +backfill_uint64 = backfill["uint64_t"] +backfill_bool = backfill["uint8_t"] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace(ndarray[algos_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef: + Py_ssize_t i, N + algos_t val + int lim, fill_count = 0 + + N = len(values) + + # GH#2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if not util.is_integer_object(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1, -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +backfill_inplace_float64 = backfill_inplace["float64_t"] +backfill_inplace_float32 = backfill_inplace["float32_t"] +backfill_inplace_object = backfill_inplace["object"] +backfill_inplace_int64 = backfill_inplace["int64_t"] +backfill_inplace_int32 = backfill_inplace["int32_t"] +backfill_inplace_uint64 = backfill_inplace["uint64_t"] +backfill_inplace_bool = backfill_inplace["uint8_t"] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace(ndarray[algos_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef: + Py_ssize_t i, j, N, K + algos_t val + int lim, fill_count = 0 + + K, N = ( values).shape + + # GH#2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if not util.is_integer_object(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1, -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + + +backfill_2d_inplace_float64 = backfill_2d_inplace["float64_t"] +backfill_2d_inplace_float32 = backfill_2d_inplace["float32_t"] +backfill_2d_inplace_object = backfill_2d_inplace["object"] +backfill_2d_inplace_int64 = backfill_2d_inplace["int64_t"] +backfill_2d_inplace_int32 = backfill_2d_inplace["int32_t"] +backfill_2d_inplace_uint64 = backfill_2d_inplace["uint64_t"] +backfill_2d_inplace_bool = backfill_2d_inplace["uint8_t"] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap(ndarray[algos_t] index, object func): + cdef: + Py_ssize_t length = index.shape[0] + Py_ssize_t i = 0 + ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas._libs.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + + +arrmap_float64 = arrmap["float64_t"] +arrmap_float32 = arrmap["float32_t"] +arrmap_object = arrmap["object"] +arrmap_int64 = arrmap["int64_t"] +arrmap_int32 = arrmap["int32_t"] +arrmap_uint64 = arrmap["uint64_t"] +arrmap_bool = arrmap["uint8_t"] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic(ndarray[algos_t] arr, bint timelike): + """ + Returns + ------- + is_monotonic_inc, is_monotonic_dec, is_unique + """ + cdef: + Py_ssize_t i, n + algos_t prev, cur + bint is_monotonic_inc = 1 + bint is_monotonic_dec = 1 + bint is_unique = 1 + bint is_strict_monotonic = 1 + + n = len(arr) + + if n == 1: + if arr[0] != arr[0] or (timelike and arr[0] == iNaT): + # single value is NaN + return False, False, True + else: + return True, True, True + elif n < 2: + return True, True, True + + if timelike and arr[0] == iNaT: + return False, False, True + + if algos_t is not object: + with nogil: + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if timelike and cur == iNaT: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if cur < prev: + is_monotonic_inc = 0 + elif cur > prev: + is_monotonic_dec = 0 + elif cur == prev: + is_unique = 0 + else: + # cur or prev is NaN + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if not is_monotonic_inc and not is_monotonic_dec: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + prev = cur + else: + # object-dtype, identical to above except we cannot use `with nogil` + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if timelike and cur == iNaT: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if cur < prev: + is_monotonic_inc = 0 + elif cur > prev: + is_monotonic_dec = 0 + elif cur == prev: + is_unique = 0 + else: + # cur or prev is NaN + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if not is_monotonic_inc and not is_monotonic_dec: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + prev = cur + + is_strict_monotonic = is_unique and (is_monotonic_inc or is_monotonic_dec) + return is_monotonic_inc, is_monotonic_dec, is_strict_monotonic + + +is_monotonic_float64 = is_monotonic["float64_t"] +is_monotonic_float32 = is_monotonic["float32_t"] +is_monotonic_object = is_monotonic["object"] +is_monotonic_int64 = is_monotonic["int64_t"] +is_monotonic_int32 = is_monotonic["int32_t"] +is_monotonic_uint64 = is_monotonic["uint64_t"] +is_monotonic_bool = is_monotonic["uint8_t"] + + # generated from template include "algos_common_helper.pxi" include "algos_rank_helper.pxi" diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in index ed4c0e4c59609..40b1b1a282670 100644 --- a/pandas/_libs/algos_common_helper.pxi.in +++ b/pandas/_libs/algos_common_helper.pxi.in @@ -15,443 +15,6 @@ Template for each `dtype` helper function using 1-d template WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -#---------------------------------------------------------------------- -# 1-d template -#---------------------------------------------------------------------- - -{{py: - -# name, c_type, dtype, can_hold_na, nogil -dtypes = [('float64', 'float64_t', 'np.float64', True, True), - ('float32', 'float32_t', 'np.float32', True, True), - ('object', 'object', 'object', True, False), - ('int32', 'int32_t', 'np.int32', False, True), - ('int64', 'int64_t', 'np.int64', False, True), - ('uint64', 'uint64_t', 'np.uint64', False, True), - ('bool', 'uint8_t', 'np.bool', False, True)] - -def get_dispatch(dtypes): - - for name, c_type, dtype, can_hold_na, nogil in dtypes: - - nogil_str = 'with nogil:' if nogil else '' - tab = ' ' if nogil else '' - yield name, c_type, dtype, can_hold_na, nogil_str, tab -}} - -{{for name, c_type, dtype, can_hold_na, nogil_str, tab - in get_dispatch(dtypes)}} - - -@cython.wraparound(False) -@cython.boundscheck(False) -def map_indices_{{name}}(ndarray[{{c_type}}] index): - """ - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - """ - cdef: - Py_ssize_t i, length - dict result = {} - - length = len(index) - - for i in range(length): - result[index[i]] = i - - return result - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_{{name}}(ndarray[{{c_type}}] old, ndarray[{{c_type}}] new, limit=None): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t, ndim=1] indexer - {{c_type}} cur, next - int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if not util.is_integer_object(limit): - raise ValueError('Limit must be an integer') - if limit < 1: - raise ValueError('Limit must be greater than 0') - lim = limit - - if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: - return indexer - - i = j = 0 - - cur = old[0] - - while j <= nright - 1 and new[j] < cur: - j += 1 - - while True: - if j == nright: - break - - if i == nleft - 1: - while j < nright: - if new[j] == cur: - indexer[j] = i - elif new[j] > cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - break - - next = old[i + 1] - - while j < nright and cur <= new[j] < next: - if new[j] == cur: - indexer[j] = i - elif fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - - fill_count = 0 - i += 1 - cur = next - - return indexer - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_inplace_{{name}}(ndarray[{{c_type}}] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef: - Py_ssize_t i, N - {{c_type}} val - int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if not util.is_integer_object(limit): - raise ValueError('Limit must be an integer') - if limit < 1: - raise ValueError('Limit must be greater than 0') - lim = limit - - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_2d_inplace_{{name}}(ndarray[{{c_type}}, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef: - Py_ssize_t i, j, N, K - {{c_type}} val - int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if not util.is_integer_object(limit): - raise ValueError('Limit must be an integer') - if limit < 1: - raise ValueError('Limit must be greater than 0') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -""" -Backfilling logic for generating fill vector - -Diagram of what's going on - -Old New Fill vector Mask - . 0 1 - . 0 1 - . 0 1 -A A 0 1 - . 1 1 - . 1 1 - . 1 1 - . 1 1 - . 1 1 -B B 1 1 - . 2 1 - . 2 1 - . 2 1 -C C 2 1 - . 0 - . 0 -D -""" - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_{{name}}(ndarray[{{c_type}}] old, ndarray[{{c_type}}] new, - limit=None): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t, ndim=1] indexer - {{c_type}} cur, prev - int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if not util.is_integer_object(limit): - raise ValueError('Limit must be an integer') - if limit < 1: - raise ValueError('Limit must be greater than 0') - lim = limit - - if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: - return indexer - - i = nleft - 1 - j = nright - 1 - - cur = old[nleft - 1] - - while j >= 0 and new[j] > cur: - j -= 1 - - while True: - if j < 0: - break - - if i == 0: - while j >= 0: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - break - - prev = old[i - 1] - - while j >= 0 and prev < new[j] <= cur: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - - fill_count = 0 - i -= 1 - cur = prev - - return indexer - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_inplace_{{name}}(ndarray[{{c_type}}] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef: - Py_ssize_t i, N - {{c_type}} val - int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if not util.is_integer_object(limit): - raise ValueError('Limit must be an integer') - if limit < 1: - raise ValueError('Limit must be greater than 0') - lim = limit - - val = values[N - 1] - for i in range(N - 1, -1, -1): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_2d_inplace_{{name}}(ndarray[{{c_type}}, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef: - Py_ssize_t i, j, N, K - {{c_type}} val - int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if not util.is_integer_object(limit): - raise ValueError('Limit must be an integer') - if limit < 1: - raise ValueError('Limit must be greater than 0') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1, -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_{{name}}(ndarray[{{c_type}}] arr, bint timelike): - """ - Returns - ------- - is_monotonic_inc, is_monotonic_dec, is_unique - """ - cdef: - Py_ssize_t i, n - {{c_type}} prev, cur - bint is_monotonic_inc = 1 - bint is_monotonic_dec = 1 - bint is_unique = 1 - - n = len(arr) - - if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): - # single value is NaN - return False, False, True - else: - return True, True, True - elif n < 2: - return True, True, True - - if timelike and arr[0] == iNaT: - return False, False, True - - {{nogil_str}} - {{tab}}prev = arr[0] - {{tab}}for i in range(1, n): - {{tab}} cur = arr[i] - {{tab}} if timelike and cur == iNaT: - {{tab}} is_monotonic_inc = 0 - {{tab}} is_monotonic_dec = 0 - {{tab}} break - {{tab}} if cur < prev: - {{tab}} is_monotonic_inc = 0 - {{tab}} elif cur > prev: - {{tab}} is_monotonic_dec = 0 - {{tab}} elif cur == prev: - {{tab}} is_unique = 0 - {{tab}} else: - {{tab}} # cur or prev is NaN - {{tab}} is_monotonic_inc = 0 - {{tab}} is_monotonic_dec = 0 - {{tab}} break - {{tab}} if not is_monotonic_inc and not is_monotonic_dec: - {{tab}} is_monotonic_inc = 0 - {{tab}} is_monotonic_dec = 0 - {{tab}} break - {{tab}} prev = cur - return is_monotonic_inc, is_monotonic_dec, \ - is_unique and (is_monotonic_inc or is_monotonic_dec) - - -@cython.wraparound(False) -@cython.boundscheck(False) -def arrmap_{{name}}(ndarray[{{c_type}}] index, object func): - cdef: - Py_ssize_t length = index.shape[0] - Py_ssize_t i = 0 - ndarray[object] result = np.empty(length, dtype=np.object_) - - from pandas._libs.lib import maybe_convert_objects - - for i in range(length): - result[i] = func(index[i]) - - return maybe_convert_objects(result) - -{{endfor}} - -#---------------------------------------------------------------------- -# put template -#---------------------------------------------------------------------- - {{py: # name, c_type, dest_type, dest_dtype