diff --git a/.gitignore b/.gitignore
index 19f1cc804dca0..a77e780f3332d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,6 +27,7 @@
 *.class
 *.dll
 *.exe
+*.pxi
 *.o
 *.py[ocd]
 *.so
diff --git a/pandas/src/algos_common_helper.pxi b/pandas/src/algos_common_helper.pxi
deleted file mode 100644
index 9dede87e0c15b..0000000000000
--- a/pandas/src/algos_common_helper.pxi
+++ /dev/null
@@ -1,2764 +0,0 @@
-"""
-Template for each `dtype` helper function using 1-d template
-
-# 1-d template
-- map_indices
-- pad
-- pad_1d
-- pad_2d
-- backfill
-- backfill_1d
-- backfill_2d
-- is_monotonic
-- arrmap
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-#----------------------------------------------------------------------
-# 1-d template
-#----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef map_indices_float64(ndarray[float64_t] index):
-    """
-    Produce a dict mapping the values of the input array to their respective
-    locations.
-
-    Example:
-        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
-
-    Better to do this with Cython because of the enormous speed boost.
-    """
-    cdef Py_ssize_t i, length
-    cdef dict result = {}
-
-    length = len(index)
-
-    for i in range(length):
-        result[index[i]] = i
-
-    return result
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_float64(ndarray[float64_t] old, ndarray[float64_t] new,
-                 limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef float64_t cur, next
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
-        return indexer
-
-    i = j = 0
-
-    cur = old[0]
-
-    while j <= nright - 1 and new[j] < cur:
-        j += 1
-
-    while True:
-        if j == nright:
-            break
-
-        if i == nleft - 1:
-            while j < nright:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] > cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j += 1
-            break
-
-        next = old[i + 1]
-
-        while j < nright and cur <= new[j] < next:
-            if new[j] == cur:
-                indexer[j] = i
-            elif fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j += 1
-
-        fill_count = 0
-        i += 1
-        cur = next
-
-    return indexer
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_inplace_float64(ndarray[float64_t] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
-    cdef Py_ssize_t i, N
-    cdef float64_t val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[0]
-    for i in range(N):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_2d_inplace_float64(ndarray[float64_t, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef float64_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, 0]
-        for i in range(N):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-"""
-Backfilling logic for generating fill vector
-
-Diagram of what's going on
-
-Old      New    Fill vector    Mask
-         .        0               1
-         .        0               1
-         .        0               1
-A        A        0               1
-         .        1               1
-         .        1               1
-         .        1               1
-         .        1               1
-         .        1               1
-B        B        1               1
-         .        2               1
-         .        2               1
-         .        2               1
-C        C        2               1
-         .                        0
-         .                        0
-D
-"""
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_float64(ndarray[float64_t] old, ndarray[float64_t] new,
-                      limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef float64_t cur, prev
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
-        return indexer
-
-    i = nleft - 1
-    j = nright - 1
-
-    cur = old[nleft - 1]
-
-    while j >= 0 and new[j] > cur:
-        j -= 1
-
-    while True:
-        if j < 0:
-            break
-
-        if i == 0:
-            while j >= 0:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] < cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j -= 1
-            break
-
-        prev = old[i - 1]
-
-        while j >= 0 and prev < new[j] <= cur:
-            if new[j] == cur:
-                indexer[j] = i
-            elif new[j] < cur and fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j -= 1
-
-        fill_count = 0
-        i -= 1
-        cur = prev
-
-    return indexer
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_inplace_float64(ndarray[float64_t] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
-    cdef Py_ssize_t i, N
-    cdef float64_t val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[N - 1]
-    for i in range(N - 1, -1, -1):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_2d_inplace_float64(ndarray[float64_t, ndim=2] values,
-                                 ndarray[uint8_t, ndim=2] mask,
-                                 limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef float64_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, N - 1]
-        for i in range(N - 1, -1, -1):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def is_monotonic_float64(ndarray[float64_t] arr, bint timelike):
-    """
-    Returns
-    -------
-    is_monotonic_inc, is_monotonic_dec, is_unique
-    """
-    cdef:
-        Py_ssize_t i, n
-        float64_t prev, cur
-        bint is_monotonic_inc = 1
-        bint is_monotonic_dec = 1
-        bint is_unique = 1
-
-    n = len(arr)
-
-    if n == 1:
-        if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
-            # single value is NaN
-            return False, False, True
-        else:
-            return True, True, True
-    elif n < 2:
-        return True, True, True
-
-    if timelike and arr[0] == iNaT:
-        return False, False, True
-
-    with nogil:
-        prev = arr[0]
-        for i in range(1, n):
-            cur = arr[i]
-            if timelike and cur == iNaT:
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            if cur < prev:
-                is_monotonic_inc = 0
-            elif cur > prev:
-                is_monotonic_dec = 0
-            elif cur == prev:
-                is_unique = 0
-            else:
-                # cur or prev is NaN
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            if not is_monotonic_inc and not is_monotonic_dec:
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            prev = cur
-    return is_monotonic_inc, is_monotonic_dec, \
-           is_unique and (is_monotonic_inc or is_monotonic_dec)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_float64(ndarray[float64_t] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef map_indices_float32(ndarray[float32_t] index):
-    """
-    Produce a dict mapping the values of the input array to their respective
-    locations.
-
-    Example:
-        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
-
-    Better to do this with Cython because of the enormous speed boost.
-    """
-    cdef Py_ssize_t i, length
-    cdef dict result = {}
-
-    length = len(index)
-
-    for i in range(length):
-        result[index[i]] = i
-
-    return result
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_float32(ndarray[float32_t] old, ndarray[float32_t] new,
-                 limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef float32_t cur, next
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
-        return indexer
-
-    i = j = 0
-
-    cur = old[0]
-
-    while j <= nright - 1 and new[j] < cur:
-        j += 1
-
-    while True:
-        if j == nright:
-            break
-
-        if i == nleft - 1:
-            while j < nright:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] > cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j += 1
-            break
-
-        next = old[i + 1]
-
-        while j < nright and cur <= new[j] < next:
-            if new[j] == cur:
-                indexer[j] = i
-            elif fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j += 1
-
-        fill_count = 0
-        i += 1
-        cur = next
-
-    return indexer
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_inplace_float32(ndarray[float32_t] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
-    cdef Py_ssize_t i, N
-    cdef float32_t val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[0]
-    for i in range(N):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_2d_inplace_float32(ndarray[float32_t, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef float32_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, 0]
-        for i in range(N):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-"""
-Backfilling logic for generating fill vector
-
-Diagram of what's going on
-
-Old      New    Fill vector    Mask
-         .        0               1
-         .        0               1
-         .        0               1
-A        A        0               1
-         .        1               1
-         .        1               1
-         .        1               1
-         .        1               1
-         .        1               1
-B        B        1               1
-         .        2               1
-         .        2               1
-         .        2               1
-C        C        2               1
-         .                        0
-         .                        0
-D
-"""
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_float32(ndarray[float32_t] old, ndarray[float32_t] new,
-                      limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef float32_t cur, prev
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
-        return indexer
-
-    i = nleft - 1
-    j = nright - 1
-
-    cur = old[nleft - 1]
-
-    while j >= 0 and new[j] > cur:
-        j -= 1
-
-    while True:
-        if j < 0:
-            break
-
-        if i == 0:
-            while j >= 0:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] < cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j -= 1
-            break
-
-        prev = old[i - 1]
-
-        while j >= 0 and prev < new[j] <= cur:
-            if new[j] == cur:
-                indexer[j] = i
-            elif new[j] < cur and fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j -= 1
-
-        fill_count = 0
-        i -= 1
-        cur = prev
-
-    return indexer
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_inplace_float32(ndarray[float32_t] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
-    cdef Py_ssize_t i, N
-    cdef float32_t val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[N - 1]
-    for i in range(N - 1, -1, -1):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_2d_inplace_float32(ndarray[float32_t, ndim=2] values,
-                                 ndarray[uint8_t, ndim=2] mask,
-                                 limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef float32_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, N - 1]
-        for i in range(N - 1, -1, -1):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def is_monotonic_float32(ndarray[float32_t] arr, bint timelike):
-    """
-    Returns
-    -------
-    is_monotonic_inc, is_monotonic_dec, is_unique
-    """
-    cdef:
-        Py_ssize_t i, n
-        float32_t prev, cur
-        bint is_monotonic_inc = 1
-        bint is_monotonic_dec = 1
-        bint is_unique = 1
-
-    n = len(arr)
-
-    if n == 1:
-        if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
-            # single value is NaN
-            return False, False, True
-        else:
-            return True, True, True
-    elif n < 2:
-        return True, True, True
-
-    if timelike and arr[0] == iNaT:
-        return False, False, True
-
-    with nogil:
-        prev = arr[0]
-        for i in range(1, n):
-            cur = arr[i]
-            if timelike and cur == iNaT:
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            if cur < prev:
-                is_monotonic_inc = 0
-            elif cur > prev:
-                is_monotonic_dec = 0
-            elif cur == prev:
-                is_unique = 0
-            else:
-                # cur or prev is NaN
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            if not is_monotonic_inc and not is_monotonic_dec:
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            prev = cur
-    return is_monotonic_inc, is_monotonic_dec, \
-           is_unique and (is_monotonic_inc or is_monotonic_dec)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_float32(ndarray[float32_t] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef map_indices_object(ndarray[object] index):
-    """
-    Produce a dict mapping the values of the input array to their respective
-    locations.
-
-    Example:
-        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
-
-    Better to do this with Cython because of the enormous speed boost.
-    """
-    cdef Py_ssize_t i, length
-    cdef dict result = {}
-
-    length = len(index)
-
-    for i in range(length):
-        result[index[i]] = i
-
-    return result
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_object(ndarray[object] old, ndarray[object] new,
-                 limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef object cur, next
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
-        return indexer
-
-    i = j = 0
-
-    cur = old[0]
-
-    while j <= nright - 1 and new[j] < cur:
-        j += 1
-
-    while True:
-        if j == nright:
-            break
-
-        if i == nleft - 1:
-            while j < nright:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] > cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j += 1
-            break
-
-        next = old[i + 1]
-
-        while j < nright and cur <= new[j] < next:
-            if new[j] == cur:
-                indexer[j] = i
-            elif fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j += 1
-
-        fill_count = 0
-        i += 1
-        cur = next
-
-    return indexer
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_inplace_object(ndarray[object] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
-    cdef Py_ssize_t i, N
-    cdef object val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[0]
-    for i in range(N):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_2d_inplace_object(ndarray[object, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef object val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, 0]
-        for i in range(N):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-"""
-Backfilling logic for generating fill vector
-
-Diagram of what's going on
-
-Old      New    Fill vector    Mask
-         .        0               1
-         .        0               1
-         .        0               1
-A        A        0               1
-         .        1               1
-         .        1               1
-         .        1               1
-         .        1               1
-         .        1               1
-B        B        1               1
-         .        2               1
-         .        2               1
-         .        2               1
-C        C        2               1
-         .                        0
-         .                        0
-D
-"""
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_object(ndarray[object] old, ndarray[object] new,
-                      limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef object cur, prev
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
-        return indexer
-
-    i = nleft - 1
-    j = nright - 1
-
-    cur = old[nleft - 1]
-
-    while j >= 0 and new[j] > cur:
-        j -= 1
-
-    while True:
-        if j < 0:
-            break
-
-        if i == 0:
-            while j >= 0:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] < cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j -= 1
-            break
-
-        prev = old[i - 1]
-
-        while j >= 0 and prev < new[j] <= cur:
-            if new[j] == cur:
-                indexer[j] = i
-            elif new[j] < cur and fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j -= 1
-
-        fill_count = 0
-        i -= 1
-        cur = prev
-
-    return indexer
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_inplace_object(ndarray[object] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
-    cdef Py_ssize_t i, N
-    cdef object val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[N - 1]
-    for i in range(N - 1, -1, -1):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_2d_inplace_object(ndarray[object, ndim=2] values,
-                                 ndarray[uint8_t, ndim=2] mask,
-                                 limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef object val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, N - 1]
-        for i in range(N - 1, -1, -1):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def is_monotonic_object(ndarray[object] arr, bint timelike):
-    """
-    Returns
-    -------
-    is_monotonic_inc, is_monotonic_dec, is_unique
-    """
-    cdef:
-        Py_ssize_t i, n
-        object prev, cur
-        bint is_monotonic_inc = 1
-        bint is_monotonic_dec = 1
-        bint is_unique = 1
-
-    n = len(arr)
-
-    if n == 1:
-        if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
-            # single value is NaN
-            return False, False, True
-        else:
-            return True, True, True
-    elif n < 2:
-        return True, True, True
-
-    if timelike and arr[0] == iNaT:
-        return False, False, True
-
-    
-    prev = arr[0]
-    for i in range(1, n):
-        cur = arr[i]
-        if timelike and cur == iNaT:
-            is_monotonic_inc = 0
-            is_monotonic_dec = 0
-            break
-        if cur < prev:
-            is_monotonic_inc = 0
-        elif cur > prev:
-            is_monotonic_dec = 0
-        elif cur == prev:
-            is_unique = 0
-        else:
-            # cur or prev is NaN
-            is_monotonic_inc = 0
-            is_monotonic_dec = 0
-            break
-        if not is_monotonic_inc and not is_monotonic_dec:
-            is_monotonic_inc = 0
-            is_monotonic_dec = 0
-            break
-        prev = cur
-    return is_monotonic_inc, is_monotonic_dec, \
-           is_unique and (is_monotonic_inc or is_monotonic_dec)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_object(ndarray[object] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef map_indices_int32(ndarray[int32_t] index):
-    """
-    Produce a dict mapping the values of the input array to their respective
-    locations.
-
-    Example:
-        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
-
-    Better to do this with Cython because of the enormous speed boost.
-    """
-    cdef Py_ssize_t i, length
-    cdef dict result = {}
-
-    length = len(index)
-
-    for i in range(length):
-        result[index[i]] = i
-
-    return result
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_int32(ndarray[int32_t] old, ndarray[int32_t] new,
-                 limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef int32_t cur, next
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
-        return indexer
-
-    i = j = 0
-
-    cur = old[0]
-
-    while j <= nright - 1 and new[j] < cur:
-        j += 1
-
-    while True:
-        if j == nright:
-            break
-
-        if i == nleft - 1:
-            while j < nright:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] > cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j += 1
-            break
-
-        next = old[i + 1]
-
-        while j < nright and cur <= new[j] < next:
-            if new[j] == cur:
-                indexer[j] = i
-            elif fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j += 1
-
-        fill_count = 0
-        i += 1
-        cur = next
-
-    return indexer
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_inplace_int32(ndarray[int32_t] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
-    cdef Py_ssize_t i, N
-    cdef int32_t val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[0]
-    for i in range(N):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_2d_inplace_int32(ndarray[int32_t, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef int32_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, 0]
-        for i in range(N):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-"""
-Backfilling logic for generating fill vector
-
-Diagram of what's going on
-
-Old      New    Fill vector    Mask
-         .        0               1
-         .        0               1
-         .        0               1
-A        A        0               1
-         .        1               1
-         .        1               1
-         .        1               1
-         .        1               1
-         .        1               1
-B        B        1               1
-         .        2               1
-         .        2               1
-         .        2               1
-C        C        2               1
-         .                        0
-         .                        0
-D
-"""
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_int32(ndarray[int32_t] old, ndarray[int32_t] new,
-                      limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef int32_t cur, prev
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
-        return indexer
-
-    i = nleft - 1
-    j = nright - 1
-
-    cur = old[nleft - 1]
-
-    while j >= 0 and new[j] > cur:
-        j -= 1
-
-    while True:
-        if j < 0:
-            break
-
-        if i == 0:
-            while j >= 0:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] < cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j -= 1
-            break
-
-        prev = old[i - 1]
-
-        while j >= 0 and prev < new[j] <= cur:
-            if new[j] == cur:
-                indexer[j] = i
-            elif new[j] < cur and fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j -= 1
-
-        fill_count = 0
-        i -= 1
-        cur = prev
-
-    return indexer
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_inplace_int32(ndarray[int32_t] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
-    cdef Py_ssize_t i, N
-    cdef int32_t val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[N - 1]
-    for i in range(N - 1, -1, -1):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_2d_inplace_int32(ndarray[int32_t, ndim=2] values,
-                                 ndarray[uint8_t, ndim=2] mask,
-                                 limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef int32_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, N - 1]
-        for i in range(N - 1, -1, -1):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def is_monotonic_int32(ndarray[int32_t] arr, bint timelike):
-    """
-    Returns
-    -------
-    is_monotonic_inc, is_monotonic_dec, is_unique
-    """
-    cdef:
-        Py_ssize_t i, n
-        int32_t prev, cur
-        bint is_monotonic_inc = 1
-        bint is_monotonic_dec = 1
-        bint is_unique = 1
-
-    n = len(arr)
-
-    if n == 1:
-        if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
-            # single value is NaN
-            return False, False, True
-        else:
-            return True, True, True
-    elif n < 2:
-        return True, True, True
-
-    if timelike and arr[0] == iNaT:
-        return False, False, True
-
-    with nogil:
-        prev = arr[0]
-        for i in range(1, n):
-            cur = arr[i]
-            if timelike and cur == iNaT:
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            if cur < prev:
-                is_monotonic_inc = 0
-            elif cur > prev:
-                is_monotonic_dec = 0
-            elif cur == prev:
-                is_unique = 0
-            else:
-                # cur or prev is NaN
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            if not is_monotonic_inc and not is_monotonic_dec:
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            prev = cur
-    return is_monotonic_inc, is_monotonic_dec, \
-           is_unique and (is_monotonic_inc or is_monotonic_dec)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_int32(ndarray[int32_t] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef map_indices_int64(ndarray[int64_t] index):
-    """
-    Produce a dict mapping the values of the input array to their respective
-    locations.
-
-    Example:
-        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
-
-    Better to do this with Cython because of the enormous speed boost.
-    """
-    cdef Py_ssize_t i, length
-    cdef dict result = {}
-
-    length = len(index)
-
-    for i in range(length):
-        result[index[i]] = i
-
-    return result
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_int64(ndarray[int64_t] old, ndarray[int64_t] new,
-                 limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef int64_t cur, next
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
-        return indexer
-
-    i = j = 0
-
-    cur = old[0]
-
-    while j <= nright - 1 and new[j] < cur:
-        j += 1
-
-    while True:
-        if j == nright:
-            break
-
-        if i == nleft - 1:
-            while j < nright:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] > cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j += 1
-            break
-
-        next = old[i + 1]
-
-        while j < nright and cur <= new[j] < next:
-            if new[j] == cur:
-                indexer[j] = i
-            elif fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j += 1
-
-        fill_count = 0
-        i += 1
-        cur = next
-
-    return indexer
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_inplace_int64(ndarray[int64_t] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
-    cdef Py_ssize_t i, N
-    cdef int64_t val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[0]
-    for i in range(N):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_2d_inplace_int64(ndarray[int64_t, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef int64_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, 0]
-        for i in range(N):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-"""
-Backfilling logic for generating fill vector
-
-Diagram of what's going on
-
-Old      New    Fill vector    Mask
-         .        0               1
-         .        0               1
-         .        0               1
-A        A        0               1
-         .        1               1
-         .        1               1
-         .        1               1
-         .        1               1
-         .        1               1
-B        B        1               1
-         .        2               1
-         .        2               1
-         .        2               1
-C        C        2               1
-         .                        0
-         .                        0
-D
-"""
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_int64(ndarray[int64_t] old, ndarray[int64_t] new,
-                      limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef int64_t cur, prev
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
-        return indexer
-
-    i = nleft - 1
-    j = nright - 1
-
-    cur = old[nleft - 1]
-
-    while j >= 0 and new[j] > cur:
-        j -= 1
-
-    while True:
-        if j < 0:
-            break
-
-        if i == 0:
-            while j >= 0:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] < cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j -= 1
-            break
-
-        prev = old[i - 1]
-
-        while j >= 0 and prev < new[j] <= cur:
-            if new[j] == cur:
-                indexer[j] = i
-            elif new[j] < cur and fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j -= 1
-
-        fill_count = 0
-        i -= 1
-        cur = prev
-
-    return indexer
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_inplace_int64(ndarray[int64_t] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
-    cdef Py_ssize_t i, N
-    cdef int64_t val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[N - 1]
-    for i in range(N - 1, -1, -1):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_2d_inplace_int64(ndarray[int64_t, ndim=2] values,
-                                 ndarray[uint8_t, ndim=2] mask,
-                                 limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef int64_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, N - 1]
-        for i in range(N - 1, -1, -1):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def is_monotonic_int64(ndarray[int64_t] arr, bint timelike):
-    """
-    Returns
-    -------
-    is_monotonic_inc, is_monotonic_dec, is_unique
-    """
-    cdef:
-        Py_ssize_t i, n
-        int64_t prev, cur
-        bint is_monotonic_inc = 1
-        bint is_monotonic_dec = 1
-        bint is_unique = 1
-
-    n = len(arr)
-
-    if n == 1:
-        if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
-            # single value is NaN
-            return False, False, True
-        else:
-            return True, True, True
-    elif n < 2:
-        return True, True, True
-
-    if timelike and arr[0] == iNaT:
-        return False, False, True
-
-    with nogil:
-        prev = arr[0]
-        for i in range(1, n):
-            cur = arr[i]
-            if timelike and cur == iNaT:
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            if cur < prev:
-                is_monotonic_inc = 0
-            elif cur > prev:
-                is_monotonic_dec = 0
-            elif cur == prev:
-                is_unique = 0
-            else:
-                # cur or prev is NaN
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            if not is_monotonic_inc and not is_monotonic_dec:
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            prev = cur
-    return is_monotonic_inc, is_monotonic_dec, \
-           is_unique and (is_monotonic_inc or is_monotonic_dec)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_int64(ndarray[int64_t] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef map_indices_bool(ndarray[uint8_t] index):
-    """
-    Produce a dict mapping the values of the input array to their respective
-    locations.
-
-    Example:
-        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
-
-    Better to do this with Cython because of the enormous speed boost.
-    """
-    cdef Py_ssize_t i, length
-    cdef dict result = {}
-
-    length = len(index)
-
-    for i in range(length):
-        result[index[i]] = i
-
-    return result
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_bool(ndarray[uint8_t] old, ndarray[uint8_t] new,
-                 limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef uint8_t cur, next
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
-        return indexer
-
-    i = j = 0
-
-    cur = old[0]
-
-    while j <= nright - 1 and new[j] < cur:
-        j += 1
-
-    while True:
-        if j == nright:
-            break
-
-        if i == nleft - 1:
-            while j < nright:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] > cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j += 1
-            break
-
-        next = old[i + 1]
-
-        while j < nright and cur <= new[j] < next:
-            if new[j] == cur:
-                indexer[j] = i
-            elif fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j += 1
-
-        fill_count = 0
-        i += 1
-        cur = next
-
-    return indexer
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_inplace_bool(ndarray[uint8_t] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
-    cdef Py_ssize_t i, N
-    cdef uint8_t val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[0]
-    for i in range(N):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_2d_inplace_bool(ndarray[uint8_t, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef uint8_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, 0]
-        for i in range(N):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-"""
-Backfilling logic for generating fill vector
-
-Diagram of what's going on
-
-Old      New    Fill vector    Mask
-         .        0               1
-         .        0               1
-         .        0               1
-A        A        0               1
-         .        1               1
-         .        1               1
-         .        1               1
-         .        1               1
-         .        1               1
-B        B        1               1
-         .        2               1
-         .        2               1
-         .        2               1
-C        C        2               1
-         .                        0
-         .                        0
-D
-"""
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_bool(ndarray[uint8_t] old, ndarray[uint8_t] new,
-                      limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef uint8_t cur, prev
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
-        return indexer
-
-    i = nleft - 1
-    j = nright - 1
-
-    cur = old[nleft - 1]
-
-    while j >= 0 and new[j] > cur:
-        j -= 1
-
-    while True:
-        if j < 0:
-            break
-
-        if i == 0:
-            while j >= 0:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] < cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j -= 1
-            break
-
-        prev = old[i - 1]
-
-        while j >= 0 and prev < new[j] <= cur:
-            if new[j] == cur:
-                indexer[j] = i
-            elif new[j] < cur and fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j -= 1
-
-        fill_count = 0
-        i -= 1
-        cur = prev
-
-    return indexer
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_inplace_bool(ndarray[uint8_t] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
-    cdef Py_ssize_t i, N
-    cdef uint8_t val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[N - 1]
-    for i in range(N - 1, -1, -1):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_2d_inplace_bool(ndarray[uint8_t, ndim=2] values,
-                                 ndarray[uint8_t, ndim=2] mask,
-                                 limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef uint8_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, N - 1]
-        for i in range(N - 1, -1, -1):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def is_monotonic_bool(ndarray[uint8_t] arr, bint timelike):
-    """
-    Returns
-    -------
-    is_monotonic_inc, is_monotonic_dec, is_unique
-    """
-    cdef:
-        Py_ssize_t i, n
-        uint8_t prev, cur
-        bint is_monotonic_inc = 1
-        bint is_monotonic_dec = 1
-        bint is_unique = 1
-
-    n = len(arr)
-
-    if n == 1:
-        if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
-            # single value is NaN
-            return False, False, True
-        else:
-            return True, True, True
-    elif n < 2:
-        return True, True, True
-
-    if timelike and arr[0] == iNaT:
-        return False, False, True
-
-    with nogil:
-        prev = arr[0]
-        for i in range(1, n):
-            cur = arr[i]
-            if timelike and cur == iNaT:
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            if cur < prev:
-                is_monotonic_inc = 0
-            elif cur > prev:
-                is_monotonic_dec = 0
-            elif cur == prev:
-                is_unique = 0
-            else:
-                # cur or prev is NaN
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            if not is_monotonic_inc and not is_monotonic_dec:
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            prev = cur
-    return is_monotonic_inc, is_monotonic_dec, \
-           is_unique and (is_monotonic_inc or is_monotonic_dec)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_bool(ndarray[uint8_t] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
-#----------------------------------------------------------------------
-# put template
-#----------------------------------------------------------------------
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def diff_2d_float64(ndarray[float64_t, ndim=2] arr,
-                     ndarray[float64_t, ndim=2] out,
-                     Py_ssize_t periods, int axis):
-    cdef:
-        Py_ssize_t i, j, sx, sy
-
-    sx, sy = (<object> arr).shape
-    if arr.flags.f_contiguous:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for j in range(sy):
-                for i in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for j in range(start, stop):
-                for i in range(sx):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-    else:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for i in range(start, stop):
-                for j in range(sy):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for i in range(sx):
-                for j in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-
-
-def put2d_float64_float64(ndarray[float64_t, ndim=2, cast=True] values,
-                                 ndarray[int64_t] indexer, Py_ssize_t loc,
-                                 ndarray[float64_t] out):
-    cdef:
-        Py_ssize_t i, j, k
-
-    k = len(values)
-    for j from 0 <= j < k:
-        i = indexer[j]
-        out[i] = values[j, loc]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def diff_2d_float32(ndarray[float32_t, ndim=2] arr,
-                     ndarray[float32_t, ndim=2] out,
-                     Py_ssize_t periods, int axis):
-    cdef:
-        Py_ssize_t i, j, sx, sy
-
-    sx, sy = (<object> arr).shape
-    if arr.flags.f_contiguous:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for j in range(sy):
-                for i in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for j in range(start, stop):
-                for i in range(sx):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-    else:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for i in range(start, stop):
-                for j in range(sy):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for i in range(sx):
-                for j in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-
-
-def put2d_float32_float32(ndarray[float32_t, ndim=2, cast=True] values,
-                                 ndarray[int64_t] indexer, Py_ssize_t loc,
-                                 ndarray[float32_t] out):
-    cdef:
-        Py_ssize_t i, j, k
-
-    k = len(values)
-    for j from 0 <= j < k:
-        i = indexer[j]
-        out[i] = values[j, loc]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def diff_2d_int8(ndarray[int8_t, ndim=2] arr,
-                     ndarray[float32_t, ndim=2] out,
-                     Py_ssize_t periods, int axis):
-    cdef:
-        Py_ssize_t i, j, sx, sy
-
-    sx, sy = (<object> arr).shape
-    if arr.flags.f_contiguous:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for j in range(sy):
-                for i in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for j in range(start, stop):
-                for i in range(sx):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-    else:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for i in range(start, stop):
-                for j in range(sy):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for i in range(sx):
-                for j in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-
-
-def put2d_int8_float32(ndarray[int8_t, ndim=2, cast=True] values,
-                                 ndarray[int64_t] indexer, Py_ssize_t loc,
-                                 ndarray[float32_t] out):
-    cdef:
-        Py_ssize_t i, j, k
-
-    k = len(values)
-    for j from 0 <= j < k:
-        i = indexer[j]
-        out[i] = values[j, loc]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def diff_2d_int16(ndarray[int16_t, ndim=2] arr,
-                     ndarray[float32_t, ndim=2] out,
-                     Py_ssize_t periods, int axis):
-    cdef:
-        Py_ssize_t i, j, sx, sy
-
-    sx, sy = (<object> arr).shape
-    if arr.flags.f_contiguous:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for j in range(sy):
-                for i in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for j in range(start, stop):
-                for i in range(sx):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-    else:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for i in range(start, stop):
-                for j in range(sy):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for i in range(sx):
-                for j in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-
-
-def put2d_int16_float32(ndarray[int16_t, ndim=2, cast=True] values,
-                                 ndarray[int64_t] indexer, Py_ssize_t loc,
-                                 ndarray[float32_t] out):
-    cdef:
-        Py_ssize_t i, j, k
-
-    k = len(values)
-    for j from 0 <= j < k:
-        i = indexer[j]
-        out[i] = values[j, loc]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def diff_2d_int32(ndarray[int32_t, ndim=2] arr,
-                     ndarray[float64_t, ndim=2] out,
-                     Py_ssize_t periods, int axis):
-    cdef:
-        Py_ssize_t i, j, sx, sy
-
-    sx, sy = (<object> arr).shape
-    if arr.flags.f_contiguous:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for j in range(sy):
-                for i in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for j in range(start, stop):
-                for i in range(sx):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-    else:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for i in range(start, stop):
-                for j in range(sy):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for i in range(sx):
-                for j in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-
-
-def put2d_int32_float64(ndarray[int32_t, ndim=2, cast=True] values,
-                                 ndarray[int64_t] indexer, Py_ssize_t loc,
-                                 ndarray[float64_t] out):
-    cdef:
-        Py_ssize_t i, j, k
-
-    k = len(values)
-    for j from 0 <= j < k:
-        i = indexer[j]
-        out[i] = values[j, loc]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def diff_2d_int64(ndarray[int64_t, ndim=2] arr,
-                     ndarray[float64_t, ndim=2] out,
-                     Py_ssize_t periods, int axis):
-    cdef:
-        Py_ssize_t i, j, sx, sy
-
-    sx, sy = (<object> arr).shape
-    if arr.flags.f_contiguous:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for j in range(sy):
-                for i in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for j in range(start, stop):
-                for i in range(sx):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-    else:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for i in range(start, stop):
-                for j in range(sy):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for i in range(sx):
-                for j in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-
-
-def put2d_int64_float64(ndarray[int64_t, ndim=2, cast=True] values,
-                                 ndarray[int64_t] indexer, Py_ssize_t loc,
-                                 ndarray[float64_t] out):
-    cdef:
-        Py_ssize_t i, j, k
-
-    k = len(values)
-    for j from 0 <= j < k:
-        i = indexer[j]
-        out[i] = values[j, loc]
-
-#----------------------------------------------------------------------
-# ensure_dtype
-#----------------------------------------------------------------------
-
-cdef int PLATFORM_INT = (<ndarray> np.arange(0, dtype=np.intp)).descr.type_num
-
-cpdef ensure_platform_int(object arr):
-    # GH3033, GH1392
-    # platform int is the size of the int pointer, e.g. np.intp
-    if util.is_array(arr):
-        if (<ndarray> arr).descr.type_num == PLATFORM_INT:
-            return arr
-        else:
-            return arr.astype(np.intp)
-    else:
-        return np.array(arr, dtype=np.intp)
-
-cpdef ensure_object(object arr):
-    if util.is_array(arr):
-        if (<ndarray> arr).descr.type_num == NPY_OBJECT:
-            return arr
-        else:
-            return arr.astype(np.object_)
-    elif hasattr(arr, 'asobject'):
-        return arr.asobject
-    else:
-        return np.array(arr, dtype=np.object_)
-
-cpdef ensure_float64(object arr):
-    if util.is_array(arr):
-        if (<ndarray> arr).descr.type_num == NPY_FLOAT64:
-            return arr
-        else:
-            return arr.astype(np.float64)
-    else:
-        return np.array(arr, dtype=np.float64)
-
-cpdef ensure_float32(object arr):
-    if util.is_array(arr):
-        if (<ndarray> arr).descr.type_num == NPY_FLOAT32:
-            return arr
-        else:
-            return arr.astype(np.float32)
-    else:
-        return np.array(arr, dtype=np.float32)
-
-cpdef ensure_int8(object arr):
-    if util.is_array(arr):
-        if (<ndarray> arr).descr.type_num == NPY_INT8:
-            return arr
-        else:
-            return arr.astype(np.int8)
-    else:
-        return np.array(arr, dtype=np.int8)
-
-cpdef ensure_int16(object arr):
-    if util.is_array(arr):
-        if (<ndarray> arr).descr.type_num == NPY_INT16:
-            return arr
-        else:
-            return arr.astype(np.int16)
-    else:
-        return np.array(arr, dtype=np.int16)
-
-cpdef ensure_int32(object arr):
-    if util.is_array(arr):
-        if (<ndarray> arr).descr.type_num == NPY_INT32:
-            return arr
-        else:
-            return arr.astype(np.int32)
-    else:
-        return np.array(arr, dtype=np.int32)
-
-cpdef ensure_int64(object arr):
-    if util.is_array(arr):
-        if (<ndarray> arr).descr.type_num == NPY_INT64:
-            return arr
-        else:
-            return arr.astype(np.int64)
-    else:
-        return np.array(arr, dtype=np.int64)
diff --git a/pandas/src/algos_groupby_helper.pxi b/pandas/src/algos_groupby_helper.pxi
deleted file mode 100644
index 013a03f719bbd..0000000000000
--- a/pandas/src/algos_groupby_helper.pxi
+++ /dev/null
@@ -1,1375 +0,0 @@
-"""
-Template for each `dtype` helper function using groupby
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-cdef extern from "numpy/npy_math.h":
-    double NAN "NPY_NAN"
-_int64_max = np.iinfo(np.int64).max
-
-#----------------------------------------------------------------------
-# group_add, group_prod, group_var, group_mean, group_ohlc
-#----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_add_float64(ndarray[float64_t, ndim=2] out,
-                       ndarray[int64_t] counts,
-                       ndarray[float64_t, ndim=2] values,
-                       ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float64_t val, count
-        ndarray[float64_t, ndim=2] sumx, nobs
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-
-        if K > 1:
-
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val:
-                        nobs[lab, j] += 1
-                        sumx[lab, j] += val
-
-        else:
-
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val:
-                    nobs[lab, 0] += 1
-                    sumx[lab, 0] += val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = sumx[i, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_prod_float64(ndarray[float64_t, ndim=2] out,
-                        ndarray[int64_t] counts,
-                        ndarray[float64_t, ndim=2] values,
-                        ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float64_t val, count
-        ndarray[float64_t, ndim=2] prodx, nobs
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-    prodx = np.ones_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val:
-                        nobs[lab, j] += 1
-                        prodx[lab, j] *= val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val:
-                    nobs[lab, 0] += 1
-                    prodx[lab, 0] *= val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = prodx[i, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-@cython.cdivision(True)
-def group_var_float64(ndarray[float64_t, ndim=2] out,
-                       ndarray[int64_t] counts,
-                       ndarray[float64_t, ndim=2] values,
-                       ndarray[int64_t] labels):
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float64_t val, ct, oldmean
-        ndarray[float64_t, ndim=2] nobs, mean
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-    mean = np.zeros_like(out)
-
-    N, K = (<object> values).shape
-
-    out[:, :] = 0.0
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[lab, j] += 1
-                    oldmean = mean[lab, j]
-                    mean[lab, j] += (val - oldmean) / nobs[lab, j]
-                    out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
-
-        for i in range(ncounts):
-            for j in range(K):
-                ct = nobs[i, j]
-                if ct < 2:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] /= (ct - 1)
-# add passing bin edges, instead of labels
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_mean_float64(ndarray[float64_t, ndim=2] out,
-                        ndarray[int64_t] counts,
-                        ndarray[float64_t, ndim=2] values,
-                        ndarray[int64_t] labels):
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float64_t val, count
-        ndarray[float64_t, ndim=2] sumx, nobs
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-                    # not nan
-                    if val == val:
-                        nobs[lab, j] += 1
-                        sumx[lab, j] += val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-                # not nan
-                if val == val:
-                    nobs[lab, 0] += 1
-                    sumx[lab, 0] += val
-
-        for i in range(ncounts):
-            for j in range(K):
-                count = nobs[i, j]
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = sumx[i, j] / count
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_ohlc_float64(ndarray[float64_t, ndim=2] out,
-                  ndarray[int64_t] counts,
-                  ndarray[float64_t, ndim=2] values,
-                  ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab
-        float64_t val, count
-        Py_ssize_t ngroups = len(counts)
-
-    if len(labels) == 0:
-        return
-
-    N, K = (<object> values).shape
-
-    if out.shape[1] != 4:
-        raise ValueError('Output array must have 4 columns')
-
-    if K > 1:
-        raise NotImplementedError("Argument 'values' must have only "
-                                  "one dimension")
-    out.fill(np.nan)
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab == -1:
-                continue
-
-            counts[lab] += 1
-            val = values[i, 0]
-            if val != val:
-                continue
-
-            if out[lab, 0] != out[lab, 0]:
-                out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val
-            else:
-                out[lab, 1] = max(out[lab, 1], val)
-                out[lab, 2] = min(out[lab, 2], val)
-                out[lab, 3] = val
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_add_float32(ndarray[float32_t, ndim=2] out,
-                       ndarray[int64_t] counts,
-                       ndarray[float32_t, ndim=2] values,
-                       ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float32_t val, count
-        ndarray[float32_t, ndim=2] sumx, nobs
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-
-        if K > 1:
-
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val:
-                        nobs[lab, j] += 1
-                        sumx[lab, j] += val
-
-        else:
-
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val:
-                    nobs[lab, 0] += 1
-                    sumx[lab, 0] += val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = sumx[i, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_prod_float32(ndarray[float32_t, ndim=2] out,
-                        ndarray[int64_t] counts,
-                        ndarray[float32_t, ndim=2] values,
-                        ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float32_t val, count
-        ndarray[float32_t, ndim=2] prodx, nobs
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-    prodx = np.ones_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val:
-                        nobs[lab, j] += 1
-                        prodx[lab, j] *= val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val:
-                    nobs[lab, 0] += 1
-                    prodx[lab, 0] *= val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = prodx[i, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-@cython.cdivision(True)
-def group_var_float32(ndarray[float32_t, ndim=2] out,
-                       ndarray[int64_t] counts,
-                       ndarray[float32_t, ndim=2] values,
-                       ndarray[int64_t] labels):
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float32_t val, ct, oldmean
-        ndarray[float32_t, ndim=2] nobs, mean
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-    mean = np.zeros_like(out)
-
-    N, K = (<object> values).shape
-
-    out[:, :] = 0.0
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[lab, j] += 1
-                    oldmean = mean[lab, j]
-                    mean[lab, j] += (val - oldmean) / nobs[lab, j]
-                    out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
-
-        for i in range(ncounts):
-            for j in range(K):
-                ct = nobs[i, j]
-                if ct < 2:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] /= (ct - 1)
-# add passing bin edges, instead of labels
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_mean_float32(ndarray[float32_t, ndim=2] out,
-                        ndarray[int64_t] counts,
-                        ndarray[float32_t, ndim=2] values,
-                        ndarray[int64_t] labels):
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float32_t val, count
-        ndarray[float32_t, ndim=2] sumx, nobs
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-                    # not nan
-                    if val == val:
-                        nobs[lab, j] += 1
-                        sumx[lab, j] += val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-                # not nan
-                if val == val:
-                    nobs[lab, 0] += 1
-                    sumx[lab, 0] += val
-
-        for i in range(ncounts):
-            for j in range(K):
-                count = nobs[i, j]
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = sumx[i, j] / count
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_ohlc_float32(ndarray[float32_t, ndim=2] out,
-                  ndarray[int64_t] counts,
-                  ndarray[float32_t, ndim=2] values,
-                  ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab
-        float32_t val, count
-        Py_ssize_t ngroups = len(counts)
-
-    if len(labels) == 0:
-        return
-
-    N, K = (<object> values).shape
-
-    if out.shape[1] != 4:
-        raise ValueError('Output array must have 4 columns')
-
-    if K > 1:
-        raise NotImplementedError("Argument 'values' must have only "
-                                  "one dimension")
-    out.fill(np.nan)
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab == -1:
-                continue
-
-            counts[lab] += 1
-            val = values[i, 0]
-            if val != val:
-                continue
-
-            if out[lab, 0] != out[lab, 0]:
-                out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val
-            else:
-                out[lab, 1] = max(out[lab, 1], val)
-                out[lab, 2] = min(out[lab, 2], val)
-                out[lab, 3] = val
-
-#----------------------------------------------------------------------
-# group_nth, group_last
-#----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_last_float64(ndarray[float64_t, ndim=2] out,
-                        ndarray[int64_t] counts,
-                        ndarray[float64_t, ndim=2] values,
-                        ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float64_t val, count
-        ndarray[float64_t, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val and val != NAN:
-                    nobs[lab, j] += 1
-                    resx[lab, j] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = resx[i, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_nth_float64(ndarray[float64_t, ndim=2] out,
-                       ndarray[int64_t] counts,
-                       ndarray[float64_t, ndim=2] values,
-                       ndarray[int64_t] labels, int64_t rank):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float64_t val, count
-        ndarray[float64_t, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val and val != NAN:
-                    nobs[lab, j] += 1
-                    if nobs[lab, j] == rank:
-                        resx[lab, j] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = resx[i, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_last_float32(ndarray[float32_t, ndim=2] out,
-                        ndarray[int64_t] counts,
-                        ndarray[float32_t, ndim=2] values,
-                        ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float32_t val, count
-        ndarray[float32_t, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val and val != NAN:
-                    nobs[lab, j] += 1
-                    resx[lab, j] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = resx[i, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_nth_float32(ndarray[float32_t, ndim=2] out,
-                       ndarray[int64_t] counts,
-                       ndarray[float32_t, ndim=2] values,
-                       ndarray[int64_t] labels, int64_t rank):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float32_t val, count
-        ndarray[float32_t, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val and val != NAN:
-                    nobs[lab, j] += 1
-                    if nobs[lab, j] == rank:
-                        resx[lab, j] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = resx[i, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_last_int64(ndarray[int64_t, ndim=2] out,
-                        ndarray[int64_t] counts,
-                        ndarray[int64_t, ndim=2] values,
-                        ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        int64_t val, count
-        ndarray[int64_t, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val and val != iNaT:
-                    nobs[lab, j] += 1
-                    resx[lab, j] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = iNaT
-                else:
-                    out[i, j] = resx[i, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_nth_int64(ndarray[int64_t, ndim=2] out,
-                       ndarray[int64_t] counts,
-                       ndarray[int64_t, ndim=2] values,
-                       ndarray[int64_t] labels, int64_t rank):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        int64_t val, count
-        ndarray[int64_t, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val and val != iNaT:
-                    nobs[lab, j] += 1
-                    if nobs[lab, j] == rank:
-                        resx[lab, j] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = iNaT
-                else:
-                    out[i, j] = resx[i, j]
-
-#----------------------------------------------------------------------
-# group_min, group_max
-#----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_max_float64(ndarray[float64_t, ndim=2] out,
-                       ndarray[int64_t] counts,
-                       ndarray[float64_t, ndim=2] values,
-                       ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float64_t val, count
-        ndarray[float64_t, ndim=2] maxx, nobs
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-
-    maxx = np.empty_like(out)
-    maxx.fill(-np.inf)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val and val != NAN:
-                        nobs[lab, j] += 1
-                        if val > maxx[lab, j]:
-                            maxx[lab, j] = val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val and val != NAN:
-                    nobs[lab, 0] += 1
-                    if val > maxx[lab, 0]:
-                        maxx[lab, 0] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = maxx[i, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_min_float64(ndarray[float64_t, ndim=2] out,
-                       ndarray[int64_t] counts,
-                       ndarray[float64_t, ndim=2] values,
-                       ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float64_t val, count
-        ndarray[float64_t, ndim=2] minx, nobs
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-
-    minx = np.empty_like(out)
-    minx.fill(np.inf)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val and val != NAN:
-
-                        nobs[lab, j] += 1
-                        if val < minx[lab, j]:
-                            minx[lab, j] = val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val and val != NAN:
-                    nobs[lab, 0] += 1
-                    if val < minx[lab, 0]:
-                        minx[lab, 0] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = minx[i, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_max_float32(ndarray[float32_t, ndim=2] out,
-                       ndarray[int64_t] counts,
-                       ndarray[float32_t, ndim=2] values,
-                       ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float32_t val, count
-        ndarray[float32_t, ndim=2] maxx, nobs
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-
-    maxx = np.empty_like(out)
-    maxx.fill(-np.inf)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val and val != NAN:
-                        nobs[lab, j] += 1
-                        if val > maxx[lab, j]:
-                            maxx[lab, j] = val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val and val != NAN:
-                    nobs[lab, 0] += 1
-                    if val > maxx[lab, 0]:
-                        maxx[lab, 0] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = maxx[i, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_min_float32(ndarray[float32_t, ndim=2] out,
-                       ndarray[int64_t] counts,
-                       ndarray[float32_t, ndim=2] values,
-                       ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float32_t val, count
-        ndarray[float32_t, ndim=2] minx, nobs
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-
-    minx = np.empty_like(out)
-    minx.fill(np.inf)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val and val != NAN:
-
-                        nobs[lab, j] += 1
-                        if val < minx[lab, j]:
-                            minx[lab, j] = val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val and val != NAN:
-                    nobs[lab, 0] += 1
-                    if val < minx[lab, 0]:
-                        minx[lab, 0] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = minx[i, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_max_int64(ndarray[int64_t, ndim=2] out,
-                       ndarray[int64_t] counts,
-                       ndarray[int64_t, ndim=2] values,
-                       ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        int64_t val, count
-        ndarray[int64_t, ndim=2] maxx, nobs
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-
-    maxx = np.empty_like(out)
-    maxx.fill(-_int64_max)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val and val != iNaT:
-                        nobs[lab, j] += 1
-                        if val > maxx[lab, j]:
-                            maxx[lab, j] = val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val and val != iNaT:
-                    nobs[lab, 0] += 1
-                    if val > maxx[lab, 0]:
-                        maxx[lab, 0] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = iNaT
-                else:
-                    out[i, j] = maxx[i, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_min_int64(ndarray[int64_t, ndim=2] out,
-                       ndarray[int64_t] counts,
-                       ndarray[int64_t, ndim=2] values,
-                       ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        int64_t val, count
-        ndarray[int64_t, ndim=2] minx, nobs
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-
-    minx = np.empty_like(out)
-    minx.fill(_int64_max)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val and val != iNaT:
-
-                        nobs[lab, j] += 1
-                        if val < minx[lab, j]:
-                            minx[lab, j] = val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val and val != iNaT:
-                    nobs[lab, 0] += 1
-                    if val < minx[lab, 0]:
-                        minx[lab, 0] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = iNaT
-                else:
-                    out[i, j] = minx[i, j]
-
-#----------------------------------------------------------------------
-# other grouping functions not needing a template
-#----------------------------------------------------------------------
-
-
-def group_median_float64(ndarray[float64_t, ndim=2] out,
-                         ndarray[int64_t] counts,
-                         ndarray[float64_t, ndim=2] values,
-                         ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, ngroups, size
-        ndarray[int64_t] _counts
-        ndarray data
-        float64_t* ptr
-    ngroups = len(counts)
-    N, K = (<object> values).shape
-
-    indexer, _counts = groupsort_indexer(labels, ngroups)
-    counts[:] = _counts[1:]
-
-    data = np.empty((K, N), dtype=np.float64)
-    ptr = <float64_t*> data.data
-
-    take_2d_axis1_float64_float64(values.T, indexer, out=data)
-
-    for i in range(K):
-        # exclude NA group
-        ptr += _counts[0]
-        for j in range(ngroups):
-            size = _counts[j + 1]
-            out[j, i] = _median_linear(ptr, size)
-            ptr += size
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_cumprod_float64(float64_t[:, :] out,
-                          float64_t[:, :] values,
-                          int64_t[:] labels,
-                          float64_t[:, :] accum):
-    """
-    Only transforms on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, size
-        float64_t val
-        int64_t lab
-
-    N, K = (<object> values).shape
-    accum = np.ones_like(accum)
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-
-            if lab < 0:
-                continue
-            for j in range(K):
-                val = values[i, j]
-                if val == val:
-                    accum[lab, j] *= val
-                    out[i, j] = accum[lab, j]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_cumsum(numeric[:, :] out,
-                 numeric[:, :] values,
-                 int64_t[:] labels,
-                 numeric[:, :] accum):
-    """
-    Only transforms on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, size
-        numeric val
-        int64_t lab
-
-    N, K = (<object> values).shape
-    accum = np.zeros_like(accum)
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-
-            if lab < 0:
-                continue
-            for j in range(K):
-                val = values[i, j]
-                if val == val:
-                    accum[lab, j] += val
-                    out[i, j] = accum[lab, j]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_shift_indexer(int64_t[:] out, int64_t[:] labels,
-                        int ngroups, int periods):
-    cdef:
-        Py_ssize_t N, i, j, ii
-        int offset, sign
-        int64_t lab, idxer, idxer_slot
-        int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64)
-        int64_t[:, :] label_indexer
-
-    N, = (<object> labels).shape
-
-    if periods < 0:
-        periods = -periods
-        offset = N - 1
-        sign = -1
-    elif periods > 0:
-        offset = 0
-        sign = 1
-
-    if periods == 0:
-        with nogil:
-            for i in range(N):
-                out[i] = i
-    else:
-        # array of each previous indexer seen
-        label_indexer = np.zeros((ngroups, periods), dtype=np.int64)
-        with nogil:
-            for i in range(N):
-                ## reverse iterator if shifting backwards
-                ii = offset + sign * i
-                lab = labels[ii]
-
-                # Skip null keys
-                if lab == -1:
-                    out[ii] = -1
-                    continue
-
-                label_seen[lab] += 1
-
-                idxer_slot = label_seen[lab] % periods
-                idxer = label_indexer[lab, idxer_slot]
-
-                if label_seen[lab] > periods:
-                    out[ii] = idxer
-                else:
-                    out[ii] = -1
-
-                label_indexer[lab, idxer_slot] = ii
diff --git a/pandas/src/algos_take_helper.pxi b/pandas/src/algos_take_helper.pxi
deleted file mode 100644
index d8fb05804d4e5..0000000000000
--- a/pandas/src/algos_take_helper.pxi
+++ /dev/null
@@ -1,4949 +0,0 @@
-"""
-Template for each `dtype` helper function for take
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-#----------------------------------------------------------------------
-# take_1d, take_2d
-#----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_bool_bool_memview(uint8_t[:] values,
-                                              int64_t[:] indexer,
-                                              uint8_t[:] out,
-                                              fill_value=np.nan):
-
-
-
-    cdef:
-        Py_ssize_t i, n, idx
-        uint8_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_bool_bool(ndarray[uint8_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              uint8_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_bool_bool_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-
-    cdef:
-        Py_ssize_t i, n, idx
-        uint8_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_bool_bool_memview(uint8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    uint8_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        uint8_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            uint8_t *v
-            uint8_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(uint8_t) and
-            sizeof(uint8_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(uint8_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_bool_bool(ndarray[uint8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    uint8_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_bool_bool_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        uint8_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            uint8_t *v
-            uint8_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(uint8_t) and
-            sizeof(uint8_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(uint8_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_bool_bool_memview(uint8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    uint8_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        uint8_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_bool_bool(ndarray[uint8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    uint8_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_bool_bool_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        uint8_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_bool_bool(ndarray[uint8_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[uint8_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        uint8_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_bool_object_memview(uint8_t[:] values,
-                                              int64_t[:] indexer,
-                                              object[:] out,
-                                              fill_value=np.nan):
-
-
-
-    cdef:
-        Py_ssize_t i, n, idx
-        object fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = True if values[idx] > 0 else False
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_bool_object(ndarray[uint8_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              object[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_bool_object_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-
-    cdef:
-        Py_ssize_t i, n, idx
-        object fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = True if values[idx] > 0 else False
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_bool_object_memview(uint8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    object[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        object fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            object *v
-            object *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(object) and
-            sizeof(object) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(object) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = True if values[idx, j] > 0 else False
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_bool_object(ndarray[uint8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    object[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_bool_object_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        object fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            object *v
-            object *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(object) and
-            sizeof(object) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(object) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = True if values[idx, j] > 0 else False
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_bool_object_memview(uint8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    object[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        object fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = True if values[i, idx] > 0 else False
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_bool_object(ndarray[uint8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    object[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_bool_object_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        object fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = True if values[i, idx] > 0 else False
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_bool_object(ndarray[uint8_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[object, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        object fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = True if values[idx, idx1[j]] > 0 else False
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int8_int8_memview(int8_t[:] values,
-                                              int64_t[:] indexer,
-                                              int8_t[:] out,
-                                              fill_value=np.nan):
-
-
-
-    cdef:
-        Py_ssize_t i, n, idx
-        int8_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int8_int8(ndarray[int8_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              int8_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int8_int8_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-
-    cdef:
-        Py_ssize_t i, n, idx
-        int8_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int8_int8_memview(int8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int8_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int8_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            int8_t *v
-            int8_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int8_t) and
-            sizeof(int8_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int8_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int8_int8(ndarray[int8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int8_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int8_int8_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int8_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            int8_t *v
-            int8_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int8_t) and
-            sizeof(int8_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int8_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int8_int8_memview(int8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int8_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int8_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int8_int8(ndarray[int8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int8_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int8_int8_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int8_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int8_int8(ndarray[int8_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[int8_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        int8_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int8_int32_memview(int8_t[:] values,
-                                              int64_t[:] indexer,
-                                              int32_t[:] out,
-                                              fill_value=np.nan):
-
-
-
-    cdef:
-        Py_ssize_t i, n, idx
-        int32_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int8_int32(ndarray[int8_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              int32_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int8_int32_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-
-    cdef:
-        Py_ssize_t i, n, idx
-        int32_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int8_int32_memview(int8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int32_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            int32_t *v
-            int32_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int32_t) and
-            sizeof(int32_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int32_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int8_int32(ndarray[int8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int32_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int8_int32_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            int32_t *v
-            int32_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int32_t) and
-            sizeof(int32_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int32_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int8_int32_memview(int8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int32_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int8_int32(ndarray[int8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int32_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int8_int32_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int8_int32(ndarray[int8_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[int32_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        int32_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int8_int64_memview(int8_t[:] values,
-                                              int64_t[:] indexer,
-                                              int64_t[:] out,
-                                              fill_value=np.nan):
-
-
-
-    cdef:
-        Py_ssize_t i, n, idx
-        int64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int8_int64(ndarray[int8_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              int64_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int8_int64_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-
-    cdef:
-        Py_ssize_t i, n, idx
-        int64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int8_int64_memview(int8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            int64_t *v
-            int64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int64_t) and
-            sizeof(int64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int8_int64(ndarray[int8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int64_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int8_int64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            int64_t *v
-            int64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int64_t) and
-            sizeof(int64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int8_int64_memview(int8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int8_int64(ndarray[int8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int64_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int8_int64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int8_int64(ndarray[int8_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[int64_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        int64_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int8_float64_memview(int8_t[:] values,
-                                              int64_t[:] indexer,
-                                              float64_t[:] out,
-                                              fill_value=np.nan):
-
-
-
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int8_float64(ndarray[int8_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              float64_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int8_float64_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int8_float64_memview(int8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int8_float64(ndarray[int8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int8_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int8_float64_memview(int8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int8_float64(ndarray[int8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int8_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int8_float64(ndarray[int8_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[float64_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        float64_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int16_int16_memview(int16_t[:] values,
-                                              int64_t[:] indexer,
-                                              int16_t[:] out,
-                                              fill_value=np.nan):
-
-
-
-    cdef:
-        Py_ssize_t i, n, idx
-        int16_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int16_int16(ndarray[int16_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              int16_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int16_int16_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-
-    cdef:
-        Py_ssize_t i, n, idx
-        int16_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int16_int16_memview(int16_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int16_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int16_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            int16_t *v
-            int16_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int16_t) and
-            sizeof(int16_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int16_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int16_int16(ndarray[int16_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int16_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int16_int16_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int16_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            int16_t *v
-            int16_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int16_t) and
-            sizeof(int16_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int16_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int16_int16_memview(int16_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int16_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int16_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int16_int16(ndarray[int16_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int16_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int16_int16_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int16_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int16_int16(ndarray[int16_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[int16_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        int16_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int16_int32_memview(int16_t[:] values,
-                                              int64_t[:] indexer,
-                                              int32_t[:] out,
-                                              fill_value=np.nan):
-
-
-
-    cdef:
-        Py_ssize_t i, n, idx
-        int32_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int16_int32(ndarray[int16_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              int32_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int16_int32_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-
-    cdef:
-        Py_ssize_t i, n, idx
-        int32_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int16_int32_memview(int16_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int32_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            int32_t *v
-            int32_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int32_t) and
-            sizeof(int32_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int32_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int16_int32(ndarray[int16_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int32_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int16_int32_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            int32_t *v
-            int32_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int32_t) and
-            sizeof(int32_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int32_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int16_int32_memview(int16_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int32_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int16_int32(ndarray[int16_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int32_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int16_int32_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int16_int32(ndarray[int16_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[int32_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        int32_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int16_int64_memview(int16_t[:] values,
-                                              int64_t[:] indexer,
-                                              int64_t[:] out,
-                                              fill_value=np.nan):
-
-
-
-    cdef:
-        Py_ssize_t i, n, idx
-        int64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int16_int64(ndarray[int16_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              int64_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int16_int64_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-
-    cdef:
-        Py_ssize_t i, n, idx
-        int64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int16_int64_memview(int16_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            int64_t *v
-            int64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int64_t) and
-            sizeof(int64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int16_int64(ndarray[int16_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int64_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int16_int64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            int64_t *v
-            int64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int64_t) and
-            sizeof(int64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int16_int64_memview(int16_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int16_int64(ndarray[int16_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int64_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int16_int64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int16_int64(ndarray[int16_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[int64_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        int64_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int16_float64_memview(int16_t[:] values,
-                                              int64_t[:] indexer,
-                                              float64_t[:] out,
-                                              fill_value=np.nan):
-
-
-
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int16_float64(ndarray[int16_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              float64_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int16_float64_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int16_float64_memview(int16_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int16_float64(ndarray[int16_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int16_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int16_float64_memview(int16_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int16_float64(ndarray[int16_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int16_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int16_float64(ndarray[int16_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[float64_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        float64_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int32_int32_memview(int32_t[:] values,
-                                              int64_t[:] indexer,
-                                              int32_t[:] out,
-                                              fill_value=np.nan):
-
-
-
-    cdef:
-        Py_ssize_t i, n, idx
-        int32_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int32_int32(ndarray[int32_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              int32_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int32_int32_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-
-    cdef:
-        Py_ssize_t i, n, idx
-        int32_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int32_int32_memview(int32_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int32_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            int32_t *v
-            int32_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int32_t) and
-            sizeof(int32_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int32_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int32_int32(ndarray[int32_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int32_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int32_int32_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            int32_t *v
-            int32_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int32_t) and
-            sizeof(int32_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int32_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int32_int32_memview(int32_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int32_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int32_int32(ndarray[int32_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int32_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int32_int32_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int32_int32(ndarray[int32_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[int32_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        int32_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int32_int64_memview(int32_t[:] values,
-                                              int64_t[:] indexer,
-                                              int64_t[:] out,
-                                              fill_value=np.nan):
-
-
-
-    cdef:
-        Py_ssize_t i, n, idx
-        int64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int32_int64(ndarray[int32_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              int64_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int32_int64_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-
-    cdef:
-        Py_ssize_t i, n, idx
-        int64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int32_int64_memview(int32_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            int64_t *v
-            int64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int64_t) and
-            sizeof(int64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int32_int64(ndarray[int32_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int64_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int32_int64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            int64_t *v
-            int64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int64_t) and
-            sizeof(int64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int32_int64_memview(int32_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int32_int64(ndarray[int32_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int64_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int32_int64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int32_int64(ndarray[int32_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[int64_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        int64_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int32_float64_memview(int32_t[:] values,
-                                              int64_t[:] indexer,
-                                              float64_t[:] out,
-                                              fill_value=np.nan):
-
-
-
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int32_float64(ndarray[int32_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              float64_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int32_float64_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int32_float64_memview(int32_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int32_float64(ndarray[int32_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int32_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int32_float64_memview(int32_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int32_float64(ndarray[int32_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int32_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int32_float64(ndarray[int32_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[float64_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        float64_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int64_int64_memview(int64_t[:] values,
-                                              int64_t[:] indexer,
-                                              int64_t[:] out,
-                                              fill_value=np.nan):
-
-
-
-    cdef:
-        Py_ssize_t i, n, idx
-        int64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int64_int64(ndarray[int64_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              int64_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int64_int64_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-
-    cdef:
-        Py_ssize_t i, n, idx
-        int64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int64_int64_memview(int64_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            int64_t *v
-            int64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int64_t) and
-            sizeof(int64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int64_int64(ndarray[int64_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int64_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int64_int64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            int64_t *v
-            int64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int64_t) and
-            sizeof(int64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int64_int64_memview(int64_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int64_int64(ndarray[int64_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int64_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int64_int64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int64_int64(ndarray[int64_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[int64_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        int64_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int64_float64_memview(int64_t[:] values,
-                                              int64_t[:] indexer,
-                                              float64_t[:] out,
-                                              fill_value=np.nan):
-
-
-
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int64_float64(ndarray[int64_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              float64_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int64_float64_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int64_float64_memview(int64_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int64_float64(ndarray[int64_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int64_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int64_float64_memview(int64_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int64_float64(ndarray[int64_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int64_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int64_float64(ndarray[int64_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[float64_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        float64_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_float32_float32_memview(float32_t[:] values,
-                                              int64_t[:] indexer,
-                                              float32_t[:] out,
-                                              fill_value=np.nan):
-
-
-
-    cdef:
-        Py_ssize_t i, n, idx
-        float32_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_float32_float32(ndarray[float32_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              float32_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_float32_float32_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-
-    cdef:
-        Py_ssize_t i, n, idx
-        float32_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_float32_float32_memview(float32_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float32_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float32_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            float32_t *v
-            float32_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float32_t) and
-            sizeof(float32_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float32_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_float32_float32(ndarray[float32_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float32_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_float32_float32_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float32_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            float32_t *v
-            float32_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float32_t) and
-            sizeof(float32_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float32_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_float32_float32_memview(float32_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float32_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float32_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_float32_float32(ndarray[float32_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float32_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_float32_float32_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float32_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_float32_float32(ndarray[float32_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[float32_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        float32_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_float32_float64_memview(float32_t[:] values,
-                                              int64_t[:] indexer,
-                                              float64_t[:] out,
-                                              fill_value=np.nan):
-
-
-
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_float32_float64(ndarray[float32_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              float64_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_float32_float64_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_float32_float64_memview(float32_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_float32_float64(ndarray[float32_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_float32_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_float32_float64_memview(float32_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_float32_float64(ndarray[float32_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_float32_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_float32_float64(ndarray[float32_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[float64_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        float64_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_float64_float64_memview(float64_t[:] values,
-                                              int64_t[:] indexer,
-                                              float64_t[:] out,
-                                              fill_value=np.nan):
-
-
-
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_float64_float64(ndarray[float64_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              float64_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_float64_float64_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_float64_float64_memview(float64_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_float64_float64(ndarray[float64_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_float64_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_float64_float64_memview(float64_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_float64_float64(ndarray[float64_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_float64_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_float64_float64(ndarray[float64_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[float64_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        float64_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_object_object_memview(object[:] values,
-                                              int64_t[:] indexer,
-                                              object[:] out,
-                                              fill_value=np.nan):
-
-
-
-    cdef:
-        Py_ssize_t i, n, idx
-        object fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_object_object(ndarray[object, ndim=1] values,
-                              int64_t[:] indexer,
-                              object[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_object_object_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-
-    cdef:
-        Py_ssize_t i, n, idx
-        object fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_object_object_memview(object[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    object[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        object fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            object *v
-            object *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(object) and
-            sizeof(object) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(object) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_object_object(ndarray[object, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    object[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_object_object_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        object fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            object *v
-            object *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(object) and
-            sizeof(object) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(object) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_object_object_memview(object[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    object[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        object fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_object_object(ndarray[object, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    object[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_object_object_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        object fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_object_object(ndarray[object, ndim=2] values,
-                                    indexer,
-                                    ndarray[object, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        object fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
diff --git a/pandas/src/hashtable_class_helper.pxi b/pandas/src/hashtable_class_helper.pxi
deleted file mode 100644
index da0c76aeca86f..0000000000000
--- a/pandas/src/hashtable_class_helper.pxi
+++ /dev/null
@@ -1,860 +0,0 @@
-"""
-Template for each `dtype` helper function for hashtable
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-#----------------------------------------------------------------------
-# VectorData
-#----------------------------------------------------------------------
-
-
-ctypedef struct Float64VectorData:
-    float64_t *data
-    size_t n, m
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void append_data_float64(Float64VectorData *data,
-                                float64_t x) nogil:
-
-    data.data[data.n] = x
-    data.n += 1
-
-
-ctypedef struct Int64VectorData:
-    int64_t *data
-    size_t n, m
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void append_data_int64(Int64VectorData *data,
-                                int64_t x) nogil:
-
-    data.data[data.n] = x
-    data.n += 1
-
-ctypedef fused vector_data:
-    Int64VectorData
-    Float64VectorData
-
-cdef bint needs_resize(vector_data *data) nogil:
-    return data.n == data.m
-
-#----------------------------------------------------------------------
-# Vector
-#----------------------------------------------------------------------
-
-cdef class Float64Vector:
-
-    cdef:
-        Float64VectorData *data
-        ndarray ao
-
-    def __cinit__(self):
-        self.data = <Float64VectorData *>PyMem_Malloc(
-            sizeof(Float64VectorData))
-        if not self.data:
-            raise MemoryError()
-        self.data.n = 0
-        self.data.m = _INIT_VEC_CAP
-        self.ao = np.empty(self.data.m, dtype=np.float64)
-        self.data.data = <float64_t*> self.ao.data
-
-    cdef resize(self):
-        self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
-        self.ao.resize(self.data.m)
-        self.data.data = <float64_t*> self.ao.data
-
-    def __dealloc__(self):
-        PyMem_Free(self.data)
-
-    def __len__(self):
-        return self.data.n
-
-    def to_array(self):
-        self.ao.resize(self.data.n)
-        self.data.m = self.data.n
-        return self.ao
-
-    cdef inline void append(self, float64_t x):
-
-        if needs_resize(self.data):
-            self.resize()
-
-        append_data_float64(self.data, x)
-
-cdef class Int64Vector:
-
-    cdef:
-        Int64VectorData *data
-        ndarray ao
-
-    def __cinit__(self):
-        self.data = <Int64VectorData *>PyMem_Malloc(
-            sizeof(Int64VectorData))
-        if not self.data:
-            raise MemoryError()
-        self.data.n = 0
-        self.data.m = _INIT_VEC_CAP
-        self.ao = np.empty(self.data.m, dtype=np.int64)
-        self.data.data = <int64_t*> self.ao.data
-
-    cdef resize(self):
-        self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
-        self.ao.resize(self.data.m)
-        self.data.data = <int64_t*> self.ao.data
-
-    def __dealloc__(self):
-        PyMem_Free(self.data)
-
-    def __len__(self):
-        return self.data.n
-
-    def to_array(self):
-        self.ao.resize(self.data.n)
-        self.data.m = self.data.n
-        return self.ao
-
-    cdef inline void append(self, int64_t x):
-
-        if needs_resize(self.data):
-            self.resize()
-
-        append_data_int64(self.data, x)
-
-
-cdef class ObjectVector:
-
-    cdef:
-        PyObject **data
-        size_t n, m
-        ndarray ao
-
-    def __cinit__(self):
-        self.n = 0
-        self.m = _INIT_VEC_CAP
-        self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
-        self.data = <PyObject**> self.ao.data
-
-    def __len__(self):
-        return self.n
-
-    cdef inline append(self, object o):
-        if self.n == self.m:
-            self.m = max(self.m * 2, _INIT_VEC_CAP)
-            self.ao.resize(self.m)
-            self.data = <PyObject**> self.ao.data
-
-        Py_INCREF(o)
-        self.data[self.n] = <PyObject*> o
-        self.n += 1
-
-    def to_array(self):
-        self.ao.resize(self.n)
-        self.m = self.n
-        return self.ao
-
-
-#----------------------------------------------------------------------
-# HashTable
-#----------------------------------------------------------------------
-
-
-cdef class HashTable:
-    pass
-
-cdef class Float64HashTable(HashTable):
-
-    def __cinit__(self, size_hint=1):
-        self.table = kh_init_float64()
-        if size_hint is not None:
-            kh_resize_float64(self.table, size_hint)
-
-    def __len__(self):
-        return self.table.size
-
-    def __dealloc__(self):
-        kh_destroy_float64(self.table)
-
-    def __contains__(self, object key):
-        cdef khiter_t k
-        k = kh_get_float64(self.table, key)
-        return k != self.table.n_buckets
-
-    cpdef get_item(self, float64_t val):
-        cdef khiter_t k
-        k = kh_get_float64(self.table, val)
-        if k != self.table.n_buckets:
-            return self.table.vals[k]
-        else:
-            raise KeyError(val)
-
-    def get_iter_test(self, float64_t key, Py_ssize_t iterations):
-        cdef Py_ssize_t i, val=0
-        for i in range(iterations):
-            k = kh_get_float64(self.table, val)
-            if k != self.table.n_buckets:
-                val = self.table.vals[k]
-
-    cpdef set_item(self, float64_t key, Py_ssize_t val):
-        cdef:
-            khiter_t k
-            int ret = 0
-
-        k = kh_put_float64(self.table, key, &ret)
-        self.table.keys[k] = key
-        if kh_exist_float64(self.table, k):
-            self.table.vals[k] = val
-        else:
-            raise KeyError(key)
-
-    @cython.boundscheck(False)
-    def map(self, float64_t[:] keys, int64_t[:] values):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int ret = 0
-            float64_t key
-            khiter_t k
-
-        with nogil:
-            for i in range(n):
-                key = keys[i]
-                k = kh_put_float64(self.table, key, &ret)
-                self.table.vals[k] = <Py_ssize_t> values[i]
-
-    @cython.boundscheck(False)
-    def map_locations(self, ndarray[float64_t, ndim=1] values):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int ret = 0
-            float64_t val
-            khiter_t k
-
-        with nogil:
-            for i in range(n):
-                val = values[i]
-                k = kh_put_float64(self.table, val, &ret)
-                self.table.vals[k] = i
-
-    @cython.boundscheck(False)
-    def lookup(self, float64_t[:] values):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int ret = 0
-            float64_t val
-            khiter_t k
-            int64_t[:] locs = np.empty(n, dtype=np.int64)
-
-        with nogil:
-            for i in range(n):
-                val = values[i]
-                k = kh_get_float64(self.table, val)
-                if k != self.table.n_buckets:
-                    locs[i] = self.table.vals[k]
-                else:
-                    locs[i] = -1
-
-        return np.asarray(locs)
-
-    def factorize(self, float64_t values):
-        uniques = Float64Vector()
-        labels = self.get_labels(values, uniques, 0, 0)
-        return uniques.to_array(), labels
-
-    @cython.boundscheck(False)
-    def get_labels(self, float64_t[:] values, Float64Vector uniques,
-                   Py_ssize_t count_prior, Py_ssize_t na_sentinel,
-                   bint check_null=True):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int64_t[:] labels
-            Py_ssize_t idx, count = count_prior
-            int ret = 0
-            float64_t val
-            khiter_t k
-            Float64VectorData *ud
-
-        labels = np.empty(n, dtype=np.int64)
-        ud = uniques.data
-
-        with nogil:
-            for i in range(n):
-                val = values[i]
-
-                if check_null and val != val:
-                    labels[i] = na_sentinel
-                    continue
-
-                k = kh_get_float64(self.table, val)
-
-                if k != self.table.n_buckets:
-                    idx = self.table.vals[k]
-                    labels[i] = idx
-                else:
-                    k = kh_put_float64(self.table, val, &ret)
-                    self.table.vals[k] = count
-
-                    if needs_resize(ud):
-                        with gil:
-                            uniques.resize()
-                    append_data_float64(ud, val)
-                    labels[i] = count
-                    count += 1
-
-        return np.asarray(labels)
-
-    @cython.boundscheck(False)
-    def get_labels_groupby(self, float64_t[:] values):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int64_t[:] labels
-            Py_ssize_t idx, count = 0
-            int ret = 0
-            float64_t val
-            khiter_t k
-            Float64Vector uniques = Float64Vector()
-            Float64VectorData *ud
-
-        labels = np.empty(n, dtype=np.int64)
-        ud = uniques.data
-
-        with nogil:
-            for i in range(n):
-                val = values[i]
-
-                # specific for groupby
-                if val < 0:
-                    labels[i] = -1
-                    continue
-
-                k = kh_get_float64(self.table, val)
-                if k != self.table.n_buckets:
-                    idx = self.table.vals[k]
-                    labels[i] = idx
-                else:
-                    k = kh_put_float64(self.table, val, &ret)
-                    self.table.vals[k] = count
-
-                    if needs_resize(ud):
-                        with gil:
-                            uniques.resize()
-                    append_data_float64(ud, val)
-                    labels[i] = count
-                    count += 1
-
-        arr_uniques = uniques.to_array()
-
-        return np.asarray(labels), arr_uniques
-
-    @cython.boundscheck(False)
-    def unique(self, float64_t[:] values):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int ret = 0
-            float64_t val
-            khiter_t k
-            bint seen_na = 0
-            Float64Vector uniques = Float64Vector()
-            Float64VectorData *ud
-
-        ud = uniques.data
-
-        with nogil:
-            for i in range(n):
-                val = values[i]
-
-                if val == val:
-                    k = kh_get_float64(self.table, val)
-                    if k == self.table.n_buckets:
-                        kh_put_float64(self.table, val, &ret)
-                        if needs_resize(ud):
-                            with gil:
-                                uniques.resize()
-                        append_data_float64(ud, val)
-                elif not seen_na:
-                    seen_na = 1
-                    if needs_resize(ud):
-                        with gil:
-                            uniques.resize()
-                    append_data_float64(ud, NAN)
-
-        return uniques.to_array()
-
-cdef class Int64HashTable(HashTable):
-
-    def __cinit__(self, size_hint=1):
-        self.table = kh_init_int64()
-        if size_hint is not None:
-            kh_resize_int64(self.table, size_hint)
-
-    def __len__(self):
-        return self.table.size
-
-    def __dealloc__(self):
-        kh_destroy_int64(self.table)
-
-    def __contains__(self, object key):
-        cdef khiter_t k
-        k = kh_get_int64(self.table, key)
-        return k != self.table.n_buckets
-
-    cpdef get_item(self, int64_t val):
-        cdef khiter_t k
-        k = kh_get_int64(self.table, val)
-        if k != self.table.n_buckets:
-            return self.table.vals[k]
-        else:
-            raise KeyError(val)
-
-    def get_iter_test(self, int64_t key, Py_ssize_t iterations):
-        cdef Py_ssize_t i, val=0
-        for i in range(iterations):
-            k = kh_get_int64(self.table, val)
-            if k != self.table.n_buckets:
-                val = self.table.vals[k]
-
-    cpdef set_item(self, int64_t key, Py_ssize_t val):
-        cdef:
-            khiter_t k
-            int ret = 0
-
-        k = kh_put_int64(self.table, key, &ret)
-        self.table.keys[k] = key
-        if kh_exist_int64(self.table, k):
-            self.table.vals[k] = val
-        else:
-            raise KeyError(key)
-
-    @cython.boundscheck(False)
-    def map(self, int64_t[:] keys, int64_t[:] values):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int ret = 0
-            int64_t key
-            khiter_t k
-
-        with nogil:
-            for i in range(n):
-                key = keys[i]
-                k = kh_put_int64(self.table, key, &ret)
-                self.table.vals[k] = <Py_ssize_t> values[i]
-
-    @cython.boundscheck(False)
-    def map_locations(self, ndarray[int64_t, ndim=1] values):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int ret = 0
-            int64_t val
-            khiter_t k
-
-        with nogil:
-            for i in range(n):
-                val = values[i]
-                k = kh_put_int64(self.table, val, &ret)
-                self.table.vals[k] = i
-
-    @cython.boundscheck(False)
-    def lookup(self, int64_t[:] values):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int ret = 0
-            int64_t val
-            khiter_t k
-            int64_t[:] locs = np.empty(n, dtype=np.int64)
-
-        with nogil:
-            for i in range(n):
-                val = values[i]
-                k = kh_get_int64(self.table, val)
-                if k != self.table.n_buckets:
-                    locs[i] = self.table.vals[k]
-                else:
-                    locs[i] = -1
-
-        return np.asarray(locs)
-
-    def factorize(self, int64_t values):
-        uniques = Int64Vector()
-        labels = self.get_labels(values, uniques, 0, 0)
-        return uniques.to_array(), labels
-
-    @cython.boundscheck(False)
-    def get_labels(self, int64_t[:] values, Int64Vector uniques,
-                   Py_ssize_t count_prior, Py_ssize_t na_sentinel,
-                   bint check_null=True):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int64_t[:] labels
-            Py_ssize_t idx, count = count_prior
-            int ret = 0
-            int64_t val
-            khiter_t k
-            Int64VectorData *ud
-
-        labels = np.empty(n, dtype=np.int64)
-        ud = uniques.data
-
-        with nogil:
-            for i in range(n):
-                val = values[i]
-
-                if check_null and val == iNaT:
-                    labels[i] = na_sentinel
-                    continue
-
-                k = kh_get_int64(self.table, val)
-
-                if k != self.table.n_buckets:
-                    idx = self.table.vals[k]
-                    labels[i] = idx
-                else:
-                    k = kh_put_int64(self.table, val, &ret)
-                    self.table.vals[k] = count
-
-                    if needs_resize(ud):
-                        with gil:
-                            uniques.resize()
-                    append_data_int64(ud, val)
-                    labels[i] = count
-                    count += 1
-
-        return np.asarray(labels)
-
-    @cython.boundscheck(False)
-    def get_labels_groupby(self, int64_t[:] values):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int64_t[:] labels
-            Py_ssize_t idx, count = 0
-            int ret = 0
-            int64_t val
-            khiter_t k
-            Int64Vector uniques = Int64Vector()
-            Int64VectorData *ud
-
-        labels = np.empty(n, dtype=np.int64)
-        ud = uniques.data
-
-        with nogil:
-            for i in range(n):
-                val = values[i]
-
-                # specific for groupby
-                if val < 0:
-                    labels[i] = -1
-                    continue
-
-                k = kh_get_int64(self.table, val)
-                if k != self.table.n_buckets:
-                    idx = self.table.vals[k]
-                    labels[i] = idx
-                else:
-                    k = kh_put_int64(self.table, val, &ret)
-                    self.table.vals[k] = count
-
-                    if needs_resize(ud):
-                        with gil:
-                            uniques.resize()
-                    append_data_int64(ud, val)
-                    labels[i] = count
-                    count += 1
-
-        arr_uniques = uniques.to_array()
-
-        return np.asarray(labels), arr_uniques
-
-    @cython.boundscheck(False)
-    def unique(self, int64_t[:] values):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int ret = 0
-            int64_t val
-            khiter_t k
-            bint seen_na = 0
-            Int64Vector uniques = Int64Vector()
-            Int64VectorData *ud
-
-        ud = uniques.data
-
-        with nogil:
-            for i in range(n):
-                val = values[i]
-
-                k = kh_get_int64(self.table, val)
-                if k == self.table.n_buckets:
-                    kh_put_int64(self.table, val, &ret)
-                    if needs_resize(ud):
-                        with gil:
-                            uniques.resize()
-                    append_data_int64(ud, val)
-
-        return uniques.to_array()
-
-
-cdef class StringHashTable(HashTable):
-    cdef kh_str_t *table
-
-    def __cinit__(self, int size_hint=1):
-        self.table = kh_init_str()
-        if size_hint is not None:
-            kh_resize_str(self.table, size_hint)
-
-    def __dealloc__(self):
-        kh_destroy_str(self.table)
-
-    cpdef get_item(self, object val):
-        cdef khiter_t k
-        k = kh_get_str(self.table, util.get_c_string(val))
-        if k != self.table.n_buckets:
-            return self.table.vals[k]
-        else:
-            raise KeyError(val)
-
-    def get_iter_test(self, object key, Py_ssize_t iterations):
-        cdef Py_ssize_t i, val
-        for i in range(iterations):
-            k = kh_get_str(self.table, util.get_c_string(key))
-            if k != self.table.n_buckets:
-                val = self.table.vals[k]
-
-    cpdef set_item(self, object key, Py_ssize_t val):
-        cdef:
-            khiter_t k
-            int ret = 0
-            char* buf
-
-        buf = util.get_c_string(key)
-
-        k = kh_put_str(self.table, buf, &ret)
-        self.table.keys[k] = key
-        if kh_exist_str(self.table, k):
-            self.table.vals[k] = val
-        else:
-            raise KeyError(key)
-
-    def get_indexer(self, ndarray[object] values):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            ndarray[int64_t] labels = np.empty(n, dtype=np.int64)
-            char *buf
-            int64_t *resbuf = <int64_t*> labels.data
-            khiter_t k
-            kh_str_t *table = self.table
-
-        for i in range(n):
-            buf = util.get_c_string(values[i])
-            k = kh_get_str(table, buf)
-            if k != table.n_buckets:
-                resbuf[i] = table.vals[k]
-            else:
-                resbuf[i] = -1
-        return labels
-
-    def unique(self, ndarray[object] values):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int ret = 0
-            object val
-            char *buf
-            khiter_t k
-            ObjectVector uniques = ObjectVector()
-
-        for i in range(n):
-            val = values[i]
-            buf = util.get_c_string(val)
-            k = kh_get_str(self.table, buf)
-            if k == self.table.n_buckets:
-                kh_put_str(self.table, buf, &ret)
-                uniques.append(val)
-
-        return uniques.to_array()
-
-    def factorize(self, ndarray[object] values):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            ndarray[int64_t] labels = np.empty(n, dtype=np.int64)
-            dict reverse = {}
-            Py_ssize_t idx, count = 0
-            int ret = 0
-            object val
-            char *buf
-            khiter_t k
-
-        for i in range(n):
-            val = values[i]
-            buf = util.get_c_string(val)
-            k = kh_get_str(self.table, buf)
-            if k != self.table.n_buckets:
-                idx = self.table.vals[k]
-                labels[i] = idx
-            else:
-                k = kh_put_str(self.table, buf, &ret)
-                # print 'putting %s, %s' % (val, count)
-
-                self.table.vals[k] = count
-                reverse[count] = val
-                labels[i] = count
-                count += 1
-
-        return reverse, labels
-
-
-na_sentinel = object
-
-cdef class PyObjectHashTable(HashTable):
-
-    def __init__(self, size_hint=1):
-        self.table = kh_init_pymap()
-        kh_resize_pymap(self.table, size_hint)
-
-    def __dealloc__(self):
-        if self.table is not NULL:
-            self.destroy()
-
-    def __len__(self):
-        return self.table.size
-
-    def __contains__(self, object key):
-        cdef khiter_t k
-        hash(key)
-        if key != key or key is None:
-            key = na_sentinel
-        k = kh_get_pymap(self.table, <PyObject*>key)
-        return k != self.table.n_buckets
-
-    def destroy(self):
-        kh_destroy_pymap(self.table)
-        self.table = NULL
-
-    cpdef get_item(self, object val):
-        cdef khiter_t k
-        if val != val or val is None:
-            val = na_sentinel
-        k = kh_get_pymap(self.table, <PyObject*>val)
-        if k != self.table.n_buckets:
-            return self.table.vals[k]
-        else:
-            raise KeyError(val)
-
-    def get_iter_test(self, object key, Py_ssize_t iterations):
-        cdef Py_ssize_t i, val
-        if key != key or key is None:
-            key = na_sentinel
-        for i in range(iterations):
-            k = kh_get_pymap(self.table, <PyObject*>key)
-            if k != self.table.n_buckets:
-                val = self.table.vals[k]
-
-    cpdef set_item(self, object key, Py_ssize_t val):
-        cdef:
-            khiter_t k
-            int ret = 0
-            char* buf
-
-        hash(key)
-        if key != key or key is None:
-            key = na_sentinel
-        k = kh_put_pymap(self.table, <PyObject*>key, &ret)
-        # self.table.keys[k] = key
-        if kh_exist_pymap(self.table, k):
-            self.table.vals[k] = val
-        else:
-            raise KeyError(key)
-
-    def map_locations(self, ndarray[object] values):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int ret = 0
-            object val
-            khiter_t k
-
-        for i in range(n):
-            val = values[i]
-            hash(val)
-            if val != val or val is None:
-                val = na_sentinel
-
-            k = kh_put_pymap(self.table, <PyObject*>val, &ret)
-            self.table.vals[k] = i
-
-    def lookup(self, ndarray[object] values):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int ret = 0
-            object val
-            khiter_t k
-            int64_t[:] locs = np.empty(n, dtype=np.int64)
-
-        for i in range(n):
-            val = values[i]
-            hash(val)
-            if val != val or val is None:
-                val = na_sentinel
-
-            k = kh_get_pymap(self.table, <PyObject*>val)
-            if k != self.table.n_buckets:
-                locs[i] = self.table.vals[k]
-            else:
-                locs[i] = -1
-
-        return np.asarray(locs)
-
-    def unique(self, ndarray[object] values):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int ret = 0
-            object val
-            khiter_t k
-            ObjectVector uniques = ObjectVector()
-            bint seen_na = 0
-
-        for i in range(n):
-            val = values[i]
-            hash(val)
-            if not _checknan(val):
-                k = kh_get_pymap(self.table, <PyObject*>val)
-                if k == self.table.n_buckets:
-                    kh_put_pymap(self.table, <PyObject*>val, &ret)
-                    uniques.append(val)
-            elif not seen_na:
-                seen_na = 1
-                uniques.append(nan)
-
-        return uniques.to_array()
-
-    def get_labels(self, ndarray[object] values, ObjectVector uniques,
-                   Py_ssize_t count_prior, int64_t na_sentinel,
-                   bint check_null=True):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int64_t[:] labels
-            Py_ssize_t idx, count = count_prior
-            int ret = 0
-            object val
-            khiter_t k
-
-        labels = np.empty(n, dtype=np.int64)
-
-        for i in range(n):
-            val = values[i]
-            hash(val)
-
-            if check_null and val != val or val is None:
-                labels[i] = na_sentinel
-                continue
-
-            k = kh_get_pymap(self.table, <PyObject*>val)
-            if k != self.table.n_buckets:
-                idx = self.table.vals[k]
-                labels[i] = idx
-            else:
-                k = kh_put_pymap(self.table, <PyObject*>val, &ret)
-                self.table.vals[k] = count
-                uniques.append(val)
-                labels[i] = count
-                count += 1
-
-        return np.asarray(labels)
\ No newline at end of file
diff --git a/pandas/src/hashtable_func_helper.pxi b/pandas/src/hashtable_func_helper.pxi
deleted file mode 100644
index d05b81acc5dd5..0000000000000
--- a/pandas/src/hashtable_func_helper.pxi
+++ /dev/null
@@ -1,197 +0,0 @@
-"""
-Template for each `dtype` helper function for hashtable
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-#----------------------------------------------------------------------
-# VectorData
-#----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef build_count_table_float64(float64_t[:] values,
-                                 kh_float64_t *table, bint dropna):
-    cdef:
-        khiter_t k
-        Py_ssize_t i, n = len(values)
-        float64_t val
-        int ret = 0
-
-    with nogil:
-        kh_resize_float64(table, n)
-
-        for i in range(n):
-            val = values[i]
-            if val == val or not dropna:
-                k = kh_get_float64(table, val)
-                if k != table.n_buckets:
-                    table.vals[k] += 1
-                else:
-                    k = kh_put_float64(table, val, &ret)
-                    table.vals[k] = 1
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef value_count_float64(float64_t[:] values, bint dropna):
-    cdef:
-        Py_ssize_t i=0
-        kh_float64_t *table
-        float64_t[:] result_keys
-        int64_t[:] result_counts
-        int k
-
-    table = kh_init_float64()
-    build_count_table_float64(values, table, dropna)
-
-    result_keys = np.empty(table.n_occupied, dtype=np.float64)
-    result_counts = np.zeros(table.n_occupied, dtype=np.int64)
-
-    with nogil:
-        for k in range(table.n_buckets):
-            if kh_exist_float64(table, k):
-                result_keys[i] = table.keys[k]
-                result_counts[i] = table.vals[k]
-                i += 1
-    kh_destroy_float64(table)
-
-    return np.asarray(result_keys), np.asarray(result_counts)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def duplicated_float64(float64_t[:] values,
-                         object keep='first'):
-    cdef:
-        int ret = 0, k
-        float64_t value
-        Py_ssize_t i, n = len(values)
-        kh_float64_t * table = kh_init_float64()
-        ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
-
-    kh_resize_float64(table, min(n, _SIZE_HINT_LIMIT))
-
-    if keep not in ('last', 'first', False):
-        raise ValueError('keep must be either "first", "last" or False')
-
-    if keep == 'last':
-        with nogil:
-            for i from n > i >=0:
-                kh_put_float64(table, values[i], &ret)
-                out[i] = ret == 0
-    elif keep == 'first':
-        with nogil:
-            for i from 0 <= i < n:
-                kh_put_float64(table, values[i], &ret)
-                out[i] = ret == 0
-    else:
-        with nogil:
-            for i from 0 <= i < n:
-                value = values[i]
-                k = kh_get_float64(table, value)
-                if k != table.n_buckets:
-                    out[table.vals[k]] = 1
-                    out[i] = 1
-                else:
-                    k = kh_put_float64(table, value, &ret)
-                    table.keys[k] = value
-                    table.vals[k] = i
-                    out[i] = 0
-    kh_destroy_float64(table)
-    return out
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef build_count_table_int64(int64_t[:] values,
-                                 kh_int64_t *table, bint dropna):
-    cdef:
-        khiter_t k
-        Py_ssize_t i, n = len(values)
-        int64_t val
-        int ret = 0
-
-    with nogil:
-        kh_resize_int64(table, n)
-
-        for i in range(n):
-            val = values[i]
-            if val == val or not dropna:
-                k = kh_get_int64(table, val)
-                if k != table.n_buckets:
-                    table.vals[k] += 1
-                else:
-                    k = kh_put_int64(table, val, &ret)
-                    table.vals[k] = 1
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef value_count_int64(int64_t[:] values, bint dropna):
-    cdef:
-        Py_ssize_t i=0
-        kh_int64_t *table
-        int64_t[:] result_keys
-        int64_t[:] result_counts
-        int k
-
-    table = kh_init_int64()
-    build_count_table_int64(values, table, dropna)
-
-    result_keys = np.empty(table.n_occupied, dtype=np.int64)
-    result_counts = np.zeros(table.n_occupied, dtype=np.int64)
-
-    with nogil:
-        for k in range(table.n_buckets):
-            if kh_exist_int64(table, k):
-                result_keys[i] = table.keys[k]
-                result_counts[i] = table.vals[k]
-                i += 1
-    kh_destroy_int64(table)
-
-    return np.asarray(result_keys), np.asarray(result_counts)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def duplicated_int64(int64_t[:] values,
-                         object keep='first'):
-    cdef:
-        int ret = 0, k
-        int64_t value
-        Py_ssize_t i, n = len(values)
-        kh_int64_t * table = kh_init_int64()
-        ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
-
-    kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT))
-
-    if keep not in ('last', 'first', False):
-        raise ValueError('keep must be either "first", "last" or False')
-
-    if keep == 'last':
-        with nogil:
-            for i from n > i >=0:
-                kh_put_int64(table, values[i], &ret)
-                out[i] = ret == 0
-    elif keep == 'first':
-        with nogil:
-            for i from 0 <= i < n:
-                kh_put_int64(table, values[i], &ret)
-                out[i] = ret == 0
-    else:
-        with nogil:
-            for i from 0 <= i < n:
-                value = values[i]
-                k = kh_get_int64(table, value)
-                if k != table.n_buckets:
-                    out[table.vals[k]] = 1
-                    out[i] = 1
-                else:
-                    k = kh_put_int64(table, value, &ret)
-                    table.keys[k] = value
-                    table.vals[k] = i
-                    out[i] = 0
-    kh_destroy_int64(table)
-    return out
diff --git a/pandas/src/join_helper.pxi b/pandas/src/join_helper.pxi
deleted file mode 100644
index 44b8159351492..0000000000000
--- a/pandas/src/join_helper.pxi
+++ /dev/null
@@ -1,1899 +0,0 @@
-"""
-Template for each `dtype` helper function for join
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-#----------------------------------------------------------------------
-# left_join_indexer, inner_join_indexer, outer_join_indexer
-#----------------------------------------------------------------------
-
-# Joins on ordered, unique indices
-
-# right might contain non-unique values
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def left_join_indexer_unique_float64(ndarray[float64_t] left,
-                                      ndarray[float64_t] right):
-    cdef:
-        Py_ssize_t i, j, nleft, nright
-        ndarray[int64_t] indexer
-        float64_t lval, rval
-
-    i = 0
-    j = 0
-    nleft = len(left)
-    nright = len(right)
-
-    indexer = np.empty(nleft, dtype=np.int64)
-    while True:
-        if i == nleft:
-            break
-
-        if j == nright:
-            indexer[i] = -1
-            i += 1
-            continue
-
-        rval = right[j]
-
-        while i < nleft - 1 and left[i] == rval:
-            indexer[i] = j
-            i += 1
-
-        if left[i] == right[j]:
-            indexer[i] = j
-            i += 1
-            while i < nleft - 1 and left[i] == rval:
-                indexer[i] = j
-                i += 1
-            j += 1
-        elif left[i] > rval:
-            indexer[i] = -1
-            j += 1
-        else:
-            indexer[i] = -1
-            i += 1
-    return indexer
-
-
-# @cython.wraparound(False)
-# @cython.boundscheck(False)
-def left_join_indexer_float64(ndarray[float64_t] left,
-                               ndarray[float64_t] right):
-    """
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    """
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        float64_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[float64_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                count += nleft - i
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.float64)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    i += 1
-                    count += 1
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = left[i]
-                count += 1
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def inner_join_indexer_float64(ndarray[float64_t] left,
-                                ndarray[float64_t] right):
-    """
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    """
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        float64_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[float64_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.float64)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def outer_join_indexer_float64(ndarray[float64_t] left,
-                                ndarray[float64_t] right):
-    cdef:
-        Py_ssize_t i, j, nright, nleft, count
-        float64_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[float64_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        count = nright
-    elif nright == 0:
-        count = nleft
-    else:
-        while True:
-            if i == nleft:
-                count += nright - j
-                break
-            if j == nright:
-                count += nleft - i
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                count += 1
-                j += 1
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.float64)
-
-    # do it again, but populate the indexers / result
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        for j in range(nright):
-            lindexer[j] = -1
-            rindexer[j] = j
-            result[j] = right[j]
-    elif nright == 0:
-        for i in range(nleft):
-            lindexer[i] = i
-            rindexer[i] = -1
-            result[i] = left[i]
-    else:
-        while True:
-            if i == nleft:
-                while j < nright:
-                    lindexer[count] = -1
-                    rindexer[count] = j
-                    result[count] = right[j]
-                    count += 1
-                    j += 1
-                break
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    count += 1
-                    i += 1
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = lval
-                count += 1
-                i += 1
-            else:
-                lindexer[count] = -1
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                j += 1
-
-    return result, lindexer, rindexer
-
-# Joins on ordered, unique indices
-
-# right might contain non-unique values
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def left_join_indexer_unique_float32(ndarray[float32_t] left,
-                                      ndarray[float32_t] right):
-    cdef:
-        Py_ssize_t i, j, nleft, nright
-        ndarray[int64_t] indexer
-        float32_t lval, rval
-
-    i = 0
-    j = 0
-    nleft = len(left)
-    nright = len(right)
-
-    indexer = np.empty(nleft, dtype=np.int64)
-    while True:
-        if i == nleft:
-            break
-
-        if j == nright:
-            indexer[i] = -1
-            i += 1
-            continue
-
-        rval = right[j]
-
-        while i < nleft - 1 and left[i] == rval:
-            indexer[i] = j
-            i += 1
-
-        if left[i] == right[j]:
-            indexer[i] = j
-            i += 1
-            while i < nleft - 1 and left[i] == rval:
-                indexer[i] = j
-                i += 1
-            j += 1
-        elif left[i] > rval:
-            indexer[i] = -1
-            j += 1
-        else:
-            indexer[i] = -1
-            i += 1
-    return indexer
-
-
-# @cython.wraparound(False)
-# @cython.boundscheck(False)
-def left_join_indexer_float32(ndarray[float32_t] left,
-                               ndarray[float32_t] right):
-    """
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    """
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        float32_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[float32_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                count += nleft - i
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.float32)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    i += 1
-                    count += 1
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = left[i]
-                count += 1
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def inner_join_indexer_float32(ndarray[float32_t] left,
-                                ndarray[float32_t] right):
-    """
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    """
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        float32_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[float32_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.float32)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def outer_join_indexer_float32(ndarray[float32_t] left,
-                                ndarray[float32_t] right):
-    cdef:
-        Py_ssize_t i, j, nright, nleft, count
-        float32_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[float32_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        count = nright
-    elif nright == 0:
-        count = nleft
-    else:
-        while True:
-            if i == nleft:
-                count += nright - j
-                break
-            if j == nright:
-                count += nleft - i
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                count += 1
-                j += 1
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.float32)
-
-    # do it again, but populate the indexers / result
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        for j in range(nright):
-            lindexer[j] = -1
-            rindexer[j] = j
-            result[j] = right[j]
-    elif nright == 0:
-        for i in range(nleft):
-            lindexer[i] = i
-            rindexer[i] = -1
-            result[i] = left[i]
-    else:
-        while True:
-            if i == nleft:
-                while j < nright:
-                    lindexer[count] = -1
-                    rindexer[count] = j
-                    result[count] = right[j]
-                    count += 1
-                    j += 1
-                break
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    count += 1
-                    i += 1
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = lval
-                count += 1
-                i += 1
-            else:
-                lindexer[count] = -1
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                j += 1
-
-    return result, lindexer, rindexer
-
-# Joins on ordered, unique indices
-
-# right might contain non-unique values
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def left_join_indexer_unique_object(ndarray[object] left,
-                                      ndarray[object] right):
-    cdef:
-        Py_ssize_t i, j, nleft, nright
-        ndarray[int64_t] indexer
-        object lval, rval
-
-    i = 0
-    j = 0
-    nleft = len(left)
-    nright = len(right)
-
-    indexer = np.empty(nleft, dtype=np.int64)
-    while True:
-        if i == nleft:
-            break
-
-        if j == nright:
-            indexer[i] = -1
-            i += 1
-            continue
-
-        rval = right[j]
-
-        while i < nleft - 1 and left[i] == rval:
-            indexer[i] = j
-            i += 1
-
-        if left[i] == right[j]:
-            indexer[i] = j
-            i += 1
-            while i < nleft - 1 and left[i] == rval:
-                indexer[i] = j
-                i += 1
-            j += 1
-        elif left[i] > rval:
-            indexer[i] = -1
-            j += 1
-        else:
-            indexer[i] = -1
-            i += 1
-    return indexer
-
-
-# @cython.wraparound(False)
-# @cython.boundscheck(False)
-def left_join_indexer_object(ndarray[object] left,
-                               ndarray[object] right):
-    """
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    """
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        object lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[object] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                count += nleft - i
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=object)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    i += 1
-                    count += 1
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = left[i]
-                count += 1
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def inner_join_indexer_object(ndarray[object] left,
-                                ndarray[object] right):
-    """
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    """
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        object lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[object] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=object)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def outer_join_indexer_object(ndarray[object] left,
-                                ndarray[object] right):
-    cdef:
-        Py_ssize_t i, j, nright, nleft, count
-        object lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[object] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        count = nright
-    elif nright == 0:
-        count = nleft
-    else:
-        while True:
-            if i == nleft:
-                count += nright - j
-                break
-            if j == nright:
-                count += nleft - i
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                count += 1
-                j += 1
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=object)
-
-    # do it again, but populate the indexers / result
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        for j in range(nright):
-            lindexer[j] = -1
-            rindexer[j] = j
-            result[j] = right[j]
-    elif nright == 0:
-        for i in range(nleft):
-            lindexer[i] = i
-            rindexer[i] = -1
-            result[i] = left[i]
-    else:
-        while True:
-            if i == nleft:
-                while j < nright:
-                    lindexer[count] = -1
-                    rindexer[count] = j
-                    result[count] = right[j]
-                    count += 1
-                    j += 1
-                break
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    count += 1
-                    i += 1
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = lval
-                count += 1
-                i += 1
-            else:
-                lindexer[count] = -1
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                j += 1
-
-    return result, lindexer, rindexer
-
-# Joins on ordered, unique indices
-
-# right might contain non-unique values
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def left_join_indexer_unique_int32(ndarray[int32_t] left,
-                                      ndarray[int32_t] right):
-    cdef:
-        Py_ssize_t i, j, nleft, nright
-        ndarray[int64_t] indexer
-        int32_t lval, rval
-
-    i = 0
-    j = 0
-    nleft = len(left)
-    nright = len(right)
-
-    indexer = np.empty(nleft, dtype=np.int64)
-    while True:
-        if i == nleft:
-            break
-
-        if j == nright:
-            indexer[i] = -1
-            i += 1
-            continue
-
-        rval = right[j]
-
-        while i < nleft - 1 and left[i] == rval:
-            indexer[i] = j
-            i += 1
-
-        if left[i] == right[j]:
-            indexer[i] = j
-            i += 1
-            while i < nleft - 1 and left[i] == rval:
-                indexer[i] = j
-                i += 1
-            j += 1
-        elif left[i] > rval:
-            indexer[i] = -1
-            j += 1
-        else:
-            indexer[i] = -1
-            i += 1
-    return indexer
-
-
-# @cython.wraparound(False)
-# @cython.boundscheck(False)
-def left_join_indexer_int32(ndarray[int32_t] left,
-                               ndarray[int32_t] right):
-    """
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    """
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        int32_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[int32_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                count += nleft - i
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int32)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    i += 1
-                    count += 1
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = left[i]
-                count += 1
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def inner_join_indexer_int32(ndarray[int32_t] left,
-                                ndarray[int32_t] right):
-    """
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    """
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        int32_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[int32_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int32)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def outer_join_indexer_int32(ndarray[int32_t] left,
-                                ndarray[int32_t] right):
-    cdef:
-        Py_ssize_t i, j, nright, nleft, count
-        int32_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[int32_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        count = nright
-    elif nright == 0:
-        count = nleft
-    else:
-        while True:
-            if i == nleft:
-                count += nright - j
-                break
-            if j == nright:
-                count += nleft - i
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                count += 1
-                j += 1
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int32)
-
-    # do it again, but populate the indexers / result
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        for j in range(nright):
-            lindexer[j] = -1
-            rindexer[j] = j
-            result[j] = right[j]
-    elif nright == 0:
-        for i in range(nleft):
-            lindexer[i] = i
-            rindexer[i] = -1
-            result[i] = left[i]
-    else:
-        while True:
-            if i == nleft:
-                while j < nright:
-                    lindexer[count] = -1
-                    rindexer[count] = j
-                    result[count] = right[j]
-                    count += 1
-                    j += 1
-                break
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    count += 1
-                    i += 1
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = lval
-                count += 1
-                i += 1
-            else:
-                lindexer[count] = -1
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                j += 1
-
-    return result, lindexer, rindexer
-
-# Joins on ordered, unique indices
-
-# right might contain non-unique values
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def left_join_indexer_unique_int64(ndarray[int64_t] left,
-                                      ndarray[int64_t] right):
-    cdef:
-        Py_ssize_t i, j, nleft, nright
-        ndarray[int64_t] indexer
-        int64_t lval, rval
-
-    i = 0
-    j = 0
-    nleft = len(left)
-    nright = len(right)
-
-    indexer = np.empty(nleft, dtype=np.int64)
-    while True:
-        if i == nleft:
-            break
-
-        if j == nright:
-            indexer[i] = -1
-            i += 1
-            continue
-
-        rval = right[j]
-
-        while i < nleft - 1 and left[i] == rval:
-            indexer[i] = j
-            i += 1
-
-        if left[i] == right[j]:
-            indexer[i] = j
-            i += 1
-            while i < nleft - 1 and left[i] == rval:
-                indexer[i] = j
-                i += 1
-            j += 1
-        elif left[i] > rval:
-            indexer[i] = -1
-            j += 1
-        else:
-            indexer[i] = -1
-            i += 1
-    return indexer
-
-
-# @cython.wraparound(False)
-# @cython.boundscheck(False)
-def left_join_indexer_int64(ndarray[int64_t] left,
-                               ndarray[int64_t] right):
-    """
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    """
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        int64_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[int64_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                count += nleft - i
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int64)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    i += 1
-                    count += 1
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = left[i]
-                count += 1
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def inner_join_indexer_int64(ndarray[int64_t] left,
-                                ndarray[int64_t] right):
-    """
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    """
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        int64_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[int64_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int64)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def outer_join_indexer_int64(ndarray[int64_t] left,
-                                ndarray[int64_t] right):
-    cdef:
-        Py_ssize_t i, j, nright, nleft, count
-        int64_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[int64_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        count = nright
-    elif nright == 0:
-        count = nleft
-    else:
-        while True:
-            if i == nleft:
-                count += nright - j
-                break
-            if j == nright:
-                count += nleft - i
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                count += 1
-                j += 1
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int64)
-
-    # do it again, but populate the indexers / result
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        for j in range(nright):
-            lindexer[j] = -1
-            rindexer[j] = j
-            result[j] = right[j]
-    elif nright == 0:
-        for i in range(nleft):
-            lindexer[i] = i
-            rindexer[i] = -1
-            result[i] = left[i]
-    else:
-        while True:
-            if i == nleft:
-                while j < nright:
-                    lindexer[count] = -1
-                    rindexer[count] = j
-                    result[count] = right[j]
-                    count += 1
-                    j += 1
-                break
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    count += 1
-                    i += 1
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = lval
-                count += 1
-                i += 1
-            else:
-                lindexer[count] = -1
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                j += 1
-
-    return result, lindexer, rindexer
diff --git a/pandas/src/joins_func_helper.pxi b/pandas/src/joins_func_helper.pxi
deleted file mode 100644
index 7a59da37c5ced..0000000000000
--- a/pandas/src/joins_func_helper.pxi
+++ /dev/null
@@ -1,373 +0,0 @@
-"""
-Template for each `dtype` helper function for hashtable
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-#----------------------------------------------------------------------
-# asof_join_by
-#----------------------------------------------------------------------
-
-
-from hashtable cimport *
-
-
-def asof_join_int64_t_by_object(ndarray[int64_t] left_values,
-        ndarray[int64_t] right_values,
-        ndarray[object] left_by_values,
-        ndarray[object] right_by_values,
-        bint allow_exact_matches=1,
-        tolerance=None):
-
-    cdef:
-        Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
-        ndarray[int64_t] left_indexer, right_indexer
-        bint has_tolerance = 0
-        int64_t tolerance_
-        PyObjectHashTable hash_table
-        object by_value
-
-    # if we are using tolerance, set our objects
-    if tolerance is not None:
-        has_tolerance = 1
-        tolerance_ = tolerance
-
-    left_size = len(left_values)
-    right_size = len(right_values)
-
-    left_indexer = np.empty(left_size, dtype=np.int64)
-    right_indexer = np.empty(left_size, dtype=np.int64)
-
-    hash_table = PyObjectHashTable(right_size)
-
-    right_pos = 0
-    for left_pos in range(left_size):
-        # restart right_pos if it went negative in a previous iteration
-        if right_pos < 0:
-            right_pos = 0
-
-        # find last position in right whose value is less than left's value
-        if allow_exact_matches:
-            while right_pos < right_size and\
-                right_values[right_pos] <= left_values[left_pos]:
-                hash_table.set_item(right_by_values[right_pos], right_pos)
-                right_pos += 1
-        else:
-            while right_pos < right_size and\
-                right_values[right_pos] < left_values[left_pos]:
-                hash_table.set_item(right_by_values[right_pos], right_pos)
-                right_pos += 1
-        right_pos -= 1
-
-        # save positions as the desired index
-        by_value = left_by_values[left_pos]
-        found_right_pos = hash_table.get_item(by_value)\
-                          if by_value in hash_table else -1
-        left_indexer[left_pos] = left_pos
-        right_indexer[left_pos] = found_right_pos
-
-        # if needed, verify that tolerance is met
-        if has_tolerance and found_right_pos != -1:
-            diff = left_values[left_pos] - right_values[found_right_pos]
-            if diff > tolerance_:
-                right_indexer[left_pos] = -1
-
-    return left_indexer, right_indexer
-
-
-def asof_join_double_by_object(ndarray[double] left_values,
-        ndarray[double] right_values,
-        ndarray[object] left_by_values,
-        ndarray[object] right_by_values,
-        bint allow_exact_matches=1,
-        tolerance=None):
-
-    cdef:
-        Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
-        ndarray[int64_t] left_indexer, right_indexer
-        bint has_tolerance = 0
-        double tolerance_
-        PyObjectHashTable hash_table
-        object by_value
-
-    # if we are using tolerance, set our objects
-    if tolerance is not None:
-        has_tolerance = 1
-        tolerance_ = tolerance
-
-    left_size = len(left_values)
-    right_size = len(right_values)
-
-    left_indexer = np.empty(left_size, dtype=np.int64)
-    right_indexer = np.empty(left_size, dtype=np.int64)
-
-    hash_table = PyObjectHashTable(right_size)
-
-    right_pos = 0
-    for left_pos in range(left_size):
-        # restart right_pos if it went negative in a previous iteration
-        if right_pos < 0:
-            right_pos = 0
-
-        # find last position in right whose value is less than left's value
-        if allow_exact_matches:
-            while right_pos < right_size and\
-                right_values[right_pos] <= left_values[left_pos]:
-                hash_table.set_item(right_by_values[right_pos], right_pos)
-                right_pos += 1
-        else:
-            while right_pos < right_size and\
-                right_values[right_pos] < left_values[left_pos]:
-                hash_table.set_item(right_by_values[right_pos], right_pos)
-                right_pos += 1
-        right_pos -= 1
-
-        # save positions as the desired index
-        by_value = left_by_values[left_pos]
-        found_right_pos = hash_table.get_item(by_value)\
-                          if by_value in hash_table else -1
-        left_indexer[left_pos] = left_pos
-        right_indexer[left_pos] = found_right_pos
-
-        # if needed, verify that tolerance is met
-        if has_tolerance and found_right_pos != -1:
-            diff = left_values[left_pos] - right_values[found_right_pos]
-            if diff > tolerance_:
-                right_indexer[left_pos] = -1
-
-    return left_indexer, right_indexer
-
-
-def asof_join_int64_t_by_int64_t(ndarray[int64_t] left_values,
-        ndarray[int64_t] right_values,
-        ndarray[int64_t] left_by_values,
-        ndarray[int64_t] right_by_values,
-        bint allow_exact_matches=1,
-        tolerance=None):
-
-    cdef:
-        Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
-        ndarray[int64_t] left_indexer, right_indexer
-        bint has_tolerance = 0
-        int64_t tolerance_
-        Int64HashTable hash_table
-        int64_t by_value
-
-    # if we are using tolerance, set our objects
-    if tolerance is not None:
-        has_tolerance = 1
-        tolerance_ = tolerance
-
-    left_size = len(left_values)
-    right_size = len(right_values)
-
-    left_indexer = np.empty(left_size, dtype=np.int64)
-    right_indexer = np.empty(left_size, dtype=np.int64)
-
-    hash_table = Int64HashTable(right_size)
-
-    right_pos = 0
-    for left_pos in range(left_size):
-        # restart right_pos if it went negative in a previous iteration
-        if right_pos < 0:
-            right_pos = 0
-
-        # find last position in right whose value is less than left's value
-        if allow_exact_matches:
-            while right_pos < right_size and\
-                right_values[right_pos] <= left_values[left_pos]:
-                hash_table.set_item(right_by_values[right_pos], right_pos)
-                right_pos += 1
-        else:
-            while right_pos < right_size and\
-                right_values[right_pos] < left_values[left_pos]:
-                hash_table.set_item(right_by_values[right_pos], right_pos)
-                right_pos += 1
-        right_pos -= 1
-
-        # save positions as the desired index
-        by_value = left_by_values[left_pos]
-        found_right_pos = hash_table.get_item(by_value)\
-                          if by_value in hash_table else -1
-        left_indexer[left_pos] = left_pos
-        right_indexer[left_pos] = found_right_pos
-
-        # if needed, verify that tolerance is met
-        if has_tolerance and found_right_pos != -1:
-            diff = left_values[left_pos] - right_values[found_right_pos]
-            if diff > tolerance_:
-                right_indexer[left_pos] = -1
-
-    return left_indexer, right_indexer
-
-
-def asof_join_double_by_int64_t(ndarray[double] left_values,
-        ndarray[double] right_values,
-        ndarray[int64_t] left_by_values,
-        ndarray[int64_t] right_by_values,
-        bint allow_exact_matches=1,
-        tolerance=None):
-
-    cdef:
-        Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
-        ndarray[int64_t] left_indexer, right_indexer
-        bint has_tolerance = 0
-        double tolerance_
-        Int64HashTable hash_table
-        int64_t by_value
-
-    # if we are using tolerance, set our objects
-    if tolerance is not None:
-        has_tolerance = 1
-        tolerance_ = tolerance
-
-    left_size = len(left_values)
-    right_size = len(right_values)
-
-    left_indexer = np.empty(left_size, dtype=np.int64)
-    right_indexer = np.empty(left_size, dtype=np.int64)
-
-    hash_table = Int64HashTable(right_size)
-
-    right_pos = 0
-    for left_pos in range(left_size):
-        # restart right_pos if it went negative in a previous iteration
-        if right_pos < 0:
-            right_pos = 0
-
-        # find last position in right whose value is less than left's value
-        if allow_exact_matches:
-            while right_pos < right_size and\
-                right_values[right_pos] <= left_values[left_pos]:
-                hash_table.set_item(right_by_values[right_pos], right_pos)
-                right_pos += 1
-        else:
-            while right_pos < right_size and\
-                right_values[right_pos] < left_values[left_pos]:
-                hash_table.set_item(right_by_values[right_pos], right_pos)
-                right_pos += 1
-        right_pos -= 1
-
-        # save positions as the desired index
-        by_value = left_by_values[left_pos]
-        found_right_pos = hash_table.get_item(by_value)\
-                          if by_value in hash_table else -1
-        left_indexer[left_pos] = left_pos
-        right_indexer[left_pos] = found_right_pos
-
-        # if needed, verify that tolerance is met
-        if has_tolerance and found_right_pos != -1:
-            diff = left_values[left_pos] - right_values[found_right_pos]
-            if diff > tolerance_:
-                right_indexer[left_pos] = -1
-
-    return left_indexer, right_indexer
-
-
-#----------------------------------------------------------------------
-# asof_join
-#----------------------------------------------------------------------
-
-
-def asof_join_int64_t(ndarray[int64_t] left_values,
-        ndarray[int64_t] right_values,
-        bint allow_exact_matches=1,
-        tolerance=None):
-
-    cdef:
-        Py_ssize_t left_pos, right_pos, left_size, right_size
-        ndarray[int64_t] left_indexer, right_indexer
-        bint has_tolerance = 0
-        int64_t tolerance_
-
-    # if we are using tolerance, set our objects
-    if tolerance is not None:
-        has_tolerance = 1
-        tolerance_ = tolerance
-
-    left_size = len(left_values)
-    right_size = len(right_values)
-
-    left_indexer = np.empty(left_size, dtype=np.int64)
-    right_indexer = np.empty(left_size, dtype=np.int64)
-
-    right_pos = 0
-    for left_pos in range(left_size):
-        # restart right_pos if it went negative in a previous iteration
-        if right_pos < 0:
-            right_pos = 0
-
-        # find last position in right whose value is less than left's value
-        if allow_exact_matches:
-            while right_pos < right_size and\
-                right_values[right_pos] <= left_values[left_pos]:
-                right_pos += 1
-        else:
-            while right_pos < right_size and\
-                right_values[right_pos] < left_values[left_pos]:
-                right_pos += 1
-        right_pos -= 1
-
-        # save positions as the desired index
-        left_indexer[left_pos] = left_pos
-        right_indexer[left_pos] = right_pos
-
-        # if needed, verify that tolerance is met
-        if has_tolerance and right_pos != -1:
-            diff = left_values[left_pos] - right_values[right_pos]
-            if diff > tolerance_:
-                right_indexer[left_pos] = -1
-
-    return left_indexer, right_indexer
-
-
-def asof_join_double(ndarray[double] left_values,
-        ndarray[double] right_values,
-        bint allow_exact_matches=1,
-        tolerance=None):
-
-    cdef:
-        Py_ssize_t left_pos, right_pos, left_size, right_size
-        ndarray[int64_t] left_indexer, right_indexer
-        bint has_tolerance = 0
-        double tolerance_
-
-    # if we are using tolerance, set our objects
-    if tolerance is not None:
-        has_tolerance = 1
-        tolerance_ = tolerance
-
-    left_size = len(left_values)
-    right_size = len(right_values)
-
-    left_indexer = np.empty(left_size, dtype=np.int64)
-    right_indexer = np.empty(left_size, dtype=np.int64)
-
-    right_pos = 0
-    for left_pos in range(left_size):
-        # restart right_pos if it went negative in a previous iteration
-        if right_pos < 0:
-            right_pos = 0
-
-        # find last position in right whose value is less than left's value
-        if allow_exact_matches:
-            while right_pos < right_size and\
-                right_values[right_pos] <= left_values[left_pos]:
-                right_pos += 1
-        else:
-            while right_pos < right_size and\
-                right_values[right_pos] < left_values[left_pos]:
-                right_pos += 1
-        right_pos -= 1
-
-        # save positions as the desired index
-        left_indexer[left_pos] = left_pos
-        right_indexer[left_pos] = right_pos
-
-        # if needed, verify that tolerance is met
-        if has_tolerance and right_pos != -1:
-            diff = left_values[left_pos] - right_values[right_pos]
-            if diff > tolerance_:
-                right_indexer[left_pos] = -1
-
-    return left_indexer, right_indexer
diff --git a/pandas/src/sparse_op_helper.pxi b/pandas/src/sparse_op_helper.pxi
deleted file mode 100644
index 8462c31c84679..0000000000000
--- a/pandas/src/sparse_op_helper.pxi
+++ /dev/null
@@ -1,5864 +0,0 @@
-"""
-Template for each `dtype` helper function for sparse ops
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-#----------------------------------------------------------------------
-# Sparse op
-#----------------------------------------------------------------------
-
-cdef inline float64_t __div_float64(float64_t a, float64_t b):
-    if b == 0:
-        if a > 0:
-            return INF
-        elif a < 0:
-            return -INF
-        else:
-            return NaN
-    else:
-        return float(a) / b
-
-cdef inline float64_t __truediv_float64(float64_t a, float64_t b):
-    return __div_float64(a, b)
-
-cdef inline float64_t __floordiv_float64(float64_t a, float64_t b):
-    if b == 0:
-        # numpy >= 1.11 returns NaN
-        # for a // 0, rather than +-inf
-        if _np_version_under1p11:
-            if a > 0:
-                return INF
-            elif a < 0:
-                return -INF
-        return NaN
-    else:
-        return a // b
-
-cdef inline float64_t __mod_float64(float64_t a, float64_t b):
-    if b == 0:
-        return NaN
-    else:
-        return a % b
-
-cdef inline float64_t __div_int64(int64_t a, int64_t b):
-    if b == 0:
-        if a > 0:
-            return INF
-        elif a < 0:
-            return -INF
-        else:
-            return NaN
-    else:
-        return float(a) / b
-
-cdef inline float64_t __truediv_int64(int64_t a, int64_t b):
-    return __div_int64(a, b)
-
-cdef inline int64_t __floordiv_int64(int64_t a, int64_t b):
-    if b == 0:
-        return 0
-    else:
-        return a // b
-
-cdef inline int64_t __mod_int64(int64_t a, int64_t b):
-    if b == 0:
-        return 0
-    else:
-        return a % b
-
-#----------------------------------------------------------------------
-# sparse array op
-#----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_add_float64(ndarray x_,
-                                                BlockIndex xindex,
-                                                float64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                float64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[float64_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.float64)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] + yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill + y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] + y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] + yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill + y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill + yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_add_float64(ndarray x_, IntIndex xindex,
-                                              float64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              float64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[float64_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.float64)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill + y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] + yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] + y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] + yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill + y[yi]
-            yi += 1
-
-    return out, out_index, xfill + yfill
-
-
-cpdef sparse_add_float64(ndarray[float64_t, ndim=1] x,
-                                  SparseIndex xindex, float64_t xfill,
-                                  ndarray[float64_t, ndim=1] y,
-                                  SparseIndex yindex, float64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_add_float64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_add_float64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_add_float64(float64_t xfill,
-                                       float64_t yfill):
-    return xfill + yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_add_int64(ndarray x_,
-                                                BlockIndex xindex,
-                                                int64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                int64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[int64_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.int64)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] + yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill + y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] + y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] + yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill + y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill + yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_add_int64(ndarray x_, IntIndex xindex,
-                                              int64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              int64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[int64_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.int64)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill + y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] + yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] + y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] + yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill + y[yi]
-            yi += 1
-
-    return out, out_index, xfill + yfill
-
-
-cpdef sparse_add_int64(ndarray[int64_t, ndim=1] x,
-                                  SparseIndex xindex, int64_t xfill,
-                                  ndarray[int64_t, ndim=1] y,
-                                  SparseIndex yindex, int64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_add_int64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_add_int64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_add_int64(int64_t xfill,
-                                       int64_t yfill):
-    return xfill + yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_sub_float64(ndarray x_,
-                                                BlockIndex xindex,
-                                                float64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                float64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[float64_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.float64)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] - yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill - y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] - y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] - yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill - y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill - yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_sub_float64(ndarray x_, IntIndex xindex,
-                                              float64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              float64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[float64_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.float64)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill - y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] - yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] - y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] - yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill - y[yi]
-            yi += 1
-
-    return out, out_index, xfill - yfill
-
-
-cpdef sparse_sub_float64(ndarray[float64_t, ndim=1] x,
-                                  SparseIndex xindex, float64_t xfill,
-                                  ndarray[float64_t, ndim=1] y,
-                                  SparseIndex yindex, float64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_sub_float64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_sub_float64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_sub_float64(float64_t xfill,
-                                       float64_t yfill):
-    return xfill - yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_sub_int64(ndarray x_,
-                                                BlockIndex xindex,
-                                                int64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                int64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[int64_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.int64)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] - yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill - y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] - y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] - yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill - y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill - yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_sub_int64(ndarray x_, IntIndex xindex,
-                                              int64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              int64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[int64_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.int64)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill - y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] - yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] - y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] - yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill - y[yi]
-            yi += 1
-
-    return out, out_index, xfill - yfill
-
-
-cpdef sparse_sub_int64(ndarray[int64_t, ndim=1] x,
-                                  SparseIndex xindex, int64_t xfill,
-                                  ndarray[int64_t, ndim=1] y,
-                                  SparseIndex yindex, int64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_sub_int64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_sub_int64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_sub_int64(int64_t xfill,
-                                       int64_t yfill):
-    return xfill - yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_mul_float64(ndarray x_,
-                                                BlockIndex xindex,
-                                                float64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                float64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[float64_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.float64)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] * yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill * y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] * y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] * yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill * y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill * yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_mul_float64(ndarray x_, IntIndex xindex,
-                                              float64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              float64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[float64_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.float64)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill * y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] * yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] * y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] * yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill * y[yi]
-            yi += 1
-
-    return out, out_index, xfill * yfill
-
-
-cpdef sparse_mul_float64(ndarray[float64_t, ndim=1] x,
-                                  SparseIndex xindex, float64_t xfill,
-                                  ndarray[float64_t, ndim=1] y,
-                                  SparseIndex yindex, float64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_mul_float64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_mul_float64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_mul_float64(float64_t xfill,
-                                       float64_t yfill):
-    return xfill * yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_mul_int64(ndarray x_,
-                                                BlockIndex xindex,
-                                                int64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                int64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[int64_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.int64)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] * yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill * y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] * y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] * yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill * y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill * yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_mul_int64(ndarray x_, IntIndex xindex,
-                                              int64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              int64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[int64_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.int64)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill * y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] * yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] * y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] * yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill * y[yi]
-            yi += 1
-
-    return out, out_index, xfill * yfill
-
-
-cpdef sparse_mul_int64(ndarray[int64_t, ndim=1] x,
-                                  SparseIndex xindex, int64_t xfill,
-                                  ndarray[int64_t, ndim=1] y,
-                                  SparseIndex yindex, int64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_mul_int64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_mul_int64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_mul_int64(int64_t xfill,
-                                       int64_t yfill):
-    return xfill * yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_div_float64(ndarray x_,
-                                                BlockIndex xindex,
-                                                float64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                float64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[float64_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.float64)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = __div_float64(x[xi], yfill)
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = __div_float64(xfill, y[yi])
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = __div_float64(x[xi], y[yi])
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = __div_float64(x[xi], yfill)
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = __div_float64(xfill, y[yi])
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, __div_float64(xfill, yfill)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_div_float64(ndarray x_, IntIndex xindex,
-                                              float64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              float64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[float64_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.float64)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = __div_float64(xfill, y[yi])
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = __div_float64(x[xi], yfill)
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = __div_float64(x[xi], y[yi])
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = __div_float64(x[xi], yfill)
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = __div_float64(xfill, y[yi])
-            yi += 1
-
-    return out, out_index, __div_float64(xfill, yfill)
-
-
-cpdef sparse_div_float64(ndarray[float64_t, ndim=1] x,
-                                  SparseIndex xindex, float64_t xfill,
-                                  ndarray[float64_t, ndim=1] y,
-                                  SparseIndex yindex, float64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_div_float64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_div_float64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_div_float64(float64_t xfill,
-                                       float64_t yfill):
-    return __div_float64(xfill, yfill)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_div_int64(ndarray x_,
-                                                BlockIndex xindex,
-                                                int64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                int64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[float64_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.float64)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = __div_int64(x[xi], yfill)
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = __div_int64(xfill, y[yi])
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = __div_int64(x[xi], y[yi])
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = __div_int64(x[xi], yfill)
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = __div_int64(xfill, y[yi])
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, __div_int64(xfill, yfill)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_div_int64(ndarray x_, IntIndex xindex,
-                                              int64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              int64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[float64_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.float64)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = __div_int64(xfill, y[yi])
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = __div_int64(x[xi], yfill)
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = __div_int64(x[xi], y[yi])
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = __div_int64(x[xi], yfill)
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = __div_int64(xfill, y[yi])
-            yi += 1
-
-    return out, out_index, __div_int64(xfill, yfill)
-
-
-cpdef sparse_div_int64(ndarray[int64_t, ndim=1] x,
-                                  SparseIndex xindex, int64_t xfill,
-                                  ndarray[int64_t, ndim=1] y,
-                                  SparseIndex yindex, int64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_div_int64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_div_int64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_div_int64(int64_t xfill,
-                                       int64_t yfill):
-    return __div_int64(xfill, yfill)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_mod_float64(ndarray x_,
-                                                BlockIndex xindex,
-                                                float64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                float64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[float64_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.float64)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = __mod_float64(x[xi], yfill)
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = __mod_float64(xfill, y[yi])
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = __mod_float64(x[xi], y[yi])
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = __mod_float64(x[xi], yfill)
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = __mod_float64(xfill, y[yi])
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, __mod_float64(xfill, yfill)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_mod_float64(ndarray x_, IntIndex xindex,
-                                              float64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              float64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[float64_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.float64)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = __mod_float64(xfill, y[yi])
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = __mod_float64(x[xi], yfill)
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = __mod_float64(x[xi], y[yi])
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = __mod_float64(x[xi], yfill)
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = __mod_float64(xfill, y[yi])
-            yi += 1
-
-    return out, out_index, __mod_float64(xfill, yfill)
-
-
-cpdef sparse_mod_float64(ndarray[float64_t, ndim=1] x,
-                                  SparseIndex xindex, float64_t xfill,
-                                  ndarray[float64_t, ndim=1] y,
-                                  SparseIndex yindex, float64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_mod_float64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_mod_float64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_mod_float64(float64_t xfill,
-                                       float64_t yfill):
-    return __mod_float64(xfill, yfill)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_mod_int64(ndarray x_,
-                                                BlockIndex xindex,
-                                                int64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                int64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[int64_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.int64)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = __mod_int64(x[xi], yfill)
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = __mod_int64(xfill, y[yi])
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = __mod_int64(x[xi], y[yi])
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = __mod_int64(x[xi], yfill)
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = __mod_int64(xfill, y[yi])
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, __mod_int64(xfill, yfill)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_mod_int64(ndarray x_, IntIndex xindex,
-                                              int64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              int64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[int64_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.int64)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = __mod_int64(xfill, y[yi])
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = __mod_int64(x[xi], yfill)
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = __mod_int64(x[xi], y[yi])
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = __mod_int64(x[xi], yfill)
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = __mod_int64(xfill, y[yi])
-            yi += 1
-
-    return out, out_index, __mod_int64(xfill, yfill)
-
-
-cpdef sparse_mod_int64(ndarray[int64_t, ndim=1] x,
-                                  SparseIndex xindex, int64_t xfill,
-                                  ndarray[int64_t, ndim=1] y,
-                                  SparseIndex yindex, int64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_mod_int64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_mod_int64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_mod_int64(int64_t xfill,
-                                       int64_t yfill):
-    return __mod_int64(xfill, yfill)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_truediv_float64(ndarray x_,
-                                                BlockIndex xindex,
-                                                float64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                float64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[float64_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.float64)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = __truediv_float64(x[xi], yfill)
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = __truediv_float64(xfill, y[yi])
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = __truediv_float64(x[xi], y[yi])
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = __truediv_float64(x[xi], yfill)
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = __truediv_float64(xfill, y[yi])
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, __truediv_float64(xfill, yfill)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_truediv_float64(ndarray x_, IntIndex xindex,
-                                              float64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              float64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[float64_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.float64)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = __truediv_float64(xfill, y[yi])
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = __truediv_float64(x[xi], yfill)
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = __truediv_float64(x[xi], y[yi])
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = __truediv_float64(x[xi], yfill)
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = __truediv_float64(xfill, y[yi])
-            yi += 1
-
-    return out, out_index, __truediv_float64(xfill, yfill)
-
-
-cpdef sparse_truediv_float64(ndarray[float64_t, ndim=1] x,
-                                  SparseIndex xindex, float64_t xfill,
-                                  ndarray[float64_t, ndim=1] y,
-                                  SparseIndex yindex, float64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_truediv_float64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_truediv_float64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_truediv_float64(float64_t xfill,
-                                       float64_t yfill):
-    return __truediv_float64(xfill, yfill)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_truediv_int64(ndarray x_,
-                                                BlockIndex xindex,
-                                                int64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                int64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[float64_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.float64)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = __truediv_int64(x[xi], yfill)
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = __truediv_int64(xfill, y[yi])
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = __truediv_int64(x[xi], y[yi])
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = __truediv_int64(x[xi], yfill)
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = __truediv_int64(xfill, y[yi])
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, __truediv_int64(xfill, yfill)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_truediv_int64(ndarray x_, IntIndex xindex,
-                                              int64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              int64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[float64_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.float64)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = __truediv_int64(xfill, y[yi])
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = __truediv_int64(x[xi], yfill)
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = __truediv_int64(x[xi], y[yi])
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = __truediv_int64(x[xi], yfill)
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = __truediv_int64(xfill, y[yi])
-            yi += 1
-
-    return out, out_index, __truediv_int64(xfill, yfill)
-
-
-cpdef sparse_truediv_int64(ndarray[int64_t, ndim=1] x,
-                                  SparseIndex xindex, int64_t xfill,
-                                  ndarray[int64_t, ndim=1] y,
-                                  SparseIndex yindex, int64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_truediv_int64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_truediv_int64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_truediv_int64(int64_t xfill,
-                                       int64_t yfill):
-    return __truediv_int64(xfill, yfill)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_floordiv_float64(ndarray x_,
-                                                BlockIndex xindex,
-                                                float64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                float64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[float64_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.float64)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = __floordiv_float64(x[xi], yfill)
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = __floordiv_float64(xfill, y[yi])
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = __floordiv_float64(x[xi], y[yi])
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = __floordiv_float64(x[xi], yfill)
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = __floordiv_float64(xfill, y[yi])
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, __floordiv_float64(xfill, yfill)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_floordiv_float64(ndarray x_, IntIndex xindex,
-                                              float64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              float64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[float64_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.float64)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = __floordiv_float64(xfill, y[yi])
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = __floordiv_float64(x[xi], yfill)
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = __floordiv_float64(x[xi], y[yi])
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = __floordiv_float64(x[xi], yfill)
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = __floordiv_float64(xfill, y[yi])
-            yi += 1
-
-    return out, out_index, __floordiv_float64(xfill, yfill)
-
-
-cpdef sparse_floordiv_float64(ndarray[float64_t, ndim=1] x,
-                                  SparseIndex xindex, float64_t xfill,
-                                  ndarray[float64_t, ndim=1] y,
-                                  SparseIndex yindex, float64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_floordiv_float64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_floordiv_float64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_floordiv_float64(float64_t xfill,
-                                       float64_t yfill):
-    return __floordiv_float64(xfill, yfill)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_floordiv_int64(ndarray x_,
-                                                BlockIndex xindex,
-                                                int64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                int64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[int64_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.int64)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = __floordiv_int64(x[xi], yfill)
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = __floordiv_int64(xfill, y[yi])
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = __floordiv_int64(x[xi], y[yi])
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = __floordiv_int64(x[xi], yfill)
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = __floordiv_int64(xfill, y[yi])
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, __floordiv_int64(xfill, yfill)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_floordiv_int64(ndarray x_, IntIndex xindex,
-                                              int64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              int64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[int64_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.int64)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = __floordiv_int64(xfill, y[yi])
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = __floordiv_int64(x[xi], yfill)
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = __floordiv_int64(x[xi], y[yi])
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = __floordiv_int64(x[xi], yfill)
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = __floordiv_int64(xfill, y[yi])
-            yi += 1
-
-    return out, out_index, __floordiv_int64(xfill, yfill)
-
-
-cpdef sparse_floordiv_int64(ndarray[int64_t, ndim=1] x,
-                                  SparseIndex xindex, int64_t xfill,
-                                  ndarray[int64_t, ndim=1] y,
-                                  SparseIndex yindex, int64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_floordiv_int64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_floordiv_int64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_floordiv_int64(int64_t xfill,
-                                       int64_t yfill):
-    return __floordiv_int64(xfill, yfill)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_pow_float64(ndarray x_,
-                                                BlockIndex xindex,
-                                                float64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                float64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[float64_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.float64)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] ** yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill ** y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] ** y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] ** yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill ** y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill ** yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_pow_float64(ndarray x_, IntIndex xindex,
-                                              float64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              float64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[float64_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.float64)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill ** y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] ** yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] ** y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] ** yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill ** y[yi]
-            yi += 1
-
-    return out, out_index, xfill ** yfill
-
-
-cpdef sparse_pow_float64(ndarray[float64_t, ndim=1] x,
-                                  SparseIndex xindex, float64_t xfill,
-                                  ndarray[float64_t, ndim=1] y,
-                                  SparseIndex yindex, float64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_pow_float64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_pow_float64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_pow_float64(float64_t xfill,
-                                       float64_t yfill):
-    return xfill ** yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_pow_int64(ndarray x_,
-                                                BlockIndex xindex,
-                                                int64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                int64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[int64_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.int64)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] ** yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill ** y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] ** y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] ** yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill ** y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill ** yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_pow_int64(ndarray x_, IntIndex xindex,
-                                              int64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              int64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[int64_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.int64)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill ** y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] ** yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] ** y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] ** yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill ** y[yi]
-            yi += 1
-
-    return out, out_index, xfill ** yfill
-
-
-cpdef sparse_pow_int64(ndarray[int64_t, ndim=1] x,
-                                  SparseIndex xindex, int64_t xfill,
-                                  ndarray[int64_t, ndim=1] y,
-                                  SparseIndex yindex, int64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_pow_int64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_pow_int64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_pow_int64(int64_t xfill,
-                                       int64_t yfill):
-    return xfill ** yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_eq_float64(ndarray x_,
-                                                BlockIndex xindex,
-                                                float64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                float64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] == yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill == y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] == y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] == yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill == y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill == yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_eq_float64(ndarray x_, IntIndex xindex,
-                                              float64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              float64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill == y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] == yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] == y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] == yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill == y[yi]
-            yi += 1
-
-    return out, out_index, xfill == yfill
-
-
-cpdef sparse_eq_float64(ndarray[float64_t, ndim=1] x,
-                                  SparseIndex xindex, float64_t xfill,
-                                  ndarray[float64_t, ndim=1] y,
-                                  SparseIndex yindex, float64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_eq_float64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_eq_float64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_eq_float64(float64_t xfill,
-                                       float64_t yfill):
-    return xfill == yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_eq_int64(ndarray x_,
-                                                BlockIndex xindex,
-                                                int64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                int64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] == yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill == y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] == y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] == yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill == y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill == yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_eq_int64(ndarray x_, IntIndex xindex,
-                                              int64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              int64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill == y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] == yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] == y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] == yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill == y[yi]
-            yi += 1
-
-    return out, out_index, xfill == yfill
-
-
-cpdef sparse_eq_int64(ndarray[int64_t, ndim=1] x,
-                                  SparseIndex xindex, int64_t xfill,
-                                  ndarray[int64_t, ndim=1] y,
-                                  SparseIndex yindex, int64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_eq_int64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_eq_int64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_eq_int64(int64_t xfill,
-                                       int64_t yfill):
-    return xfill == yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_ne_float64(ndarray x_,
-                                                BlockIndex xindex,
-                                                float64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                float64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] != yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill != y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] != y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] != yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill != y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill != yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_ne_float64(ndarray x_, IntIndex xindex,
-                                              float64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              float64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill != y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] != yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] != y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] != yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill != y[yi]
-            yi += 1
-
-    return out, out_index, xfill != yfill
-
-
-cpdef sparse_ne_float64(ndarray[float64_t, ndim=1] x,
-                                  SparseIndex xindex, float64_t xfill,
-                                  ndarray[float64_t, ndim=1] y,
-                                  SparseIndex yindex, float64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_ne_float64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_ne_float64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_ne_float64(float64_t xfill,
-                                       float64_t yfill):
-    return xfill != yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_ne_int64(ndarray x_,
-                                                BlockIndex xindex,
-                                                int64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                int64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] != yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill != y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] != y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] != yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill != y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill != yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_ne_int64(ndarray x_, IntIndex xindex,
-                                              int64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              int64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill != y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] != yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] != y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] != yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill != y[yi]
-            yi += 1
-
-    return out, out_index, xfill != yfill
-
-
-cpdef sparse_ne_int64(ndarray[int64_t, ndim=1] x,
-                                  SparseIndex xindex, int64_t xfill,
-                                  ndarray[int64_t, ndim=1] y,
-                                  SparseIndex yindex, int64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_ne_int64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_ne_int64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_ne_int64(int64_t xfill,
-                                       int64_t yfill):
-    return xfill != yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_lt_float64(ndarray x_,
-                                                BlockIndex xindex,
-                                                float64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                float64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] < yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill < y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] < y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] < yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill < y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill < yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_lt_float64(ndarray x_, IntIndex xindex,
-                                              float64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              float64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill < y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] < yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] < y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] < yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill < y[yi]
-            yi += 1
-
-    return out, out_index, xfill < yfill
-
-
-cpdef sparse_lt_float64(ndarray[float64_t, ndim=1] x,
-                                  SparseIndex xindex, float64_t xfill,
-                                  ndarray[float64_t, ndim=1] y,
-                                  SparseIndex yindex, float64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_lt_float64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_lt_float64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_lt_float64(float64_t xfill,
-                                       float64_t yfill):
-    return xfill < yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_lt_int64(ndarray x_,
-                                                BlockIndex xindex,
-                                                int64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                int64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] < yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill < y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] < y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] < yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill < y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill < yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_lt_int64(ndarray x_, IntIndex xindex,
-                                              int64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              int64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill < y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] < yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] < y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] < yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill < y[yi]
-            yi += 1
-
-    return out, out_index, xfill < yfill
-
-
-cpdef sparse_lt_int64(ndarray[int64_t, ndim=1] x,
-                                  SparseIndex xindex, int64_t xfill,
-                                  ndarray[int64_t, ndim=1] y,
-                                  SparseIndex yindex, int64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_lt_int64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_lt_int64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_lt_int64(int64_t xfill,
-                                       int64_t yfill):
-    return xfill < yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_gt_float64(ndarray x_,
-                                                BlockIndex xindex,
-                                                float64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                float64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] > yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill > y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] > y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] > yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill > y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill > yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_gt_float64(ndarray x_, IntIndex xindex,
-                                              float64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              float64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill > y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] > yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] > y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] > yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill > y[yi]
-            yi += 1
-
-    return out, out_index, xfill > yfill
-
-
-cpdef sparse_gt_float64(ndarray[float64_t, ndim=1] x,
-                                  SparseIndex xindex, float64_t xfill,
-                                  ndarray[float64_t, ndim=1] y,
-                                  SparseIndex yindex, float64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_gt_float64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_gt_float64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_gt_float64(float64_t xfill,
-                                       float64_t yfill):
-    return xfill > yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_gt_int64(ndarray x_,
-                                                BlockIndex xindex,
-                                                int64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                int64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] > yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill > y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] > y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] > yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill > y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill > yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_gt_int64(ndarray x_, IntIndex xindex,
-                                              int64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              int64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill > y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] > yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] > y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] > yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill > y[yi]
-            yi += 1
-
-    return out, out_index, xfill > yfill
-
-
-cpdef sparse_gt_int64(ndarray[int64_t, ndim=1] x,
-                                  SparseIndex xindex, int64_t xfill,
-                                  ndarray[int64_t, ndim=1] y,
-                                  SparseIndex yindex, int64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_gt_int64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_gt_int64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_gt_int64(int64_t xfill,
-                                       int64_t yfill):
-    return xfill > yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_le_float64(ndarray x_,
-                                                BlockIndex xindex,
-                                                float64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                float64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] <= yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill <= y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] <= y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] <= yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill <= y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill <= yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_le_float64(ndarray x_, IntIndex xindex,
-                                              float64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              float64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill <= y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] <= yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] <= y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] <= yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill <= y[yi]
-            yi += 1
-
-    return out, out_index, xfill <= yfill
-
-
-cpdef sparse_le_float64(ndarray[float64_t, ndim=1] x,
-                                  SparseIndex xindex, float64_t xfill,
-                                  ndarray[float64_t, ndim=1] y,
-                                  SparseIndex yindex, float64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_le_float64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_le_float64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_le_float64(float64_t xfill,
-                                       float64_t yfill):
-    return xfill <= yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_le_int64(ndarray x_,
-                                                BlockIndex xindex,
-                                                int64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                int64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] <= yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill <= y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] <= y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] <= yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill <= y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill <= yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_le_int64(ndarray x_, IntIndex xindex,
-                                              int64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              int64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill <= y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] <= yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] <= y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] <= yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill <= y[yi]
-            yi += 1
-
-    return out, out_index, xfill <= yfill
-
-
-cpdef sparse_le_int64(ndarray[int64_t, ndim=1] x,
-                                  SparseIndex xindex, int64_t xfill,
-                                  ndarray[int64_t, ndim=1] y,
-                                  SparseIndex yindex, int64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_le_int64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_le_int64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_le_int64(int64_t xfill,
-                                       int64_t yfill):
-    return xfill <= yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_ge_float64(ndarray x_,
-                                                BlockIndex xindex,
-                                                float64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                float64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] >= yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill >= y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] >= y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] >= yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill >= y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill >= yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_ge_float64(ndarray x_, IntIndex xindex,
-                                              float64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              float64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[float64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill >= y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] >= yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] >= y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] >= yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill >= y[yi]
-            yi += 1
-
-    return out, out_index, xfill >= yfill
-
-
-cpdef sparse_ge_float64(ndarray[float64_t, ndim=1] x,
-                                  SparseIndex xindex, float64_t xfill,
-                                  ndarray[float64_t, ndim=1] y,
-                                  SparseIndex yindex, float64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_ge_float64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_ge_float64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_ge_float64(float64_t xfill,
-                                       float64_t yfill):
-    return xfill >= yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_ge_int64(ndarray x_,
-                                                BlockIndex xindex,
-                                                int64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                int64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] >= yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill >= y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] >= y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] >= yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill >= y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill >= yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_ge_int64(ndarray x_, IntIndex xindex,
-                                              int64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              int64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill >= y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] >= yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] >= y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] >= yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill >= y[yi]
-            yi += 1
-
-    return out, out_index, xfill >= yfill
-
-
-cpdef sparse_ge_int64(ndarray[int64_t, ndim=1] x,
-                                  SparseIndex xindex, int64_t xfill,
-                                  ndarray[int64_t, ndim=1] y,
-                                  SparseIndex yindex, int64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_ge_int64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_ge_int64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_ge_int64(int64_t xfill,
-                                       int64_t yfill):
-    return xfill >= yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_and_int64(ndarray x_,
-                                                BlockIndex xindex,
-                                                int64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                int64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] & yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill & y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] & y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] & yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill & y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill & yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_and_int64(ndarray x_, IntIndex xindex,
-                                              int64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              int64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill & y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] & yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] & y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] & yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill & y[yi]
-            yi += 1
-
-    return out, out_index, xfill & yfill
-
-
-cpdef sparse_and_int64(ndarray[int64_t, ndim=1] x,
-                                  SparseIndex xindex, int64_t xfill,
-                                  ndarray[int64_t, ndim=1] y,
-                                  SparseIndex yindex, int64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_and_int64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_and_int64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_and_int64(int64_t xfill,
-                                       int64_t yfill):
-    return xfill & yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_and_uint8(ndarray x_,
-                                                BlockIndex xindex,
-                                                uint8_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                uint8_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[uint8_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] & yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill & y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] & y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] & yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill & y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill & yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_and_uint8(ndarray x_, IntIndex xindex,
-                                              uint8_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              uint8_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[uint8_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill & y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] & yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] & y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] & yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill & y[yi]
-            yi += 1
-
-    return out, out_index, xfill & yfill
-
-
-cpdef sparse_and_uint8(ndarray[uint8_t, ndim=1] x,
-                                  SparseIndex xindex, uint8_t xfill,
-                                  ndarray[uint8_t, ndim=1] y,
-                                  SparseIndex yindex, uint8_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_and_uint8(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_and_uint8(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_and_uint8(uint8_t xfill,
-                                       uint8_t yfill):
-    return xfill & yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_or_int64(ndarray x_,
-                                                BlockIndex xindex,
-                                                int64_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                int64_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] | yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill | y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] | y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] | yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill | y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill | yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_or_int64(ndarray x_, IntIndex xindex,
-                                              int64_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              int64_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[int64_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill | y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] | yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] | y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] | yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill | y[yi]
-            yi += 1
-
-    return out, out_index, xfill | yfill
-
-
-cpdef sparse_or_int64(ndarray[int64_t, ndim=1] x,
-                                  SparseIndex xindex, int64_t xfill,
-                                  ndarray[int64_t, ndim=1] y,
-                                  SparseIndex yindex, int64_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_or_int64(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_or_int64(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_or_int64(int64_t xfill,
-                                       int64_t yfill):
-    return xfill | yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple block_op_or_uint8(ndarray x_,
-                                                BlockIndex xindex,
-                                                uint8_t xfill,
-                                                ndarray y_,
-                                                BlockIndex yindex,
-                                                uint8_t yfill):
-    '''
-    Binary operator on BlockIndex objects with fill values
-    '''
-
-    cdef:
-        BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
-        int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
-        ndarray[uint8_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # to suppress Cython warning
-    x = x_
-    y = y_
-
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    # Wow, what a hack job. Need to do something about this
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if yblock == yindex.nblocks:
-            # use y fill value
-            out[out_i] = x[xi] | yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-            continue
-
-        if xblock == xindex.nblocks:
-            # use x fill value
-            out[out_i] = xfill | y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-            continue
-
-        yloc = yindex.locbuf[yblock] + ybp
-        xloc = xindex.locbuf[xblock] + xbp
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] | y[yi]
-            xi += 1
-            yi += 1
-
-            # advance both locations
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] | yfill
-            xi += 1
-
-            # advance x location
-            xbp += 1
-            if xbp == xindex.lenbuf[xblock]:
-                xblock += 1
-                xbp = 0
-        else:
-            # use x fill value
-            out[out_i] = xfill | y[yi]
-            yi += 1
-
-            # advance y location
-            ybp += 1
-            if ybp == yindex.lenbuf[yblock]:
-                yblock += 1
-                ybp = 0
-
-    return out, out_index, xfill | yfill
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline tuple int_op_or_uint8(ndarray x_, IntIndex xindex,
-                                              uint8_t xfill,
-                                              ndarray y_, IntIndex yindex,
-                                              uint8_t yfill):
-    cdef:
-        IntIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xloc, yloc
-        ndarray[int32_t, ndim=1] xindices, yindices, out_indices
-        ndarray[uint8_t, ndim=1] x, y
-        ndarray[uint8_t, ndim=1] out
-
-    # suppress Cython compiler warnings due to inlining
-    x = x_
-    y = y_
-
-    # need to do this first to know size of result array
-    out_index = xindex.make_union(yindex)
-    out = np.empty(out_index.npoints, dtype=np.uint8)
-
-    xindices = xindex.indices
-    yindices = yindex.indices
-    out_indices = out_index.indices
-
-    # walk the two SparseVectors, adding matched locations...
-    for out_i from 0 <= out_i < out_index.npoints:
-        if xi == xindex.npoints:
-            # use x fill value
-            out[out_i] = xfill | y[yi]
-            yi += 1
-            continue
-
-        if yi == yindex.npoints:
-            # use y fill value
-            out[out_i] = x[xi] | yfill
-            xi += 1
-            continue
-
-        xloc = xindices[xi]
-        yloc = yindices[yi]
-
-        # each index in the out_index had to come from either x, y, or both
-        if xloc == yloc:
-            out[out_i] = x[xi] | y[yi]
-            xi += 1
-            yi += 1
-        elif xloc < yloc:
-            # use y fill value
-            out[out_i] = x[xi] | yfill
-            xi += 1
-        else:
-            # use x fill value
-            out[out_i] = xfill | y[yi]
-            yi += 1
-
-    return out, out_index, xfill | yfill
-
-
-cpdef sparse_or_uint8(ndarray[uint8_t, ndim=1] x,
-                                  SparseIndex xindex, uint8_t xfill,
-                                  ndarray[uint8_t, ndim=1] y,
-                                  SparseIndex yindex, uint8_t yfill):
-
-    if isinstance(xindex, BlockIndex):
-        return block_op_or_uint8(x, xindex.to_block_index(), xfill,
-                                             y, yindex.to_block_index(), yfill)
-    elif isinstance(xindex, IntIndex):
-        return int_op_or_uint8(x, xindex.to_int_index(), xfill,
-                                           y, yindex.to_int_index(), yfill)
-    else:
-        raise NotImplementedError
-
-
-cpdef sparse_fill_or_uint8(uint8_t xfill,
-                                       uint8_t yfill):
-    return xfill | yfill