diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in
index 42dda15ea2cbb..1fcecba0821c8 100644
--- a/pandas/_libs/algos_common_helper.pxi.in
+++ b/pandas/_libs/algos_common_helper.pxi.in
@@ -19,33 +19,44 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 # 1-d template
 #----------------------------------------------------------------------
 
-{{py:
+ctypedef fused algos_t:
+    float64_t
+    float32_t
+    object
+    int32_t
+    int64_t
+    uint64_t
+    uint8_t
 
-# name, c_type, dtype, can_hold_na, nogil
-dtypes = [('float64', 'float64_t', 'np.float64', True, True),
-          ('float32', 'float32_t', 'np.float32', True, True),
-          ('object', 'object', 'object', True, False),
-          ('int32', 'int32_t', 'np.int32', False, True),
-          ('int64', 'int64_t', 'np.int64', False, True),
-          ('uint64', 'uint64_t', 'np.uint64', False, True),
-          ('bool', 'uint8_t', 'np.bool', False, True)]
 
-def get_dispatch(dtypes):
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def arrmap(ndarray[algos_t] index, object func):
+    cdef:
+        Py_ssize_t length = index.shape[0]
+        Py_ssize_t i = 0
+        ndarray[object] result = np.empty(length, dtype=np.object_)
 
-    for name, c_type, dtype, can_hold_na, nogil in dtypes:
+    from pandas._libs.lib import maybe_convert_objects
 
-        nogil_str = 'with nogil:' if nogil else ''
-        tab = '    ' if nogil else ''
-        yield name, c_type, dtype, can_hold_na, nogil_str, tab
-}}
+    for i in range(length):
+        result[i] = func(index[i])
+
+    return maybe_convert_objects(result)
 
-{{for name, c_type, dtype, can_hold_na, nogil_str, tab
-      in get_dispatch(dtypes)}}
+
+arrmap_float64 = arrmap["float64_t"]
+arrmap_float32 = arrmap["float32_t"]
+arrmap_object = arrmap["object"]
+arrmap_int32 = arrmap["int32_t"]
+arrmap_int64 = arrmap["int64_t"]
+arrmap_uint64 = arrmap["uint64_t"]
+arrmap_bool = arrmap["uint8_t"]
 
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-cpdef map_indices_{{name}}(ndarray[{{c_type}}] index):
+cpdef map_indices(ndarray[algos_t] index):
     """
     Produce a dict mapping the values of the input array to their respective
     locations.
@@ -55,8 +66,9 @@ cpdef map_indices_{{name}}(ndarray[{{c_type}}] index):
 
     Better to do this with Cython because of the enormous speed boost.
     """
-    cdef Py_ssize_t i, length
-    cdef dict result = {}
+    cdef:
+        Py_ssize_t i, length
+        dict result = {}
 
     length = len(index)
 
@@ -66,13 +78,22 @@ cpdef map_indices_{{name}}(ndarray[{{c_type}}] index):
     return result
 
 
+map_indices_float64 = map_indices["float64_t"]
+map_indices_float32 = map_indices["float32_t"]
+map_indices_object = map_indices["object"]
+map_indices_int32 = map_indices["int32_t"]
+map_indices_int64 = map_indices["int64_t"]
+map_indices_uint64 = map_indices["uint64_t"]
+map_indices_bool = map_indices["uint8_t"]
+
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_{{name}}(ndarray[{{c_type}}] old, ndarray[{{c_type}}] new, limit=None):
+def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None):
     cdef:
         Py_ssize_t i, j, nleft, nright
         ndarray[int64_t, ndim=1] indexer
-        {{c_type}} cur, next
+        algos_t cur, next
         int lim, fill_count = 0
 
     nleft = len(old)
@@ -129,20 +150,28 @@ def pad_{{name}}(ndarray[{{c_type}}] old, ndarray[{{c_type}}] new, limit=None):
 
     return indexer
 
+pad_float64 = pad["float64_t"]
+pad_float32 = pad["float32_t"]
+pad_object = pad["object"]
+pad_int32 = pad["int32_t"]
+pad_int64 = pad["int64_t"]
+pad_uint64 = pad["uint64_t"]
+pad_bool = pad["uint8_t"]
+
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_inplace_{{name}}(ndarray[{{c_type}}] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
+def pad_inplace(ndarray[algos_t] values,
+                ndarray[uint8_t, cast=True] mask,
+                limit=None):
     cdef:
-        Py_ssize_t i, N
-        {{c_type}} val
-        int lim, fill_count = 0
+        cdef Py_ssize_t i, N
+        cdef algos_t val
+        cdef int lim, fill_count = 0
 
     N = len(values)
 
-    # GH 2778
+    # GH#2778
     if N == 0:
         return
 
@@ -167,19 +196,28 @@ def pad_inplace_{{name}}(ndarray[{{c_type}}] values,
             val = values[i]
 
 
+pad_inplace_float64 = pad_inplace["float64_t"]
+pad_inplace_float32 = pad_inplace["float32_t"]
+pad_inplace_object = pad_inplace["object"]
+pad_inplace_int32 = pad_inplace["int32_t"]
+pad_inplace_int64 = pad_inplace["int64_t"]
+pad_inplace_uint64 = pad_inplace["uint64_t"]
+pad_inplace_bool = pad_inplace["uint8_t"]
+
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_2d_inplace_{{name}}(ndarray[{{c_type}}, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
+def pad_2d_inplace(ndarray[algos_t, ndim=2] values,
+                   ndarray[uint8_t, ndim=2] mask,
+                   limit=None):
     cdef:
         Py_ssize_t i, j, N, K
-        {{c_type}} val
+        algos_t val
         int lim, fill_count = 0
 
     K, N = (<object> values).shape
 
-    # GH 2778
+    # GH#2778
     if N == 0:
         return
 
@@ -205,6 +243,16 @@ def pad_2d_inplace_{{name}}(ndarray[{{c_type}}, ndim=2] values,
                 fill_count = 0
                 val = values[j, i]
 
+
+pad_2d_inplace_float64 = pad_2d_inplace["float64_t"]
+pad_2d_inplace_float32 = pad_2d_inplace["float32_t"]
+pad_2d_inplace_object = pad_2d_inplace["object"]
+pad_2d_inplace_int32 = pad_2d_inplace["int32_t"]
+pad_2d_inplace_int64 = pad_2d_inplace["int64_t"]
+pad_2d_inplace_uint64 = pad_2d_inplace["uint64_t"]
+pad_2d_inplace_bool = pad_2d_inplace["uint8_t"]
+
+
 """
 Backfilling logic for generating fill vector
 
@@ -233,13 +281,12 @@ D
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_{{name}}(ndarray[{{c_type}}] old, ndarray[{{c_type}}] new,
-                      limit=None):
+def backfill(ndarray[algos_t] old, ndarray[algos_t] new,limit=None):
     cdef:
-        Py_ssize_t i, j, nleft, nright
-        ndarray[int64_t, ndim=1] indexer
-        {{c_type}} cur, prev
-        int lim, fill_count = 0
+        cdef Py_ssize_t i, j, nleft, nright
+        cdef ndarray[int64_t, ndim=1] indexer
+        cdef algos_t cur, prev
+        cdef int lim, fill_count = 0
 
     nleft = len(old)
     nright = len(new)
@@ -297,19 +344,28 @@ def backfill_{{name}}(ndarray[{{c_type}}] old, ndarray[{{c_type}}] new,
     return indexer
 
 
+backfill_float64 = backfill["float64_t"]
+backfill_float32 = backfill["float32_t"]
+backfill_object = backfill["object"]
+backfill_int32 = backfill["int32_t"]
+backfill_int64 = backfill["int64_t"]
+backfill_uint64 = backfill["uint64_t"]
+backfill_bool = backfill["uint8_t"]
+
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_inplace_{{name}}(ndarray[{{c_type}}] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
+def backfill_inplace(ndarray[algos_t] values,
+                     ndarray[uint8_t, cast=True] mask,
+                     limit=None):
     cdef:
-        Py_ssize_t i, N
-        {{c_type}} val
-        int lim, fill_count = 0
+        cdef Py_ssize_t i, N
+        cdef algos_t val
+        cdef int lim, fill_count = 0
 
     N = len(values)
 
-    # GH 2778
+    # GH#2778
     if N == 0:
         return
 
@@ -334,19 +390,28 @@ def backfill_inplace_{{name}}(ndarray[{{c_type}}] values,
             val = values[i]
 
 
+backfill_inplace_float64 = backfill_inplace["float64_t"]
+backfill_inplace_float32 = backfill_inplace["float32_t"]
+backfill_inplace_object = backfill_inplace["object"]
+backfill_inplace_int32 = backfill_inplace["int32_t"]
+backfill_inplace_int64 = backfill_inplace["int64_t"]
+backfill_inplace_uint64 = backfill_inplace["uint64_t"]
+backfill_inplace_bool = backfill_inplace["uint8_t"]
+
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_2d_inplace_{{name}}(ndarray[{{c_type}}, ndim=2] values,
-                                 ndarray[uint8_t, ndim=2] mask,
-                                 limit=None):
+def backfill_2d_inplace(ndarray[algos_t, ndim=2] values,
+                        ndarray[uint8_t, ndim=2] mask,
+                        limit=None):
     cdef:
         Py_ssize_t i, j, N, K
-        {{c_type}} val
+        algos_t val
         int lim, fill_count = 0
 
     K, N = (<object> values).shape
 
-    # GH 2778
+    # GH#2778
     if N == 0:
         return
 
@@ -373,6 +438,39 @@ def backfill_2d_inplace_{{name}}(ndarray[{{c_type}}, ndim=2] values,
                 val = values[j, i]
 
 
+backfill_2d_inplace_float64 = backfill_2d_inplace["float64_t"]
+backfill_2d_inplace_float32 = backfill_2d_inplace["float32_t"]
+backfill_2d_inplace_object = backfill_2d_inplace["object"]
+backfill_2d_inplace_int32 = backfill_2d_inplace["int32_t"]
+backfill_2d_inplace_int64 = backfill_2d_inplace["int64_t"]
+backfill_2d_inplace_uint64 = backfill_2d_inplace["uint64_t"]
+backfill_2d_inplace_bool = backfill_2d_inplace["uint8_t"]
+
+
+{{py:
+
+# name, c_type, dtype, can_hold_na, nogil
+dtypes = [('float64', 'float64_t', 'np.float64', True, True),
+          ('float32', 'float32_t', 'np.float32', True, True),
+          ('object', 'object', 'object', True, False),
+          ('int32', 'int32_t', 'np.int32', False, True),
+          ('int64', 'int64_t', 'np.int64', False, True),
+          ('uint64', 'uint64_t', 'np.uint64', False, True),
+          ('bool', 'uint8_t', 'np.bool', False, True)]
+
+def get_dispatch(dtypes):
+
+    for name, c_type, dtype, can_hold_na, nogil in dtypes:
+
+        nogil_str = 'with nogil:' if nogil else ''
+        tab = '    ' if nogil else ''
+        yield name, c_type, dtype, can_hold_na, nogil_str, tab
+}}
+
+{{for name, c_type, dtype, can_hold_na, nogil_str, tab
+      in get_dispatch(dtypes)}}
+
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def is_monotonic_{{name}}(ndarray[{{c_type}}] arr, bint timelike):
@@ -429,22 +527,6 @@ def is_monotonic_{{name}}(ndarray[{{c_type}}] arr, bint timelike):
     return is_monotonic_inc, is_monotonic_dec, \
            is_unique and (is_monotonic_inc or is_monotonic_dec)
 
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_{{name}}(ndarray[{{c_type}}] index, object func):
-    cdef:
-        Py_ssize_t length = index.shape[0]
-        Py_ssize_t i = 0
-        ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas._libs.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
 {{endfor}}
 
 #----------------------------------------------------------------------
diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in
index 0e69324acd341..4883e067ea8c4 100644
--- a/pandas/_libs/algos_take_helper.pxi.in
+++ b/pandas/_libs/algos_take_helper.pxi.in
@@ -264,29 +264,34 @@ def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
 # take_2d internal function
 #----------------------------------------------------------------------
 
-{{py:
-
-# dtype, ctype, init_result
-dtypes = [('float64', 'float64_t', 'np.empty_like(values)'),
-          ('uint64', 'uint64_t', 'np.empty_like(values)'),
-          ('object', 'object', 'values.copy()'),
-          ('int64', 'int64_t', 'np.empty_like(values)')]
-}}
+ctypedef fused take_t:
+    float64_t
+    uint64_t
+    object
+    int64_t
 
-{{for dtype, ctype, init_result in dtypes}}
 
-cdef _take_2d_{{dtype}}(ndarray[{{ctype}}, ndim=2] values, object idx):
+cdef _take_2d(ndarray[take_t, ndim=2] values, object idx):
     cdef:
         Py_ssize_t i, j, N, K
         ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx
-        ndarray[{{ctype}}, ndim=2] result
+        ndarray[take_t, ndim=2] result
         object val
 
     N, K = (<object> values).shape
-    result = {{init_result}}
+    if take_t is object:
+        result = values.copy()
+    else:
+        result = np.empty_like(values)
+
     for i in range(N):
         for j in range(K):
             result[i, j] = values[i, indexer[i, j]]
     return result
 
-{{endfor}}
+
+# TODO: Are these treated as cdefs?
+_take_2d_float64 = _take_2d[float64_t]
+_take_2d_uint64 = _take_2d[uint64_t]
+_take_2d_object = _take_2d[object]
+_take_2d_int64 = _take_2d[int64_t]
diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
index 0062a6c8d31ab..7b1dc8f41575c 100644
--- a/pandas/_libs/groupby_helper.pxi.in
+++ b/pandas/_libs/groupby_helper.pxi.in
@@ -593,37 +593,26 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
 # group_min, group_max
 #----------------------------------------------------------------------
 
-{{py:
-
-# name, c_type, dest_type2, nan_val
-dtypes = [('float64', 'float64_t', 'NAN', 'np.inf'),
-          ('float32', 'float32_t', 'NAN', 'np.inf'),
-          ('int64', 'int64_t', 'iNaT', '_int64_max')]
-
-def get_dispatch(dtypes):
-
-    for name, dest_type2, nan_val, inf_val in dtypes:
-        yield name, dest_type2, nan_val, inf_val
-}}
-
-
-{{for name, dest_type2, nan_val, inf_val in get_dispatch(dtypes)}}
+ctypedef fused group_t:
+    float64_t
+    float32_t
+    int64_t
 
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
-                       ndarray[int64_t] counts,
-                       ndarray[{{dest_type2}}, ndim=2] values,
-                       ndarray[int64_t] labels,
-                       Py_ssize_t min_count=-1):
+def group_max(ndarray[group_t, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[group_t, ndim=2] values,
+              ndarray[int64_t] labels,
+              Py_ssize_t min_count=-1):
     """
     Only aggregates on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        {{dest_type2}} val, count
-        ndarray[{{dest_type2}}, ndim=2] maxx, nobs
+        group_t val, count
+        ndarray[group_t, ndim=2] maxx, nobs
 
     assert min_count == -1, "'min_count' only used in add and prod"
 
@@ -633,7 +622,12 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
     nobs = np.zeros_like(out)
 
     maxx = np.empty_like(out)
-    maxx.fill(-{{inf_val}})
+
+    if group_t is int64_t:
+        # evaluated at compile-time
+        maxx.fill(-_int64_max)
+    else:
+        maxx.fill(-np.inf)
 
     N, K = (<object> values).shape
 
@@ -648,11 +642,9 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 val = values[i, j]
 
                 # not nan
-                {{if name == 'int64'}}
-                if val != {{nan_val}}:
-                {{else}}
-                if val == val and val != {{nan_val}}:
-                {{endif}}
+                if ((group_t is int64_t and val != iNaT) or
+                        (group_t is not int64_t and
+                         val == val and val != NAN)):
                     nobs[lab, j] += 1
                     if val > maxx[lab, j]:
                         maxx[lab, j] = val
@@ -660,25 +652,33 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
         for i in range(ncounts):
             for j in range(K):
                 if nobs[i, j] == 0:
-                    out[i, j] = {{nan_val}}
+                    if group_t is int64_t:
+                        out[i, j] = iNaT
+                    else:
+                        out[i, j] = NAN
                 else:
                     out[i, j] = maxx[i, j]
 
 
+group_max_float64 = group_max["float64_t"]
+group_max_float32 = group_max["float32_t"]
+group_max_int64 = group_max["int64_t"]
+
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
-                       ndarray[int64_t] counts,
-                       ndarray[{{dest_type2}}, ndim=2] values,
-                       ndarray[int64_t] labels,
-                       Py_ssize_t min_count=-1):
+def group_min(ndarray[group_t, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[group_t, ndim=2] values,
+              ndarray[int64_t] labels,
+              Py_ssize_t min_count=-1):
     """
     Only aggregates on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        {{dest_type2}} val, count
-        ndarray[{{dest_type2}}, ndim=2] minx, nobs
+        group_t val, count
+        ndarray[group_t, ndim=2] minx, nobs
 
     assert min_count == -1, "'min_count' only used in add and prod"
 
@@ -688,7 +688,12 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
     nobs = np.zeros_like(out)
 
     minx = np.empty_like(out)
-    minx.fill({{inf_val}})
+
+    if group_t is int64_t:
+        # evaluated at compile-time
+        minx.fill(_int64_max)
+    else:
+        minx.fill(np.inf)
 
     N, K = (<object> values).shape
 
@@ -703,11 +708,9 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 val = values[i, j]
 
                 # not nan
-                {{if name == 'int64'}}
-                if val != {{nan_val}}:
-                {{else}}
-                if val == val and val != {{nan_val}}:
-                {{endif}}
+                if ((group_t is int64_t and val != iNaT) or
+                        (group_t is not int64_t and
+                         val == val and val != NAN)):
                     nobs[lab, j] += 1
                     if val < minx[lab, j]:
                         minx[lab, j] = val
@@ -715,29 +718,42 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
         for i in range(ncounts):
             for j in range(K):
                 if nobs[i, j] == 0:
-                    out[i, j] = {{nan_val}}
+                    if group_t is int64_t:
+                        out[i, j] = iNaT
+                    else:
+                        out[i, j] = NAN
                 else:
                     out[i, j] = minx[i, j]
 
 
+group_min_float64 = group_min["float64_t"]
+group_min_float32 = group_min["float32_t"]
+group_min_int64 = group_min["int64_t"]
+
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
-                          ndarray[{{dest_type2}}, ndim=2] values,
-                          ndarray[int64_t] labels,
-                          bint is_datetimelike):
+def group_cummin(ndarray[group_t, ndim=2] out,
+                 ndarray[group_t, ndim=2] values,
+                 ndarray[int64_t] labels,
+                 bint is_datetimelike):
     """
     Only transforms on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, size
-        {{dest_type2}} val, mval
-        ndarray[{{dest_type2}}, ndim=2] accum
+        group_t val, mval
+        ndarray[group_t, ndim=2] accum
         int64_t lab
 
     N, K = (<object> values).shape
     accum = np.empty_like(values)
-    accum.fill({{inf_val}})
+
+    if group_t is int64_t:
+        # evaluated at compile-time
+        accum.fill(_int64_max)
+    else:
+        accum.fill(np.inf)
 
     with nogil:
         for i in range(N):
@@ -749,37 +765,50 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 val = values[i, j]
 
                 # val = nan
-                {{if name == 'int64'}}
-                if is_datetimelike and val == {{nan_val}}:
-                    out[i, j] = {{nan_val}}
+                if group_t is int64_t:
+                    # evaluated at compile-time
+                    if is_datetimelike and val == iNaT:
+                        out[i, j] = iNaT
+                        continue
+
                 else:
-                {{else}}
-                if val == val:
-                {{endif}}
-                    mval = accum[lab, j]
-                    if val < mval:
-                        accum[lab, j] = mval = val
-                    out[i, j] = mval
+                    if val != val:
+                        continue
+
+                mval = accum[lab, j]
+                if val < mval:
+                    accum[lab, j] = mval = val
+                out[i, j] = mval
+
+
+group_cummin_float64 = group_cummin["float64_t"]
+group_cummin_float32 = group_cummin["float32_t"]
+group_cummin_int64 = group_cummin["int64_t"]
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
-                          ndarray[{{dest_type2}}, ndim=2] values,
-                          ndarray[int64_t] labels,
-                          bint is_datetimelike):
+def group_cummax(ndarray[group_t, ndim=2] out,
+                 ndarray[group_t, ndim=2] values,
+                 ndarray[int64_t] labels,
+                 bint is_datetimelike):
     """
     Only transforms on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, size
-        {{dest_type2}} val, mval
-        ndarray[{{dest_type2}}, ndim=2] accum
+        group_t val, mval
+        ndarray[group_t, ndim=2] accum
         int64_t lab
 
     N, K = (<object> values).shape
     accum = np.empty_like(values)
-    accum.fill(-{{inf_val}})
+
+    if group_t is int64_t:
+        # evaluated at compile-time
+        accum.fill(-_int64_max)
+    else:
+        accum.fill(-np.inf)
 
     with nogil:
         for i in range(N):
@@ -790,16 +819,21 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
             for j in range(K):
                 val = values[i, j]
 
-                {{if name == 'int64'}}
-                if is_datetimelike and val == {{nan_val}}:
-                    out[i, j] = {{nan_val}}
+                if group_t is int64_t:
+                    # evaluated at compile-time
+                    if is_datetimelike and val == iNaT:
+                        out[i, j] = iNaT
+                        continue
                 else:
-                {{else}}
-                if val == val:
-                {{endif}}
-                    mval = accum[lab, j]
-                    if val > mval:
-                        accum[lab, j] = mval = val
-                    out[i, j] = mval
+                    if val != val:
+                        continue
 
-{{endfor}}
+                mval = accum[lab, j]
+                if val > mval:
+                    accum[lab, j] = mval = val
+                out[i, j] = mval
+
+
+group_cummax_float64 = group_cummax["float64_t"]
+group_cummax_float32 = group_cummax["float32_t"]
+group_cummax_int64 = group_cummax["int64_t"]
diff --git a/pandas/_libs/join_helper.pxi.in b/pandas/_libs/join_helper.pxi.in
index feb8cfb76a7f0..3b84edc1c60d3 100644
--- a/pandas/_libs/join_helper.pxi.in
+++ b/pandas/_libs/join_helper.pxi.in
@@ -8,24 +8,13 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 # left_join_indexer, inner_join_indexer, outer_join_indexer
 #----------------------------------------------------------------------
 
-{{py:
-
-# name, c_type, dtype
-dtypes = [('float64', 'float64_t', 'np.float64'),
-          ('float32', 'float32_t', 'np.float32'),
-          ('object', 'object', 'object'),
-          ('int32', 'int32_t', 'np.int32'),
-          ('int64', 'int64_t', 'np.int64'),
-          ('uint64', 'uint64_t', 'np.uint64')]
-
-def get_dispatch(dtypes):
-
-    for name, c_type, dtype in dtypes:
-        yield name, c_type, dtype
-
-}}
-
-{{for name, c_type, dtype in get_dispatch(dtypes)}}
+ctypedef fused join_t:
+    float64_t
+    float32_t
+    object
+    int32_t
+    int64_t
+    uint64_t
 
 # Joins on ordered, unique indices
 
@@ -34,12 +23,11 @@ def get_dispatch(dtypes):
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def left_join_indexer_unique_{{name}}(ndarray[{{c_type}}] left,
-                                      ndarray[{{c_type}}] right):
+def left_join_indexer_unique(ndarray[join_t] left, ndarray[join_t] right):
     cdef:
         Py_ssize_t i, j, nleft, nright
         ndarray[int64_t] indexer
-        {{c_type}} lval, rval
+        join_t lval, rval
 
     i = 0
     j = 0
@@ -78,6 +66,37 @@ def left_join_indexer_unique_{{name}}(ndarray[{{c_type}}] left,
     return indexer
 
 
+left_join_indexer_unique_float64 = left_join_indexer_unique["float64_t"]
+left_join_indexer_unique_float32 = left_join_indexer_unique["float32_t"]
+left_join_indexer_unique_object = left_join_indexer_unique["object"]
+left_join_indexer_unique_int32 = left_join_indexer_unique["int32_t"]
+left_join_indexer_unique_int64 = left_join_indexer_unique["int64_t"]
+left_join_indexer_unique_uint64 = left_join_indexer_unique["uint64_t"]
+
+{{py:
+
+# name, c_type, dtype
+dtypes = [('float64', 'float64_t', 'np.float64'),
+          ('float32', 'float32_t', 'np.float32'),
+          ('object', 'object', 'object'),
+          ('int32', 'int32_t', 'np.int32'),
+          ('int64', 'int64_t', 'np.int64'),
+          ('uint64', 'uint64_t', 'np.uint64')]
+
+def get_dispatch(dtypes):
+
+    for name, c_type, dtype in dtypes:
+        yield name, c_type, dtype
+
+}}
+
+{{for name, c_type, dtype in get_dispatch(dtypes)}}
+
+# Joins on ordered, unique indices
+
+# right might contain non-unique values
+
+
 # @cython.wraparound(False)
 # @cython.boundscheck(False)
 def left_join_indexer_{{name}}(ndarray[{{c_type}}] left,
diff --git a/pandas/_libs/reshape_helper.pxi.in b/pandas/_libs/reshape_helper.pxi.in
index bb9a5977f8b45..0eab84c71ee71 100644
--- a/pandas/_libs/reshape_helper.pxi.in
+++ b/pandas/_libs/reshape_helper.pxi.in
@@ -8,34 +8,28 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 # reshape
 # ----------------------------------------------------------------------
 
-{{py:
-
-# name, c_type
-dtypes = [('uint8', 'uint8_t'),
-          ('uint16', 'uint16_t'),
-          ('uint32', 'uint32_t'),
-          ('uint64', 'uint64_t'),
-          ('int8', 'int8_t'),
-          ('int16', 'int16_t'),
-          ('int32', 'int32_t'),
-          ('int64', 'int64_t'),
-          ('float32', 'float32_t'),
-          ('float64', 'float64_t'),
-          ('object', 'object')]
-}}
-
-{{for dtype, c_type in dtypes}}
-
+ctypedef fused reshape_t:
+    uint8_t
+    uint16_t
+    uint32_t
+    uint64_t
+    int8_t
+    int16_t
+    int32_t
+    int64_t
+    float32_t
+    float64_t
+    object
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def unstack_{{dtype}}(ndarray[{{c_type}}, ndim=2] values,
-                      ndarray[uint8_t, ndim=1] mask,
-                      Py_ssize_t stride,
-                      Py_ssize_t length,
-                      Py_ssize_t width,
-                      ndarray[{{c_type}}, ndim=2] new_values,
-                      ndarray[uint8_t, ndim=2] new_mask):
+def unstack(ndarray[reshape_t, ndim=2] values,
+            ndarray[uint8_t, ndim=1] mask,
+            Py_ssize_t stride,
+            Py_ssize_t length,
+            Py_ssize_t width,
+            ndarray[reshape_t, ndim=2] new_values,
+            ndarray[uint8_t, ndim=2] new_mask):
     """
     transform long sorted_values to wide new_values
 
@@ -50,23 +44,33 @@ def unstack_{{dtype}}(ndarray[{{c_type}}, ndim=2] values,
         result array
     new_mask : boolean ndarray
         result mask
-
     """
-
     cdef:
         Py_ssize_t i, j, w, nulls, s, offset
 
-    {{if dtype == 'object'}}
-    if True:
-    {{else}}
-    with nogil:
-    {{endif}}
+    if reshape_t is not object:
+        with nogil:
+            for i in range(stride):
+                nulls = 0
 
-        for i in range(stride):
+                for j in range(length):
+                    for w in range(width):
 
+                        offset = j * width + w
+
+                        if mask[offset]:
+                            s = i * width + w
+                            new_values[j, s] = values[offset - nulls, i]
+                            new_mask[j, s] = 1
+                        else:
+                            nulls += 1
+
+    else:
+        # identical to above version, but "with nogil" is not available
+        for i in range(stride):
             nulls = 0
-            for j in range(length):
 
+            for j in range(length):
                 for w in range(width):
 
                     offset = j * width + w
@@ -78,4 +82,15 @@ def unstack_{{dtype}}(ndarray[{{c_type}}, ndim=2] values,
                     else:
                         nulls += 1
 
-{{endfor}}
+
+unstack_uint8 = unstack["uint8_t"]
+unstack_uint16 = unstack["uint16_t"]
+unstack_uint32 = unstack["uint32_t"]
+unstack_uint64 = unstack["uint64_t"]
+unstack_int8 = unstack["int8_t"]
+unstack_int16 = unstack["int16_t"]
+unstack_int32 = unstack["int32_t"]
+unstack_int64 = unstack["int64_t"]
+unstack_float32 = unstack["float32_t"]
+unstack_float64 = unstack["float64_t"]
+unstack_object = unstack["object"]