pymc-devs
diff --git a/‎pytensor/link/numba/dispatch/basic.py
+1-1 b/‎pytensor/link/numba/dispatch/basic.py
+1-1
diff --git a/‎pytensor/link/numba/dispatch/linalg/decomposition/lu.py
+206 b/‎pytensor/link/numba/dispatch/linalg/decomposition/lu.py
+206
diff --git a/‎pytensor/link/numba/dispatch/linalg/decomposition/lu_factor.py
+86 b/‎pytensor/link/numba/dispatch/linalg/decomposition/lu_factor.py
+86
@@ -76,7 +76,7 @@ def numba_njit(*args, fastmath=None, **kwargs):
         message=(
             "(\x1b\\[1m)*"  # ansi escape code for bold text
             "Cannot cache compiled function "
-            '"(numba_funcified_fgraph|store_core_outputs|cholesky|solve|solve_triangular|cho_solve)" '
+            '"(numba_funcified_fgraph|store_core_outputs|cholesky|solve|solve_triangular|cho_solve|lu_factor)" '
             "as it uses dynamic globals"
         ),
         category=NumbaWarning,
 
@@ -0,0 +1,206 @@
+from collections.abc import Callable
+from typing import cast as typing_cast
+
+import numpy as np
+from numba import njit as numba_njit
+from numba.core.extending import overload
+from numba.np.linalg import ensure_lapack
+from scipy import linalg
+
+from pytensor.link.numba.dispatch.linalg.decomposition.lu_factor import _getrf
+from pytensor.link.numba.dispatch.linalg.utils import _check_scipy_linalg_matrix
+
+
+@numba_njit
+def _pivot_to_permutation(p, dtype):
+    p_inv = np.arange(len(p)).astype(dtype)
+    for i in range(len(p)):
+        p_inv[i], p_inv[p[i]] = p_inv[p[i]], p_inv[i]
+    return p_inv
+
+
+@numba_njit
+def _lu_factor_to_lu(a, dtype, overwrite_a):
+    A_copy, IPIV, INFO = _getrf(a, overwrite_a=overwrite_a)
+
+    L = np.eye(A_copy.shape[-1], dtype=dtype)
+    L += np.tril(A_copy, k=-1)
+    U = np.triu(A_copy)
+
+    # Fortran is 1 indexed, so we need to subtract 1 from the IPIV array
+    IPIV = IPIV - 1
+    p_inv = _pivot_to_permutation(IPIV, dtype=dtype)
+    perm = np.argsort(p_inv)
+
+    return perm, L, U
+
+
+def _lu_1(
+    a: np.ndarray,
+    permute_l: bool,
+    check_finite: bool,
+    p_indices: bool,
+    overwrite_a: bool,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Thin wrapper around scipy.linalg.lu. Used as an overload target to avoid side-effects on users to import Pytensor.
+
+    Called when permute_l is True and p_indices is False, and returns a tuple of (perm, L, U), where perm an integer
+    array of row swaps, such that L[perm] @ U = A.
+    """
+    return typing_cast(
+        tuple[np.ndarray, np.ndarray, np.ndarray],
+        linalg.lu(
+            a,
+            permute_l=permute_l,
+            check_finite=check_finite,
+            p_indices=p_indices,
+            overwrite_a=overwrite_a,
+        ),
+    )
+
+
+def _lu_2(
+    a: np.ndarray,
+    permute_l: bool,
+    check_finite: bool,
+    p_indices: bool,
+    overwrite_a: bool,
+) -> tuple[np.ndarray, np.ndarray]:
+    """
+    Thin wrapper around scipy.linalg.lu. Used as an overload target to avoid side-effects on users to import Pytensor.
+
+    Called when permute_l is False and p_indices is True, and returns a tuple of (PL, U), where PL is the
+    permuted L matrix, PL = P @ L.
+    """
+    return typing_cast(
+        tuple[np.ndarray, np.ndarray],
+        linalg.lu(
+            a,
+            permute_l=permute_l,
+            check_finite=check_finite,
+            p_indices=p_indices,
+            overwrite_a=overwrite_a,
+        ),
+    )
+
+
+def _lu_3(
+    a: np.ndarray,
+    permute_l: bool,
+    check_finite: bool,
+    p_indices: bool,
+    overwrite_a: bool,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Thin wrapper around scipy.linalg.lu. Used as an overload target to avoid side-effects on users to import Pytensor.
+
+    Called when permute_l is False and p_indices is False, and returns a tuple of (P, L, U), where P is the permutation
+    matrix, P @ L @ U = A.
+    """
+    return typing_cast(
+        tuple[np.ndarray, np.ndarray, np.ndarray],
+        linalg.lu(
+            a,
+            permute_l=permute_l,
+            check_finite=check_finite,
+            p_indices=p_indices,
+            overwrite_a=overwrite_a,
+        ),
+    )
+
+
+@overload(_lu_1)
+def lu_impl_1(
+    a: np.ndarray,
+    permute_l: bool,
+    check_finite: bool,
+    p_indices: bool,
+    overwrite_a: bool,
+) -> Callable[
+    [np.ndarray, bool, bool, bool, bool], tuple[np.ndarray, np.ndarray, np.ndarray]
+]:
+    """
+    Overload scipy.linalg.lu with a numba function. This function is called when permute_l is True and p_indices is
+    False. Returns a tuple of (perm, L, U), where perm an integer array of row swaps, such that L[perm] @ U = A.
+    """
+    ensure_lapack()
+    _check_scipy_linalg_matrix(a, "lu")
+    dtype = a.dtype
+
+    def impl(
+        a: np.ndarray,
+        permute_l: bool,
+        check_finite: bool,
+        p_indices: bool,
+        overwrite_a: bool,
+    ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+        perm, L, U = _lu_factor_to_lu(a, dtype, overwrite_a)
+        return perm, L, U
+
+    return impl
+
+
+@overload(_lu_2)
+def lu_impl_2(
+    a: np.ndarray,
+    permute_l: bool,
+    check_finite: bool,
+    p_indices: bool,
+    overwrite_a: bool,
+) -> Callable[[np.ndarray, bool, bool, bool, bool], tuple[np.ndarray, np.ndarray]]:
+    """
+    Overload scipy.linalg.lu with a numba function. This function is called when permute_l is False and p_indices is
+    True. Returns a tuple of (PL, U), where PL is the permuted L matrix, PL = P @ L.
+    """
+
+    ensure_lapack()
+    _check_scipy_linalg_matrix(a, "lu")
+    dtype = a.dtype
+
+    def impl(
+        a: np.ndarray,
+        permute_l: bool,
+        check_finite: bool,
+        p_indices: bool,
+        overwrite_a: bool,
+    ) -> tuple[np.ndarray, np.ndarray]:
+        perm, L, U = _lu_factor_to_lu(a, dtype, overwrite_a)
+        PL = L[perm]
+
+        return PL, U
+
+    return impl
+
+
+@overload(_lu_3)
+def lu_impl_3(
+    a: np.ndarray,
+    permute_l: bool,
+    check_finite: bool,
+    p_indices: bool,
+    overwrite_a: bool,
+) -> Callable[
+    [np.ndarray, bool, bool, bool, bool], tuple[np.ndarray, np.ndarray, np.ndarray]
+]:
+    """
+    Overload scipy.linalg.lu with a numba function. This function is called when permute_l is True and p_indices is
+    False. Returns a tuple of (P, L, U), such that P @ L @ U = A.
+    """
+    ensure_lapack()
+    _check_scipy_linalg_matrix(a, "lu")
+    dtype = a.dtype
+
+    def impl(
+        a: np.ndarray,
+        permute_l: bool,
+        check_finite: bool,
+        p_indices: bool,
+        overwrite_a: bool,
+    ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+        perm, L, U = _lu_factor_to_lu(a, dtype, overwrite_a)
+        P = np.eye(a.shape[-1], dtype=dtype)[perm]
+
+        return P, L, U
+
+    return impl
@@ -0,0 +1,86 @@
+from collections.abc import Callable
+
+import numpy as np
+from numba.core.extending import overload
+from numba.np.linalg import _copy_to_fortran_order, ensure_lapack
+from scipy import linalg
+
+from pytensor.link.numba.dispatch.linalg._LAPACK import (
+    _LAPACK,
+    _get_underlying_float,
+    int_ptr_to_val,
+    val_to_int_ptr,
+)
+from pytensor.link.numba.dispatch.linalg.utils import (
+    _check_scipy_linalg_matrix,
+)
+
+
+def _getrf(A, overwrite_a=False) -> tuple[np.ndarray, np.ndarray, int]:
+    """
+    Underlying LAPACK function used for LU factorization. Compared to scipy.linalg.lu_factorize, this function also
+    returns an info code with diagnostic information.
+    """
+    (getrf,) = linalg.get_lapack_funcs("getrf", (A,))
+    A_copy, ipiv, info = getrf(A, overwrite_a=overwrite_a)
+
+    return A_copy, ipiv, info
+
+
+@overload(_getrf)
+def getrf_impl(
+    A: np.ndarray, overwrite_a: bool = False
+) -> Callable[[np.ndarray, bool], tuple[np.ndarray, np.ndarray, int]]:
+    ensure_lapack()
+    _check_scipy_linalg_matrix(A, "getrf")
+    dtype = A.dtype
+    w_type = _get_underlying_float(dtype)
+    numba_getrf = _LAPACK().numba_xgetrf(dtype)
+
+    def impl(
+        A: np.ndarray, overwrite_a: bool = False
+    ) -> tuple[np.ndarray, np.ndarray, int]:
+        _M, _N = np.int32(A.shape[-2:])  # type: ignore
+
+        if overwrite_a and A.flags.f_contiguous:
+            A_copy = A
+        else:
+            A_copy = _copy_to_fortran_order(A)
+
+        M = val_to_int_ptr(_M)  # type: ignore
+        N = val_to_int_ptr(_N)  # type: ignore
+        LDA = val_to_int_ptr(_M)  # type: ignore
+        IPIV = np.empty(_N, dtype=np.int32)  # type: ignore
+        INFO = val_to_int_ptr(0)
+
+        numba_getrf(M, N, A_copy.view(w_type).ctypes, LDA, IPIV.ctypes, INFO)
+
+        return A_copy, IPIV, int_ptr_to_val(INFO)
+
+    return impl
+
+
+def _lu_factor(A: np.ndarray, overwrite_a: bool = False):
+    """
+    Thin wrapper around scipy.linalg.lu_factor. Used as an overload target to avoid side-effects on users who import
+    Pytensor.
+    """
+    return linalg.lu_factor(A, overwrite_a=overwrite_a)
+
+
+@overload(_lu_factor)
+def lu_factor_impl(
+    A: np.ndarray, overwrite_a: bool = False
+) -> Callable[[np.ndarray, bool], tuple[np.ndarray, np.ndarray]]:
+    ensure_lapack()
+    _check_scipy_linalg_matrix(A, "lu_factor")
+
+    def impl(A: np.ndarray, overwrite_a: bool = False) -> tuple[np.ndarray, np.ndarray]:
+        A_copy, IPIV, INFO = _getrf(A, overwrite_a=overwrite_a)
+        IPIV -= 1  # LAPACK uses 1-based indexing, convert to 0-based
+
+        if INFO != 0:
+            raise np.linalg.LinAlgError("LU decomposition failed")
+        return A_copy, IPIV
+
+    return impl