cyyever
diff --git a/‎aten/src/ATen/ConjugateFallback.cpp
Lines changed: 2 additions & 0 deletions b/‎aten/src/ATen/ConjugateFallback.cpp
Lines changed: 2 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/BatchLinearAlgebra.cpp
Lines changed: 191 additions & 189 deletions b/‎aten/src/ATen/native/BatchLinearAlgebra.cpp
Lines changed: 191 additions & 189 deletions
diff --git a/‎aten/src/ATen/native/BatchLinearAlgebra.h
Lines changed: 11 additions & 0 deletions b/‎aten/src/ATen/native/BatchLinearAlgebra.h
Lines changed: 11 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
Lines changed: 83 additions & 0 deletions b/‎aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
Lines changed: 83 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/LinearAlgebraUtils.h
Lines changed: 7 additions & 48 deletions b/‎aten/src/ATen/native/LinearAlgebraUtils.h
Lines changed: 7 additions & 48 deletions
diff --git a/‎aten/src/ATen/native/NegateFallback.cpp
Lines changed: 2 additions & 0 deletions b/‎aten/src/ATen/native/NegateFallback.cpp
Lines changed: 2 additions & 0 deletions
@@ -51,6 +51,8 @@ TORCH_LIBRARY_IMPL(aten, Conjugate, m) {
   m.impl("baddbmm", torch::CppFunction::makeFallthrough());
   m.impl("baddbmm_", torch::CppFunction::makeFallthrough());
   m.impl("baddbmm.out", torch::CppFunction::makeFallthrough());
+  m.impl("linalg_svd", torch::CppFunction::makeFallthrough());
+  m.impl("linalg_svd.U", torch::CppFunction::makeFallthrough());
 
   TORCH_VIEW_FNS(m)
   TENSOR_UTILITIES_AND_CONSTRUCTORS(m)
 
@@ -161,6 +161,8 @@ void lapackLuSolve(char trans, int n, int nrhs, scalar_t *a, int lda, int *ipiv,
 template <class scalar_t>
 void lapackLu(int m, int n, scalar_t *a, int lda, int *ipiv, int *info);
 
+template<class scalar_t, class value_t=scalar_t>
+void lapackSvd(char jobz, int m, int n, scalar_t *a, int lda, value_t *s, scalar_t *u, int ldu, scalar_t *vt, int ldvt, scalar_t *work, int lwork, value_t *rwork, int *iwork, int *info);
 #endif
 
 #if AT_BUILD_WITH_BLAS()
@@ -239,5 +241,14 @@ using lu_solve_trans_fn = void (*)(
     TransposeType /*trans*/);
 DECLARE_DISPATCH(lu_solve_trans_fn, lu_solve_trans_stub);
 
+using svd_fn = void (*)(
+    const Tensor& /*A*/,
+    const bool /*full_matrices*/,
+    const bool /*compute_uv*/,
+    const Tensor& /*U*/,
+    const Tensor& /*S*/,
+    const Tensor& /*Vh*/,
+    const Tensor& /*info*/);
+DECLARE_DISPATCH(svd_fn, svd_stub);
 
 }} // namespace at::native
@@ -943,6 +943,84 @@ void lu_solve_kernel(const Tensor& b, const Tensor& lu, const Tensor& pivots) {
   lu_solve_trans_kernel(b, lu, pivots, TransposeType::NoTranspose);
 }
 
+template <typename scalar_t>
+static void apply_svd(const Tensor& A,
+                      const bool full_matrices,
+                      const bool compute_uv,
+                      const Tensor& U,
+                      const Tensor& S,
+                      const Tensor& Vh,
+                      const Tensor& info) {
+#if !AT_BUILD_WITH_LAPACK()
+  TORCH_CHECK(false, "svd: LAPACK library not found in compilation");
+#else
+  using value_t = typename c10::scalar_value_type<scalar_t>::type;
+  const auto A_data = A.data_ptr<scalar_t>();
+  const auto U_data = compute_uv ? U.data_ptr<scalar_t>() : nullptr;
+  const auto S_data = S.data_ptr<value_t>();
+  const auto info_data = info.data_ptr<int>();
+  const auto Vh_data = compute_uv ? Vh.data_ptr<scalar_t>() : nullptr;
+  const auto A_stride = matrixStride(A);
+  const auto S_stride = S.size(-1);
+  const auto U_stride = compute_uv ? matrixStride(U) : 1;
+  const auto Vh_stride = compute_uv ? matrixStride(Vh) : 1;
+  const auto batchsize = batchCount(A);
+  const char jobz = compute_uv ? (full_matrices ? 'A' : 'S') : 'N';
+
+  const auto m = A.size(-2);
+  const auto n = A.size(-1);
+  const auto lda = A.stride(-1);
+  const auto ldu= compute_uv ? U.stride(-1) : 1;
+  const auto ldvh = compute_uv ? Vh.stride(-1) : 1;
+
+  auto iwork = std::vector<int>(8 * std::min(m, n));
+  auto* const iwork_data = iwork.data();
+
+  // rwork is just used for the complex decomposition
+  auto rwork = std::vector<value_t>{};
+  if (A.is_complex()) {
+    rwork.resize(std::max(computeLRWorkDim(jobz, m, n), int64_t{1}));
+  }
+  auto* const rwork_data = rwork.data();
+
+  // Query svd for the optimal lwork size
+  int lwork = -1;
+  {
+    scalar_t wkopt;
+    lapackSvd<scalar_t, value_t>(jobz, m, n, A_data, lda, S_data, U_data, ldu, Vh_data, ldvh, &wkopt, lwork, rwork_data, iwork_data, info_data);
+    lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
+  }
+  auto work = std::vector<scalar_t>(lwork);
+  auto* const work_data = work.data();
+
+  for (const auto i : c10::irange(batchsize)) {
+    auto* const A_working_ptr = &A_data[i * A_stride];
+    auto* const S_working_ptr = &S_data[i * S_stride];
+    auto* const U_working_ptr = compute_uv ? &U_data[i * U_stride] : nullptr;
+    auto* const Vh_working_ptr = compute_uv ? &Vh_data[i * Vh_stride] : nullptr;
+
+    // Compute S, U (optionally) and Vh (optionally)
+    lapackSvd<scalar_t, value_t>(jobz, m, n, A_working_ptr, lda,
+                        S_working_ptr, U_working_ptr, ldu, Vh_working_ptr, ldvh, work_data, lwork, rwork_data, iwork_data, info_data + i);
+  }
+#endif
+}
+
+void svd_kernel(const Tensor& A,
+                const bool full_matrices,
+                const bool compute_uv,
+                const Tensor& U,
+                const Tensor& S,
+                const Tensor& Vh,
+                const Tensor& infos) {
+  // Need to copy A as column major, as its contents will be destroyed in the LAPACK call.
+  // FIXME It'd be more efficient, rather than cloning A, to copy it into `U` or `Vh` (depending on m > n
+  // or m < n) and call jobz='O'
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(A.scalar_type(), "linalg_svd_cpu", [&]{
+    apply_svd<scalar_t>(cloneBatchedColumnMajor(A), full_matrices, compute_uv, U, S, Vh, infos);
+  });
+}
+
 } // anonymous namespace
 
 REGISTER_ARCH_DISPATCH(cholesky_stub, DEFAULT, &cholesky_kernel);
@@ -1023,4 +1101,9 @@ REGISTER_AVX2_DISPATCH(lu_solve_stub, &lu_solve_kernel);
 REGISTER_VSX_DISPATCH(lu_solve_stub, &lu_solve_kernel);
 REGISTER_ZVECTOR_DISPATCH(lu_solve_stub, &lu_solve_kernel);
 
+REGISTER_ARCH_DISPATCH(svd_stub, DEFAULT, &svd_kernel);
+REGISTER_AVX512_DISPATCH(svd_stub, &svd_kernel);
+REGISTER_AVX2_DISPATCH(svd_stub, &svd_kernel);
+REGISTER_VSX_DISPATCH(svd_stub, &svd_kernel);
+REGISTER_ZVECTOR_DISPATCH(svd_stub, &svd_kernel);
 }} // namespace at::native
@@ -14,6 +14,7 @@
 #include <cstring>
 #include <cctype>
 
+
 namespace at { namespace native {
 
 // Used as an interface between the different BLAS-like libraries
@@ -248,7 +249,6 @@ void batch_iterator_with_broadcasting(const Tensor& a, const Tensor& b, const fu
   iter.serial_for_each(loop, {0, batchCount(b)});
 }
 
-
 // Returns the epsilon value for floating types except half
 static inline double _get_epsilon(const ScalarType& sc_type) {
   switch (sc_type) {
@@ -468,55 +468,14 @@ static inline std::tuple<std::vector<int64_t>,
   return std::make_tuple(q_sizes, q_strides, n_columns_q);
 }
 
-// Function to generate empty tensors of required size, strides and dtype for the SVD operation
-static inline std::tuple<Tensor, Tensor, Tensor> _create_U_S_VT(const Tensor& input, bool some, bool compute_uv,
-    const bool svd_use_cusolver=false) {
-
-  // U, S, VT are initialized as empty tensors.
-  // For CPU LAPACK and GPU MAGMA backend, the tensors are initialized on CPU.
-  // For GPU cuSOLVER backend, the tensors are initialized on GPU.
-  const auto usvt_device = svd_use_cusolver ? at::kCUDA : at::kCPU;
-
-  auto sizes = input.sizes().vec();
-  int64_t m = input.size(-2), n = input.size(-1);
-
-  sizes[input.dim() - 1] = some ? std::min(m, n) : m;
-  const auto u_strides = contiguous_strides(sizes, /*f-contig*/true);
-
-  // cuSOLVER's gesvdjBatched fails with illegal memory access and
-  // cuSOLVER's gesvdj fails with CUSOLVER_STATUS_EXECUTION_FAILED
-  // if matrices for U and VT are not allocated
-  // even though the result of computation is not used we need to allocate this memory
-
-  Tensor U_empty = (compute_uv || svd_use_cusolver)
-      ? at::empty_strided(sizes, u_strides, input.options().device(usvt_device))
-      : at::empty({0}, input.options().device(usvt_device));
-
-  // VT should be a column-major or a batch of column-major matrices
-  sizes[input.dim() - 2] = some ? std::min(m, n) : n;
-  sizes[input.dim() - 1] = n;
-  const auto vt_strides = contiguous_strides(sizes, /*f-contig*/!svd_use_cusolver);
-  Tensor VT_empty = (compute_uv || svd_use_cusolver)
-      ? at::empty_strided(sizes, vt_strides, input.options().device(usvt_device))
-      : at::empty({0}, input.options().device(usvt_device));
-
-  // U and VT might not get filled in this case
-  if (!some && compute_uv && input.numel() == 0) {
-    U_empty.zero_();
-    VT_empty.zero_();
-    // make U and VT an identity matrix, because they should be orthogonal
-    U_empty.diagonal(0, -2, -1).fill_(1);
-    VT_empty.diagonal(0, -2, -1).fill_(1);
-  }
-
-  sizes.pop_back();
-  sizes[input.dim() - 2] = std::min(m, n);
-  ScalarType dtype = toValueType(input.scalar_type());
-  Tensor S_empty = at::empty(sizes, input.options().dtype(dtype).device(usvt_device));
-
-  return std::tuple<Tensor, Tensor, Tensor>(U_empty, S_empty, VT_empty);
+static inline bool svd_uses_cusolver(const Tensor& A) {
+  // if cusolver is available, it is used unconditionally
+  return A.is_cuda()
+         && at::globalContext().hasCuSOLVER()
+         && at::globalContext().linalgPreferredBackend() != at::LinalgBackend::Magma;
 }
 
+
 // Function used instead of .to so that the original strides are retained
 // .to doesn't retain strides and make the output tensor contiguous
 static inline Tensor same_stride_to(const Tensor& original_tensor, const at::TensorOptions& options) {
 
@@ -35,6 +35,8 @@ TORCH_LIBRARY_IMPL(aten, Negative, m) {
   // linear algebra functions
   m.impl("linalg_solve_triangular", torch::CppFunction::makeFallthrough());
   m.impl("linalg_solve_triangular.out", torch::CppFunction::makeFallthrough());
+  m.impl("linalg_svd", torch::CppFunction::makeFallthrough());
+  m.impl("linalg_svd.U", torch::CppFunction::makeFallthrough());
 
   TORCH_VIEW_FNS(m)
   TENSOR_UTILITIES_AND_CONSTRUCTORS(m)