llvm
diff --git a/‎mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
Lines changed: 4 additions & 0 deletions b/‎mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
Lines changed: 10 additions & 0 deletions b/‎mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
Lines changed: 10 additions & 0 deletions
diff --git a/‎mlir/lib/Dialect/SparseTensor/Transforms/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎mlir/lib/Dialect/SparseTensor/Transforms/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.cpp
Lines changed: 17 additions & 26 deletions b/‎mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.cpp
Lines changed: 17 additions & 26 deletions
diff --git a/‎mlir/lib/Dialect/SparseTensor/Transforms/IterationGraphSorter.cpp
Lines changed: 273 additions & 0 deletions b/‎mlir/lib/Dialect/SparseTensor/Transforms/IterationGraphSorter.cpp
Lines changed: 273 additions & 0 deletions
@@ -127,6 +127,10 @@ inline bool hasAnySparseOperandOrResult(Operation *op) {
   return hasAnySparseOperand(op) || hasAnySparseResult(op);
 }
 
+/// Returns true iff MLIR operation has any sparse tensor with non-identity
+/// dim2lvl maps.
+bool hasAnyNonIdentityOperandsOrResults(Operation *op);
+
 //
 // Inference.
 //
 
@@ -875,6 +875,16 @@ bool mlir::sparse_tensor::isUniqueCOOType(Type tp) {
   return isCOOType(getSparseTensorEncoding(tp), 0, /*isUnique=*/true);
 }
 
+bool mlir::sparse_tensor::hasAnyNonIdentityOperandsOrResults(Operation *op) {
+  auto hasNonIdentityMap = [](Value v) {
+    auto stt = tryGetSparseTensorType(v);
+    return stt && !stt->isIdentity();
+  };
+
+  return llvm::any_of(op->getOperands(), hasNonIdentityMap) ||
+         llvm::any_of(op->getResults(), hasNonIdentityMap);
+}
+
 Level mlir::sparse_tensor::getCOOStart(SparseTensorEncodingAttr enc) {
   // We only consider COO region with at least two levels for the purpose
   // of AOS storage optimization.
 
@@ -2,6 +2,7 @@ add_mlir_dialect_library(MLIRSparseTensorTransforms
   BufferizableOpInterfaceImpl.cpp
   CodegenEnv.cpp
   CodegenUtils.cpp
+  IterationGraphSorter.cpp
   LoopEmitter.cpp
   SparseBufferRewriting.cpp
   SparseGPUCodegen.cpp
 
@@ -57,7 +57,11 @@ CodegenEnv::CodegenEnv(linalg::GenericOp linop, SparsificationOptions opts,
       loopEmitter(), topSort(), sparseOut(nullptr), outerParNest(-1u),
       insChain(), expValues(), expFilled(), expAdded(), expCount(), redVal(),
       redExp(detail::kInvalidId), redCustom(detail::kInvalidId),
-      redValidLexInsert() {}
+      redValidLexInsert() {
+  // TODO: remove topSort, loops should be already sorted by previous pass.
+  for (unsigned l = 0; l < latticeMerger.getNumLoops(); l++)
+    topSort.push_back(l);
+}
 
 LogicalResult CodegenEnv::initTensorExp() {
   // Builds the tensor expression for the Linalg operation in SSA form.
@@ -181,36 +185,23 @@ bool CodegenEnv::isAdmissibleTensorExp(ExprId exp) {
   // Accept "truly dynamic" if the output tensor materializes uninitialized
   // into the computation and insertions occur in lexicographic index order.
   sparseOut = lhs;
-  return isMaterializing(lhs->get());
-}
 
-bool CodegenEnv::isAdmissibleTopoOrder() {
-  if (!hasSparseOutput())
-    return true;
-
-  OpOperand *lhs = linalgOp.getDpsInitOperand(0);
-  // Accept "truly dynamic" if the output tensor materializes uninitialized
-  // into the computation and insertions occur in lexicographic index order.
-  LoopOrd nest = 0;
+  // Find the outermost parallel nest to determine whether compress/expand is
+  // needed.
+  outerParNest = 0;
   const auto iteratorTypes = linalgOp.getIteratorTypesArray();
   assert(topSortSize() == latticeMerger.getNumLoops());
   for (const LoopId i : topSort) {
-    if (!latticeMerger.isFilterLoop(i)) {
-      // We only count non-filter loops as filter loops should be considered
-      // a special type of parallel loops.
-      if (linalg::isReductionIterator(iteratorTypes[i]))
-        break; // terminate at first reduction
-      nest++;
-    }
-  }
-  // Determine admissible dynamic insertion situations:
-  // (1) fully injective, since there are no reductions,
-  // (2) admissible 1-d expansion in innermost dimension.
-  if (static_cast<int64_t>(nest) >= linalgOp.getRank(lhs) - 1) {
-    outerParNest = nest;
-    return true;
+    if (linalg::isReductionIterator(iteratorTypes[i]))
+      break; // terminate at first reduction
+    outerParNest++;
   }
-  return false;
+
+  // Inadmissible kernel should have already been rejected by the previous
+  // path during loop scheduling.
+  assert(static_cast<int64_t>(outerParNest) >=
+         linalgOp.getRank(linalgOp.getDpsInitOperand(0)) - 1);
+  return isMaterializing(lhs->get());
 }
 
 //===----------------------------------------------------------------------===//
 
@@ -0,0 +1,273 @@
+//===- LoopScheduler.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "IterationGraphSorter.h"
+
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
+#include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
+#include "mlir/IR/AffineExprVisitor.h"
+#include "mlir/IR/BuiltinTypes.h"
+
+using namespace mlir;
+using namespace mlir::sparse_tensor;
+
+namespace {
+
+/// A helper class that visits an affine expression and tries to find an
+/// AffineDimExpr to which the corresponding iterator from a GenericOp matches
+/// the desired iterator type.
+/// If there is no matched iterator type, returns the first DimExpr in the
+/// expression.
+class AffineDimFinder : public AffineExprVisitor<AffineDimFinder> {
+public:
+  explicit AffineDimFinder(ArrayRef<utils::IteratorType> itTypes)
+      : iterTypes(itTypes) {}
+
+  // Override method from AffineExprVisitor.
+  void visitDimExpr(AffineDimExpr expr) {
+    if (pickedDim == nullptr || pickIterType == iterTypes[expr.getPosition()])
+      pickedDim = expr;
+  }
+
+  /// Set the desired iterator type that we want to pick.
+  void setPickedIterType(utils::IteratorType iterType) {
+    pickIterType = iterType;
+  }
+
+  /// Get the desired AffineDimExpr.
+  AffineDimExpr getDimExpr() const {
+    return llvm::cast<AffineDimExpr>(pickedDim);
+  }
+
+  void walkPostOrder(AffineExpr expr) {
+    pickedDim = nullptr;
+    AffineExprVisitor<AffineDimFinder>::walkPostOrder(expr);
+  }
+
+private:
+  /// The picked AffineDimExpr after visit.
+  AffineExpr pickedDim;
+  /// The iterator type that we want.
+  utils::IteratorType pickIterType;
+  /// The mapping between dim=>iterator type.
+  ArrayRef<utils::IteratorType> iterTypes;
+};
+
+// Flattens an affine expression into a list of AffineDimExprs.
+struct AffineDimCollector : public AffineExprVisitor<AffineDimCollector> {
+  // Overrides method from AffineExprVisitor.
+  void visitDimExpr(AffineDimExpr expr) { dims.push_back(expr); }
+  SmallVector<AffineDimExpr> dims;
+};
+
+} // namespace
+
+inline static bool includesAny(SortMask mask1, SortMask mask2) {
+  return static_cast<unsigned>(mask1) & static_cast<unsigned>(mask2);
+}
+
+inline static bool includesDenseInput(SortMask mask) {
+  return includesAny(mask, SortMask::kIncludeDenseInput);
+}
+
+inline static bool includesDenseOutput(SortMask mask) {
+  return includesAny(mask, SortMask::kIncludeDenseOutput);
+}
+
+/// A helper to compute a topological sort. O(n^2) time complexity
+/// as we use adj matrix for the graph.
+/// The sorted result will put the first Reduction iterator to the
+/// latest possible position.
+AffineMap IterationGraphSorter::topoSort() {
+  std::vector<unsigned> redIt; // reduce iterator with 0 degree
+  std::vector<unsigned> parIt; // parallel iterator with 0 degree
+  const unsigned numLoops = getNumLoops();
+  for (unsigned i = 0; i < numLoops; i++) {
+    if (inDegree[i] == 0) {
+      if (iterTypes[i] == utils::IteratorType::reduction)
+        redIt.push_back(i);
+      else
+        parIt.push_back(i);
+    }
+  }
+
+  SmallVector<unsigned> loopOrder;
+  while (!redIt.empty() || !parIt.empty()) {
+    // We always prefer parallel loop over reduction loop because putting
+    // reduction loop early might make the loop sequence inadmissible.
+    auto &it = !parIt.empty() ? parIt : redIt;
+    auto src = it.back();
+    loopOrder.push_back(src);
+    it.pop_back();
+    // Update in-degree, and push 0-degree node into worklist.
+    for (unsigned dst = 0; dst < numLoops; dst++) {
+      if (itGraph[src][dst] && --inDegree[dst] == 0) {
+        if (iterTypes[dst] == utils::IteratorType::reduction)
+          redIt.push_back(dst);
+        else
+          parIt.push_back(dst);
+      }
+    }
+  }
+
+  if (loopOrder.size() == numLoops)
+    return AffineMap::getPermutationMap(loopOrder, out.getContext());
+
+  // Cycle detected.
+  return AffineMap();
+}
+
+IterationGraphSorter
+IterationGraphSorter::fromGenericOp(linalg::GenericOp genericOp) {
+  // Must be a demapped sparse kernel.
+  assert(!hasAnyNonIdentityOperandsOrResults(genericOp) &&
+         hasAnySparseOperandOrResult(genericOp) &&
+         genericOp.getNumDpsInits() == 1);
+
+  SmallVector<AffineMap> loopMap = genericOp.getIndexingMapsArray();
+  SmallVector<Value> ins = genericOp.getDpsInputs();
+
+  AffineMap outMap = loopMap.back();
+  loopMap.pop_back();
+
+  Value out = genericOp.getDpsInitOperand(0)->get();
+  SmallVector<utils::IteratorType> iterTypes =
+      genericOp.getIteratorTypesArray();
+
+  return IterationGraphSorter(std::move(ins), std::move(loopMap), out, outMap,
+                              std::move(iterTypes));
+}
+
+IterationGraphSorter::IterationGraphSorter(
+    SmallVector<Value> &&ins, SmallVector<AffineMap> &&loop2InsLvl, Value out,
+    AffineMap loop2OutLvl, SmallVector<utils::IteratorType> &&iterTypes)
+    : ins(std::move(ins)), loop2InsLvl(std::move(loop2InsLvl)), out(out),
+      loop2OutLvl(loop2OutLvl), iterTypes(std::move(iterTypes)) {
+  // One map per tensor.
+  assert(loop2InsLvl.size() == ins.size());
+  // All the affine maps have the same number of dimensions (loops).
+  assert(llvm::all_equal(llvm::map_range(
+      loop2InsLvl, [](AffineMap m) { return m.getNumDims(); })));
+  // The number of results of the map should match the rank of the tensor.
+  assert(llvm::all_of(llvm::zip(loop2InsLvl, ins), [](auto mvPair) {
+    auto [m, v] = mvPair;
+    return m.getNumResults() ==
+           v.getType().template cast<ShapedType>().getRank();
+  }));
+
+  itGraph.resize(getNumLoops(), std::vector<bool>(getNumLoops(), false));
+  inDegree.resize(getNumLoops());
+}
+
+AffineMap IterationGraphSorter::sort(SortMask mask, Value ignored) {
+  // Reset the interation graph.
+  for (auto &row : itGraph)
+    std::fill(row.begin(), row.end(), false);
+  // Reset cached in-degree.
+  std::fill(inDegree.begin(), inDegree.end(), 0);
+
+  for (auto [in, map] : llvm::zip(ins, loop2InsLvl)) {
+    // Get map and encoding.
+    const auto enc = getSparseTensorEncoding(in.getType());
+    // Skip dense inputs when not requested.
+    if ((!enc && !includesDenseInput(mask)) || in == ignored)
+      continue;
+
+    addConstraints(in, map);
+  }
+
+  // Get map and encoding.
+  const auto enc = getSparseTensorEncoding(out.getType());
+  if ((enc || includesDenseOutput(mask)) && out != ignored)
+    addConstraints(out, loop2OutLvl);
+
+  return topoSort();
+}
+
+void IterationGraphSorter::addConstraints(Value t, AffineMap loop2LvlMap) {
+  auto addIterOrdering = [this](unsigned f, unsigned t) {
+    if (!itGraph[f][t] && f != t) {
+      itGraph[f][t] = true;
+      inDegree[t]++;
+    }
+  };
+
+  AffineDimFinder finder(iterTypes);
+  finder.setPickedIterType(utils::IteratorType::reduction);
+
+  // To compute iteration graph for tensor[d0 + d1 + d3, d4 + d5 + d6],
+  // we require there exist d_x \in {d0, d1, d3} and d_y \in {d4, d5, d6},
+  // and d_x > d_y && {d0, d1, d3} - d_x > {d4, d5, d6} - d_y
+  const Level lvlRank = loop2LvlMap.getNumResults();
+  for (Level lvl = 1; lvl < lvlRank; lvl++) {
+    const AffineExpr fa = loop2LvlMap.getResult(lvl - 1);
+    const AffineExpr ta = loop2LvlMap.getResult(lvl);
+
+    if (llvm::isa<AffineDimExpr>(fa) || llvm::isa<AffineDimExpr>(ta)) {
+      // Special case when at least one loop2LvlExp is a simple AffineDimExpr
+      // (say, d0) and we require d0 > {d1, d2, ...} or {d1, d2, ...} > d0
+      AffineDimCollector fCollector;
+      fCollector.walkPostOrder(fa);
+      AffineDimCollector tCollector;
+      tCollector.walkPostOrder(ta);
+
+      for (auto fd : fCollector.dims) {
+        for (auto td : tCollector.dims) {
+          const unsigned f = fd.getPosition();
+          const unsigned t = td.getPosition();
+          addIterOrdering(f, t);
+        }
+      }
+      continue;
+    }
+
+    // When both loop2LvlExpr is compound, we pick an abitrary reduction loop
+    // from lhs and rhs and use them as d_x and d_y.
+    finder.walkPostOrder(fa);
+    const AffineDimExpr fexp = finder.getDimExpr();
+    const unsigned fldx = fexp.getPosition();
+
+    finder.walkPostOrder(ta);
+    const AffineDimExpr texp = finder.getDimExpr();
+    const unsigned tldx = texp.getPosition();
+
+    // d_x > d_y
+    addIterOrdering(fldx, tldx);
+
+    AffineDimCollector fCollector;
+    fCollector.walkPostOrder(fa);
+    AffineDimCollector tCollector;
+    tCollector.walkPostOrder(ta);
+
+    // Make sure dx and dy is the last.
+    for (auto fd : fCollector.dims) {
+      const unsigned f = fd.getPosition();
+      addIterOrdering(f, fldx);
+    }
+    for (auto td : tCollector.dims) {
+      const unsigned t = td.getPosition();
+      addIterOrdering(t, tldx);
+    }
+    // {d0, d1, d3} - d_x > {d4, d5, d6} - d_y
+    // This is to ensure that the affine expressions are reduced in sparse
+    // tensor level ordering.
+    for (auto fd : fCollector.dims) {
+      const unsigned f = fd.getPosition();
+      if (f == fldx) // skip d_x
+        continue;
+      for (auto td : tCollector.dims) {
+        const unsigned t = td.getPosition();
+        if (t == tldx) // skip d_y
+          continue;
+        addIterOrdering(f, t);
+      }
+    }
+  }
+}