[RISCV] Fix the worst case for VSHA2MS in SiFive P400/P600 scheduling models (llvm#116893)

mshockwave · web-flow · commit 0165f8817cbd · 2024-11-21T10:59:46.000-08:00
For each RVV instruction we should have a single WriteRes assignment to the worst case scheduling class. This assignment is usually equal to that of the largest LMUL + smallest SEW. My llvm#114317 accidentally made two of these assignments on `WriteVSHA2MSV_WorstCase`. This won't affect our MachineScheduler nor most of our llvm-mca use cases (assuming you populate the correct LMUL and SEW), yet it's not ideal either. This patch fixes this issue by assigning the correct numbers and resource mapping to `WriteVSHA2MSV_WorstCase`, which is equal to that of largest LMUL + _largest_ SEW (Zvknh's scheduling properties are special). I also added a MCA test to make sure we always pick up the correct worst case numbers for P600's scheduling model. Original issue was reported by @reidtatge
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
@@ -883,8 +883,16 @@ foreach mx = SchedMxList in {
   let Latency = 3, ReleaseAtCycles = [LMulLat] in {
     defm "" : LMULWriteResMX<"WriteVSHA2CHV", [SiFiveP400VEXQ0], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVSHA2CLV", [SiFiveP400VEXQ0], mx, IsWorstCase>;
-    foreach sew = !listremove(SchedSEWSet<mx>.val, [8, 16]) in
-    defm "" : LMULSEWWriteResMXSEW<"WriteVSHA2MSV", [SiFiveP400VEXQ0], mx, sew, IsWorstCase>;
+    defvar ZvknhSEWs = !listremove(SchedSEWSet<mx>.val, [8, 16]);
+    // Largest SEW is the last element, assuming SchedSEWSet is sorted in ascending
+    // order.
+    defvar LargestZvknhSEW = !foldl(!head(ZvknhSEWs), ZvknhSEWs, last, curr, curr);
+    foreach sew = ZvknhSEWs in {
+      // The worst case for Zvknh[ab] is designated to the largest SEW and LMUL.
+      defvar IsWorstCaseVSHA2MSV = !and(IsWorstCase, !eq(sew, LargestZvknhSEW));
+      defm "" : LMULSEWWriteResMXSEW<"WriteVSHA2MSV", [SiFiveP400VEXQ0], mx, sew,
+                                     IsWorstCaseVSHA2MSV>;
+    }
   }
   // Zvkned
   let Latency = 2, ReleaseAtCycles = [LMulLat] in {
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
@@ -1135,9 +1135,16 @@ foreach mx = SchedMxList in {
   let Latency = 3, ReleaseAtCycles = [LMulLat] in {
     defm "" : LMULWriteResMX<"WriteVSHA2CHV", [SiFiveP600VectorCrypto], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVSHA2CLV", [SiFiveP600VectorCrypto], mx, IsWorstCase>;
-    foreach sew = !listremove(SchedSEWSet<mx>.val, [8, 16]) in {
+    defvar ZvknhSEWs = !listremove(SchedSEWSet<mx>.val, [8, 16]);
+    // Largest SEW is the last element, assuming SchedSEWSet is sorted in ascending
+    // order.
+    defvar LargestZvknhSEW = !foldl(!head(ZvknhSEWs), ZvknhSEWs, last, curr, curr);
+    foreach sew = ZvknhSEWs in {
+      // The worst case for Zvknh[ab] is designated to the largest SEW and LMUL.
+      defvar IsWorstCaseVSHA2MSV = !and(IsWorstCase, !eq(sew, LargestZvknhSEW));
       let ReleaseAtCycles = [SiFiveP600VSHA2MSCycles<mx, sew>.c] in
-      defm "" : LMULSEWWriteResMXSEW<"WriteVSHA2MSV", [SiFiveP600VectorCrypto], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVSHA2MSV", [SiFiveP600VectorCrypto], mx, sew,
+                                     IsWorstCaseVSHA2MSV>;
     }
   }
   // Zvkned
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvknhb.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvknhb.s
@@ -1,6 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
 # RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-p670 -iterations=1 < %s | FileCheck %s
 
+# Worst case for vsha2ms should be that of LMUL=8 and SEW=64.
+vsha2ms.vv v4, v8, v12
+
 # SEW is only e32 or e64
 
 vsetvli zero, zero, e32, m1, tu, mu
@@ -44,14 +47,14 @@ vsha2ch.vv v8, v16, v24
 vsha2cl.vv v8, v16, v24
 
 # CHECK:      Iterations:        1
-# CHECK-NEXT: Instructions:      32
-# CHECK-NEXT: Total Cycles:      108
-# CHECK-NEXT: Total uOps:        32
+# CHECK-NEXT: Instructions:      33
+# CHECK-NEXT: Total Cycles:      119
+# CHECK-NEXT: Total uOps:        33
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.30
-# CHECK-NEXT: IPC:               0.30
-# CHECK-NEXT: Block RThroughput: 97.0
+# CHECK-NEXT: uOps Per Cycle:    0.28
+# CHECK-NEXT: IPC:               0.28
+# CHECK-NEXT: Block RThroughput: 109.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -62,6 +65,7 @@ vsha2cl.vv v8, v16, v24
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     12.00                       vsha2ms.vv	v4, v8, v12
 # CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
 # CHECK-NEXT:  1      3     1.00                        vsha2ms.vv	v4, v8, v12
 # CHECK-NEXT:  1      3     1.00                        vsha2ch.vv	v4, v8, v12
@@ -115,10 +119,11 @@ vsha2cl.vv v8, v16, v24
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]
-# CHECK-NEXT:  -      -      -      -     8.00    -      -      -      -      -      -     97.00   -      -      -      -
+# CHECK-NEXT:  -      -      -      -     8.00    -      -      -      -      -      -     109.00  -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     12.00   -      -      -      -     vsha2ms.vv	v4, v8, v12
 # CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vsha2ms.vv	v4, v8, v12
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vsha2ch.vv	v4, v8, v12