From e1b02f21f461135c10bbb8304b2ada9d47681d5e Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 3 Dec 2021 23:26:54 +0100
Subject: [PATCH 1/2] [PhaseOrdering] Add test for incorrect merge function
 scheduling

Add an -enable-merge-functions option to allow testing of function
merging as it will actually happen in the optimization pipeline.
Based on that add a test where we currently produce two identical
functions without merging them due to incorrect pass scheduling
under the new pass manager.

(cherry picked from commit 5b94037a304eb88809f86d8b9235426976061009)
---
 llvm/lib/Passes/PassBuilder.cpp               |   6 +-
 .../PhaseOrdering/X86/merge-functions.ll      | 126 ++++++++++++++++++
 2 files changed, 131 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 0bc955dbbfea3..216dea3b8172b 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -278,6 +278,10 @@ static cl::opt<bool> EnableO3NonTrivialUnswitching(
     "enable-npm-O3-nontrivial-unswitch", cl::init(true), cl::Hidden,
     cl::ZeroOrMore, cl::desc("Enable non-trivial loop unswitching for -O3"));
 
+static cl::opt<bool> EnableMergeFunctions(
+    "enable-merge-functions", cl::init(false), cl::Hidden,
+    cl::desc("Enable function merging as part of the optimization pipeline"));
+
 PipelineTuningOptions::PipelineTuningOptions() {
   LoopInterleaving = true;
   LoopVectorization = true;
@@ -287,7 +291,7 @@ PipelineTuningOptions::PipelineTuningOptions() {
   LicmMssaOptCap = SetLicmMssaOptCap;
   LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap;
   CallGraphProfile = true;
-  MergeFunctions = false;
+  MergeFunctions = EnableMergeFunctions;
 }
 
 namespace llvm {
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll b/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll
new file mode 100644
index 0000000000000..aa83692844240
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll
@@ -0,0 +1,126 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='default<O3>' -enable-merge-functions -S < %s | FileCheck %s
+
+; TODO: These two functions should get merged, but currently aren't, because
+; the function merging pass is scheduled too early.
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i1 @test1(i32 %c) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SWITCH_TABLEIDX:%.*]] = add i32 [[C:%.*]], -100
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[SWITCH_TABLEIDX]], 20
+; CHECK-NEXT:    [[SWITCH_CAST:%.*]] = trunc i32 [[SWITCH_TABLEIDX]] to i20
+; CHECK-NEXT:    [[SWITCH_DOWNSHIFT:%.*]] = lshr i20 -490991, [[SWITCH_CAST]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i20 [[SWITCH_DOWNSHIFT]], 1
+; CHECK-NEXT:    [[SWITCH_MASKED:%.*]] = icmp ne i20 [[TMP1]], 0
+; CHECK-NEXT:    [[I_0:%.*]] = select i1 [[TMP0]], i1 [[SWITCH_MASKED]], i1 false
+; CHECK-NEXT:    ret i1 [[I_0]]
+;
+entry:
+  %_4 = alloca i8, align 1
+  %_3 = alloca i8, align 1
+  %_2 = alloca i8, align 1
+  %i = alloca i8, align 1
+  %i1 = icmp eq i32 %c, 115
+  br i1 %i1, label %bb10, label %bb11
+
+bb10:                                             ; preds = %entry
+  store i8 1, i8* %_4, align 1
+  br label %bb12
+
+bb11:                                             ; preds = %entry
+  %_6 = icmp eq i32 %c, 109
+  %i2 = zext i1 %_6 to i8
+  store i8 %i2, i8* %_4, align 1
+  br label %bb12
+
+bb12:                                             ; preds = %bb11, %bb10
+  %i3 = load i8, i8* %_4, align 1
+  %i4 = trunc i8 %i3 to i1
+  br i1 %i4, label %bb7, label %bb8
+
+bb8:                                              ; preds = %bb12
+  %_8 = icmp eq i32 %c, 104
+  %i5 = zext i1 %_8 to i8
+  store i8 %i5, i8* %_3, align 1
+  br label %bb9
+
+bb7:                                              ; preds = %bb12
+  store i8 1, i8* %_3, align 1
+  br label %bb9
+
+bb9:                                              ; preds = %bb7, %bb8
+  %i6 = load i8, i8* %_3, align 1
+  %i7 = trunc i8 %i6 to i1
+  br i1 %i7, label %bb4, label %bb5
+
+bb5:                                              ; preds = %bb9
+  %_10 = icmp eq i32 %c, 100
+  %i8 = zext i1 %_10 to i8
+  store i8 %i8, i8* %_2, align 1
+  br label %bb6
+
+bb4:                                              ; preds = %bb9
+  store i8 1, i8* %_2, align 1
+  br label %bb6
+
+bb6:                                              ; preds = %bb4, %bb5
+  %i9 = load i8, i8* %_2, align 1
+  %i10 = trunc i8 %i9 to i1
+  br i1 %i10, label %bb1, label %bb2
+
+bb2:                                              ; preds = %bb6
+  %_12 = icmp eq i32 %c, 119
+  %i11 = zext i1 %_12 to i8
+  store i8 %i11, i8* %i, align 1
+  br label %bb3
+
+bb1:                                              ; preds = %bb6
+  store i8 1, i8* %i, align 1
+  br label %bb3
+
+bb3:                                              ; preds = %bb1, %bb2
+  %i12 = load i8, i8* %i, align 1
+  %i13 = trunc i8 %i12 to i1
+  ret i1 %i13
+}
+
+define i1 @test2(i32 %c) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SWITCH_TABLEIDX:%.*]] = add i32 [[C:%.*]], -100
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[SWITCH_TABLEIDX]], 20
+; CHECK-NEXT:    [[SWITCH_CAST:%.*]] = trunc i32 [[SWITCH_TABLEIDX]] to i20
+; CHECK-NEXT:    [[SWITCH_DOWNSHIFT:%.*]] = lshr i20 -490991, [[SWITCH_CAST]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i20 [[SWITCH_DOWNSHIFT]], 1
+; CHECK-NEXT:    [[SWITCH_MASKED:%.*]] = icmp ne i20 [[TMP1]], 0
+; CHECK-NEXT:    [[I_0:%.*]] = select i1 [[TMP0]], i1 [[SWITCH_MASKED]], i1 false
+; CHECK-NEXT:    ret i1 [[I_0]]
+;
+entry:
+  %i = alloca i8, align 1
+  switch i32 %c, label %bb1 [
+  i32 115, label %bb2
+  i32 109, label %bb2
+  i32 104, label %bb2
+  i32 100, label %bb2
+  i32 119, label %bb2
+  ]
+
+bb1:                                              ; preds = %entry
+  store i8 0, i8* %i, align 1
+  br label %bb3
+
+bb2:                                              ; preds = %entry, %entry, %entry, %entry, %entry
+  store i8 1, i8* %i, align 1
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb1
+  %i1 = load i8, i8* %i, align 1
+  %i2 = trunc i8 %i1 to i1
+  ret i1 %i2
+}
+

From 4d82dcae064e8eb16bba6085319176fedf34a48a Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Sat, 4 Dec 2021 13:14:15 +0100
Subject: [PATCH 2/2] [NewPM] Fix MergeFunctions scheduling

MergeFunctions (as well as HotColdSplitting an IROutliner) are
incorrectly scheduled under the new pass manager. The code makes
it look like they run towards the end of the module optimization
pipeline (as they should), while in reality the run at the start.
This is because the OptimizePM populated around them is only
scheduled later.

I'm fixing this by moving these three passes until after OptimizePM
to avoid splitting the function pass pipeline. It doesn't seem
important to me that some of the function passes run after these
late module passes.

Differential Revision: https://reviews.llvm.org/D115098

(cherry picked from commit ae7f468073e448d9a5c39364a5cc4241525c4896)
---
 llvm/lib/Passes/PassBuilder.cpp               | 34 +++++++++----------
 .../PhaseOrdering/X86/merge-functions.ll      | 11 ++----
 2 files changed, 19 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 216dea3b8172b..eaf4108c64db9 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1420,23 +1420,6 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
 
   addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false);
 
-  // Split out cold code. Splitting is done late to avoid hiding context from
-  // other optimizations and inadvertently regressing performance. The tradeoff
-  // is that this has a higher code size cost than splitting early.
-  if (EnableHotColdSplit && !LTOPreLink)
-    MPM.addPass(HotColdSplittingPass());
-
-  // Search the code for similar regions of code. If enough similar regions can
-  // be found where extracting the regions into their own function will decrease
-  // the size of the program, we extract the regions, a deduplicate the
-  // structurally similar regions.
-  if (EnableIROutliner)
-    MPM.addPass(IROutlinerPass());
-
-  // Merge functions if requested.
-  if (PTO.MergeFunctions)
-    MPM.addPass(MergeFunctionsPass());
-
   // LoopSink pass sinks instructions hoisted by LICM, which serves as a
   // canonicalization pass that enables other optimizations. As a result,
   // LoopSink pass needs to be a very late IR pass to avoid undoing LICM
@@ -1463,6 +1446,23 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   for (auto &C : OptimizerLastEPCallbacks)
     C(MPM, Level);
 
+  // Split out cold code. Splitting is done late to avoid hiding context from
+  // other optimizations and inadvertently regressing performance. The tradeoff
+  // is that this has a higher code size cost than splitting early.
+  if (EnableHotColdSplit && !LTOPreLink)
+    MPM.addPass(HotColdSplittingPass());
+
+  // Search the code for similar regions of code. If enough similar regions can
+  // be found where extracting the regions into their own function will decrease
+  // the size of the program, we extract the regions, a deduplicate the
+  // structurally similar regions.
+  if (EnableIROutliner)
+    MPM.addPass(IROutlinerPass());
+
+  // Merge functions if requested.
+  if (PTO.MergeFunctions)
+    MPM.addPass(MergeFunctionsPass());
+
   if (PTO.CallGraphProfile)
     MPM.addPass(CGProfilePass());
 
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll b/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll
index aa83692844240..39cd34a98a002 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll
@@ -90,15 +90,8 @@ bb3:                                              ; preds = %bb1, %bb2
 
 define i1 @test2(i32 %c) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SWITCH_TABLEIDX:%.*]] = add i32 [[C:%.*]], -100
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[SWITCH_TABLEIDX]], 20
-; CHECK-NEXT:    [[SWITCH_CAST:%.*]] = trunc i32 [[SWITCH_TABLEIDX]] to i20
-; CHECK-NEXT:    [[SWITCH_DOWNSHIFT:%.*]] = lshr i20 -490991, [[SWITCH_CAST]]
-; CHECK-NEXT:    [[TMP1:%.*]] = and i20 [[SWITCH_DOWNSHIFT]], 1
-; CHECK-NEXT:    [[SWITCH_MASKED:%.*]] = icmp ne i20 [[TMP1]], 0
-; CHECK-NEXT:    [[I_0:%.*]] = select i1 [[TMP0]], i1 [[SWITCH_MASKED]], i1 false
-; CHECK-NEXT:    ret i1 [[I_0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i1 @test1(i32 [[TMP0:%.*]]) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    ret i1 [[TMP2]]
 ;
 entry:
   %i = alloca i8, align 1