Add -reorder-basic-blocks option and fix nop insertion

1. Basic block reordering can be enabled with -reorder-basic-blocks option enabled.

Blocks will be sorted according to the Reversed Post Traversal Order, but the next
node to visit among all candidate children nodes is selected 'randomly'.

Example:
  A
 / \
B   C
 \ /
  D

This CFG can generate two possible layouts:
A-B-C-D or A-C-B-D

2. Fix nop insetion

Add checks to avoiding insertions in empty basic blocks(dead blocks) and bundle locked instructions.

BUG=
R=jpp@chromium.org, jvoung@chromium.org, stichnot@chromium.org

Review URL: https://codereview.chromium.org/1255303004.
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index ef49c15..c19d01a 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -371,6 +371,51 @@
   assert(Nodes.size() == OldSize);
 }
 
+namespace {
+void getRandomPostOrder(CfgNode *Node, llvm::BitVector &ToVisit,
+                        Ice::NodeList &PostOrder,
+                        Ice::RandomNumberGenerator *RNG) {
+  assert(ToVisit[Node->getIndex()]);
+  ToVisit[Node->getIndex()] = false;
+  NodeList Outs = Node->getOutEdges();
+  Ice::RandomShuffle(Outs.begin(), Outs.end(),
+                     [RNG](int N) { return RNG->next(N); });
+  for (CfgNode *Next : Outs) {
+    if (ToVisit[Next->getIndex()])
+      getRandomPostOrder(Next, ToVisit, PostOrder, RNG);
+  }
+  PostOrder.push_back(Node);
+}
+} // end of anonymous namespace
+
+void Cfg::shuffleNodes() {
+  if (!Ctx->getFlags().shouldReorderBasicBlocks())
+    return;
+
+  NodeList ReversedReachable;
+  NodeList Unreachable;
+  llvm::BitVector ToVisit(Nodes.size(), true);
+  // Traverse from entry node.
+  getRandomPostOrder(getEntryNode(), ToVisit, ReversedReachable,
+                     &Ctx->getRNG());
+  // Collect the unreachable nodes.
+  for (CfgNode *Node : Nodes)
+    if (ToVisit[Node->getIndex()])
+      Unreachable.push_back(Node);
+  // Copy the layout list to the Nodes.
+  SizeT OldSize = Nodes.size();
+  (void)OldSize;
+  Nodes.clear();
+  for (CfgNode *Node : reverse_range(ReversedReachable))
+    Nodes.emplace_back(Node);
+  for (CfgNode *Node : Unreachable) {
+    Nodes.emplace_back(Node);
+  }
+  assert(Nodes.size() == OldSize);
+
+  dump("After basic block shuffling");
+}
+
 void Cfg::doArgLowering() {
   TimerMarker T(TimerStack::TT_doArgLowering, this);
   getTarget()->lowerArguments();
@@ -383,6 +428,8 @@
 }
 
 void Cfg::doNopInsertion() {
+  if (!Ctx->getFlags().shouldDoNopInsertion())
+    return;
   TimerMarker T(TimerStack::TT_doNopInsertion, this);
   for (CfgNode *Node : Nodes)
     Node->doNopInsertion();
diff --git a/src/IceCfg.h b/src/IceCfg.h
index f2b5295..499e226 100644
--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -180,6 +180,7 @@
   void deletePhis();
   void advancedPhiLowering();
   void reorderNodes();
+  void shuffleNodes();
   void doAddressOpt();
   void doArgLowering();
   void doNopInsertion();
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp
index 78063e3..cae5d37 100644
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -500,18 +500,22 @@
   TargetLowering *Target = Func->getTarget();
   LoweringContext &Context = Target->getContext();
   Context.init(this);
+  Context.setInsertPoint(Context.getCur());
+  // Do not insert nop in bundle locked instructions.
+  bool PauseNopInsertion = false;
   while (!Context.atEnd()) {
-    Target->doNopInsertion();
+    if (llvm::isa<InstBundleLock>(Context.getCur())) {
+      PauseNopInsertion = true;
+    } else if (llvm::isa<InstBundleUnlock>(Context.getCur())) {
+      PauseNopInsertion = false;
+    }
+    if (!PauseNopInsertion)
+      Target->doNopInsertion();
     // Ensure Cur=Next, so that the nops are inserted before the current
     // instruction rather than after.
-    Context.advanceNext();
     Context.advanceCur();
+    Context.advanceNext();
   }
-  // Insert before all instructions.
-  Context.setInsertPoint(getInsts().begin());
-  Context.advanceNext();
-  Context.advanceCur();
-  Target->doNopInsertion();
 }
 
 // Drives the target lowering.  Passes the current instruction and the
diff --git a/src/IceClFlags.cpp b/src/IceClFlags.cpp
index 5a4abe6..5066a4b 100644
--- a/src/IceClFlags.cpp
+++ b/src/IceClFlags.cpp
@@ -284,6 +284,12 @@
     cl::desc("The threshold for immediates randomization and pooling"),
     cl::init(0xffff));
 
+// Command line option for turning on basic block shuffling.
+cl::opt<bool> ReorderBasicBlocks(
+    "reorder-basic-blocks",
+    cl::desc("Shuffle the layout of basic blocks in each functions"),
+    cl::init(false));
+
 // Command line option for turning on function layout reordering.
 cl::opt<bool> ReorderFunctions(
     "reorder-functions",
@@ -341,6 +347,7 @@
   OutFlags.PhiEdgeSplit = false;
   OutFlags.RandomNopInsertion = false;
   OutFlags.RandomRegAlloc = false;
+  OutFlags.ReorderBasicBlocks = false;
   OutFlags.ReorderFunctions = false;
   OutFlags.ReorderGlobalVariables = false;
   OutFlags.ReorderPooledConstants = false;
@@ -398,8 +405,17 @@
   OutFlags.setOptLevel(::OLevel);
   OutFlags.setPhiEdgeSplit(::EnablePhiEdgeSplit);
   OutFlags.setRandomSeed(::RandomSeed);
+  OutFlags.setRandomizeAndPoolImmediatesOption(
+      ::RandomizeAndPoolImmediatesOption);
+  OutFlags.setRandomizeAndPoolImmediatesThreshold(
+      ::RandomizeAndPoolImmediatesThreshold);
+  OutFlags.setReorderFunctionsWindowSize(::ReorderFunctionsWindowSize);
+  OutFlags.setShouldReorderBasicBlocks(::ReorderBasicBlocks);
   OutFlags.setShouldDoNopInsertion(::ShouldDoNopInsertion);
   OutFlags.setShouldRandomizeRegAlloc(::RandomizeRegisterAllocation);
+  OutFlags.setShouldReorderFunctions(::ReorderFunctions);
+  OutFlags.setShouldReorderGlobalVariables(::ReorderGlobalVariables);
+  OutFlags.setShouldReorderPooledConstants(::ReorderPooledConstants);
   OutFlags.setSkipUnimplemented(::SkipUnimplemented);
   OutFlags.setSubzeroTimingEnabled(::SubzeroTimingEnabled);
   OutFlags.setTargetArch(::TargetArch);
@@ -414,22 +430,6 @@
   OutFlags.setMaxNopsPerInstruction(::MaxNopsPerInstruction);
   OutFlags.setNopProbabilityAsPercentage(::NopProbabilityAsPercentage);
   OutFlags.setVerbose(VMask);
-
-  // Set for immediates randomization or pooling option.
-  OutFlags.setRandomizeAndPoolImmediatesOption(
-      ::RandomizeAndPoolImmediatesOption);
-  OutFlags.setRandomizeAndPoolImmediatesThreshold(
-      ::RandomizeAndPoolImmediatesThreshold);
-
-  // Set for function reordering options.
-  OutFlags.setShouldReorderFunctions(::ReorderFunctions);
-  OutFlags.setReorderFunctionsWindowSize(::ReorderFunctionsWindowSize);
-
-  // Set for global variable reordering option.
-  OutFlags.setShouldReorderGlobalVariables(::ReorderGlobalVariables);
-
-  // Set for pooled constant reordering option.
-  OutFlags.setShouldReorderPooledConstants(::ReorderPooledConstants);
 }
 
 void ClFlags::getParsedClFlagsExtra(ClFlagsExtra &OutFlagsExtra) {
diff --git a/src/IceClFlags.h b/src/IceClFlags.h
index 549bf16..244790b 100644
--- a/src/IceClFlags.h
+++ b/src/IceClFlags.h
@@ -153,6 +153,11 @@
     return RandomizeAndPoolImmediatesThreshold;
   }
 
+  bool shouldReorderBasicBlocks() const { return ReorderBasicBlocks; }
+  void setShouldReorderBasicBlocks(bool NewValue) {
+    ReorderBasicBlocks = NewValue;
+  }
+
   void setShouldReorderFunctions(bool Option) { ReorderFunctions = Option; }
   bool shouldReorderFunctions() const { return ReorderFunctions; }
 
@@ -229,6 +234,7 @@
   bool PhiEdgeSplit;
   bool RandomNopInsertion;
   bool RandomRegAlloc;
+  bool ReorderBasicBlocks;
   bool ReorderFunctions;
   bool ReorderGlobalVariables;
   bool ReorderPooledConstants;
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index b4c4ea3..3c7ad5e 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -400,6 +400,9 @@
   Func->contractEmptyNodes();
   Func->reorderNodes();
 
+  // Shuffle basic block order if -reorder-basic-blocks is enabled.
+  Func->shuffleNodes();
+
   // Branch optimization.  This needs to be done just before code emission.  In
   // particular, no transformations that insert or reorder CfgNodes should be
   // done after branch optimization.  We go ahead and do it before nop insertion
@@ -407,9 +410,8 @@
   Func->doBranchOpt();
   Func->dump("After branch optimization");
 
-  // Nop insertion
-  if (Ctx->getFlags().shouldDoNopInsertion())
-    Func->doNopInsertion();
+  // Nop insertion if -nop-insertion is enabled.
+  Func->doNopInsertion();
 
   // Mark nodes that require sandbox alignment
   if (Ctx->getFlags().getUseSandboxing())
@@ -446,10 +448,11 @@
     return;
   Func->dump("After stack frame mapping");
 
-  // Nop insertion
-  if (Ctx->getFlags().shouldDoNopInsertion()) {
-    Func->doNopInsertion();
-  }
+  // Shuffle basic block order if -reorder-basic-blocks is enabled.
+  Func->shuffleNodes();
+
+  // Nop insertion if -nop-insertion is enabled.
+  Func->doNopInsertion();
 
   // Mark nodes that require sandbox alignment
   if (Ctx->getFlags().getUseSandboxing())
diff --git a/tests_lit/llvm2ice_tests/nop-insertion.ll b/tests_lit/llvm2ice_tests/nop-insertion.ll
index 197ddd3..160d933 100644
--- a/tests_lit/llvm2ice_tests/nop-insertion.ll
+++ b/tests_lit/llvm2ice_tests/nop-insertion.ll
@@ -4,6 +4,7 @@
 
 ; Use filetype=asm because this currently depends on the # variant
 ; assembler comment.
+
 ; RUN: %p2i -i %s --filetype=asm -a -sz-seed=1 -nop-insertion \
 ; RUN:    -nop-insertion-percentage=50 -max-nops-per-instruction=1 \
 ; RUN:    | FileCheck %s --check-prefix=PROB50
@@ -13,88 +14,121 @@
 ; RUN: %p2i -i %s --filetype=asm -a -sz-seed=1 -nop-insertion \
 ; RUN:    -nop-insertion-percentage=50 -max-nops-per-instruction=2 \
 ; RUN:    | FileCheck %s --check-prefix=MAXNOPS2
+; RUN: %p2i -i %s --filetype=asm -a -sz-seed=1 -nop-insertion -sandbox\
+; RUN:    -nop-insertion-percentage=50 -max-nops-per-instruction=1 \
+; RUN:    | FileCheck %s --check-prefix=SANDBOX50
+
 
 define <4 x i32> @mul_v4i32(<4 x i32> %a, <4 x i32> %b) {
 entry:
   %res = mul <4 x i32> %a, %b
   ret <4 x i32> %res
+
 ; PROB50-LABEL: mul_v4i32
-; PROB50: nop # variant = 3
+; PROB50: nop # variant = 4
 ; PROB50: subl $60, %esp
-; PROB50: nop # variant = 4
 ; PROB50: movups %xmm0, 32(%esp)
-; PROB50: movups %xmm1, 16(%esp)
 ; PROB50: nop # variant = 0
-; PROB50: movups 32(%esp), %xmm0
+; PROB50: movups %xmm1, 16(%esp)
 ; PROB50: nop # variant = 4
+; PROB50: movups 32(%esp), %xmm0
 ; PROB50: pshufd $49, 32(%esp), %xmm1
 ; PROB50: pshufd $49, 16(%esp), %xmm2
 ; PROB50: pmuludq 16(%esp), %xmm0
-; PROB50: pmuludq %xmm2, %xmm1
 ; PROB50: nop # variant = 0
+; PROB50: pmuludq %xmm2, %xmm1
 ; PROB50: shufps $136, %xmm1, %xmm0
-; PROB50: pshufd $216, %xmm0, %xmm0
 ; PROB50: nop # variant = 2
+; PROB50: pshufd $216, %xmm0, %xmm0
 ; PROB50: movups %xmm0, (%esp)
 ; PROB50: movups (%esp), %xmm0
-; PROB50: addl $60, %esp
 ; PROB50: nop # variant = 0
+; PROB50: addl $60, %esp
+; PROB50: nop # variant = 3
 ; PROB50: ret
 
 ; PROB90-LABEL: mul_v4i32
-; PROB90: nop # variant = 3
+; PROB90: nop # variant = 4
 ; PROB90: subl $60, %esp
-; PROB90: nop # variant = 4
+; PROB90: nop # variant = 3
 ; PROB90: movups %xmm0, 32(%esp)
-; PROB90: nop # variant = 3
+; PROB90: nop # variant = 2
 ; PROB90: movups %xmm1, 16(%esp)
-; PROB90: nop # variant = 2
+; PROB90: nop # variant = 3
 ; PROB90: movups 32(%esp), %xmm0
-; PROB90: nop # variant = 3
+; PROB90: nop # variant = 4
 ; PROB90: pshufd $49, 32(%esp), %xmm1
-; PROB90: nop # variant = 4
-; PROB90: pshufd $49, 16(%esp), %xmm2
 ; PROB90: nop # variant = 0
+; PROB90: pshufd $49, 16(%esp), %xmm2
+; PROB90: nop # variant = 2
 ; PROB90: pmuludq 16(%esp), %xmm0
-; PROB90: nop # variant = 2
-; PROB90: pmuludq %xmm2, %xmm1
 ; PROB90: nop # variant = 3
+; PROB90: pmuludq %xmm2, %xmm1
+; PROB90: nop # variant = 4
 ; PROB90: shufps $136, %xmm1, %xmm0
-; PROB90: nop # variant = 4
+; PROB90: nop # variant = 2
 ; PROB90: pshufd $216, %xmm0, %xmm0
-; PROB90: nop # variant = 2
-; PROB90: movups %xmm0, (%esp)
 ; PROB90: nop # variant = 4
-; PROB90: movups (%esp), %xmm0
+; PROB90: movups %xmm0, (%esp)
 ; PROB90: nop # variant = 2
+; PROB90: movups (%esp), %xmm0
+; PROB90: nop # variant = 3
 ; PROB90: addl $60, %esp
 ; PROB90: nop # variant = 3
 ; PROB90: ret
 
 ; MAXNOPS2-LABEL: mul_v4i32
+; MAXNOPS2: nop # variant = 4
 ; MAXNOPS2: subl $60, %esp
+; MAXNOPS2: nop # variant = 0
 ; MAXNOPS2: nop # variant = 4
 ; MAXNOPS2: movups %xmm0, 32(%esp)
-; MAXNOPS2: nop # variant = 0
-; MAXNOPS2: nop # variant = 4
 ; MAXNOPS2: movups %xmm1, 16(%esp)
-; MAXNOPS2: movups 32(%esp), %xmm0
 ; MAXNOPS2: nop # variant = 0
-; MAXNOPS2: pshufd $49, 32(%esp), %xmm1
+; MAXNOPS2: movups 32(%esp), %xmm0
 ; MAXNOPS2: nop # variant = 2
+; MAXNOPS2: pshufd $49, 32(%esp), %xmm1
 ; MAXNOPS2: pshufd $49, 16(%esp), %xmm2
-; MAXNOPS2: pmuludq 16(%esp), %xmm0
 ; MAXNOPS2: nop # variant = 0
 ; MAXNOPS2: nop # variant = 3
+; MAXNOPS2: pmuludq 16(%esp), %xmm0
 ; MAXNOPS2: pmuludq %xmm2, %xmm1
 ; MAXNOPS2: shufps $136, %xmm1, %xmm0
-; MAXNOPS2: pshufd $216, %xmm0, %xmm0
 ; MAXNOPS2: nop # variant = 3
-; MAXNOPS2: movups %xmm0, (%esp)
+; MAXNOPS2: pshufd $216, %xmm0, %xmm0
 ; MAXNOPS2: nop # variant = 0
-; MAXNOPS2: movups (%esp), %xmm0
+; MAXNOPS2: movups %xmm0, (%esp)
 ; MAXNOPS2: nop # variant = 2
-; MAXNOPS2: addl $60, %esp
+; MAXNOPS2: movups (%esp), %xmm0
 ; MAXNOPS2: nop # variant = 4
+; MAXNOPS2: addl $60, %esp
 ; MAXNOPS2: ret
+
+; SANDBOX50-LABEL: mul_v4i32
+; SANDBOX50: nop # variant = 4
+; SANDBOX50: subl $60, %esp
+; SANDBOX50: movups %xmm0, 32(%esp)
+; SANDBOX50: nop # variant = 0
+; SANDBOX50: movups %xmm1, 16(%esp)
+; SANDBOX50: nop # variant = 4
+; SANDBOX50: movups 32(%esp), %xmm0
+; SANDBOX50: pshufd $49, 32(%esp), %xmm1
+; SANDBOX50: pshufd $49, 16(%esp), %xmm2
+; SANDBOX50: pmuludq 16(%esp), %xmm0
+; SANDBOX50: nop # variant = 0
+; SANDBOX50: pmuludq %xmm2, %xmm1
+; SANDBOX50: shufps $136, %xmm1, %xmm0
+; SANDBOX50: nop # variant = 2
+; SANDBOX50: pshufd $216, %xmm0, %xmm0
+; SANDBOX50: movups %xmm0, (%esp)
+; SANDBOX50: movups (%esp), %xmm0
+; SANDBOX50: nop # variant = 0
+; SANDBOX50: addl $60, %esp
+; SANDBOX50: nop # variant = 3
+; SANDBOX50: pop %ecx
+; SANDBOX50: .bundle_lock
+; SANDBOX50: andl $-32, %ecx
+; SANDBOX50: jmp *%ecx
+; SANDBOX50: .bundle_unlock
+
 }
diff --git a/tests_lit/llvm2ice_tests/reorder-basic-blocks.ll b/tests_lit/llvm2ice_tests/reorder-basic-blocks.ll
new file mode 100644
index 0000000..9df88eb
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/reorder-basic-blocks.ll
@@ -0,0 +1,41 @@
+; Trivial smoke test of basic block reordering. Different random seeds should
+; generate different basic block layout.
+; REQUIRES allow_dump
+
+; RUN: %p2i -i %s --filetype=asm --args -O2 -sz-seed=1 \
+; RUN: -reorder-basic-blocks -threads=0 \
+; RUN: | FileCheck %s --check-prefix=SEED1
+; RUN: %p2i -i %s --filetype=asm --args -O2 -sz-seed=2 \
+; RUN: -reorder-basic-blocks -threads=0 \
+; RUN: | FileCheck %s --check-prefix=SEED2
+
+define void @basic_block_reordering(i32 %foo, i32 %bar) {
+entry:
+  %r1 = icmp eq i32 %foo, %bar
+  br i1 %r1, label %BB1, label %BB2
+BB1:
+  %r2 = icmp sgt i32 %foo, %bar
+  br i1 %r2, label %BB3, label %BB4
+BB2:
+  %r3 = icmp slt i32 %foo, %bar
+  br i1 %r3, label %BB3, label %BB4
+BB3:
+  ret void
+BB4:
+  ret void
+
+
+; SEED1-LABEL: basic_block_reordering:
+; SEED1: .Lbasic_block_reordering$entry:
+; SEED1: .Lbasic_block_reordering$BB1:
+; SEED1: .Lbasic_block_reordering$BB2:
+; SEED1: .Lbasic_block_reordering$BB3:
+; SEED1: .Lbasic_block_reordering$BB4:
+
+; SEED2-LABEL: basic_block_reordering:
+; SEED2: .Lbasic_block_reordering$entry:
+; SEED2: .Lbasic_block_reordering$BB2:
+; SEED2: .Lbasic_block_reordering$BB1:
+; SEED2: .Lbasic_block_reordering$BB4:
+; SEED2: .Lbasic_block_reordering$BB3
+}