Enable Local CSE by default

Reduce the default number of iterations to 1
Put the optional code behind the -lcse-no-ssa flag, which is disabled by
default. This brings down the overhead of enabling this to about 2%.

BUG=
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/2185193002 .
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index 28ae017..b6ab31e 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -514,19 +514,23 @@
   dump("After basic block shuffling");
 }
 
-void Cfg::localCSE() {
+void Cfg::localCSE(bool AssumeSSA) {
   // Performs basic-block local common-subexpression elimination
   // If we have
   // t1 = op b c
   // t2 = op b c
   // This pass will replace future references to t2 in a basic block by t1
   // Points to note:
-  // 1. Does not assume SSA, but not tested on non-SSA input yet as it is run
-  //    at the beginning.
+  // 1. Assumes SSA by default. To change this, use -lcse=no-ssa
+  //      This is needed if this pass is moved to a point later in the pipeline.
+  //      If variables have a single definition (in the node), CSE can work just
+  //      on the basis of an equality compare on instructions (sans Dest). When
+  //      variables can be updated (hence, non-SSA) the result of a previous
+  //      instruction which used that variable as an operand can not be reused.
   // 2. Leaves removal of instructions to DCE.
   // 3. Only enabled on arithmetic instructions. pnacl-clang (-O2) is expected
   //    to take care of cases not arising from GEP simplification.
-  // 4. By default, two passes are made over each basic block. Control this
+  // 4. By default, a single pass is made over each basic block. Control this
   //    with -lcse-max-iters=N
 
   TimerMarker T(TimerStack::TT_localCse, this);
@@ -575,41 +579,45 @@
 
   for (CfgNode *Node : getNodes()) {
     CfgUnorderedSet<Inst *, InstHash, InstEq> Seen;
+    // Stores currently available instructions.
 
     CfgUnorderedMap<Variable *, Variable *, VariableHash> Replacements;
     // Combining the above two into a single data structure might consume less
     // memory but will be slower i.e map of Instruction -> Set of Variables
 
     CfgUnorderedMap<Variable *, std::vector<Inst *>, VariableHash> Dependency;
-    // Not necessary for SSA, still keeping it in case this pass is not run at
-    // the beginning. Remove to improve performace.
+    // Maps a variable to the Instructions that depend on it.
+    // a = op1 b c
+    // x = op2 c d
+    // Will result in the map : b -> {a}, c -> {a, x}, d -> {x}
+    // Not necessary for SSA as dependencies will never be invalidated, and the
+    // container will use minimal memory when left unused.
 
-    int IterCount = getFlags().getLocalCseMaxIterations();
+    auto IterCount = getFlags().getLocalCseMaxIterations();
 
-    while (IterCount--) {
-      // TODO : Stats on IterCount -> performance
+    for (uint32_t i = 0; i < IterCount; ++i) {
+      // TODO(manasijm): Stats on IterCount -> performance
       for (Inst &Instr : Node->getInsts()) {
         if (Instr.isDeleted() || !llvm::isa<InstArithmetic>(&Instr))
           continue;
+        if (!AssumeSSA) {
+          // Invalidate replacements
+          auto Iter = Replacements.find(Instr.getDest());
+          if (Iter != Replacements.end()) {
+            Replacements.erase(Iter);
+          }
 
-        // Invalidate replacements
-        auto Iter = Replacements.find(Instr.getDest());
-        if (Iter != Replacements.end()) {
-          Replacements.erase(Iter);
-        }
-
-        // Invalidate 'seen' instructions whose operands were just updated.
-        auto DepIter = Dependency.find(Instr.getDest());
-        if (DepIter != Dependency.end()) {
-          for (auto DepInst : DepIter->second) {
-            Seen.erase(DepInst);
+          // Invalidate 'seen' instructions whose operands were just updated.
+          auto DepIter = Dependency.find(Instr.getDest());
+          if (DepIter != Dependency.end()) {
+            for (auto *DepInst : DepIter->second) {
+              Seen.erase(DepInst);
+            }
           }
         }
-        // The above two can be removed if SSA is assumed.
 
         // Replace - doing this before checking for repetitions might enable
-        // more
-        // optimizations
+        // more optimizations
         for (SizeT i = 0; i < Instr.getSrcSize(); ++i) {
           auto *Opnd = Instr.getSrc(i);
           if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) {
@@ -627,11 +635,13 @@
         } else { // new
           Seen.insert(&Instr);
 
-          // Update dependencies
-          for (SizeT i = 0; i < Instr.getSrcSize(); ++i) {
-            auto *Opnd = Instr.getSrc(i);
-            if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) {
-              Dependency[Var].push_back(&Instr);
+          if (!AssumeSSA) {
+            // Update dependencies
+            for (SizeT i = 0; i < Instr.getSrcSize(); ++i) {
+              auto *Opnd = Instr.getSrc(i);
+              if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) {
+                Dependency[Var].push_back(&Instr);
+              }
             }
           }
         }
diff --git a/src/IceCfg.h b/src/IceCfg.h
index 02e8394..0c37823 100644
--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -201,7 +201,7 @@
   void advancedPhiLowering();
   void reorderNodes();
   void shuffleNodes();
-  void localCSE();
+  void localCSE(bool AssumeSSA);
   void shortCircuitJumps();
   void loopInvariantCodeMotion();
 
diff --git a/src/IceClFlags.def b/src/IceClFlags.def
index 2af1ee4..0281e7d 100644
--- a/src/IceClFlags.def
+++ b/src/IceClFlags.def
@@ -140,9 +140,14 @@
              "information to stdout at the end of program execution."),        \
     cl::init(false))                                                           \
                                                                                \
-  X(EnableExperimental, bool, dev_opt_flag, "enable-experimental",             \
-    cl::desc("Enable Optimizations not yet part of O2"),                       \
-    cl::init(false))                                                           \
+  X(LocalCSE, Ice::LCSEOptions, dev_opt_flag, "lcse",                          \
+    cl::desc("Local common subexpression elimination"),                        \
+    cl::init(Ice::LCSE_EnabledSSA),                                            \
+    cl::values(                                                                \
+      clEnumValN(Ice::LCSE_Disabled, "0", "disabled"),                         \
+      clEnumValN(Ice::LCSE_EnabledSSA, "enabled", "assume-ssa"),               \
+      clEnumValN(Ice::LCSE_EnabledNoSSA, "no-ssa", "no-assume-ssa"),           \
+      clEnumValEnd))                                                           \
                                                                                \
   X(EnablePhiEdgeSplit, bool, dev_opt_flag, "phi-edge-split",                  \
     cl::desc("Enable edge splitting for Phi lowering"), cl::init(true))        \
@@ -186,8 +191,8 @@
              "building LLVM IR first"),                                        \
     cl::init(false))                                                           \
                                                                                \
-   X(LocalCseMaxIterations, int, dev_opt_flag, "lcse-max-iters",               \
-    cl::desc("Number of times local-cse is run on a block"), cl::init(2))      \
+   X(LocalCseMaxIterations, uint32_t, dev_opt_flag, "lcse-max-iters",          \
+    cl::desc("Number of times local-cse is run on a block"), cl::init(1))      \
                                                                                \
   X(LoopInvariantCodeMotion, bool, dev_opt_flag, "licm",                       \
     cl::desc("Hoist loop invariant arithmetic operations"), cl::init(false))   \
diff --git a/src/IceDefs.h b/src/IceDefs.h
index 2c09ba9..c78af0f 100644
--- a/src/IceDefs.h
+++ b/src/IceDefs.h
@@ -308,6 +308,12 @@
   Liveness_Intervals
 };
 
+enum LCSEOptions {
+  LCSE_Disabled,
+  LCSE_EnabledSSA,  // Default Mode, assumes SSA.
+  LCSE_EnabledNoSSA // Does not assume SSA, to be enabled if CSE is done later.
+};
+
 enum RegAllocKind {
   RAK_Unknown,
   RAK_Global,       /// full, global register allocation
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index ecb4ae6..27fa85f 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -457,8 +457,8 @@
     Func->dump("After LICM");
   }
 
-  if (getFlags().getEnableExperimental()) {
-    Func->localCSE();
+  if (getFlags().getLocalCSE() != Ice::LCSE_Disabled) {
+    Func->localCSE(getFlags().getLocalCSE() == Ice::LCSE_EnabledSSA);
     Func->dump("After Local CSE");
   }
   if (getFlags().getEnableShortCircuit()) {
diff --git a/tests_lit/llvm2ice_tests/local-cse.ll b/tests_lit/llvm2ice_tests/local-cse.ll
index fae1816..b99c8f4 100644
--- a/tests_lit/llvm2ice_tests/local-cse.ll
+++ b/tests_lit/llvm2ice_tests/local-cse.ll
@@ -1,18 +1,18 @@
 ; Tests local-cse on x8632 and x8664
 
 ; RUN: %p2i -i %s --filetype=obj --disassemble --target x8632 --args \
-; RUN: -O2 -enable-experimental | FileCheck --check-prefix=X8632 \
+; RUN: -O2 | FileCheck --check-prefix=X8632 \
 ; RUN: --check-prefix=X8632EXP %s
 
 ; RUN: %p2i -i %s --filetype=obj --disassemble --target x8632 --args \
-; RUN: -O2 | FileCheck --check-prefix=X8632 --check-prefix X8632NOEXP %s
+; RUN: -O2 -lcse=0| FileCheck --check-prefix=X8632 --check-prefix X8632NOEXP %s
 
 ; RUN: %p2i -i %s --filetype=obj --disassemble --target x8664 --args \
-; RUN: -O2 -enable-experimental | FileCheck --check-prefix=X8664 \
+; RUN: -O2 | FileCheck --check-prefix=X8664 \
 ; RUN: --check-prefix=X8664EXP %s
 
 ; RUN: %p2i -i %s --filetype=obj --disassemble --target x8664 --args \
-; RUN: -O2 | FileCheck --check-prefix=X8664 --check-prefix X8664NOEXP %s
+; RUN: -O2 -lcse=0| FileCheck --check-prefix=X8664 --check-prefix X8664NOEXP %s
 
 
 define internal i32 @local_cse_test(i32 %a, i32 %b) {