Subzero: Initial implementation of BB Local CSE

Adds Cfg::localCse for basic-block local common-subexpression elimination
If we have
    t1 = op b c
    t2 = op b c
This pass will replace future uses of t2 in a basic block by t1.

To enable, use -enable-experimental in O2

BUG=none
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1997443002 .
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index f717379..8f6a2ff 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -507,6 +507,132 @@
   dump("After basic block shuffling");
 }
 
+void Cfg::localCSE() {
+  // Performs basic-block local common-subexpression elimination
+  // If we have
+  // t1 = op b c
+  // t2 = op b c
+  // This pass will replace future references to t2 in a basic block by t1
+  // Points to note:
+  // 1. Does not assume SSA, but not tested on non-SSA input yet as it is run
+  //    at the beginning.
+  // 2. Leaves removal of instructions to DCE.
+  // 3. Only enabled on arithmetic instructions. pnacl-clang (-O2) is expected
+  //    to take care of cases not arising from GEP simplification.
+  // 4. By default, two passes are made over each basic block. Control this
+  //    with -lcse-max-iters=N
+
+  TimerMarker T(TimerStack::TT_localCse, this);
+  struct VariableHash {
+    size_t operator()(const Variable *Var) const { return Var->hashValue(); }
+  };
+
+  struct InstHash {
+    size_t operator()(const Inst *Instr) const {
+      auto Kind = Instr->getKind();
+      auto Result =
+          std::hash<typename std::underlying_type<Inst::InstKind>::type>()(
+              Kind);
+      for (SizeT i = 0; i < Instr->getSrcSize(); ++i) {
+        Result ^= Instr->getSrc(i)->hashValue();
+      }
+      return Result;
+    }
+  };
+  struct InstEq {
+    bool srcEq(const Operand *A, const Operand *B) const {
+      if (llvm::isa<Variable>(A) || llvm::isa<Constant>(A))
+        return (A == B);
+      return false;
+    }
+    bool operator()(const Inst *InstrA, const Inst *InstrB) const {
+      if ((InstrA->getKind() != InstrB->getKind()) ||
+          (InstrA->getSrcSize() != InstrB->getSrcSize()))
+        return false;
+
+      if (auto *A = llvm::dyn_cast<InstArithmetic>(InstrA)) {
+        auto *B = llvm::cast<InstArithmetic>(InstrB);
+        // A, B are guaranteed to be of the same 'kind' at this point
+        // So, dyn_cast is not needed
+        if (A->getOp() != B->getOp())
+          return false;
+      }
+      // Does not enter loop if different kind or number of operands
+      for (SizeT i = 0; i < InstrA->getSrcSize(); ++i) {
+        if (!srcEq(InstrA->getSrc(i), InstrB->getSrc(i)))
+          return false;
+      }
+      return true;
+    }
+  };
+
+  for (CfgNode *Node : getNodes()) {
+    CfgUnorderedSet<Inst *, InstHash, InstEq> Seen;
+
+    CfgUnorderedMap<Variable *, Variable *, VariableHash> Replacements;
+    // Combining the above two into a single data structure might consume less
+    // memory but will be slower i.e map of Instruction -> Set of Variables
+
+    CfgUnorderedMap<Variable *, std::vector<Inst *>, VariableHash> Dependency;
+    // Not necessary for SSA, still keeping it in case this pass is not run at
+    // the beginning. Remove to improve performace.
+
+    int IterCount = getFlags().getLocalCseMaxIterations();
+
+    while (IterCount--) {
+      // TODO : Stats on IterCount -> performance
+      for (Inst &Instr : Node->getInsts()) {
+        if (Instr.isDeleted() || !llvm::isa<InstArithmetic>(&Instr))
+          continue;
+
+        // Invalidate replacements
+        auto Iter = Replacements.find(Instr.getDest());
+        if (Iter != Replacements.end()) {
+          Replacements.erase(Iter);
+        }
+
+        // Invalidate 'seen' instructions whose operands were just updated.
+        auto DepIter = Dependency.find(Instr.getDest());
+        if (DepIter != Dependency.end()) {
+          for (auto DepInst : DepIter->second) {
+            Seen.erase(DepInst);
+          }
+        }
+        // The above two can be removed if SSA is assumed.
+
+        // Replace - doing this before checking for repetitions might enable
+        // more
+        // optimizations
+        for (SizeT i = 0; i < Instr.getSrcSize(); ++i) {
+          auto *Opnd = Instr.getSrc(i);
+          if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) {
+            if (Replacements.find(Var) != Replacements.end()) {
+              Instr.replaceSource(i, Replacements[Var]);
+            }
+          }
+        }
+
+        // Check for repetitions
+        auto SeenIter = Seen.find(&Instr);
+        if (SeenIter != Seen.end()) { // seen before
+          const Inst *Found = *SeenIter;
+          Replacements[Instr.getDest()] = Found->getDest();
+        } else { // new
+          Seen.insert(&Instr);
+
+          // Update dependencies
+          for (SizeT i = 0; i < Instr.getSrcSize(); ++i) {
+            auto *Opnd = Instr.getSrc(i);
+            if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) {
+              Dependency[Var].push_back(&Instr);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 void Cfg::doArgLowering() {
   TimerMarker T(TimerStack::TT_doArgLowering, this);
   getTarget()->lowerArguments();
diff --git a/src/IceCfg.h b/src/IceCfg.h
index b85da28..e3f29c7 100644
--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -200,6 +200,7 @@
   void advancedPhiLowering();
   void reorderNodes();
   void shuffleNodes();
+  void localCSE();
 
   /// Scan allocas to determine whether we need to use a frame pointer.
   /// If SortAndCombine == true, merge all the fixed-size allocas in the
diff --git a/src/IceClFlags.def b/src/IceClFlags.def
index e9d0822..5059989 100644
--- a/src/IceClFlags.def
+++ b/src/IceClFlags.def
@@ -140,6 +140,10 @@
              "information to stdout at the end of program execution."),        \
     cl::init(false))                                                           \
                                                                                \
+  X(EnableExperimental, bool, dev_opt_flag, "enable-experimental",   \
+    cl::desc("Enable Optimizations not yet part of O2"),                       \
+    cl::init(false))                                                           \
+                                                                               \
   X(EnablePhiEdgeSplit, bool, dev_opt_flag, "phi-edge-split",                  \
     cl::desc("Enable edge splitting for Phi lowering"), cl::init(true))        \
                                                                                \
@@ -176,6 +180,9 @@
              "building LLVM IR first"),                                        \
     cl::init(false))                                                           \
                                                                                \
+   X(LocalCseMaxIterations, int, dev_opt_flag, "lcse-max-iters",               \
+    cl::desc("Number of times local-cse is run on a block"), cl::init(2))      \
+                                                                               \
   X(LogFilename, std::string, dev_opt_flag, "log",                             \
     cl::desc("Set log filename"), cl::init("-"), cl::value_desc("filename"))   \
                                                                                \
diff --git a/src/IceInst.h b/src/IceInst.h
index 36b8810..89f308d 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -108,6 +108,13 @@
     assert(I < getSrcSize());
     return Srcs[I];
   }
+  void replaceSource(SizeT Index, Operand *Replacement) {
+    assert(Index < NumSrcs);
+    assert(!isDeleted());
+    assert(LiveRangesEnded == 0);
+    //Invalidates liveness info because the use Srcs[Index] is removed.
+    Srcs[Index] = Replacement;
+  }
 
   bool isLastUse(const Operand *Src) const;
   void spliceLivenessInfo(Inst *OrigInst, Inst *SpliceAssn);
diff --git a/src/IceOperand.h b/src/IceOperand.h
index b3e0f5f..fdb2a66 100644
--- a/src/IceOperand.h
+++ b/src/IceOperand.h
@@ -98,6 +98,12 @@
 
   virtual Variable *asBoolean() { return nullptr; }
 
+  virtual SizeT hashValue() const {
+    llvm::report_fatal_error("Tried to hash unsupported operand type : " +
+                             std::to_string(Kind));
+    return 0;
+  }
+
 protected:
   Operand(OperandKind Kind, Type Ty) : Ty(Ty), Kind(Kind) {
     // It is undefined behavior to have a larger value in the enum
@@ -153,6 +159,7 @@
     ++LookupCount;
   }
   CounterType getLookupCount() const { return LookupCount; }
+  SizeT hashValue() const override { return 0; }
 
 protected:
   Constant(OperandKind Kind, Type Ty) : Operand(Kind, Ty) {
@@ -204,6 +211,10 @@
     return Operand->getKind() == K;
   }
 
+  SizeT hashValue() const override {
+    return std::hash<PrimType>()(Value);
+  }
+
   virtual bool shouldBeRandomizedOrPooled() const override { return false; }
 
 private:
@@ -769,6 +780,10 @@
     return Kind >= kVariable && Kind <= kVariable_Max;
   }
 
+  SizeT hashValue() const override {
+    return std::hash<SizeT>()(getIndex());
+  }
+
 protected:
   Variable(const Cfg *Func, OperandKind K, Type Ty, SizeT Index)
       : Operand(K, Ty), Number(Index),
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index 79b5477..af05b7b 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -443,6 +443,11 @@
   Func->processAllocas(SortAndCombineAllocas);
   Func->dump("After Alloca processing");
 
+  if (getFlags().getEnableExperimental()) {
+    Func->localCSE();
+    Func->dump("After Local CSE");
+  }
+
   if (!getFlags().getEnablePhiEdgeSplit()) {
     // Lower Phi instructions.
     Func->placePhiLoads();
diff --git a/src/IceTimerTree.def b/src/IceTimerTree.def
index 5faac9c..41d73c2 100644
--- a/src/IceTimerTree.def
+++ b/src/IceTimerTree.def
@@ -41,6 +41,7 @@
   X(livenessLightweight)                                                       \
   X(llvmConvert)                                                               \
   X(loadOpt)                                                                   \
+  X(localCse)                                                                  \
   X(lowerPhiAssignments)                                                       \
   X(materializeVectorShuffles)                                                 \
   X(parse)                                                                     \
diff --git a/tests_lit/llvm2ice_tests/local-cse.ll b/tests_lit/llvm2ice_tests/local-cse.ll
new file mode 100644
index 0000000..fae1816
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/local-cse.ll
@@ -0,0 +1,36 @@
+; Tests local-cse on x8632 and x8664
+
+; RUN: %p2i -i %s --filetype=obj --disassemble --target x8632 --args \
+; RUN: -O2 -enable-experimental | FileCheck --check-prefix=X8632 \
+; RUN: --check-prefix=X8632EXP %s
+
+; RUN: %p2i -i %s --filetype=obj --disassemble --target x8632 --args \
+; RUN: -O2 | FileCheck --check-prefix=X8632 --check-prefix X8632NOEXP %s
+
+; RUN: %p2i -i %s --filetype=obj --disassemble --target x8664 --args \
+; RUN: -O2 -enable-experimental | FileCheck --check-prefix=X8664 \
+; RUN: --check-prefix=X8664EXP %s
+
+; RUN: %p2i -i %s --filetype=obj --disassemble --target x8664 --args \
+; RUN: -O2 | FileCheck --check-prefix=X8664 --check-prefix X8664NOEXP %s
+
+
+define internal i32 @local_cse_test(i32 %a, i32 %b) {
+entry:
+  %add1 = add i32 %b, %a
+  %add2 = add i32 %b, %a
+  %add3 = add i32 %add1, %add2
+  ret i32 %add3
+}
+
+; X8632: add
+; X8632: add
+; X8632NOEXP: add
+; X8632EXP-NOT: add
+; X8632: ret
+
+; X8664: add
+; X8664: add
+; X8664NOEXP: add
+; X8664EXP-NOT: add
+; X8664: ret
\ No newline at end of file