Subzero: Initial implementation of BB Local CSE
Adds Cfg::localCse for basic-block local common-subexpression elimination
If we have
t1 = op b c
t2 = op b c
This pass will replace future uses of t2 in a basic block by t1.
To enable, use -enable-experimental in O2
BUG=none
R=stichnot@chromium.org
Review URL: https://codereview.chromium.org/1997443002 .
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index f717379..8f6a2ff 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -507,6 +507,132 @@
dump("After basic block shuffling");
}
+void Cfg::localCSE() {
+ // Performs basic-block local common-subexpression elimination
+ // If we have
+ // t1 = op b c
+ // t2 = op b c
+ // This pass will replace future references to t2 in a basic block by t1
+ // Points to note:
+ // 1. Does not assume SSA, but not tested on non-SSA input yet as it is run
+ // at the beginning.
+ // 2. Leaves removal of instructions to DCE.
+ // 3. Only enabled on arithmetic instructions. pnacl-clang (-O2) is expected
+ // to take care of cases not arising from GEP simplification.
+ // 4. By default, two passes are made over each basic block. Control this
+ // with -lcse-max-iters=N
+
+ TimerMarker T(TimerStack::TT_localCse, this);
+ struct VariableHash {
+ size_t operator()(const Variable *Var) const { return Var->hashValue(); }
+ };
+
+ struct InstHash {
+ size_t operator()(const Inst *Instr) const {
+ auto Kind = Instr->getKind();
+ auto Result =
+ std::hash<typename std::underlying_type<Inst::InstKind>::type>()(
+ Kind);
+ for (SizeT i = 0; i < Instr->getSrcSize(); ++i) {
+ Result ^= Instr->getSrc(i)->hashValue();
+ }
+ return Result;
+ }
+ };
+ struct InstEq {
+ bool srcEq(const Operand *A, const Operand *B) const {
+ if (llvm::isa<Variable>(A) || llvm::isa<Constant>(A))
+ return (A == B);
+ return false;
+ }
+ bool operator()(const Inst *InstrA, const Inst *InstrB) const {
+ if ((InstrA->getKind() != InstrB->getKind()) ||
+ (InstrA->getSrcSize() != InstrB->getSrcSize()))
+ return false;
+
+ if (auto *A = llvm::dyn_cast<InstArithmetic>(InstrA)) {
+ auto *B = llvm::cast<InstArithmetic>(InstrB);
+ // A, B are guaranteed to be of the same 'kind' at this point
+ // So, dyn_cast is not needed
+ if (A->getOp() != B->getOp())
+ return false;
+ }
+ // Does not enter loop if different kind or number of operands
+ for (SizeT i = 0; i < InstrA->getSrcSize(); ++i) {
+ if (!srcEq(InstrA->getSrc(i), InstrB->getSrc(i)))
+ return false;
+ }
+ return true;
+ }
+ };
+
+ for (CfgNode *Node : getNodes()) {
+ CfgUnorderedSet<Inst *, InstHash, InstEq> Seen;
+
+ CfgUnorderedMap<Variable *, Variable *, VariableHash> Replacements;
+ // Combining the above two into a single data structure might consume less
+ // memory but will be slower i.e map of Instruction -> Set of Variables
+
+ CfgUnorderedMap<Variable *, std::vector<Inst *>, VariableHash> Dependency;
+ // Not necessary for SSA, still keeping it in case this pass is not run at
+ // the beginning. Remove to improve performace.
+
+ int IterCount = getFlags().getLocalCseMaxIterations();
+
+ while (IterCount--) {
+ // TODO : Stats on IterCount -> performance
+ for (Inst &Instr : Node->getInsts()) {
+ if (Instr.isDeleted() || !llvm::isa<InstArithmetic>(&Instr))
+ continue;
+
+ // Invalidate replacements
+ auto Iter = Replacements.find(Instr.getDest());
+ if (Iter != Replacements.end()) {
+ Replacements.erase(Iter);
+ }
+
+ // Invalidate 'seen' instructions whose operands were just updated.
+ auto DepIter = Dependency.find(Instr.getDest());
+ if (DepIter != Dependency.end()) {
+ for (auto DepInst : DepIter->second) {
+ Seen.erase(DepInst);
+ }
+ }
+ // The above two can be removed if SSA is assumed.
+
+ // Replace - doing this before checking for repetitions might enable
+ // more
+ // optimizations
+ for (SizeT i = 0; i < Instr.getSrcSize(); ++i) {
+ auto *Opnd = Instr.getSrc(i);
+ if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) {
+ if (Replacements.find(Var) != Replacements.end()) {
+ Instr.replaceSource(i, Replacements[Var]);
+ }
+ }
+ }
+
+ // Check for repetitions
+ auto SeenIter = Seen.find(&Instr);
+ if (SeenIter != Seen.end()) { // seen before
+ const Inst *Found = *SeenIter;
+ Replacements[Instr.getDest()] = Found->getDest();
+ } else { // new
+ Seen.insert(&Instr);
+
+ // Update dependencies
+ for (SizeT i = 0; i < Instr.getSrcSize(); ++i) {
+ auto *Opnd = Instr.getSrc(i);
+ if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) {
+ Dependency[Var].push_back(&Instr);
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
void Cfg::doArgLowering() {
TimerMarker T(TimerStack::TT_doArgLowering, this);
getTarget()->lowerArguments();
diff --git a/src/IceCfg.h b/src/IceCfg.h
index b85da28..e3f29c7 100644
--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -200,6 +200,7 @@
void advancedPhiLowering();
void reorderNodes();
void shuffleNodes();
+ void localCSE();
/// Scan allocas to determine whether we need to use a frame pointer.
/// If SortAndCombine == true, merge all the fixed-size allocas in the
diff --git a/src/IceClFlags.def b/src/IceClFlags.def
index e9d0822..5059989 100644
--- a/src/IceClFlags.def
+++ b/src/IceClFlags.def
@@ -140,6 +140,10 @@
"information to stdout at the end of program execution."), \
cl::init(false)) \
\
+ X(EnableExperimental, bool, dev_opt_flag, "enable-experimental", \
+ cl::desc("Enable Optimizations not yet part of O2"), \
+ cl::init(false)) \
+ \
X(EnablePhiEdgeSplit, bool, dev_opt_flag, "phi-edge-split", \
cl::desc("Enable edge splitting for Phi lowering"), cl::init(true)) \
\
@@ -176,6 +180,9 @@
"building LLVM IR first"), \
cl::init(false)) \
\
+ X(LocalCseMaxIterations, int, dev_opt_flag, "lcse-max-iters", \
+ cl::desc("Number of times local-cse is run on a block"), cl::init(2)) \
+ \
X(LogFilename, std::string, dev_opt_flag, "log", \
cl::desc("Set log filename"), cl::init("-"), cl::value_desc("filename")) \
\
diff --git a/src/IceInst.h b/src/IceInst.h
index 36b8810..89f308d 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -108,6 +108,13 @@
assert(I < getSrcSize());
return Srcs[I];
}
+ void replaceSource(SizeT Index, Operand *Replacement) {
+ assert(Index < NumSrcs);
+ assert(!isDeleted());
+ assert(LiveRangesEnded == 0);
+ //Invalidates liveness info because the use Srcs[Index] is removed.
+ Srcs[Index] = Replacement;
+ }
bool isLastUse(const Operand *Src) const;
void spliceLivenessInfo(Inst *OrigInst, Inst *SpliceAssn);
diff --git a/src/IceOperand.h b/src/IceOperand.h
index b3e0f5f..fdb2a66 100644
--- a/src/IceOperand.h
+++ b/src/IceOperand.h
@@ -98,6 +98,12 @@
virtual Variable *asBoolean() { return nullptr; }
+ virtual SizeT hashValue() const {
+ llvm::report_fatal_error("Tried to hash unsupported operand type : " +
+ std::to_string(Kind));
+ return 0;
+ }
+
protected:
Operand(OperandKind Kind, Type Ty) : Ty(Ty), Kind(Kind) {
// It is undefined behavior to have a larger value in the enum
@@ -153,6 +159,7 @@
++LookupCount;
}
CounterType getLookupCount() const { return LookupCount; }
+ SizeT hashValue() const override { return 0; }
protected:
Constant(OperandKind Kind, Type Ty) : Operand(Kind, Ty) {
@@ -204,6 +211,10 @@
return Operand->getKind() == K;
}
+ SizeT hashValue() const override {
+ return std::hash<PrimType>()(Value);
+ }
+
virtual bool shouldBeRandomizedOrPooled() const override { return false; }
private:
@@ -769,6 +780,10 @@
return Kind >= kVariable && Kind <= kVariable_Max;
}
+ SizeT hashValue() const override {
+ return std::hash<SizeT>()(getIndex());
+ }
+
protected:
Variable(const Cfg *Func, OperandKind K, Type Ty, SizeT Index)
: Operand(K, Ty), Number(Index),
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index 79b5477..af05b7b 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -443,6 +443,11 @@
Func->processAllocas(SortAndCombineAllocas);
Func->dump("After Alloca processing");
+ if (getFlags().getEnableExperimental()) {
+ Func->localCSE();
+ Func->dump("After Local CSE");
+ }
+
if (!getFlags().getEnablePhiEdgeSplit()) {
// Lower Phi instructions.
Func->placePhiLoads();
diff --git a/src/IceTimerTree.def b/src/IceTimerTree.def
index 5faac9c..41d73c2 100644
--- a/src/IceTimerTree.def
+++ b/src/IceTimerTree.def
@@ -41,6 +41,7 @@
X(livenessLightweight) \
X(llvmConvert) \
X(loadOpt) \
+ X(localCse) \
X(lowerPhiAssignments) \
X(materializeVectorShuffles) \
X(parse) \
diff --git a/tests_lit/llvm2ice_tests/local-cse.ll b/tests_lit/llvm2ice_tests/local-cse.ll
new file mode 100644
index 0000000..fae1816
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/local-cse.ll
@@ -0,0 +1,36 @@
+; Tests local-cse on x8632 and x8664
+
+; RUN: %p2i -i %s --filetype=obj --disassemble --target x8632 --args \
+; RUN: -O2 -enable-experimental | FileCheck --check-prefix=X8632 \
+; RUN: --check-prefix=X8632EXP %s
+
+; RUN: %p2i -i %s --filetype=obj --disassemble --target x8632 --args \
+; RUN: -O2 | FileCheck --check-prefix=X8632 --check-prefix X8632NOEXP %s
+
+; RUN: %p2i -i %s --filetype=obj --disassemble --target x8664 --args \
+; RUN: -O2 -enable-experimental | FileCheck --check-prefix=X8664 \
+; RUN: --check-prefix=X8664EXP %s
+
+; RUN: %p2i -i %s --filetype=obj --disassemble --target x8664 --args \
+; RUN: -O2 | FileCheck --check-prefix=X8664 --check-prefix X8664NOEXP %s
+
+
+define internal i32 @local_cse_test(i32 %a, i32 %b) {
+entry:
+ %add1 = add i32 %b, %a
+ %add2 = add i32 %b, %a
+ %add3 = add i32 %add1, %add2
+ ret i32 %add3
+}
+
+; X8632: add
+; X8632: add
+; X8632NOEXP: add
+; X8632EXP-NOT: add
+; X8632: ret
+
+; X8664: add
+; X8664: add
+; X8664NOEXP: add
+; X8664EXP-NOT: add
+; X8664: ret
\ No newline at end of file