| //===- DivergenceAnalysis.cpp --------- Divergence Analysis Implementation -==// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file implements a general divergence analysis for loop vectorization |
| // and GPU programs. It determines which branches and values in a loop or GPU |
| // program are divergent. It can help branch optimizations such as jump |
| // threading and loop unswitching to make better decisions. |
| // |
| // GPU programs typically use the SIMD execution model, where multiple threads |
| // in the same execution group have to execute in lock-step. Therefore, if the |
| // code contains divergent branches (i.e., threads in a group do not agree on |
| // which path of the branch to take), the group of threads has to execute all |
| // the paths from that branch with different subsets of threads enabled until |
| // they re-converge. |
| // |
| // Due to this execution model, some optimizations such as jump |
| // threading and loop unswitching can interfere with thread re-convergence. |
| // Therefore, an analysis that computes which branches in a GPU program are |
| // divergent can help the compiler to selectively run these optimizations. |
| // |
| // This implementation is derived from the Vectorization Analysis of the |
| // Region Vectorizer (RV). That implementation in turn is based on the approach |
| // described in |
| // |
| // Improving Performance of OpenCL on CPUs |
| // Ralf Karrenberg and Sebastian Hack |
| // CC '12 |
| // |
| // This DivergenceAnalysis implementation is generic in the sense that it does |
| // not itself identify original sources of divergence. |
| // Instead specialized adapter classes, (LoopDivergenceAnalysis) for loops and |
| // (GPUDivergenceAnalysis) for GPU programs, identify the sources of divergence |
| // (e.g., special variables that hold the thread ID or the iteration variable). |
| // |
| // The generic implementation propagates divergence to variables that are data |
| // or sync dependent on a source of divergence. |
| // |
| // While data dependency is a well-known concept, the notion of sync dependency |
| // is worth more explanation. Sync dependence characterizes the control flow |
| // aspect of the propagation of branch divergence. For example, |
| // |
| // %cond = icmp slt i32 %tid, 10 |
| // br i1 %cond, label %then, label %else |
| // then: |
| // br label %merge |
| // else: |
| // br label %merge |
| // merge: |
| // %a = phi i32 [ 0, %then ], [ 1, %else ] |
| // |
| // Suppose %tid holds the thread ID. Although %a is not data dependent on %tid |
| // because %tid is not on its use-def chains, %a is sync dependent on %tid |
| // because the branch "br i1 %cond" depends on %tid and affects which value %a |
| // is assigned to. |
| // |
| // The sync dependence detection (which branch induces divergence in which join |
| // points) is implemented in the SyncDependenceAnalysis. |
| // |
| // The current DivergenceAnalysis implementation has the following limitations: |
| // 1. intra-procedural. It conservatively considers the arguments of a |
| // non-kernel-entry function and the return value of a function call as |
| // divergent. |
| // 2. memory as black box. It conservatively considers values loaded from |
| // generic or local address as divergent. This can be improved by leveraging |
| // pointer analysis and/or by modelling non-escaping memory objects in SSA |
| // as done in RV. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "llvm/Analysis/DivergenceAnalysis.h" |
| #include "llvm/Analysis/LoopInfo.h" |
| #include "llvm/Analysis/Passes.h" |
| #include "llvm/Analysis/PostDominators.h" |
| #include "llvm/Analysis/TargetTransformInfo.h" |
| #include "llvm/IR/Dominators.h" |
| #include "llvm/IR/InstIterator.h" |
| #include "llvm/IR/Instructions.h" |
| #include "llvm/IR/IntrinsicInst.h" |
| #include "llvm/IR/Value.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/raw_ostream.h" |
| #include <vector> |
| |
| using namespace llvm; |
| |
| #define DEBUG_TYPE "divergence-analysis" |
| |
| // class DivergenceAnalysis |
| DivergenceAnalysis::DivergenceAnalysis( |
| const Function &F, const Loop *RegionLoop, const DominatorTree &DT, |
| const LoopInfo &LI, SyncDependenceAnalysis &SDA, bool IsLCSSAForm) |
| : F(F), RegionLoop(RegionLoop), DT(DT), LI(LI), SDA(SDA), |
| IsLCSSAForm(IsLCSSAForm) {} |
| |
| void DivergenceAnalysis::markDivergent(const Value &DivVal) { |
| assert(isa<Instruction>(DivVal) || isa<Argument>(DivVal)); |
| assert(!isAlwaysUniform(DivVal) && "cannot be a divergent"); |
| DivergentValues.insert(&DivVal); |
| } |
| |
| void DivergenceAnalysis::addUniformOverride(const Value &UniVal) { |
| UniformOverrides.insert(&UniVal); |
| } |
| |
| bool DivergenceAnalysis::updateTerminator(const Instruction &Term) const { |
| if (Term.getNumSuccessors() <= 1) |
| return false; |
| if (auto *BranchTerm = dyn_cast<BranchInst>(&Term)) { |
| assert(BranchTerm->isConditional()); |
| return isDivergent(*BranchTerm->getCondition()); |
| } |
| if (auto *SwitchTerm = dyn_cast<SwitchInst>(&Term)) { |
| return isDivergent(*SwitchTerm->getCondition()); |
| } |
| if (isa<InvokeInst>(Term)) { |
| return false; // ignore abnormal executions through landingpad |
| } |
| |
| llvm_unreachable("unexpected terminator"); |
| } |
| |
| bool DivergenceAnalysis::updateNormalInstruction(const Instruction &I) const { |
| // TODO function calls with side effects, etc |
| for (const auto &Op : I.operands()) { |
| if (isDivergent(*Op)) |
| return true; |
| } |
| return false; |
| } |
| |
| bool DivergenceAnalysis::isTemporalDivergent(const BasicBlock &ObservingBlock, |
| const Value &Val) const { |
| const auto *Inst = dyn_cast<const Instruction>(&Val); |
| if (!Inst) |
| return false; |
| // check whether any divergent loop carrying Val terminates before control |
| // proceeds to ObservingBlock |
| for (const auto *Loop = LI.getLoopFor(Inst->getParent()); |
| Loop != RegionLoop && !Loop->contains(&ObservingBlock); |
| Loop = Loop->getParentLoop()) { |
| if (DivergentLoops.find(Loop) != DivergentLoops.end()) |
| return true; |
| } |
| |
| return false; |
| } |
| |
| bool DivergenceAnalysis::updatePHINode(const PHINode &Phi) const { |
| // joining divergent disjoint path in Phi parent block |
| if (!Phi.hasConstantOrUndefValue() && isJoinDivergent(*Phi.getParent())) { |
| return true; |
| } |
| |
| // An incoming value could be divergent by itself. |
| // Otherwise, an incoming value could be uniform within the loop |
| // that carries its definition but it may appear divergent |
| // from outside the loop. This happens when divergent loop exits |
| // drop definitions of that uniform value in different iterations. |
| // |
| // for (int i = 0; i < n; ++i) { // 'i' is uniform inside the loop |
| // if (i % thread_id == 0) break; // divergent loop exit |
| // } |
| // int divI = i; // divI is divergent |
| for (size_t i = 0; i < Phi.getNumIncomingValues(); ++i) { |
| const auto *InVal = Phi.getIncomingValue(i); |
| if (isDivergent(*Phi.getIncomingValue(i)) || |
| isTemporalDivergent(*Phi.getParent(), *InVal)) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| bool DivergenceAnalysis::inRegion(const Instruction &I) const { |
| return I.getParent() && inRegion(*I.getParent()); |
| } |
| |
| bool DivergenceAnalysis::inRegion(const BasicBlock &BB) const { |
| return (!RegionLoop && BB.getParent() == &F) || RegionLoop->contains(&BB); |
| } |
| |
| // marks all users of loop-carried values of the loop headed by LoopHeader as |
| // divergent |
| void DivergenceAnalysis::taintLoopLiveOuts(const BasicBlock &LoopHeader) { |
| auto *DivLoop = LI.getLoopFor(&LoopHeader); |
| assert(DivLoop && "loopHeader is not actually part of a loop"); |
| |
| SmallVector<BasicBlock *, 8> TaintStack; |
| DivLoop->getExitBlocks(TaintStack); |
| |
| // Otherwise potential users of loop-carried values could be anywhere in the |
| // dominance region of DivLoop (including its fringes for phi nodes) |
| DenseSet<const BasicBlock *> Visited; |
| for (auto *Block : TaintStack) { |
| Visited.insert(Block); |
| } |
| Visited.insert(&LoopHeader); |
| |
| while (!TaintStack.empty()) { |
| auto *UserBlock = TaintStack.back(); |
| TaintStack.pop_back(); |
| |
| // don't spread divergence beyond the region |
| if (!inRegion(*UserBlock)) |
| continue; |
| |
| assert(!DivLoop->contains(UserBlock) && |
| "irreducible control flow detected"); |
| |
| // phi nodes at the fringes of the dominance region |
| if (!DT.dominates(&LoopHeader, UserBlock)) { |
| // all PHI nodes of UserBlock become divergent |
| for (auto &Phi : UserBlock->phis()) { |
| Worklist.push_back(&Phi); |
| } |
| continue; |
| } |
| |
| // taint outside users of values carried by DivLoop |
| for (auto &I : *UserBlock) { |
| if (isAlwaysUniform(I)) |
| continue; |
| if (isDivergent(I)) |
| continue; |
| |
| for (auto &Op : I.operands()) { |
| auto *OpInst = dyn_cast<Instruction>(&Op); |
| if (!OpInst) |
| continue; |
| if (DivLoop->contains(OpInst->getParent())) { |
| markDivergent(I); |
| pushUsers(I); |
| break; |
| } |
| } |
| } |
| |
| // visit all blocks in the dominance region |
| for (auto *SuccBlock : successors(UserBlock)) { |
| if (!Visited.insert(SuccBlock).second) { |
| continue; |
| } |
| TaintStack.push_back(SuccBlock); |
| } |
| } |
| } |
| |
| void DivergenceAnalysis::pushPHINodes(const BasicBlock &Block) { |
| for (const auto &Phi : Block.phis()) { |
| if (isDivergent(Phi)) |
| continue; |
| Worklist.push_back(&Phi); |
| } |
| } |
| |
| void DivergenceAnalysis::pushUsers(const Value &V) { |
| for (const auto *User : V.users()) { |
| const auto *UserInst = dyn_cast<const Instruction>(User); |
| if (!UserInst) |
| continue; |
| |
| if (isDivergent(*UserInst)) |
| continue; |
| |
| // only compute divergent inside loop |
| if (!inRegion(*UserInst)) |
| continue; |
| Worklist.push_back(UserInst); |
| } |
| } |
| |
| bool DivergenceAnalysis::propagateJoinDivergence(const BasicBlock &JoinBlock, |
| const Loop *BranchLoop) { |
| LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n"); |
| |
| // ignore divergence outside the region |
| if (!inRegion(JoinBlock)) { |
| return false; |
| } |
| |
| // push non-divergent phi nodes in JoinBlock to the worklist |
| pushPHINodes(JoinBlock); |
| |
| // JoinBlock is a divergent loop exit |
| if (BranchLoop && !BranchLoop->contains(&JoinBlock)) { |
| return true; |
| } |
| |
| // disjoint-paths divergent at JoinBlock |
| markBlockJoinDivergent(JoinBlock); |
| return false; |
| } |
| |
| void DivergenceAnalysis::propagateBranchDivergence(const Instruction &Term) { |
| LLVM_DEBUG(dbgs() << "propBranchDiv " << Term.getParent()->getName() << "\n"); |
| |
| markDivergent(Term); |
| |
| const auto *BranchLoop = LI.getLoopFor(Term.getParent()); |
| |
| // whether there is a divergent loop exit from BranchLoop (if any) |
| bool IsBranchLoopDivergent = false; |
| |
| // iterate over all blocks reachable by disjoint from Term within the loop |
| // also iterates over loop exits that become divergent due to Term. |
| for (const auto *JoinBlock : SDA.join_blocks(Term)) { |
| IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop); |
| } |
| |
| // Branch loop is a divergent loop due to the divergent branch in Term |
| if (IsBranchLoopDivergent) { |
| assert(BranchLoop); |
| if (!DivergentLoops.insert(BranchLoop).second) { |
| return; |
| } |
| propagateLoopDivergence(*BranchLoop); |
| } |
| } |
| |
| void DivergenceAnalysis::propagateLoopDivergence(const Loop &ExitingLoop) { |
| LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getName() << "\n"); |
| |
| // don't propagate beyond region |
| if (!inRegion(*ExitingLoop.getHeader())) |
| return; |
| |
| const auto *BranchLoop = ExitingLoop.getParentLoop(); |
| |
| // Uses of loop-carried values could occur anywhere |
| // within the dominance region of the definition. All loop-carried |
| // definitions are dominated by the loop header (reducible control). |
| // Thus all users have to be in the dominance region of the loop header, |
| // except PHI nodes that can also live at the fringes of the dom region |
| // (incoming defining value). |
| if (!IsLCSSAForm) |
| taintLoopLiveOuts(*ExitingLoop.getHeader()); |
| |
| // whether there is a divergent loop exit from BranchLoop (if any) |
| bool IsBranchLoopDivergent = false; |
| |
| // iterate over all blocks reachable by disjoint paths from exits of |
| // ExitingLoop also iterates over loop exits (of BranchLoop) that in turn |
| // become divergent. |
| for (const auto *JoinBlock : SDA.join_blocks(ExitingLoop)) { |
| IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop); |
| } |
| |
| // Branch loop is a divergent due to divergent loop exit in ExitingLoop |
| if (IsBranchLoopDivergent) { |
| assert(BranchLoop); |
| if (!DivergentLoops.insert(BranchLoop).second) { |
| return; |
| } |
| propagateLoopDivergence(*BranchLoop); |
| } |
| } |
| |
| void DivergenceAnalysis::compute() { |
| for (auto *DivVal : DivergentValues) { |
| pushUsers(*DivVal); |
| } |
| |
| // propagate divergence |
| while (!Worklist.empty()) { |
| const Instruction &I = *Worklist.back(); |
| Worklist.pop_back(); |
| |
| // maintain uniformity of overrides |
| if (isAlwaysUniform(I)) |
| continue; |
| |
| bool WasDivergent = isDivergent(I); |
| if (WasDivergent) |
| continue; |
| |
| // propagate divergence caused by terminator |
| if (I.isTerminator()) { |
| if (updateTerminator(I)) { |
| // propagate control divergence to affected instructions |
| propagateBranchDivergence(I); |
| continue; |
| } |
| } |
| |
| // update divergence of I due to divergent operands |
| bool DivergentUpd = false; |
| const auto *Phi = dyn_cast<const PHINode>(&I); |
| if (Phi) { |
| DivergentUpd = updatePHINode(*Phi); |
| } else { |
| DivergentUpd = updateNormalInstruction(I); |
| } |
| |
| // propagate value divergence to users |
| if (DivergentUpd) { |
| markDivergent(I); |
| pushUsers(I); |
| } |
| } |
| } |
| |
| bool DivergenceAnalysis::isAlwaysUniform(const Value &V) const { |
| return UniformOverrides.find(&V) != UniformOverrides.end(); |
| } |
| |
| bool DivergenceAnalysis::isDivergent(const Value &V) const { |
| return DivergentValues.find(&V) != DivergentValues.end(); |
| } |
| |
| bool DivergenceAnalysis::isDivergentUse(const Use &U) const { |
| Value &V = *U.get(); |
| Instruction &I = *cast<Instruction>(U.getUser()); |
| return isDivergent(V) || isTemporalDivergent(*I.getParent(), V); |
| } |
| |
| void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const { |
| if (DivergentValues.empty()) |
| return; |
| // iterate instructions using instructions() to ensure a deterministic order. |
| for (auto &I : instructions(F)) { |
| if (isDivergent(I)) |
| OS << "DIVERGENT:" << I << '\n'; |
| } |
| } |
| |
| // class GPUDivergenceAnalysis |
| GPUDivergenceAnalysis::GPUDivergenceAnalysis(Function &F, |
| const DominatorTree &DT, |
| const PostDominatorTree &PDT, |
| const LoopInfo &LI, |
| const TargetTransformInfo &TTI) |
| : SDA(DT, PDT, LI), DA(F, nullptr, DT, LI, SDA, false) { |
| for (auto &I : instructions(F)) { |
| if (TTI.isSourceOfDivergence(&I)) { |
| DA.markDivergent(I); |
| } else if (TTI.isAlwaysUniform(&I)) { |
| DA.addUniformOverride(I); |
| } |
| } |
| for (auto &Arg : F.args()) { |
| if (TTI.isSourceOfDivergence(&Arg)) { |
| DA.markDivergent(Arg); |
| } |
| } |
| |
| DA.compute(); |
| } |
| |
| bool GPUDivergenceAnalysis::isDivergent(const Value &val) const { |
| return DA.isDivergent(val); |
| } |
| |
| bool GPUDivergenceAnalysis::isDivergentUse(const Use &use) const { |
| return DA.isDivergentUse(use); |
| } |
| |
| void GPUDivergenceAnalysis::print(raw_ostream &OS, const Module *mod) const { |
| OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n"; |
| DA.print(OS, mod); |
| OS << "}\n"; |
| } |