| //===- subzero/src/IceCfg.cpp - Control flow graph implementation ---------===// |
| // |
| // The Subzero Code Generator |
| // |
| // This file is distributed under the University of Illinois Open Source |
| // License. See LICENSE.TXT for details. |
| // |
| //===----------------------------------------------------------------------===// |
| /// |
| /// \file |
| /// \brief Implements the Cfg class. |
| /// |
| //===----------------------------------------------------------------------===// |
| |
| #include "IceCfg.h" |
| |
| #include "IceAssembler.h" |
| #include "IceBitVector.h" |
| #include "IceCfgNode.h" |
| #include "IceClFlags.h" |
| #include "IceDefs.h" |
| #include "IceELFObjectWriter.h" |
| #include "IceGlobalInits.h" |
| #include "IceInst.h" |
| #include "IceInstVarIter.h" |
| #include "IceInstrumentation.h" |
| #include "IceLiveness.h" |
| #include "IceLoopAnalyzer.h" |
| #include "IceOperand.h" |
| #include "IceTargetLowering.h" |
| |
| #include <memory> |
| #include <utility> |
| |
| namespace Ice { |
| |
| Cfg::Cfg(GlobalContext *Ctx, uint32_t SequenceNumber) |
| : Allocator(createAllocator()), Ctx(Ctx), SequenceNumber(SequenceNumber), |
| VMask(getFlags().getVerbose()), FunctionName(), |
| NextInstNumber(Inst::NumberInitial), Live(nullptr) { |
| NodeStrings.reset(new StringPool); |
| VarStrings.reset(new StringPool); |
| CfgLocalAllocatorScope _(this); |
| Target = TargetLowering::createLowering(getFlags().getTargetArch(), this); |
| VMetadata.reset(new VariablesMetadata(this)); |
| TargetAssembler = Target->createAssembler(); |
| } |
| |
| Cfg::~Cfg() { |
| assert(CfgAllocatorTraits::current() == nullptr); |
| if (getFlags().getDumpStrings()) { |
| OstreamLocker _(Ctx); |
| Ostream &Str = Ctx->getStrDump(); |
| getNodeStrings()->dump(Str); |
| getVarStrings()->dump(Str); |
| } |
| } |
| |
| // Called in the initalizer list of Cfg's constructor to create the Allocator |
| // and set it as TLS before any other member fields are constructed, since they |
| // may depend on it. |
| ArenaAllocator *Cfg::createAllocator() { |
| ArenaAllocator *Allocator = new ArenaAllocator(); |
| CfgAllocatorTraits::set_current(Allocator); |
| return Allocator; |
| } |
| |
| /// Create a string like "foo(i=123:b=9)" indicating the function name, number |
| /// of high-level instructions, and number of basic blocks. This string is only |
| /// used for dumping and other diagnostics, and the idea is that given a set of |
| /// functions to debug a problem on, it's easy to find the smallest or simplest |
| /// function to attack. Note that the counts may change somewhat depending on |
| /// what point it is called during the translation passes. |
| std::string Cfg::getFunctionNameAndSize() const { |
| if (!BuildDefs::dump()) |
| return getFunctionName().toString(); |
| SizeT NodeCount = 0; |
| SizeT InstCount = 0; |
| for (CfgNode *Node : getNodes()) { |
| ++NodeCount; |
| // Note: deleted instructions are *not* ignored. |
| InstCount += Node->getPhis().size(); |
| for (Inst &I : Node->getInsts()) { |
| if (!llvm::isa<InstTarget>(&I)) |
| ++InstCount; |
| } |
| } |
| return getFunctionName() + "(i=" + std::to_string(InstCount) + |
| ":b=" + std::to_string(NodeCount) + ")"; |
| } |
| |
| void Cfg::setError(const std::string &Message) { |
| HasError = true; |
| ErrorMessage = Message; |
| } |
| |
| CfgNode *Cfg::makeNode() { |
| SizeT LabelIndex = Nodes.size(); |
| auto *Node = CfgNode::create(this, LabelIndex); |
| Nodes.push_back(Node); |
| return Node; |
| } |
| |
| void Cfg::swapNodes(NodeList &NewNodes) { |
| assert(Nodes.size() == NewNodes.size()); |
| Nodes.swap(NewNodes); |
| for (SizeT I = 0, NumNodes = getNumNodes(); I < NumNodes; ++I) |
| Nodes[I]->resetIndex(I); |
| } |
| |
| template <> Variable *Cfg::makeVariable<Variable>(Type Ty) { |
| SizeT Index = Variables.size(); |
| Variable *Var; |
| if (Target->shouldSplitToVariableVecOn32(Ty)) { |
| Var = VariableVecOn32::create(this, Ty, Index); |
| } else if (Target->shouldSplitToVariable64On32(Ty)) { |
| Var = Variable64On32::create(this, Ty, Index); |
| } else { |
| Var = Variable::create(this, Ty, Index); |
| } |
| Variables.push_back(Var); |
| return Var; |
| } |
| |
| void Cfg::addArg(Variable *Arg) { |
| Arg->setIsArg(); |
| Args.push_back(Arg); |
| } |
| |
| void Cfg::addImplicitArg(Variable *Arg) { |
| Arg->setIsImplicitArg(); |
| ImplicitArgs.push_back(Arg); |
| } |
| |
| // Returns whether the stack frame layout has been computed yet. This is used |
| // for dumping the stack frame location of Variables. |
| bool Cfg::hasComputedFrame() const { return getTarget()->hasComputedFrame(); } |
| |
| namespace { |
| constexpr char BlockNameGlobalPrefix[] = ".L$profiler$block_name$"; |
| constexpr char BlockStatsGlobalPrefix[] = ".L$profiler$block_info$"; |
| } // end of anonymous namespace |
| |
| void Cfg::createNodeNameDeclaration(const std::string &NodeAsmName) { |
| auto *Var = VariableDeclaration::create(GlobalInits.get()); |
| Var->setName(Ctx, BlockNameGlobalPrefix + NodeAsmName); |
| Var->setIsConstant(true); |
| Var->addInitializer(VariableDeclaration::DataInitializer::create( |
| GlobalInits.get(), NodeAsmName.data(), NodeAsmName.size() + 1)); |
| const SizeT Int64ByteSize = typeWidthInBytes(IceType_i64); |
| Var->setAlignment(Int64ByteSize); // Wasteful, 32-bit could use 4 bytes. |
| GlobalInits->push_back(Var); |
| } |
| |
| void Cfg::createBlockProfilingInfoDeclaration( |
| const std::string &NodeAsmName, VariableDeclaration *NodeNameDeclaration) { |
| auto *Var = VariableDeclaration::create(GlobalInits.get()); |
| Var->setName(Ctx, BlockStatsGlobalPrefix + NodeAsmName); |
| const SizeT Int64ByteSize = typeWidthInBytes(IceType_i64); |
| Var->addInitializer(VariableDeclaration::ZeroInitializer::create( |
| GlobalInits.get(), Int64ByteSize)); |
| |
| const RelocOffsetT NodeNameDeclarationOffset = 0; |
| Var->addInitializer(VariableDeclaration::RelocInitializer::create( |
| GlobalInits.get(), NodeNameDeclaration, |
| {RelocOffset::create(Ctx, NodeNameDeclarationOffset)})); |
| Var->setAlignment(Int64ByteSize); |
| GlobalInits->push_back(Var); |
| } |
| |
| void Cfg::translate() { |
| if (hasError()) |
| return; |
| // Cache the possibly-overridden optimization level once translation begins. |
| // It would be nicer to do this in the constructor, but we need to wait until |
| // after setFunctionName() has a chance to be called. |
| OptimizationLevel = |
| getFlags().matchForceO2(getFunctionName(), getSequenceNumber()) |
| ? Opt_2 |
| : getFlags().getOptLevel(); |
| if (BuildDefs::timers()) { |
| if (getFlags().matchTimingFocus(getFunctionName(), getSequenceNumber())) { |
| setFocusedTiming(); |
| getContext()->resetTimer(GlobalContext::TSK_Default); |
| } |
| } |
| if (BuildDefs::dump()) { |
| if (isVerbose(IceV_Status) && |
| getFlags().matchTestStatus(getFunctionName(), getSequenceNumber())) { |
| getContext()->getStrDump() |
| << ">>>Translating " << getFunctionNameAndSize() |
| << " seq=" << getSequenceNumber() << "\n"; |
| } |
| } |
| TimerMarker T_func(getContext(), getFunctionName().toStringOrEmpty()); |
| TimerMarker T(TimerStack::TT_translate, this); |
| |
| dump("Initial CFG"); |
| |
| // Create the Hi and Lo variables where a split was needed |
| for (Variable *Var : Variables) { |
| if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Var)) { |
| Var64On32->initHiLo(this); |
| } else if (auto *VarVecOn32 = llvm::dyn_cast<VariableVecOn32>(Var)) { |
| VarVecOn32->initVecElement(this); |
| } |
| } |
| |
| // Instrument the Cfg, e.g. with AddressSanitizer |
| if (!BuildDefs::minimal() && getFlags().getSanitizeAddresses()) { |
| getContext()->instrumentFunc(this); |
| dump("Instrumented CFG"); |
| } |
| |
| // The set of translation passes and their order are determined by the |
| // target. |
| getTarget()->translate(); |
| |
| dump("Final output"); |
| if (getFocusedTiming()) { |
| getContext()->dumpLocalTimers(getFunctionName().toString()); |
| } |
| } |
| |
| void Cfg::fixPhiNodes() { |
| for (auto *Node : Nodes) { |
| // Fix all the phi edges since WASM can't tell how to make them correctly at |
| // the beginning. |
| assert(Node); |
| const auto &InEdges = Node->getInEdges(); |
| for (auto &Instr : Node->getPhis()) { |
| auto *Phi = llvm::cast<InstPhi>(&Instr); |
| assert(Phi); |
| for (SizeT i = 0; i < InEdges.size(); ++i) { |
| Phi->setLabel(i, InEdges[i]); |
| } |
| } |
| } |
| } |
| |
| void Cfg::computeInOutEdges() { |
| // Compute the out-edges. |
| for (CfgNode *Node : Nodes) { |
| Node->computeSuccessors(); |
| } |
| |
| // Prune any unreachable nodes before computing in-edges. |
| SizeT NumNodes = getNumNodes(); |
| BitVector Reachable(NumNodes); |
| BitVector Pending(NumNodes); |
| Pending.set(getEntryNode()->getIndex()); |
| while (true) { |
| int Index = Pending.find_first(); |
| if (Index == -1) |
| break; |
| Pending.reset(Index); |
| Reachable.set(Index); |
| CfgNode *Node = Nodes[Index]; |
| assert(Node->getIndex() == (SizeT)Index); |
| for (CfgNode *Succ : Node->getOutEdges()) { |
| SizeT SuccIndex = Succ->getIndex(); |
| if (!Reachable.test(SuccIndex)) |
| Pending.set(SuccIndex); |
| } |
| } |
| SizeT Dest = 0; |
| for (SizeT Source = 0; Source < NumNodes; ++Source) { |
| if (Reachable.test(Source)) { |
| Nodes[Dest] = Nodes[Source]; |
| Nodes[Dest]->resetIndex(Dest); |
| // Compute the in-edges. |
| Nodes[Dest]->computePredecessors(); |
| ++Dest; |
| } |
| } |
| Nodes.resize(Dest); |
| |
| TimerMarker T(TimerStack::TT_phiValidation, this); |
| for (CfgNode *Node : Nodes) |
| Node->enforcePhiConsistency(); |
| } |
| |
| void Cfg::renumberInstructions() { |
| TimerMarker T(TimerStack::TT_renumberInstructions, this); |
| NextInstNumber = Inst::NumberInitial; |
| for (CfgNode *Node : Nodes) |
| Node->renumberInstructions(); |
| // Make sure the entry node is the first node and therefore got the lowest |
| // instruction numbers, to facilitate live range computation of function |
| // arguments. We want to model function arguments as being live on entry to |
| // the function, otherwise an argument whose only use is in the first |
| // instruction will be assigned a trivial live range and the register |
| // allocator will not recognize its live range as overlapping another |
| // variable's live range. |
| assert(Nodes.empty() || (*Nodes.begin() == getEntryNode())); |
| } |
| |
| // placePhiLoads() must be called before placePhiStores(). |
| void Cfg::placePhiLoads() { |
| TimerMarker T(TimerStack::TT_placePhiLoads, this); |
| for (CfgNode *Node : Nodes) |
| Node->placePhiLoads(); |
| } |
| |
| // placePhiStores() must be called after placePhiLoads(). |
| void Cfg::placePhiStores() { |
| TimerMarker T(TimerStack::TT_placePhiStores, this); |
| for (CfgNode *Node : Nodes) |
| Node->placePhiStores(); |
| } |
| |
| void Cfg::deletePhis() { |
| TimerMarker T(TimerStack::TT_deletePhis, this); |
| for (CfgNode *Node : Nodes) |
| Node->deletePhis(); |
| } |
| |
| void Cfg::advancedPhiLowering() { |
| TimerMarker T(TimerStack::TT_advancedPhiLowering, this); |
| // Clear all previously computed live ranges (but not live-in/live-out bit |
| // vectors or last-use markers), because the follow-on register allocation is |
| // only concerned with live ranges across the newly created blocks. |
| for (Variable *Var : Variables) { |
| Var->getLiveRange().reset(); |
| } |
| // This splits edges and appends new nodes to the end of the node list. This |
| // can invalidate iterators, so don't use an iterator. |
| SizeT NumNodes = getNumNodes(); |
| SizeT NumVars = getNumVariables(); |
| for (SizeT I = 0; I < NumNodes; ++I) |
| Nodes[I]->advancedPhiLowering(); |
| |
| TimerMarker TT(TimerStack::TT_lowerPhiAssignments, this); |
| if (true) { |
| // The following code does an in-place update of liveness and live ranges |
| // as a result of adding the new phi edge split nodes. |
| getLiveness()->initPhiEdgeSplits(Nodes.begin() + NumNodes, |
| Variables.begin() + NumVars); |
| TimerMarker TTT(TimerStack::TT_liveness, this); |
| // Iterate over the newly added nodes to add their liveness info. |
| for (auto I = Nodes.begin() + NumNodes, E = Nodes.end(); I != E; ++I) { |
| InstNumberT FirstInstNum = getNextInstNumber(); |
| (*I)->renumberInstructions(); |
| InstNumberT LastInstNum = getNextInstNumber() - 1; |
| (*I)->liveness(getLiveness()); |
| (*I)->livenessAddIntervals(getLiveness(), FirstInstNum, LastInstNum); |
| } |
| } else { |
| // The following code does a brute-force recalculation of live ranges as a |
| // result of adding the new phi edge split nodes. The liveness calculation |
| // is particularly expensive because the new nodes are not yet in a proper |
| // topological order and so convergence is slower. |
| // |
| // This code is kept here for reference and can be temporarily enabled in |
| // case the incremental code is under suspicion. |
| renumberInstructions(); |
| liveness(Liveness_Intervals); |
| getVMetadata()->init(VMK_All); |
| } |
| Target->regAlloc(RAK_Phi); |
| } |
| |
| // Find a reasonable placement for nodes that have not yet been placed, while |
| // maintaining the same relative ordering among already placed nodes. |
| void Cfg::reorderNodes() { |
| // TODO(ascull): it would be nice if the switch tests were always followed by |
| // the default case to allow for fall through. |
| using PlacedList = CfgList<CfgNode *>; |
| PlacedList Placed; // Nodes with relative placement locked down |
| PlacedList Unreachable; // Unreachable nodes |
| PlacedList::iterator NoPlace = Placed.end(); |
| // Keep track of where each node has been tentatively placed so that we can |
| // manage insertions into the middle. |
| CfgVector<PlacedList::iterator> PlaceIndex(Nodes.size(), NoPlace); |
| for (CfgNode *Node : Nodes) { |
| // The "do ... while(0);" construct is to factor out the --PlaceIndex and |
| // assert() statements before moving to the next node. |
| do { |
| if (Node != getEntryNode() && Node->getInEdges().empty()) { |
| // The node has essentially been deleted since it is not a successor of |
| // any other node. |
| Unreachable.push_back(Node); |
| PlaceIndex[Node->getIndex()] = Unreachable.end(); |
| Node->setNeedsPlacement(false); |
| continue; |
| } |
| if (!Node->needsPlacement()) { |
| // Add to the end of the Placed list. |
| Placed.push_back(Node); |
| PlaceIndex[Node->getIndex()] = Placed.end(); |
| continue; |
| } |
| Node->setNeedsPlacement(false); |
| // Assume for now that the unplaced node is from edge-splitting and |
| // therefore has 1 in-edge and 1 out-edge (actually, possibly more than 1 |
| // in-edge if the predecessor node was contracted). If this changes in |
| // the future, rethink the strategy. |
| assert(Node->getInEdges().size() >= 1); |
| assert(Node->hasSingleOutEdge()); |
| |
| // If it's a (non-critical) edge where the successor has a single |
| // in-edge, then place it before the successor. |
| CfgNode *Succ = Node->getOutEdges().front(); |
| if (Succ->getInEdges().size() == 1 && |
| PlaceIndex[Succ->getIndex()] != NoPlace) { |
| Placed.insert(PlaceIndex[Succ->getIndex()], Node); |
| PlaceIndex[Node->getIndex()] = PlaceIndex[Succ->getIndex()]; |
| continue; |
| } |
| |
| // Otherwise, place it after the (first) predecessor. |
| CfgNode *Pred = Node->getInEdges().front(); |
| auto PredPosition = PlaceIndex[Pred->getIndex()]; |
| // It shouldn't be the case that PredPosition==NoPlace, but if that |
| // somehow turns out to be true, we just insert Node before |
| // PredPosition=NoPlace=Placed.end() . |
| if (PredPosition != NoPlace) |
| ++PredPosition; |
| Placed.insert(PredPosition, Node); |
| PlaceIndex[Node->getIndex()] = PredPosition; |
| } while (0); |
| |
| --PlaceIndex[Node->getIndex()]; |
| assert(*PlaceIndex[Node->getIndex()] == Node); |
| } |
| |
| // Reorder Nodes according to the built-up lists. |
| NodeList Reordered; |
| Reordered.reserve(Placed.size() + Unreachable.size()); |
| for (CfgNode *Node : Placed) |
| Reordered.push_back(Node); |
| for (CfgNode *Node : Unreachable) |
| Reordered.push_back(Node); |
| assert(getNumNodes() == Reordered.size()); |
| swapNodes(Reordered); |
| } |
| |
| void Cfg::localCSE(bool AssumeSSA) { |
| // Performs basic-block local common-subexpression elimination |
| // If we have |
| // t1 = op b c |
| // t2 = op b c |
| // This pass will replace future references to t2 in a basic block by t1 |
| // Points to note: |
| // 1. Assumes SSA by default. To change this, use -lcse=no-ssa |
| // This is needed if this pass is moved to a point later in the pipeline. |
| // If variables have a single definition (in the node), CSE can work just |
| // on the basis of an equality compare on instructions (sans Dest). When |
| // variables can be updated (hence, non-SSA) the result of a previous |
| // instruction which used that variable as an operand can not be reused. |
| // 2. Leaves removal of instructions to DCE. |
| // 3. Only enabled on arithmetic instructions. pnacl-clang (-O2) is expected |
| // to take care of cases not arising from GEP simplification. |
| // 4. By default, a single pass is made over each basic block. Control this |
| // with -lcse-max-iters=N |
| |
| TimerMarker T(TimerStack::TT_localCse, this); |
| struct VariableHash { |
| size_t operator()(const Variable *Var) const { return Var->hashValue(); } |
| }; |
| |
| struct InstHash { |
| size_t operator()(const Inst *Instr) const { |
| auto Kind = Instr->getKind(); |
| auto Result = |
| std::hash<typename std::underlying_type<Inst::InstKind>::type>()( |
| Kind); |
| for (SizeT i = 0; i < Instr->getSrcSize(); ++i) { |
| Result ^= Instr->getSrc(i)->hashValue(); |
| } |
| return Result; |
| } |
| }; |
| struct InstEq { |
| bool srcEq(const Operand *A, const Operand *B) const { |
| if (llvm::isa<Variable>(A) || llvm::isa<Constant>(A)) |
| return (A == B); |
| return false; |
| } |
| bool operator()(const Inst *InstrA, const Inst *InstrB) const { |
| if ((InstrA->getKind() != InstrB->getKind()) || |
| (InstrA->getSrcSize() != InstrB->getSrcSize())) |
| return false; |
| |
| if (auto *A = llvm::dyn_cast<InstArithmetic>(InstrA)) { |
| auto *B = llvm::cast<InstArithmetic>(InstrB); |
| // A, B are guaranteed to be of the same 'kind' at this point |
| // So, dyn_cast is not needed |
| if (A->getOp() != B->getOp()) |
| return false; |
| } |
| // Does not enter loop if different kind or number of operands |
| for (SizeT i = 0; i < InstrA->getSrcSize(); ++i) { |
| if (!srcEq(InstrA->getSrc(i), InstrB->getSrc(i))) |
| return false; |
| } |
| return true; |
| } |
| }; |
| |
| for (CfgNode *Node : getNodes()) { |
| CfgUnorderedSet<Inst *, InstHash, InstEq> Seen; |
| // Stores currently available instructions. |
| |
| CfgUnorderedMap<Variable *, Variable *, VariableHash> Replacements; |
| // Combining the above two into a single data structure might consume less |
| // memory but will be slower i.e map of Instruction -> Set of Variables |
| |
| CfgUnorderedMap<Variable *, std::vector<Inst *>, VariableHash> Dependency; |
| // Maps a variable to the Instructions that depend on it. |
| // a = op1 b c |
| // x = op2 c d |
| // Will result in the map : b -> {a}, c -> {a, x}, d -> {x} |
| // Not necessary for SSA as dependencies will never be invalidated, and the |
| // container will use minimal memory when left unused. |
| |
| auto IterCount = getFlags().getLocalCseMaxIterations(); |
| |
| for (uint32_t i = 0; i < IterCount; ++i) { |
| // TODO(manasijm): Stats on IterCount -> performance |
| for (Inst &Instr : Node->getInsts()) { |
| if (Instr.isDeleted() || !llvm::isa<InstArithmetic>(&Instr)) |
| continue; |
| if (!AssumeSSA) { |
| // Invalidate replacements |
| auto Iter = Replacements.find(Instr.getDest()); |
| if (Iter != Replacements.end()) { |
| Replacements.erase(Iter); |
| } |
| |
| // Invalidate 'seen' instructions whose operands were just updated. |
| auto DepIter = Dependency.find(Instr.getDest()); |
| if (DepIter != Dependency.end()) { |
| for (auto *DepInst : DepIter->second) { |
| Seen.erase(DepInst); |
| } |
| } |
| } |
| |
| // Replace - doing this before checking for repetitions might enable |
| // more optimizations |
| for (SizeT i = 0; i < Instr.getSrcSize(); ++i) { |
| auto *Opnd = Instr.getSrc(i); |
| if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) { |
| if (Replacements.find(Var) != Replacements.end()) { |
| Instr.replaceSource(i, Replacements[Var]); |
| } |
| } |
| } |
| |
| // Check for repetitions |
| auto SeenIter = Seen.find(&Instr); |
| if (SeenIter != Seen.end()) { // seen before |
| const Inst *Found = *SeenIter; |
| Replacements[Instr.getDest()] = Found->getDest(); |
| } else { // new |
| Seen.insert(&Instr); |
| |
| if (!AssumeSSA) { |
| // Update dependencies |
| for (SizeT i = 0; i < Instr.getSrcSize(); ++i) { |
| auto *Opnd = Instr.getSrc(i); |
| if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) { |
| Dependency[Var].push_back(&Instr); |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| void Cfg::loopInvariantCodeMotion() { |
| TimerMarker T(TimerStack::TT_loopInvariantCodeMotion, this); |
| // Does not introduce new nodes as of now. |
| for (auto &Loop : LoopInfo) { |
| CfgNode *Header = Loop.Header; |
| assert(Header); |
| if (Header->getLoopNestDepth() < 1) |
| return; |
| CfgNode *PreHeader = Loop.PreHeader; |
| if (PreHeader == nullptr || PreHeader->getInsts().size() == 0) { |
| return; // try next loop |
| } |
| |
| auto &Insts = PreHeader->getInsts(); |
| auto &LastInst = Insts.back(); |
| Insts.pop_back(); |
| |
| for (auto *Inst : findLoopInvariantInstructions(Loop.Body)) { |
| PreHeader->appendInst(Inst); |
| } |
| PreHeader->appendInst(&LastInst); |
| } |
| } |
| |
| CfgVector<Inst *> |
| Cfg::findLoopInvariantInstructions(const CfgUnorderedSet<SizeT> &Body) { |
| CfgUnorderedSet<Inst *> InvariantInsts; |
| CfgUnorderedSet<Variable *> InvariantVars; |
| for (auto *Var : getArgs()) { |
| InvariantVars.insert(Var); |
| } |
| bool Changed = false; |
| do { |
| Changed = false; |
| for (auto NodeIndex : Body) { |
| auto *Node = Nodes[NodeIndex]; |
| CfgVector<std::reference_wrapper<Inst>> Insts(Node->getInsts().begin(), |
| Node->getInsts().end()); |
| |
| for (auto &InstRef : Insts) { |
| auto &Inst = InstRef.get(); |
| if (Inst.isDeleted() || |
| InvariantInsts.find(&Inst) != InvariantInsts.end()) |
| continue; |
| switch (Inst.getKind()) { |
| case Inst::InstKind::Alloca: |
| case Inst::InstKind::Br: |
| case Inst::InstKind::Ret: |
| case Inst::InstKind::Phi: |
| case Inst::InstKind::Call: |
| case Inst::InstKind::Intrinsic: |
| case Inst::InstKind::Load: |
| case Inst::InstKind::Store: |
| case Inst::InstKind::Switch: |
| continue; |
| default: |
| break; |
| } |
| |
| bool IsInvariant = true; |
| for (SizeT i = 0; i < Inst.getSrcSize(); ++i) { |
| if (auto *Var = llvm::dyn_cast<Variable>(Inst.getSrc(i))) { |
| if (InvariantVars.find(Var) == InvariantVars.end()) { |
| IsInvariant = false; |
| } |
| } |
| } |
| if (IsInvariant) { |
| Changed = true; |
| InvariantInsts.insert(&Inst); |
| Node->getInsts().remove(Inst); |
| if (Inst.getDest() != nullptr) { |
| InvariantVars.insert(Inst.getDest()); |
| } |
| } |
| } |
| } |
| } while (Changed); |
| |
| CfgVector<Inst *> InstVector(InvariantInsts.begin(), InvariantInsts.end()); |
| std::sort(InstVector.begin(), InstVector.end(), |
| [](Inst *A, Inst *B) { return A->getNumber() < B->getNumber(); }); |
| return InstVector; |
| } |
| |
| void Cfg::shortCircuitJumps() { |
| // Split Nodes whenever an early jump is possible. |
| // __N : |
| // a = <something> |
| // Instruction 1 without side effect |
| // ... b = <something> ... |
| // Instruction N without side effect |
| // t1 = or a b |
| // br t1 __X __Y |
| // |
| // is transformed into: |
| // __N : |
| // a = <something> |
| // br a __X __N_ext |
| // |
| // __N_ext : |
| // Instruction 1 without side effect |
| // ... b = <something> ... |
| // Instruction N without side effect |
| // br b __X __Y |
| // (Similar logic for AND, jump to false instead of true target.) |
| |
| TimerMarker T(TimerStack::TT_shortCircuit, this); |
| getVMetadata()->init(VMK_Uses); |
| auto NodeStack = this->getNodes(); |
| CfgUnorderedMap<SizeT, CfgVector<CfgNode *>> Splits; |
| while (!NodeStack.empty()) { |
| auto *Node = NodeStack.back(); |
| NodeStack.pop_back(); |
| auto NewNode = Node->shortCircuit(); |
| if (NewNode) { |
| NodeStack.push_back(NewNode); |
| NodeStack.push_back(Node); |
| Splits[Node->getIndex()].push_back(NewNode); |
| } |
| } |
| |
| // Insert nodes in the right place |
| NodeList NewList; |
| NewList.reserve(Nodes.size()); |
| CfgUnorderedSet<SizeT> Inserted; |
| for (auto *Node : Nodes) { |
| if (Inserted.find(Node->getIndex()) != Inserted.end()) |
| continue; // already inserted |
| NodeList Stack{Node}; |
| while (!Stack.empty()) { |
| auto *Current = Stack.back(); |
| Stack.pop_back(); |
| Inserted.insert(Current->getIndex()); |
| NewList.push_back(Current); |
| for (auto *Next : Splits[Current->getIndex()]) { |
| Stack.push_back(Next); |
| } |
| } |
| } |
| |
| SizeT NodeIndex = 0; |
| for (auto *Node : NewList) { |
| Node->resetIndex(NodeIndex++); |
| } |
| Nodes = NewList; |
| } |
| |
| void Cfg::floatConstantCSE() { |
| // Load multiple uses of a floating point constant (between two call |
| // instructions or block start/end) into a variable before its first use. |
| // t1 = b + 1.0 |
| // t2 = c + 1.0 |
| // Gets transformed to: |
| // t0 = 1.0 |
| // t0_1 = t0 |
| // t1 = b + t0_1 |
| // t2 = c + t0_1 |
| // Call instructions reset the procedure, but use the same variable, just in |
| // case it got a register. We are assuming floating point registers are not |
| // callee saved in general. Example, continuing from before: |
| // result = call <some function> |
| // t3 = d + 1.0 |
| // Gets transformed to: |
| // result = call <some function> |
| // t0_2 = t0 |
| // t3 = d + t0_2 |
| // TODO(manasijm, stichnot): Figure out how to 'link' t0 to the stack slot of |
| // 1.0. When t0 does not get a register, introducing an extra assignment |
| // statement does not make sense. The relevant portion is marked below. |
| |
| TimerMarker _(TimerStack::TT_floatConstantCse, this); |
| for (CfgNode *Node : getNodes()) { |
| |
| CfgUnorderedMap<Constant *, Variable *> ConstCache; |
| auto Current = Node->getInsts().begin(); |
| auto End = Node->getInsts().end(); |
| while (Current != End) { |
| CfgUnorderedMap<Constant *, CfgVector<InstList::iterator>> FloatUses; |
| if (llvm::isa<InstCall>(iteratorToInst(Current))) { |
| ++Current; |
| assert(Current != End); |
| // Block should not end with a call |
| } |
| while (Current != End && !llvm::isa<InstCall>(iteratorToInst(Current))) { |
| if (!Current->isDeleted()) { |
| for (SizeT i = 0; i < Current->getSrcSize(); ++i) { |
| if (auto *Const = llvm::dyn_cast<Constant>(Current->getSrc(i))) { |
| if (Const->getType() == IceType_f32 || |
| Const->getType() == IceType_f64) { |
| FloatUses[Const].push_back(Current); |
| } |
| } |
| } |
| } |
| ++Current; |
| } |
| for (auto &Pair : FloatUses) { |
| static constexpr SizeT MinUseThreshold = 3; |
| if (Pair.second.size() < MinUseThreshold) |
| continue; |
| // Only consider constants with at least `MinUseThreshold` uses |
| auto &Insts = Node->getInsts(); |
| |
| if (ConstCache.find(Pair.first) == ConstCache.end()) { |
| // Saw a constant (which is used at least twice) for the first time |
| auto *NewVar = makeVariable(Pair.first->getType()); |
| // NewVar->setLinkedTo(Pair.first); |
| // TODO(manasijm): Plumbing for linking to an Operand. |
| auto *Assign = InstAssign::create(Node->getCfg(), NewVar, Pair.first); |
| Insts.insert(Pair.second[0], Assign); |
| ConstCache[Pair.first] = NewVar; |
| } |
| |
| auto *NewVar = makeVariable(Pair.first->getType()); |
| NewVar->setLinkedTo(ConstCache[Pair.first]); |
| auto *Assign = |
| InstAssign::create(Node->getCfg(), NewVar, ConstCache[Pair.first]); |
| |
| Insts.insert(Pair.second[0], Assign); |
| for (auto InstUse : Pair.second) { |
| for (SizeT i = 0; i < InstUse->getSrcSize(); ++i) { |
| if (auto *Const = llvm::dyn_cast<Constant>(InstUse->getSrc(i))) { |
| if (Const == Pair.first) { |
| InstUse->replaceSource(i, NewVar); |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| void Cfg::doArgLowering() { |
| TimerMarker T(TimerStack::TT_doArgLowering, this); |
| getTarget()->lowerArguments(); |
| } |
| |
| void Cfg::sortAndCombineAllocas(CfgVector<InstAlloca *> &Allocas, |
| uint32_t CombinedAlignment, InstList &Insts, |
| AllocaBaseVariableType BaseVariableType) { |
| if (Allocas.empty()) |
| return; |
| // Sort by decreasing alignment. |
| std::sort(Allocas.begin(), Allocas.end(), [](InstAlloca *A1, InstAlloca *A2) { |
| uint32_t Align1 = A1->getAlignInBytes(); |
| uint32_t Align2 = A2->getAlignInBytes(); |
| if (Align1 == Align2) |
| return A1->getNumber() < A2->getNumber(); |
| else |
| return Align1 > Align2; |
| }); |
| // Process the allocas in order of decreasing stack alignment. This allows |
| // us to pack less-aligned pieces after more-aligned ones, resulting in less |
| // stack growth. It also allows there to be at most one stack alignment "and" |
| // instruction for a whole list of allocas. |
| uint32_t CurrentOffset = 0; |
| CfgVector<int32_t> Offsets; |
| for (Inst *Instr : Allocas) { |
| auto *Alloca = llvm::cast<InstAlloca>(Instr); |
| // Adjust the size of the allocation up to the next multiple of the |
| // object's alignment. |
| uint32_t Alignment = std::max(Alloca->getAlignInBytes(), 1u); |
| auto *ConstSize = |
| llvm::dyn_cast<ConstantInteger32>(Alloca->getSizeInBytes()); |
| uint32_t Size = Utils::applyAlignment(ConstSize->getValue(), Alignment); |
| if (BaseVariableType == BVT_FramePointer) { |
| // Addressing is relative to the frame pointer. Subtract the offset after |
| // adding the size of the alloca, because it grows downwards from the |
| // frame pointer. |
| Offsets.push_back(Target->getFramePointerOffset(CurrentOffset, Size)); |
| } else { |
| // Addressing is relative to the stack pointer or to a user pointer. Add |
| // the offset before adding the size of the object, because it grows |
| // upwards from the stack pointer. In addition, if the addressing is |
| // relative to the stack pointer, we need to add the pre-computed max out |
| // args size bytes. |
| const uint32_t OutArgsOffsetOrZero = |
| (BaseVariableType == BVT_StackPointer) |
| ? getTarget()->maxOutArgsSizeBytes() |
| : 0; |
| Offsets.push_back(CurrentOffset + OutArgsOffsetOrZero); |
| } |
| // Update the running offset of the fused alloca region. |
| CurrentOffset += Size; |
| } |
| // Round the offset up to the alignment granularity to use as the size. |
| uint32_t TotalSize = Utils::applyAlignment(CurrentOffset, CombinedAlignment); |
| // Ensure every alloca was assigned an offset. |
| assert(Allocas.size() == Offsets.size()); |
| |
| switch (BaseVariableType) { |
| case BVT_UserPointer: { |
| Variable *BaseVariable = makeVariable(IceType_i32); |
| for (SizeT i = 0; i < Allocas.size(); ++i) { |
| auto *Alloca = llvm::cast<InstAlloca>(Allocas[i]); |
| // Emit a new addition operation to replace the alloca. |
| Operand *AllocaOffset = Ctx->getConstantInt32(Offsets[i]); |
| InstArithmetic *Add = |
| InstArithmetic::create(this, InstArithmetic::Add, Alloca->getDest(), |
| BaseVariable, AllocaOffset); |
| Insts.push_front(Add); |
| Alloca->setDeleted(); |
| } |
| Operand *AllocaSize = Ctx->getConstantInt32(TotalSize); |
| InstAlloca *CombinedAlloca = |
| InstAlloca::create(this, BaseVariable, AllocaSize, CombinedAlignment); |
| CombinedAlloca->setKnownFrameOffset(); |
| Insts.push_front(CombinedAlloca); |
| } break; |
| case BVT_StackPointer: |
| case BVT_FramePointer: { |
| for (SizeT i = 0; i < Allocas.size(); ++i) { |
| auto *Alloca = llvm::cast<InstAlloca>(Allocas[i]); |
| // Emit a fake definition of the rematerializable variable. |
| Variable *Dest = Alloca->getDest(); |
| auto *Def = InstFakeDef::create(this, Dest); |
| if (BaseVariableType == BVT_StackPointer) |
| Dest->setRematerializable(getTarget()->getStackReg(), Offsets[i]); |
| else |
| Dest->setRematerializable(getTarget()->getFrameReg(), Offsets[i]); |
| Insts.push_front(Def); |
| Alloca->setDeleted(); |
| } |
| // Allocate the fixed area in the function prolog. |
| getTarget()->reserveFixedAllocaArea(TotalSize, CombinedAlignment); |
| } break; |
| } |
| } |
| |
| void Cfg::processAllocas(bool SortAndCombine) { |
| TimerMarker _(TimerStack::TT_alloca, this); |
| const uint32_t StackAlignment = getTarget()->getStackAlignment(); |
| CfgNode *EntryNode = getEntryNode(); |
| assert(EntryNode); |
| // LLVM enforces power of 2 alignment. |
| assert(llvm::isPowerOf2_32(StackAlignment)); |
| // If the ABI's stack alignment is smaller than the vector size (16 bytes), |
| // conservatively use a frame pointer to allow for explicit alignment of the |
| // stack pointer. This needs to happen before register allocation so the frame |
| // pointer can be reserved. |
| if (getTarget()->needsStackPointerAlignment()) { |
| getTarget()->setHasFramePointer(); |
| } |
| // Determine if there are large alignment allocations in the entry block or |
| // dynamic allocations (variable size in the entry block). |
| bool HasLargeAlignment = false; |
| bool HasDynamicAllocation = false; |
| for (Inst &Instr : EntryNode->getInsts()) { |
| if (Instr.isDeleted()) |
| continue; |
| if (auto *Alloca = llvm::dyn_cast<InstAlloca>(&Instr)) { |
| uint32_t AlignmentParam = Alloca->getAlignInBytes(); |
| if (AlignmentParam > StackAlignment) |
| HasLargeAlignment = true; |
| if (llvm::isa<Constant>(Alloca->getSizeInBytes())) |
| Alloca->setKnownFrameOffset(); |
| else { |
| HasDynamicAllocation = true; |
| // If Allocas are not sorted, the first dynamic allocation causes |
| // later Allocas to be at unknown offsets relative to the stack/frame. |
| if (!SortAndCombine) |
| break; |
| } |
| } |
| } |
| // Don't do the heavyweight sorting and layout for low optimization levels. |
| if (!SortAndCombine) |
| return; |
| // Any alloca outside the entry block is a dynamic allocation. |
| for (CfgNode *Node : Nodes) { |
| if (Node == EntryNode) |
| continue; |
| for (Inst &Instr : Node->getInsts()) { |
| if (Instr.isDeleted()) |
| continue; |
| if (llvm::isa<InstAlloca>(&Instr)) { |
| // Allocations outside the entry block require a frame pointer. |
| HasDynamicAllocation = true; |
| break; |
| } |
| } |
| if (HasDynamicAllocation) |
| break; |
| } |
| // Mark the target as requiring a frame pointer. |
| if (HasLargeAlignment || HasDynamicAllocation) |
| getTarget()->setHasFramePointer(); |
| // Collect the Allocas into the two vectors. |
| // Allocas in the entry block that have constant size and alignment less |
| // than or equal to the function's stack alignment. |
| CfgVector<InstAlloca *> FixedAllocas; |
| // Allocas in the entry block that have constant size and alignment greater |
| // than the function's stack alignment. |
| CfgVector<InstAlloca *> AlignedAllocas; |
| // Maximum alignment used by any alloca. |
| uint32_t MaxAlignment = StackAlignment; |
| for (Inst &Instr : EntryNode->getInsts()) { |
| if (Instr.isDeleted()) |
| continue; |
| if (auto *Alloca = llvm::dyn_cast<InstAlloca>(&Instr)) { |
| if (!llvm::isa<Constant>(Alloca->getSizeInBytes())) |
| continue; |
| uint32_t AlignmentParam = Alloca->getAlignInBytes(); |
| // For default align=0, set it to the real value 1, to avoid any |
| // bit-manipulation problems below. |
| AlignmentParam = std::max(AlignmentParam, 1u); |
| assert(llvm::isPowerOf2_32(AlignmentParam)); |
| if (HasDynamicAllocation && AlignmentParam > StackAlignment) { |
| // If we have both dynamic allocations and large stack alignments, |
| // high-alignment allocations are pulled out with their own base. |
| AlignedAllocas.push_back(Alloca); |
| } else { |
| FixedAllocas.push_back(Alloca); |
| } |
| MaxAlignment = std::max(AlignmentParam, MaxAlignment); |
| } |
| } |
| // Add instructions to the head of the entry block in reverse order. |
| InstList &Insts = getEntryNode()->getInsts(); |
| if (HasDynamicAllocation && HasLargeAlignment) { |
| // We are using a frame pointer, but fixed large-alignment alloca addresses |
| // do not have a known offset from either the stack or frame pointer. |
| // They grow up from a user pointer from an alloca. |
| sortAndCombineAllocas(AlignedAllocas, MaxAlignment, Insts, BVT_UserPointer); |
| // Fixed size allocas are addressed relative to the frame pointer. |
| sortAndCombineAllocas(FixedAllocas, StackAlignment, Insts, |
| BVT_FramePointer); |
| } else { |
| // Otherwise, fixed size allocas are addressed relative to the stack unless |
| // there are dynamic allocas. |
| const AllocaBaseVariableType BasePointerType = |
| (HasDynamicAllocation ? BVT_FramePointer : BVT_StackPointer); |
| sortAndCombineAllocas(FixedAllocas, MaxAlignment, Insts, BasePointerType); |
| } |
| if (!FixedAllocas.empty() || !AlignedAllocas.empty()) |
| // No use calling findRematerializable() unless there is some |
| // rematerializable alloca instruction to seed it. |
| findRematerializable(); |
| } |
| |
| namespace { |
| |
| // Helpers for findRematerializable(). For each of them, if a suitable |
| // rematerialization is found, the instruction's Dest variable is set to be |
| // rematerializable and it returns true, otherwise it returns false. |
| |
| bool rematerializeArithmetic(const Inst *Instr) { |
| // Check that it's an Arithmetic instruction with an Add operation. |
| auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr); |
| if (Arith == nullptr || Arith->getOp() != InstArithmetic::Add) |
| return false; |
| // Check that Src(0) is rematerializable. |
| auto *Src0Var = llvm::dyn_cast<Variable>(Arith->getSrc(0)); |
| if (Src0Var == nullptr || !Src0Var->isRematerializable()) |
| return false; |
| // Check that Src(1) is an immediate. |
| auto *Src1Imm = llvm::dyn_cast<ConstantInteger32>(Arith->getSrc(1)); |
| if (Src1Imm == nullptr) |
| return false; |
| Arith->getDest()->setRematerializable( |
| Src0Var->getRegNum(), Src0Var->getStackOffset() + Src1Imm->getValue()); |
| return true; |
| } |
| |
| bool rematerializeAssign(const Inst *Instr) { |
| // An InstAssign only originates from an inttoptr or ptrtoint instruction, |
| // which never occurs in a MINIMAL build. |
| if (BuildDefs::minimal()) |
| return false; |
| // Check that it's an Assign instruction. |
| if (!llvm::isa<InstAssign>(Instr)) |
| return false; |
| // Check that Src(0) is rematerializable. |
| auto *Src0Var = llvm::dyn_cast<Variable>(Instr->getSrc(0)); |
| if (Src0Var == nullptr || !Src0Var->isRematerializable()) |
| return false; |
| Instr->getDest()->setRematerializable(Src0Var->getRegNum(), |
| Src0Var->getStackOffset()); |
| return true; |
| } |
| |
| bool rematerializeCast(const Inst *Instr) { |
| // An pointer-type bitcast never occurs in a MINIMAL build. |
| if (BuildDefs::minimal()) |
| return false; |
| // Check that it's a Cast instruction with a Bitcast operation. |
| auto *Cast = llvm::dyn_cast<InstCast>(Instr); |
| if (Cast == nullptr || Cast->getCastKind() != InstCast::Bitcast) |
| return false; |
| // Check that Src(0) is rematerializable. |
| auto *Src0Var = llvm::dyn_cast<Variable>(Cast->getSrc(0)); |
| if (Src0Var == nullptr || !Src0Var->isRematerializable()) |
| return false; |
| // Check that Dest and Src(0) have the same type. |
| Variable *Dest = Cast->getDest(); |
| if (Dest->getType() != Src0Var->getType()) |
| return false; |
| Dest->setRematerializable(Src0Var->getRegNum(), Src0Var->getStackOffset()); |
| return true; |
| } |
| |
| } // end of anonymous namespace |
| |
| /// Scan the function to find additional rematerializable variables. This is |
| /// possible when the source operand of an InstAssignment is a rematerializable |
| /// variable, or the same for a pointer-type InstCast::Bitcast, or when an |
| /// InstArithmetic is an add of a rematerializable variable and an immediate. |
| /// Note that InstAssignment instructions and pointer-type InstCast::Bitcast |
| /// instructions generally only come about from the IceConverter's treatment of |
| /// inttoptr, ptrtoint, and bitcast instructions. TODO(stichnot): Consider |
| /// other possibilities, however unlikely, such as InstArithmetic::Sub, or |
| /// commutativity. |
| void Cfg::findRematerializable() { |
| // Scan the instructions in order, and repeat until no new opportunities are |
| // found. It may take more than one iteration because a variable's defining |
| // block may happen to come after a block where it is used, depending on the |
| // CfgNode linearization order. |
| bool FoundNewAssignment; |
| do { |
| FoundNewAssignment = false; |
| for (CfgNode *Node : getNodes()) { |
| // No need to process Phi instructions. |
| for (Inst &Instr : Node->getInsts()) { |
| if (Instr.isDeleted()) |
| continue; |
| Variable *Dest = Instr.getDest(); |
| if (Dest == nullptr || Dest->isRematerializable()) |
| continue; |
| if (rematerializeArithmetic(&Instr) || rematerializeAssign(&Instr) || |
| rematerializeCast(&Instr)) { |
| FoundNewAssignment = true; |
| } |
| } |
| } |
| } while (FoundNewAssignment); |
| } |
| |
| void Cfg::doAddressOpt() { |
| TimerMarker T(TimerStack::TT_doAddressOpt, this); |
| for (CfgNode *Node : Nodes) |
| Node->doAddressOpt(); |
| } |
| |
| namespace { |
| // ShuffleVectorUtils implements helper functions for rematerializing |
| // shufflevector instructions from a sequence of extractelement/insertelement |
| // instructions. It looks for the following pattern: |
| // |
| // %t0 = extractelement A, %n0 |
| // %t1 = extractelement B, %n1 |
| // %t2 = extractelement C, %n2 |
| // ... |
| // %tN = extractelement N, %nN |
| // %d0 = insertelement undef, %t0, 0 |
| // %d1 = insertelement %d0, %t1, 1 |
| // %d2 = insertelement %d1, %t2, 2 |
| // ... |
| // %dest = insertelement %d_N-1, %tN, N |
| // |
| // where N is num_element(typeof(%dest)), and A, B, C, ... N are at most two |
| // distinct variables. |
| namespace ShuffleVectorUtils { |
| // findAllInserts is used when searching for all the insertelements that are |
| // used in a shufflevector operation. This function works recursively, when |
| // invoked with I = i, the function assumes Insts[i] is the last found |
| // insertelement in the chain. The next insertelement insertruction is saved in |
| // Insts[i+1]. |
| bool findAllInserts(Cfg *Func, GlobalContext *Ctx, VariablesMetadata *VM, |
| CfgVector<const Inst *> *Insts, SizeT I = 0) { |
| const bool Verbose = BuildDefs::dump() && Func->isVerbose(IceV_ShufMat); |
| |
| if (I > Insts->size()) { |
| if (Verbose) { |
| Ctx->getStrDump() << "\tToo many inserts.\n"; |
| } |
| return false; |
| } |
| |
| const auto *LastInsert = Insts->at(I); |
| assert(llvm::isa<InstInsertElement>(LastInsert)); |
| |
| if (I == Insts->size() - 1) { |
| // Matching against undef is not really needed because the value in Src(0) |
| // will be totally overwritten. We still enforce it anyways because the |
| // PNaCl toolchain generates the bitcode with it. |
| if (!llvm::isa<ConstantUndef>(LastInsert->getSrc(0))) { |
| if (Verbose) { |
| Ctx->getStrDump() << "\tSrc0 is not undef: " << I << " " |
| << Insts->size(); |
| LastInsert->dump(Func); |
| Ctx->getStrDump() << "\n"; |
| } |
| return false; |
| } |
| |
| // The following loop ensures that the insertelements are sorted. In theory, |
| // we could relax this restriction and allow any order. As long as each |
| // index appears exactly once, this chain is still a candidate for becoming |
| // a shufflevector. The Insts vector is traversed backwards because the |
| // instructions are "enqueued" in reverse order. |
| int32_t ExpectedElement = 0; |
| for (const auto *I : reverse_range(*Insts)) { |
| if (llvm::cast<ConstantInteger32>(I->getSrc(2))->getValue() != |
| ExpectedElement) { |
| return false; |
| } |
| ++ExpectedElement; |
| } |
| return true; |
| } |
| |
| const auto *Src0V = llvm::cast<Variable>(LastInsert->getSrc(0)); |
| const auto *Def = VM->getSingleDefinition(Src0V); |
| |
| // Only optimize if the first operand in |
| // |
| // Dest = insertelement A, B, 10 |
| // |
| // is singly-def'ed. |
| if (Def == nullptr) { |
| if (Verbose) { |
| Ctx->getStrDump() << "\tmulti-def: "; |
| (*Insts)[I]->dump(Func); |
| Ctx->getStrDump() << "\n"; |
| } |
| return false; |
| } |
| |
| // We also require the (single) definition to come from an insertelement |
| // instruction. |
| if (!llvm::isa<InstInsertElement>(Def)) { |
| if (Verbose) { |
| Ctx->getStrDump() << "\tnot insert element: "; |
| Def->dump(Func); |
| Ctx->getStrDump() << "\n"; |
| } |
| return false; |
| } |
| |
| // Everything seems fine, so we save Def in Insts, and delegate the decision |
| // to findAllInserts. |
| (*Insts)[I + 1] = Def; |
| |
| return findAllInserts(Func, Ctx, VM, Insts, I + 1); |
| } |
| |
| // insertsLastElement returns true if Insert is inserting an element in the last |
| // position of a vector. |
| bool insertsLastElement(const Inst &Insert) { |
| const Type DestTy = Insert.getDest()->getType(); |
| assert(isVectorType(DestTy)); |
| const SizeT Elem = |
| llvm::cast<ConstantInteger32>(Insert.getSrc(2))->getValue(); |
| return Elem == typeNumElements(DestTy) - 1; |
| } |
| |
| // findAllExtracts goes over all the insertelement instructions that are |
| // candidates to be replaced by a shufflevector, and searches for all the |
| // definitions of the elements being inserted. If all of the elements are the |
| // result of an extractelement instruction, and all of the extractelements |
| // operate on at most two different sources, than the instructions can be |
| // replaced by a shufflevector. |
| bool findAllExtracts(Cfg *Func, GlobalContext *Ctx, VariablesMetadata *VM, |
| const CfgVector<const Inst *> &Insts, Variable **Src0, |
| Variable **Src1, CfgVector<const Inst *> *Extracts) { |
| const bool Verbose = BuildDefs::dump() && Func->isVerbose(IceV_ShufMat); |
| |
| *Src0 = nullptr; |
| *Src1 = nullptr; |
| assert(Insts.size() > 0); |
| for (SizeT I = 0; I < Insts.size(); ++I) { |
| const auto *Insert = Insts.at(I); |
| const auto *Src1V = llvm::dyn_cast<Variable>(Insert->getSrc(1)); |
| if (Src1V == nullptr) { |
| if (Verbose) { |
| Ctx->getStrDump() << "src(1) is not a variable: "; |
| Insert->dump(Func); |
| Ctx->getStrDump() << "\n"; |
| } |
| return false; |
| } |
| |
| const auto *Def = VM->getSingleDefinition(Src1V); |
| if (Def == nullptr) { |
| if (Verbose) { |
| Ctx->getStrDump() << "multi-def src(1): "; |
| Insert->dump(Func); |
| Ctx->getStrDump() << "\n"; |
| } |
| return false; |
| } |
| |
| if (!llvm::isa<InstExtractElement>(Def)) { |
| if (Verbose) { |
| Ctx->getStrDump() << "not extractelement: "; |
| Def->dump(Func); |
| Ctx->getStrDump() << "\n"; |
| } |
| return false; |
| } |
| |
| auto *Src = llvm::cast<Variable>(Def->getSrc(0)); |
| if (*Src0 == nullptr) { |
| // No sources yet. Save Src to Src0. |
| *Src0 = Src; |
| } else if (*Src1 == nullptr) { |
| // We already have a source, so we might save Src in Src1 -- but only if |
| // Src0 is not Src. |
| if (*Src0 != Src) { |
| *Src1 = Src; |
| } |
| } else if (Src != *Src0 && Src != *Src1) { |
| // More than two sources, so we can't rematerialize the shufflevector |
| // instruction. |
| if (Verbose) { |
| Ctx->getStrDump() << "Can't shuffle more than two sources.\n"; |
| } |
| return false; |
| } |
| |
| (*Extracts)[I] = Def; |
| } |
| |
| // We should have seen at least one source operand. |
| assert(*Src0 != nullptr); |
| |
| // If a second source was not seen, then we just make Src1 = Src0 to simplify |
| // things down stream. This should not matter, as all of the indexes in the |
| // shufflevector instruction will point to Src0. |
| if (*Src1 == nullptr) { |
| *Src1 = *Src0; |
| } |
| |
| return true; |
| } |
| |
| } // end of namespace ShuffleVectorUtils |
| } // end of anonymous namespace |
| |
| void Cfg::materializeVectorShuffles() { |
| const bool Verbose = BuildDefs::dump() && isVerbose(IceV_ShufMat); |
| |
| std::unique_ptr<OstreamLocker> L; |
| if (Verbose) { |
| L.reset(new OstreamLocker(getContext())); |
| getContext()->getStrDump() << "\nShuffle materialization:\n"; |
| } |
| |
| // MaxVectorElements is the maximum number of elements in the vector types |
| // handled by Subzero. We use it to create the Inserts and Extracts vectors |
| // with the appropriate size, thus avoiding resize() calls. |
| const SizeT MaxVectorElements = typeNumElements(IceType_v16i8); |
| CfgVector<const Inst *> Inserts(MaxVectorElements); |
| CfgVector<const Inst *> Extracts(MaxVectorElements); |
| |
| TimerMarker T(TimerStack::TT_materializeVectorShuffles, this); |
| for (CfgNode *Node : Nodes) { |
| for (auto &Instr : Node->getInsts()) { |
| if (!llvm::isa<InstInsertElement>(Instr)) { |
| continue; |
| } |
| if (!ShuffleVectorUtils::insertsLastElement(Instr)) { |
| // To avoid wasting time, we only start the pattern match at the last |
| // insertelement instruction -- and go backwards from there. |
| continue; |
| } |
| if (Verbose) { |
| getContext()->getStrDump() << "\tCandidate: "; |
| Instr.dump(this); |
| getContext()->getStrDump() << "\n"; |
| } |
| Inserts.resize(typeNumElements(Instr.getDest()->getType())); |
| Inserts[0] = &Instr; |
| if (!ShuffleVectorUtils::findAllInserts(this, getContext(), |
| VMetadata.get(), &Inserts)) { |
| // If we fail to find a sequence of insertelements, we stop the |
| // optimization. |
| if (Verbose) { |
| getContext()->getStrDump() << "\tFalse alarm.\n"; |
| } |
| continue; |
| } |
| if (Verbose) { |
| getContext()->getStrDump() << "\tFound the following insertelement: \n"; |
| for (auto *I : reverse_range(Inserts)) { |
| getContext()->getStrDump() << "\t\t"; |
| I->dump(this); |
| getContext()->getStrDump() << "\n"; |
| } |
| } |
| Extracts.resize(Inserts.size()); |
| Variable *Src0; |
| Variable *Src1; |
| if (!ShuffleVectorUtils::findAllExtracts(this, getContext(), |
| VMetadata.get(), Inserts, &Src0, |
| &Src1, &Extracts)) { |
| // If we fail to match the definitions of the insertelements' sources |
| // with extractelement instructions -- or if those instructions operate |
| // on more than two different variables -- we stop the optimization. |
| if (Verbose) { |
| getContext()->getStrDump() << "\tFailed to match extractelements.\n"; |
| } |
| continue; |
| } |
| if (Verbose) { |
| getContext()->getStrDump() |
| << "\tFound the following insert/extract element pairs: \n"; |
| for (SizeT I = 0; I < Inserts.size(); ++I) { |
| const SizeT Pos = Inserts.size() - I - 1; |
| getContext()->getStrDump() << "\t\tInsert : "; |
| Inserts[Pos]->dump(this); |
| getContext()->getStrDump() << "\n\t\tExtract: "; |
| Extracts[Pos]->dump(this); |
| getContext()->getStrDump() << "\n"; |
| } |
| } |
| |
| assert(Src0 != nullptr); |
| assert(Src1 != nullptr); |
| |
| auto *ShuffleVector = |
| InstShuffleVector::create(this, Instr.getDest(), Src0, Src1); |
| assert(ShuffleVector->getSrc(0) == Src0); |
| assert(ShuffleVector->getSrc(1) == Src1); |
| for (SizeT I = 0; I < Extracts.size(); ++I) { |
| const SizeT Pos = Extracts.size() - I - 1; |
| auto *Index = llvm::cast<ConstantInteger32>(Extracts[Pos]->getSrc(1)); |
| if (Src0 == Extracts[Pos]->getSrc(0)) { |
| ShuffleVector->addIndex(Index); |
| } else { |
| ShuffleVector->addIndex(llvm::cast<ConstantInteger32>( |
| Ctx->getConstantInt32(Index->getValue() + Extracts.size()))); |
| } |
| } |
| |
| if (Verbose) { |
| getContext()->getStrDump() << "Created: "; |
| ShuffleVector->dump(this); |
| getContext()->getStrDump() << "\n"; |
| } |
| |
| Instr.setDeleted(); |
| auto &LoweringContext = getTarget()->getContext(); |
| LoweringContext.setInsertPoint(instToIterator(&Instr)); |
| LoweringContext.insert(ShuffleVector); |
| } |
| } |
| } |
| |
| void Cfg::genCode() { |
| TimerMarker T(TimerStack::TT_genCode, this); |
| for (CfgNode *Node : Nodes) |
| Node->genCode(); |
| } |
| |
| // Compute the stack frame layout. |
| void Cfg::genFrame() { |
| TimerMarker T(TimerStack::TT_genFrame, this); |
| getTarget()->addProlog(Entry); |
| for (CfgNode *Node : Nodes) |
| if (Node->getHasReturn()) |
| getTarget()->addEpilog(Node); |
| } |
| |
| void Cfg::generateLoopInfo() { |
| TimerMarker T(TimerStack::TT_computeLoopNestDepth, this); |
| LoopInfo = ComputeLoopInfo(this); |
| } |
| |
| // This is a lightweight version of live-range-end calculation. Marks the last |
| // use of only those variables whose definition and uses are completely with a |
| // single block. It is a quick single pass and doesn't need to iterate until |
| // convergence. |
| void Cfg::livenessLightweight() { |
| TimerMarker T(TimerStack::TT_livenessLightweight, this); |
| getVMetadata()->init(VMK_Uses); |
| for (CfgNode *Node : Nodes) |
| Node->livenessLightweight(); |
| } |
| |
| void Cfg::liveness(LivenessMode Mode) { |
| TimerMarker T(TimerStack::TT_liveness, this); |
| // Destroying the previous (if any) Liveness information clears the Liveness |
| // allocator TLS pointer. |
| Live = nullptr; |
| Live = Liveness::create(this, Mode); |
| |
| getVMetadata()->init(VMK_Uses); |
| Live->init(); |
| |
| // Initialize with all nodes needing to be processed. |
| BitVector NeedToProcess(Nodes.size(), true); |
| while (NeedToProcess.any()) { |
| // Iterate in reverse topological order to speed up convergence. |
| for (CfgNode *Node : reverse_range(Nodes)) { |
| if (NeedToProcess[Node->getIndex()]) { |
| NeedToProcess[Node->getIndex()] = false; |
| bool Changed = Node->liveness(getLiveness()); |
| if (Changed) { |
| // If the beginning-of-block liveness changed since the last |
| // iteration, mark all in-edges as needing to be processed. |
| for (CfgNode *Pred : Node->getInEdges()) |
| NeedToProcess[Pred->getIndex()] = true; |
| } |
| } |
| } |
| } |
| if (Mode == Liveness_Intervals) { |
| // Reset each variable's live range. |
| for (Variable *Var : Variables) |
| Var->resetLiveRange(); |
| } |
| // Make a final pass over each node to delete dead instructions, collect the |
| // first and last instruction numbers, and add live range segments for that |
| // node. |
| for (CfgNode *Node : Nodes) { |
| InstNumberT FirstInstNum = Inst::NumberSentinel; |
| InstNumberT LastInstNum = Inst::NumberSentinel; |
| for (Inst &I : Node->getPhis()) { |
| I.deleteIfDead(); |
| if (Mode == Liveness_Intervals && !I.isDeleted()) { |
| if (FirstInstNum == Inst::NumberSentinel) |
| FirstInstNum = I.getNumber(); |
| assert(I.getNumber() > LastInstNum); |
| LastInstNum = I.getNumber(); |
| } |
| } |
| for (Inst &I : Node->getInsts()) { |
| I.deleteIfDead(); |
| if (Mode == Liveness_Intervals && !I.isDeleted()) { |
| if (FirstInstNum == Inst::NumberSentinel) |
| FirstInstNum = I.getNumber(); |
| assert(I.getNumber() > LastInstNum); |
| LastInstNum = I.getNumber(); |
| } |
| } |
| if (Mode == Liveness_Intervals) { |
| // Special treatment for live in-args. Their liveness needs to extend |
| // beyond the beginning of the function, otherwise an arg whose only use |
| // is in the first instruction will end up having the trivial live range |
| // [2,2) and will *not* interfere with other arguments. So if the first |
| // instruction of the method is "r=arg1+arg2", both args may be assigned |
| // the same register. This is accomplished by extending the entry block's |
| // instruction range from [2,n) to [1,n) which will transform the |
| // problematic [2,2) live ranges into [1,2). This extension works because |
| // the entry node is guaranteed to have the lowest instruction numbers. |
| if (Node == getEntryNode()) { |
| FirstInstNum = Inst::NumberExtended; |
| // Just in case the entry node somehow contains no instructions... |
| if (LastInstNum == Inst::NumberSentinel) |
| LastInstNum = FirstInstNum; |
| } |
| // If this node somehow contains no instructions, don't bother trying to |
| // add liveness intervals for it, because variables that are live-in and |
| // live-out will have a bogus interval added. |
| if (FirstInstNum != Inst::NumberSentinel) |
| Node->livenessAddIntervals(getLiveness(), FirstInstNum, LastInstNum); |
| } |
| } |
| } |
| |
| // Traverse every Variable of every Inst and verify that it appears within the |
| // Variable's computed live range. |
| bool Cfg::validateLiveness() const { |
| TimerMarker T(TimerStack::TT_validateLiveness, this); |
| bool Valid = true; |
| OstreamLocker L(Ctx); |
| Ostream &Str = Ctx->getStrDump(); |
| for (CfgNode *Node : Nodes) { |
| Inst *FirstInst = nullptr; |
| for (Inst &Instr : Node->getInsts()) { |
| if (Instr.isDeleted()) |
| continue; |
| if (FirstInst == nullptr) |
| FirstInst = &Instr; |
| InstNumberT InstNumber = Instr.getNumber(); |
| if (Variable *Dest = Instr.getDest()) { |
| if (!Dest->getIgnoreLiveness()) { |
| bool Invalid = false; |
| constexpr bool IsDest = true; |
| if (!Dest->getLiveRange().containsValue(InstNumber, IsDest)) |
| Invalid = true; |
| // Check that this instruction actually *begins* Dest's live range, |
| // by checking that Dest is not live in the previous instruction. As |
| // a special exception, we don't check this for the first instruction |
| // of the block, because a Phi temporary may be live at the end of |
| // the previous block, and if it is also assigned in the first |
| // instruction of this block, the adjacent live ranges get merged. |
| if (&Instr != FirstInst && !Instr.isDestRedefined() && |
| Dest->getLiveRange().containsValue(InstNumber - 1, IsDest)) |
| Invalid = true; |
| if (Invalid) { |
| Valid = false; |
| Str << "Liveness error: inst " << Instr.getNumber() << " dest "; |
| Dest->dump(this); |
| Str << " live range " << Dest->getLiveRange() << "\n"; |
| } |
| } |
| } |
| FOREACH_VAR_IN_INST(Var, Instr) { |
| static constexpr bool IsDest = false; |
| if (!Var->getIgnoreLiveness() && |
| !Var->getLiveRange().containsValue(InstNumber, IsDest)) { |
| Valid = false; |
| Str << "Liveness error: inst " << Instr.getNumber() << " var "; |
| Var->dump(this); |
| Str << " live range " << Var->getLiveRange() << "\n"; |
| } |
| } |
| } |
| } |
| return Valid; |
| } |
| |
| void Cfg::contractEmptyNodes() { |
| // If we're decorating the asm output with register liveness info, this |
| // information may become corrupted or incorrect after contracting nodes that |
| // contain only redundant assignments. As such, we disable this pass when |
| // DecorateAsm is specified. This may make the resulting code look more |
| // branchy, but it should have no effect on the register assignments. |
| if (getFlags().getDecorateAsm()) |
| return; |
| for (CfgNode *Node : Nodes) { |
| Node->contractIfEmpty(); |
| } |
| } |
| |
| void Cfg::doBranchOpt() { |
| TimerMarker T(TimerStack::TT_doBranchOpt, this); |
| for (auto I = Nodes.begin(), E = Nodes.end(); I != E; ++I) { |
| auto NextNode = I + 1; |
| (*I)->doBranchOpt(NextNode == E ? nullptr : *NextNode); |
| } |
| } |
| |
| // ======================== Dump routines ======================== // |
| |
| // emitTextHeader() is not target-specific (apart from what is abstracted by |
| // the Assembler), so it is defined here rather than in the target lowering |
| // class. |
| void Cfg::emitTextHeader(GlobalString Name, GlobalContext *Ctx, |
| const Assembler *Asm) { |
| if (!BuildDefs::dump()) |
| return; |
| Ostream &Str = Ctx->getStrEmit(); |
| Str << "\t.text\n"; |
| if (getFlags().getFunctionSections()) |
| Str << "\t.section\t.text." << Name << ",\"ax\",%progbits\n"; |
| if (!Asm->getInternal() || getFlags().getDisableInternal()) { |
| Str << "\t.globl\t" << Name << "\n"; |
| Str << "\t.type\t" << Name << ",%function\n"; |
| } |
| Str << "\t" << Asm->getAlignDirective() << " " |
| << Asm->getBundleAlignLog2Bytes() << ",0x"; |
| for (uint8_t I : Asm->getNonExecBundlePadding()) |
| Str.write_hex(I); |
| Str << "\n"; |
| Str << Name << ":\n"; |
| } |
| |
| void Cfg::emitJumpTables() { |
| switch (getFlags().getOutFileType()) { |
| case FT_Elf: |
| case FT_Iasm: { |
| // The emission needs to be delayed until the after the text section so |
| // save the offsets in the global context. |
| for (const InstJumpTable *JumpTable : JumpTables) { |
| Ctx->addJumpTableData(JumpTable->toJumpTableData(getAssembler())); |
| } |
| } break; |
| case FT_Asm: { |
| // Emit the assembly directly so we don't need to hang on to all the names |
| for (const InstJumpTable *JumpTable : JumpTables) |
| getTarget()->emitJumpTable(this, JumpTable); |
| } break; |
| } |
| } |
| |
| void Cfg::emit() { |
| if (!BuildDefs::dump()) |
| return; |
| TimerMarker T(TimerStack::TT_emitAsm, this); |
| if (getFlags().getDecorateAsm()) { |
| renumberInstructions(); |
| getVMetadata()->init(VMK_Uses); |
| liveness(Liveness_Basic); |
| dump("After recomputing liveness for -decorate-asm"); |
| } |
| OstreamLocker L(Ctx); |
| Ostream &Str = Ctx->getStrEmit(); |
| const Assembler *Asm = getAssembler<>(); |
| |
| emitTextHeader(FunctionName, Ctx, Asm); |
| if (getFlags().getDecorateAsm()) { |
| for (Variable *Var : getVariables()) { |
| if (Var->hasKnownStackOffset() && !Var->isRematerializable()) { |
| Str << "\t" << Var->getSymbolicStackOffset() << " = " |
| << Var->getStackOffset() << "\n"; |
| } |
| } |
| } |
| for (CfgNode *Node : Nodes) { |
| Node->emit(this); |
| } |
| emitJumpTables(); |
| Str << "\n"; |
| } |
| |
| void Cfg::emitIAS() { |
| TimerMarker T(TimerStack::TT_emitAsm, this); |
| // The emitIAS() routines emit into the internal assembler buffer, so there's |
| // no need to lock the streams. |
| for (CfgNode *Node : Nodes) { |
| Node->emitIAS(this); |
| } |
| emitJumpTables(); |
| } |
| |
| size_t Cfg::getTotalMemoryMB() const { |
| constexpr size_t _1MB = 1024 * 1024; |
| assert(Allocator != nullptr); |
| assert(CfgAllocatorTraits::current() == Allocator.get()); |
| return Allocator->getTotalMemory() / _1MB; |
| } |
| |
| size_t Cfg::getLivenessMemoryMB() const { |
| constexpr size_t _1MB = 1024 * 1024; |
| if (Live == nullptr) { |
| return 0; |
| } |
| return Live->getAllocator()->getTotalMemory() / _1MB; |
| } |
| |
| // Dumps the IR with an optional introductory message. |
| void Cfg::dump(const char *Message) { |
| if (!BuildDefs::dump()) |
| return; |
| if (!isVerbose()) |
| return; |
| OstreamLocker L(Ctx); |
| Ostream &Str = Ctx->getStrDump(); |
| if (Message[0]) |
| Str << "================ " << Message << " ================\n"; |
| if (isVerbose(IceV_Mem)) { |
| Str << "Memory size = " << getTotalMemoryMB() << " MB\n"; |
| } |
| setCurrentNode(getEntryNode()); |
| // Print function name+args |
| if (isVerbose(IceV_Instructions)) { |
| Str << "define "; |
| if (getInternal() && !getFlags().getDisableInternal()) |
| Str << "internal "; |
| Str << ReturnType << " @" << getFunctionName() << "("; |
| for (SizeT i = 0; i < Args.size(); ++i) { |
| if (i > 0) |
| Str << ", "; |
| Str << Args[i]->getType() << " "; |
| Args[i]->dump(this); |
| } |
| // Append an extra copy of the function name here, in order to print its |
| // size stats but not mess up lit tests. |
| Str << ") { # " << getFunctionNameAndSize() << "\n"; |
| } |
| resetCurrentNode(); |
| if (isVerbose(IceV_Liveness)) { |
| // Print summary info about variables |
| for (Variable *Var : Variables) { |
| Str << "// multiblock="; |
| if (getVMetadata()->isTracked(Var)) |
| Str << getVMetadata()->isMultiBlock(Var); |
| else |
| Str << "?"; |
| Str << " defs="; |
| bool FirstPrint = true; |
| if (VMetadata->getKind() != VMK_Uses) { |
| if (const Inst *FirstDef = VMetadata->getFirstDefinition(Var)) { |
| Str << FirstDef->getNumber(); |
| FirstPrint = false; |
| } |
| } |
| if (VMetadata->getKind() == VMK_All) { |
| for (const Inst *Instr : VMetadata->getLatterDefinitions(Var)) { |
| if (!FirstPrint) |
| Str << ","; |
| Str << Instr->getNumber(); |
| FirstPrint = false; |
| } |
| } |
| Str << " weight=" << Var->getWeight(this) << " "; |
| Var->dump(this); |
| Str << " LIVE=" << Var->getLiveRange() << "\n"; |
| } |
| } |
| // Print each basic block |
| for (CfgNode *Node : Nodes) |
| Node->dump(this); |
| if (isVerbose(IceV_Instructions)) |
| Str << "}\n"; |
| } |
| |
| } // end of namespace Ice |