third_party/subzero/src/IceCfg.cpp - SwiftShader - Git at Google

 //===- subzero/src/IceCfg.cpp - Control flow graph implementation ---------===//
 //
 //                        The Subzero Code Generator
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 ///
 /// \file
 /// \brief Implements the Cfg class.
 ///
 //===----------------------------------------------------------------------===//

 #include "IceCfg.h"

 #include "IceAssembler.h"
 #include "IceBitVector.h"
 #include "IceCfgNode.h"
 #include "IceClFlags.h"
 #include "IceDefs.h"
 #include "IceELFObjectWriter.h"
 #include "IceGlobalInits.h"
 #include "IceInst.h"
 #include "IceInstVarIter.h"
 #include "IceInstrumentation.h"
 #include "IceLiveness.h"
 #include "IceLoopAnalyzer.h"
 #include "IceOperand.h"
 #include "IceTargetLowering.h"

 #include <memory>
 #include <utility>

 namespace Ice {

 Cfg::Cfg(GlobalContext *Ctx, uint32_t SequenceNumber)
     : Allocator(createAllocator()), Ctx(Ctx), SequenceNumber(SequenceNumber),
       VMask(getFlags().getVerbose()), FunctionName(),
       NextInstNumber(Inst::NumberInitial), Live(nullptr) {
   NodeStrings.reset(new StringPool);
   VarStrings.reset(new StringPool);
   CfgLocalAllocatorScope _(this);
   Target = TargetLowering::createLowering(getFlags().getTargetArch(), this);
   VMetadata.reset(new VariablesMetadata(this));
   TargetAssembler = Target->createAssembler();

   if (getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Randomize) {
     // If -randomize-pool-immediates=randomize, create a random number
     // generator to generate a cookie for constant blinding.
     RandomNumberGenerator RNG(getFlags().getRandomSeed(), RPE_ConstantBlinding,
                               this->SequenceNumber);
     ConstantBlindingCookie =
         (uint32_t)RNG.next((uint64_t)std::numeric_limits<uint32_t>::max() + 1);
   }
 }

 Cfg::~Cfg() {
   assert(CfgAllocatorTraits::current() == nullptr);
   if (getFlags().getDumpStrings()) {
     OstreamLocker _(Ctx);
     Ostream &Str = Ctx->getStrDump();
     getNodeStrings()->dump(Str);
     getVarStrings()->dump(Str);
   }
 }

 // Called in the initalizer list of Cfg's constructor to create the Allocator
 // and set it as TLS before any other member fields are constructed, since they
 // may depend on it.
 ArenaAllocator *Cfg::createAllocator() {
   ArenaAllocator *Allocator = new ArenaAllocator();
   CfgAllocatorTraits::set_current(Allocator);
   return Allocator;
 }

 /// Create a string like "foo(i=123:b=9)" indicating the function name, number
 /// of high-level instructions, and number of basic blocks.  This string is only
 /// used for dumping and other diagnostics, and the idea is that given a set of
 /// functions to debug a problem on, it's easy to find the smallest or simplest
 /// function to attack.  Note that the counts may change somewhat depending on
 /// what point it is called during the translation passes.
 std::string Cfg::getFunctionNameAndSize() const {
   if (!BuildDefs::dump())
     return getFunctionName().toString();
   SizeT NodeCount = 0;
   SizeT InstCount = 0;
   for (CfgNode *Node : getNodes()) {
     ++NodeCount;
     // Note: deleted instructions are *not* ignored.
     InstCount += Node->getPhis().size();
     for (Inst &I : Node->getInsts()) {
       if (!llvm::isa<InstTarget>(&I))
         ++InstCount;
     }
   }
   return getFunctionName() + "(i=" + std::to_string(InstCount) + ":b=" +
          std::to_string(NodeCount) + ")";
 }

 void Cfg::setError(const std::string &Message) {
   HasError = true;
   ErrorMessage = Message;
 }

 CfgNode *Cfg::makeNode() {
   SizeT LabelIndex = Nodes.size();
   auto *Node = CfgNode::create(this, LabelIndex);
   Nodes.push_back(Node);
   return Node;
 }

 void Cfg::swapNodes(NodeList &NewNodes) {
   assert(Nodes.size() == NewNodes.size());
   Nodes.swap(NewNodes);
   for (SizeT I = 0, NumNodes = getNumNodes(); I < NumNodes; ++I)
     Nodes[I]->resetIndex(I);
 }

 template <> Variable *Cfg::makeVariable<Variable>(Type Ty) {
   SizeT Index = Variables.size();
   Variable *Var;
   if (Target->shouldSplitToVariableVecOn32(Ty)) {
     Var = VariableVecOn32::create(this, Ty, Index);
   } else if (Target->shouldSplitToVariable64On32(Ty)) {
     Var = Variable64On32::create(this, Ty, Index);
   } else {
     Var = Variable::create(this, Ty, Index);
   }
   Variables.push_back(Var);
   return Var;
 }

 void Cfg::addArg(Variable *Arg) {
   Arg->setIsArg();
   Args.push_back(Arg);
 }

 void Cfg::addImplicitArg(Variable *Arg) {
   Arg->setIsImplicitArg();
   ImplicitArgs.push_back(Arg);
 }

 // Returns whether the stack frame layout has been computed yet. This is used
 // for dumping the stack frame location of Variables.
 bool Cfg::hasComputedFrame() const { return getTarget()->hasComputedFrame(); }

 namespace {
 constexpr char BlockNameGlobalPrefix[] = ".L$profiler$block_name$";
 constexpr char BlockStatsGlobalPrefix[] = ".L$profiler$block_info$";
 } // end of anonymous namespace

 void Cfg::createNodeNameDeclaration(const std::string &NodeAsmName) {
   auto *Var = VariableDeclaration::create(GlobalInits.get());
   Var->setName(Ctx, BlockNameGlobalPrefix + NodeAsmName);
   Var->setIsConstant(true);
   Var->addInitializer(VariableDeclaration::DataInitializer::create(
       GlobalInits.get(), NodeAsmName.data(), NodeAsmName.size() + 1));
   const SizeT Int64ByteSize = typeWidthInBytes(IceType_i64);
   Var->setAlignment(Int64ByteSize); // Wasteful, 32-bit could use 4 bytes.
   GlobalInits->push_back(Var);
 }

 void Cfg::createBlockProfilingInfoDeclaration(
     const std::string &NodeAsmName, VariableDeclaration *NodeNameDeclaration) {
   auto *Var = VariableDeclaration::create(GlobalInits.get());
   Var->setName(Ctx, BlockStatsGlobalPrefix + NodeAsmName);
   const SizeT Int64ByteSize = typeWidthInBytes(IceType_i64);
   Var->addInitializer(VariableDeclaration::ZeroInitializer::create(
       GlobalInits.get(), Int64ByteSize));

   const RelocOffsetT NodeNameDeclarationOffset = 0;
   Var->addInitializer(VariableDeclaration::RelocInitializer::create(
       GlobalInits.get(), NodeNameDeclaration,
       {RelocOffset::create(Ctx, NodeNameDeclarationOffset)}));
   Var->setAlignment(Int64ByteSize);
   GlobalInits->push_back(Var);
 }

 void Cfg::profileBlocks() {
   if (GlobalInits == nullptr)
     GlobalInits.reset(new VariableDeclarationList());

   for (CfgNode *Node : Nodes) {
     const std::string NodeAsmName = Node->getAsmName();
     createNodeNameDeclaration(NodeAsmName);
     createBlockProfilingInfoDeclaration(NodeAsmName, GlobalInits->back());
     Node->profileExecutionCount(GlobalInits->back());
   }
 }

 bool Cfg::isProfileGlobal(const VariableDeclaration &Var) {
   if (!Var.getName().hasStdString())
     return false;
   return Var.getName().toString().find(BlockStatsGlobalPrefix) == 0;
 }

 void Cfg::addCallToProfileSummary() {
   // The call(s) to __Sz_profile_summary are added by the profiler in functions
   // that cause the program to exit. This function is defined in
   // runtime/szrt_profiler.c.
   Constant *ProfileSummarySym =
       Ctx->getConstantExternSym(Ctx->getGlobalString("__Sz_profile_summary"));
   constexpr SizeT NumArgs = 0;
   constexpr Variable *Void = nullptr;
   constexpr bool HasTailCall = false;
   auto *Call =
       InstCall::create(this, NumArgs, Void, ProfileSummarySym, HasTailCall);
   getEntryNode()->getInsts().push_front(Call);
 }

 void Cfg::translate() {
   if (hasError())
     return;
   // Cache the possibly-overridden optimization level once translation begins.
   // It would be nicer to do this in the constructor, but we need to wait until
   // after setFunctionName() has a chance to be called.
   OptimizationLevel =
       getFlags().matchForceO2(getFunctionName(), getSequenceNumber())
           ? Opt_2
           : getFlags().getOptLevel();
   if (BuildDefs::timers()) {
     if (getFlags().matchTimingFocus(getFunctionName(), getSequenceNumber())) {
       setFocusedTiming();
       getContext()->resetTimer(GlobalContext::TSK_Default);
     }
   }
   if (BuildDefs::dump()) {
     if (isVerbose(IceV_Status) &&
         getFlags().matchTestStatus(getFunctionName(), getSequenceNumber())) {
       getContext()->getStrDump() << ">>>Translating "
                                  << getFunctionNameAndSize()
                                  << " seq=" << getSequenceNumber() << "\n";
     }
   }
   TimerMarker T_func(getContext(), getFunctionName().toStringOrEmpty());
   TimerMarker T(TimerStack::TT_translate, this);

   dump("Initial CFG");

   if (getFlags().getEnableBlockProfile()) {
     profileBlocks();
     // TODO(jpp): this is fragile, at best. Figure out a better way of
     // detecting exit functions.
     if (getFunctionName().toStringOrEmpty() == "exit") {
       addCallToProfileSummary();
     }
     dump("Profiled CFG");
   }

   // Create the Hi and Lo variables where a split was needed
   for (Variable *Var : Variables) {
     if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Var)) {
       Var64On32->initHiLo(this);
     } else if (auto *VarVecOn32 = llvm::dyn_cast<VariableVecOn32>(Var)) {
       VarVecOn32->initVecElement(this);
     }
   }

   // Instrument the Cfg, e.g. with AddressSanitizer
   if (!BuildDefs::minimal() && getFlags().getSanitizeAddresses()) {
     getContext()->instrumentFunc(this);
     dump("Instrumented CFG");
   }

   // The set of translation passes and their order are determined by the
   // target.
   getTarget()->translate();

   dump("Final output");
   if (getFocusedTiming()) {
     getContext()->dumpLocalTimers(getFunctionName().toString());
   }
 }

 void Cfg::fixPhiNodes() {
   for (auto *Node : Nodes) {
     // Fix all the phi edges since WASM can't tell how to make them correctly at
     // the beginning.
     assert(Node);
     const auto &InEdges = Node->getInEdges();
     for (auto &Instr : Node->getPhis()) {
       auto *Phi = llvm::cast<InstPhi>(&Instr);
       assert(Phi);
       for (SizeT i = 0; i < InEdges.size(); ++i) {
         Phi->setLabel(i, InEdges[i]);
       }
     }
   }
 }

 void Cfg::computeInOutEdges() {
   // Compute the out-edges.
   for (CfgNode *Node : Nodes) {
     Node->computeSuccessors();
   }

   // Prune any unreachable nodes before computing in-edges.
   SizeT NumNodes = getNumNodes();
   BitVector Reachable(NumNodes);
   BitVector Pending(NumNodes);
   Pending.set(getEntryNode()->getIndex());
   while (true) {
     int Index = Pending.find_first();
     if (Index == -1)
       break;
     Pending.reset(Index);
     Reachable.set(Index);
     CfgNode *Node = Nodes[Index];
     assert(Node->getIndex() == (SizeT)Index);
     for (CfgNode *Succ : Node->getOutEdges()) {
       SizeT SuccIndex = Succ->getIndex();
       if (!Reachable.test(SuccIndex))
         Pending.set(SuccIndex);
     }
   }
   SizeT Dest = 0;
   for (SizeT Source = 0; Source < NumNodes; ++Source) {
     if (Reachable.test(Source)) {
       Nodes[Dest] = Nodes[Source];
       Nodes[Dest]->resetIndex(Dest);
       // Compute the in-edges.
       Nodes[Dest]->computePredecessors();
       ++Dest;
     }
   }
   Nodes.resize(Dest);

   TimerMarker T(TimerStack::TT_phiValidation, this);
   for (CfgNode *Node : Nodes)
     Node->enforcePhiConsistency();
 }

 void Cfg::renumberInstructions() {
   TimerMarker T(TimerStack::TT_renumberInstructions, this);
   NextInstNumber = Inst::NumberInitial;
   for (CfgNode *Node : Nodes)
     Node->renumberInstructions();
   // Make sure the entry node is the first node and therefore got the lowest
   // instruction numbers, to facilitate live range computation of function
   // arguments.  We want to model function arguments as being live on entry to
   // the function, otherwise an argument whose only use is in the first
   // instruction will be assigned a trivial live range and the register
   // allocator will not recognize its live range as overlapping another
   // variable's live range.
   assert(Nodes.empty() || (*Nodes.begin() == getEntryNode()));
 }

 // placePhiLoads() must be called before placePhiStores().
 void Cfg::placePhiLoads() {
   TimerMarker T(TimerStack::TT_placePhiLoads, this);
   for (CfgNode *Node : Nodes)
     Node->placePhiLoads();
 }

 // placePhiStores() must be called after placePhiLoads().
 void Cfg::placePhiStores() {
   TimerMarker T(TimerStack::TT_placePhiStores, this);
   for (CfgNode *Node : Nodes)
     Node->placePhiStores();
 }

 void Cfg::deletePhis() {
   TimerMarker T(TimerStack::TT_deletePhis, this);
   for (CfgNode *Node : Nodes)
     Node->deletePhis();
 }

 void Cfg::advancedPhiLowering() {
   TimerMarker T(TimerStack::TT_advancedPhiLowering, this);
   // Clear all previously computed live ranges (but not live-in/live-out bit
   // vectors or last-use markers), because the follow-on register allocation is
   // only concerned with live ranges across the newly created blocks.
   for (Variable *Var : Variables) {
     Var->getLiveRange().reset();
   }
   // This splits edges and appends new nodes to the end of the node list. This
   // can invalidate iterators, so don't use an iterator.
   SizeT NumNodes = getNumNodes();
   SizeT NumVars = getNumVariables();
   for (SizeT I = 0; I < NumNodes; ++I)
     Nodes[I]->advancedPhiLowering();

   TimerMarker TT(TimerStack::TT_lowerPhiAssignments, this);
   if (true) {
     // The following code does an in-place update of liveness and live ranges
     // as a result of adding the new phi edge split nodes.
     getLiveness()->initPhiEdgeSplits(Nodes.begin() + NumNodes,
                                      Variables.begin() + NumVars);
     TimerMarker TTT(TimerStack::TT_liveness, this);
     // Iterate over the newly added nodes to add their liveness info.
     for (auto I = Nodes.begin() + NumNodes, E = Nodes.end(); I != E; ++I) {
       InstNumberT FirstInstNum = getNextInstNumber();
       (*I)->renumberInstructions();
       InstNumberT LastInstNum = getNextInstNumber() - 1;
       (*I)->liveness(getLiveness());
       (*I)->livenessAddIntervals(getLiveness(), FirstInstNum, LastInstNum);
     }
   } else {
     // The following code does a brute-force recalculation of live ranges as a
     // result of adding the new phi edge split nodes. The liveness calculation
     // is particularly expensive because the new nodes are not yet in a proper
     // topological order and so convergence is slower.
     //
     // This code is kept here for reference and can be temporarily enabled in
     // case the incremental code is under suspicion.
     renumberInstructions();
     liveness(Liveness_Intervals);
     getVMetadata()->init(VMK_All);
   }
   Target->regAlloc(RAK_Phi);
 }

 // Find a reasonable placement for nodes that have not yet been placed, while
 // maintaining the same relative ordering among already placed nodes.
 void Cfg::reorderNodes() {
   // TODO(ascull): it would be nice if the switch tests were always followed by
   // the default case to allow for fall through.
   using PlacedList = CfgList<CfgNode *>;
   PlacedList Placed;      // Nodes with relative placement locked down
   PlacedList Unreachable; // Unreachable nodes
   PlacedList::iterator NoPlace = Placed.end();
   // Keep track of where each node has been tentatively placed so that we can
   // manage insertions into the middle.
   CfgVector<PlacedList::iterator> PlaceIndex(Nodes.size(), NoPlace);
   for (CfgNode *Node : Nodes) {
     // The "do ... while(0);" construct is to factor out the --PlaceIndex and
     // assert() statements before moving to the next node.
     do {
       if (Node != getEntryNode() && Node->getInEdges().empty()) {
         // The node has essentially been deleted since it is not a successor of
         // any other node.
         Unreachable.push_back(Node);
         PlaceIndex[Node->getIndex()] = Unreachable.end();
         Node->setNeedsPlacement(false);
         continue;
       }
       if (!Node->needsPlacement()) {
         // Add to the end of the Placed list.
         Placed.push_back(Node);
         PlaceIndex[Node->getIndex()] = Placed.end();
         continue;
       }
       Node->setNeedsPlacement(false);
       // Assume for now that the unplaced node is from edge-splitting and
       // therefore has 1 in-edge and 1 out-edge (actually, possibly more than 1
       // in-edge if the predecessor node was contracted). If this changes in
       // the future, rethink the strategy.
       assert(Node->getInEdges().size() >= 1);
       assert(Node->hasSingleOutEdge());

       // If it's a (non-critical) edge where the successor has a single
       // in-edge, then place it before the successor.
       CfgNode *Succ = Node->getOutEdges().front();
       if (Succ->getInEdges().size() == 1 &&
           PlaceIndex[Succ->getIndex()] != NoPlace) {
         Placed.insert(PlaceIndex[Succ->getIndex()], Node);
         PlaceIndex[Node->getIndex()] = PlaceIndex[Succ->getIndex()];
         continue;
       }

       // Otherwise, place it after the (first) predecessor.
       CfgNode *Pred = Node->getInEdges().front();
       auto PredPosition = PlaceIndex[Pred->getIndex()];
       // It shouldn't be the case that PredPosition==NoPlace, but if that
       // somehow turns out to be true, we just insert Node before
       // PredPosition=NoPlace=Placed.end() .
       if (PredPosition != NoPlace)
         ++PredPosition;
       Placed.insert(PredPosition, Node);
       PlaceIndex[Node->getIndex()] = PredPosition;
     } while (0);

     --PlaceIndex[Node->getIndex()];
     assert(*PlaceIndex[Node->getIndex()] == Node);
   }

   // Reorder Nodes according to the built-up lists.
   NodeList Reordered;
   Reordered.reserve(Placed.size() + Unreachable.size());
   for (CfgNode *Node : Placed)
     Reordered.push_back(Node);
   for (CfgNode *Node : Unreachable)
     Reordered.push_back(Node);
   assert(getNumNodes() == Reordered.size());
   swapNodes(Reordered);
 }

 namespace {
 void getRandomPostOrder(CfgNode *Node, BitVector &ToVisit,
                         Ice::NodeList &PostOrder,
                         Ice::RandomNumberGenerator *RNG) {
   assert(ToVisit[Node->getIndex()]);
   ToVisit[Node->getIndex()] = false;
   NodeList Outs = Node->getOutEdges();
   Ice::RandomShuffle(Outs.begin(), Outs.end(),
                      [RNG](int N) { return RNG->next(N); });
   for (CfgNode *Next : Outs) {
     if (ToVisit[Next->getIndex()])
       getRandomPostOrder(Next, ToVisit, PostOrder, RNG);
   }
   PostOrder.push_back(Node);
 }
 } // end of anonymous namespace

 void Cfg::shuffleNodes() {
   if (!getFlags().getReorderBasicBlocks())
     return;

   NodeList ReversedReachable;
   NodeList Unreachable;
   BitVector ToVisit(Nodes.size(), true);
   // Create Random number generator for function reordering
   RandomNumberGenerator RNG(getFlags().getRandomSeed(),
                             RPE_BasicBlockReordering, SequenceNumber);
   // Traverse from entry node.
   getRandomPostOrder(getEntryNode(), ToVisit, ReversedReachable, &RNG);
   // Collect the unreachable nodes.
   for (CfgNode *Node : Nodes)
     if (ToVisit[Node->getIndex()])
       Unreachable.push_back(Node);
   // Copy the layout list to the Nodes.
   NodeList Shuffled;
   Shuffled.reserve(ReversedReachable.size() + Unreachable.size());
   for (CfgNode *Node : reverse_range(ReversedReachable))
     Shuffled.push_back(Node);
   for (CfgNode *Node : Unreachable)
     Shuffled.push_back(Node);
   assert(Nodes.size() == Shuffled.size());
   swapNodes(Shuffled);

   dump("After basic block shuffling");
 }

 void Cfg::localCSE(bool AssumeSSA) {
   // Performs basic-block local common-subexpression elimination
   // If we have
   // t1 = op b c
   // t2 = op b c
   // This pass will replace future references to t2 in a basic block by t1
   // Points to note:
   // 1. Assumes SSA by default. To change this, use -lcse=no-ssa
   //      This is needed if this pass is moved to a point later in the pipeline.
   //      If variables have a single definition (in the node), CSE can work just
   //      on the basis of an equality compare on instructions (sans Dest). When
   //      variables can be updated (hence, non-SSA) the result of a previous
   //      instruction which used that variable as an operand can not be reused.
   // 2. Leaves removal of instructions to DCE.
   // 3. Only enabled on arithmetic instructions. pnacl-clang (-O2) is expected
   //    to take care of cases not arising from GEP simplification.
   // 4. By default, a single pass is made over each basic block. Control this
   //    with -lcse-max-iters=N

   TimerMarker T(TimerStack::TT_localCse, this);
   struct VariableHash {
     size_t operator()(const Variable *Var) const { return Var->hashValue(); }
   };

   struct InstHash {
     size_t operator()(const Inst *Instr) const {
       auto Kind = Instr->getKind();
       auto Result =
           std::hash<typename std::underlying_type<Inst::InstKind>::type>()(
               Kind);
       for (SizeT i = 0; i < Instr->getSrcSize(); ++i) {
         Result ^= Instr->getSrc(i)->hashValue();
       }
       return Result;
     }
   };
   struct InstEq {
     bool srcEq(const Operand *A, const Operand *B) const {
       if (llvm::isa<Variable>(A) || llvm::isa<Constant>(A))
         return (A == B);
       return false;
     }
     bool operator()(const Inst *InstrA, const Inst *InstrB) const {
       if ((InstrA->getKind() != InstrB->getKind()) ||
           (InstrA->getSrcSize() != InstrB->getSrcSize()))
         return false;

       if (auto *A = llvm::dyn_cast<InstArithmetic>(InstrA)) {
         auto *B = llvm::cast<InstArithmetic>(InstrB);
         // A, B are guaranteed to be of the same 'kind' at this point
         // So, dyn_cast is not needed
         if (A->getOp() != B->getOp())
           return false;
       }
       // Does not enter loop if different kind or number of operands
       for (SizeT i = 0; i < InstrA->getSrcSize(); ++i) {
         if (!srcEq(InstrA->getSrc(i), InstrB->getSrc(i)))
           return false;
       }
       return true;
     }
   };

   for (CfgNode *Node : getNodes()) {
     CfgUnorderedSet<Inst *, InstHash, InstEq> Seen;
     // Stores currently available instructions.

     CfgUnorderedMap<Variable *, Variable *, VariableHash> Replacements;
     // Combining the above two into a single data structure might consume less
     // memory but will be slower i.e map of Instruction -> Set of Variables

     CfgUnorderedMap<Variable *, std::vector<Inst *>, VariableHash> Dependency;
     // Maps a variable to the Instructions that depend on it.
     // a = op1 b c
     // x = op2 c d
     // Will result in the map : b -> {a}, c -> {a, x}, d -> {x}
     // Not necessary for SSA as dependencies will never be invalidated, and the
     // container will use minimal memory when left unused.

     auto IterCount = getFlags().getLocalCseMaxIterations();

     for (uint32_t i = 0; i < IterCount; ++i) {
       // TODO(manasijm): Stats on IterCount -> performance
       for (Inst &Instr : Node->getInsts()) {
         if (Instr.isDeleted() || !llvm::isa<InstArithmetic>(&Instr))
           continue;
         if (!AssumeSSA) {
           // Invalidate replacements
           auto Iter = Replacements.find(Instr.getDest());
           if (Iter != Replacements.end()) {
             Replacements.erase(Iter);
           }

           // Invalidate 'seen' instructions whose operands were just updated.
           auto DepIter = Dependency.find(Instr.getDest());
           if (DepIter != Dependency.end()) {
             for (auto *DepInst : DepIter->second) {
               Seen.erase(DepInst);
             }
           }
         }

         // Replace - doing this before checking for repetitions might enable
         // more optimizations
         for (SizeT i = 0; i < Instr.getSrcSize(); ++i) {
           auto *Opnd = Instr.getSrc(i);
           if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) {
             if (Replacements.find(Var) != Replacements.end()) {
               Instr.replaceSource(i, Replacements[Var]);
             }
           }
         }

         // Check for repetitions
         auto SeenIter = Seen.find(&Instr);
         if (SeenIter != Seen.end()) { // seen before
           const Inst *Found = *SeenIter;
           Replacements[Instr.getDest()] = Found->getDest();
         } else { // new
           Seen.insert(&Instr);

           if (!AssumeSSA) {
             // Update dependencies
             for (SizeT i = 0; i < Instr.getSrcSize(); ++i) {
               auto *Opnd = Instr.getSrc(i);
               if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) {
                 Dependency[Var].push_back(&Instr);
               }
             }
           }
         }
       }
     }
   }
 }

 void Cfg::loopInvariantCodeMotion() {
   TimerMarker T(TimerStack::TT_loopInvariantCodeMotion, this);
   // Does not introduce new nodes as of now.
   for (auto &Loop : LoopInfo) {
     CfgNode *Header = Loop.Header;
     assert(Header);
     if (Header->getLoopNestDepth() < 1)
       return;
     CfgNode *PreHeader = Loop.PreHeader;
     if (PreHeader == nullptr || PreHeader->getInsts().size() == 0) {
       return; // try next loop
     }

     auto &Insts = PreHeader->getInsts();
     auto &LastInst = Insts.back();
     Insts.pop_back();

     for (auto *Inst : findLoopInvariantInstructions(Loop.Body)) {
       PreHeader->appendInst(Inst);
     }
     PreHeader->appendInst(&LastInst);
   }
 }

 CfgVector<Inst *>
 Cfg::findLoopInvariantInstructions(const CfgUnorderedSet<SizeT> &Body) {
   CfgUnorderedSet<Inst *> InvariantInsts;
   CfgUnorderedSet<Variable *> InvariantVars;
   for (auto *Var : getArgs()) {
     InvariantVars.insert(Var);
   }
   bool Changed = false;
   do {
     Changed = false;
     for (auto NodeIndex : Body) {
       auto *Node = Nodes[NodeIndex];
       CfgVector<std::reference_wrapper<Inst>> Insts(Node->getInsts().begin(),
                                                     Node->getInsts().end());

       for (auto &InstRef : Insts) {
         auto &Inst = InstRef.get();
         if (Inst.isDeleted() ||
             InvariantInsts.find(&Inst) != InvariantInsts.end())
           continue;
         switch (Inst.getKind()) {
         case Inst::InstKind::Alloca:
         case Inst::InstKind::Br:
         case Inst::InstKind::Ret:
         case Inst::InstKind::Phi:
         case Inst::InstKind::Call:
         case Inst::InstKind::IntrinsicCall:
         case Inst::InstKind::Load:
         case Inst::InstKind::Store:
         case Inst::InstKind::Switch:
           continue;
         default:
           break;
         }

         bool IsInvariant = true;
         for (SizeT i = 0; i < Inst.getSrcSize(); ++i) {
           if (auto *Var = llvm::dyn_cast<Variable>(Inst.getSrc(i))) {
             if (InvariantVars.find(Var) == InvariantVars.end()) {
               IsInvariant = false;
             }
           }
         }
         if (IsInvariant) {
           Changed = true;
           InvariantInsts.insert(&Inst);
           Node->getInsts().remove(Inst);
           if (Inst.getDest() != nullptr) {
             InvariantVars.insert(Inst.getDest());
           }
         }
       }
     }
   } while (Changed);

   CfgVector<Inst *> InstVector(InvariantInsts.begin(), InvariantInsts.end());
   std::sort(InstVector.begin(), InstVector.end(),
             [](Inst *A, Inst *B) { return A->getNumber() < B->getNumber(); });
   return InstVector;
 }

 void Cfg::shortCircuitJumps() {
   // Split Nodes whenever an early jump is possible.
   // __N :
   //   a = <something>
   //   Instruction 1 without side effect
   //   ... b = <something> ...
   //   Instruction N without side effect
   //   t1 = or a b
   //   br t1 __X __Y
   //
   // is transformed into:
   // __N :
   //   a = <something>
   //   br a __X __N_ext
   //
   // __N_ext :
   //   Instruction 1 without side effect
   //   ... b = <something> ...
   //   Instruction N without side effect
   //   br b __X __Y
   // (Similar logic for AND, jump to false instead of true target.)

   TimerMarker T(TimerStack::TT_shortCircuit, this);
   getVMetadata()->init(VMK_Uses);
   auto NodeStack = this->getNodes();
   CfgUnorderedMap<SizeT, CfgVector<CfgNode *>> Splits;
   while (!NodeStack.empty()) {
     auto *Node = NodeStack.back();
     NodeStack.pop_back();
     auto NewNode = Node->shortCircuit();
     if (NewNode) {
       NodeStack.push_back(NewNode);
       NodeStack.push_back(Node);
       Splits[Node->getIndex()].push_back(NewNode);
     }
   }

   // Insert nodes in the right place
   NodeList NewList;
   NewList.reserve(Nodes.size());
   CfgUnorderedSet<SizeT> Inserted;
   for (auto *Node : Nodes) {
     if (Inserted.find(Node->getIndex()) != Inserted.end())
       continue; // already inserted
     NodeList Stack{Node};
     while (!Stack.empty()) {
       auto *Current = Stack.back();
       Stack.pop_back();
       Inserted.insert(Current->getIndex());
       NewList.push_back(Current);
       for (auto *Next : Splits[Current->getIndex()]) {
         Stack.push_back(Next);
       }
     }
   }

   SizeT NodeIndex = 0;
   for (auto *Node : NewList) {
     Node->resetIndex(NodeIndex++);
   }
   Nodes = NewList;
 }

 void Cfg::floatConstantCSE() {
   // Load multiple uses of a floating point constant (between two call
   // instructions or block start/end) into a variable before its first use.
   //   t1 = b + 1.0
   //   t2 = c + 1.0
   // Gets transformed to:
   //   t0 = 1.0
   //   t0_1 = t0
   //   t1 = b + t0_1
   //   t2 = c + t0_1
   // Call instructions reset the procedure, but use the same variable, just in
   // case it got a register. We are assuming floating point registers are not
   // callee saved in general. Example, continuing from before:
   //   result = call <some function>
   //   t3 = d + 1.0
   // Gets transformed to:
   //   result = call <some function>
   //   t0_2 = t0
   //   t3 = d + t0_2
   // TODO(manasijm, stichnot): Figure out how to 'link' t0 to the stack slot of
   // 1.0. When t0 does not get a register, introducing an extra assignment
   // statement does not make sense. The relevant portion is marked below.

   TimerMarker _(TimerStack::TT_floatConstantCse, this);
   for (CfgNode *Node : getNodes()) {

     CfgUnorderedMap<Constant *, Variable *> ConstCache;
     auto Current = Node->getInsts().begin();
     auto End = Node->getInsts().end();
     while (Current != End) {
       CfgUnorderedMap<Constant *, CfgVector<InstList::iterator>> FloatUses;
       if (llvm::isa<InstCall>(iteratorToInst(Current))) {
         ++Current;
         assert(Current != End);
         // Block should not end with a call
       }
       while (Current != End && !llvm::isa<InstCall>(iteratorToInst(Current))) {
         if (!Current->isDeleted()) {
           for (SizeT i = 0; i < Current->getSrcSize(); ++i) {
             if (auto *Const = llvm::dyn_cast<Constant>(Current->getSrc(i))) {
               if (Const->getType() == IceType_f32 ||
                   Const->getType() == IceType_f64) {
                 FloatUses[Const].push_back(Current);
               }
             }
           }
         }
         ++Current;
       }
       for (auto &Pair : FloatUses) {
         static constexpr SizeT MinUseThreshold = 3;
         if (Pair.second.size() < MinUseThreshold)
           continue;
         // Only consider constants with at least `MinUseThreshold` uses
         auto &Insts = Node->getInsts();

         if (ConstCache.find(Pair.first) == ConstCache.end()) {
           // Saw a constant (which is used at least twice) for the first time
           auto *NewVar = makeVariable(Pair.first->getType());
           // NewVar->setLinkedTo(Pair.first);
           // TODO(manasijm): Plumbing for linking to an Operand.
           auto *Assign = InstAssign::create(Node->getCfg(), NewVar, Pair.first);
           Insts.insert(Pair.second[0], Assign);
           ConstCache[Pair.first] = NewVar;
         }

         auto *NewVar = makeVariable(Pair.first->getType());
         NewVar->setLinkedTo(ConstCache[Pair.first]);
         auto *Assign =
             InstAssign::create(Node->getCfg(), NewVar, ConstCache[Pair.first]);

         Insts.insert(Pair.second[0], Assign);
         for (auto InstUse : Pair.second) {
           for (SizeT i = 0; i < InstUse->getSrcSize(); ++i) {
             if (auto *Const = llvm::dyn_cast<Constant>(InstUse->getSrc(i))) {
               if (Const == Pair.first) {
                 InstUse->replaceSource(i, NewVar);
               }
             }
           }
         }
       }
     }
   }
 }

 void Cfg::doArgLowering() {
   TimerMarker T(TimerStack::TT_doArgLowering, this);
   getTarget()->lowerArguments();
 }

 void Cfg::sortAndCombineAllocas(CfgVector<InstAlloca *> &Allocas,
                                 uint32_t CombinedAlignment, InstList &Insts,
                                 AllocaBaseVariableType BaseVariableType) {
   if (Allocas.empty())
     return;
   // Sort by decreasing alignment.
   std::sort(Allocas.begin(), Allocas.end(), [](InstAlloca *A1, InstAlloca *A2) {
     uint32_t Align1 = A1->getAlignInBytes();
     uint32_t Align2 = A2->getAlignInBytes();
     if (Align1 == Align2)
       return A1->getNumber() < A2->getNumber();
     else
       return Align1 > Align2;
   });
   // Process the allocas in order of decreasing stack alignment.  This allows
   // us to pack less-aligned pieces after more-aligned ones, resulting in less
   // stack growth.  It also allows there to be at most one stack alignment "and"
   // instruction for a whole list of allocas.
   uint32_t CurrentOffset = 0;
   CfgVector<int32_t> Offsets;
   for (Inst *Instr : Allocas) {
     auto *Alloca = llvm::cast<InstAlloca>(Instr);
     // Adjust the size of the allocation up to the next multiple of the
     // object's alignment.
     uint32_t Alignment = std::max(Alloca->getAlignInBytes(), 1u);
     auto *ConstSize =
         llvm::dyn_cast<ConstantInteger32>(Alloca->getSizeInBytes());
     uint32_t Size = Utils::applyAlignment(ConstSize->getValue(), Alignment);
     if (BaseVariableType == BVT_FramePointer) {
       // Addressing is relative to the frame pointer.  Subtract the offset after
       // adding the size of the alloca, because it grows downwards from the
       // frame pointer.
       Offsets.push_back(Target->getFramePointerOffset(CurrentOffset, Size));
     } else {
       // Addressing is relative to the stack pointer or to a user pointer.  Add
       // the offset before adding the size of the object, because it grows
       // upwards from the stack pointer. In addition, if the addressing is
       // relative to the stack pointer, we need to add the pre-computed max out
       // args size bytes.
       const uint32_t OutArgsOffsetOrZero =
           (BaseVariableType == BVT_StackPointer)
               ? getTarget()->maxOutArgsSizeBytes()
               : 0;
       Offsets.push_back(CurrentOffset + OutArgsOffsetOrZero);
     }
     // Update the running offset of the fused alloca region.
     CurrentOffset += Size;
   }
   // Round the offset up to the alignment granularity to use as the size.
   uint32_t TotalSize = Utils::applyAlignment(CurrentOffset, CombinedAlignment);
   // Ensure every alloca was assigned an offset.
   assert(Allocas.size() == Offsets.size());

   switch (BaseVariableType) {
   case BVT_UserPointer: {
     Variable *BaseVariable = makeVariable(IceType_i32);
     for (SizeT i = 0; i < Allocas.size(); ++i) {
       auto *Alloca = llvm::cast<InstAlloca>(Allocas[i]);
       // Emit a new addition operation to replace the alloca.
       Operand *AllocaOffset = Ctx->getConstantInt32(Offsets[i]);
       InstArithmetic *Add =
           InstArithmetic::create(this, InstArithmetic::Add, Alloca->getDest(),
                                  BaseVariable, AllocaOffset);
       Insts.push_front(Add);
       Alloca->setDeleted();
     }
     Operand *AllocaSize = Ctx->getConstantInt32(TotalSize);
     InstAlloca *CombinedAlloca =
         InstAlloca::create(this, BaseVariable, AllocaSize, CombinedAlignment);
     CombinedAlloca->setKnownFrameOffset();
     Insts.push_front(CombinedAlloca);
   } break;
   case BVT_StackPointer:
   case BVT_FramePointer: {
     for (SizeT i = 0; i < Allocas.size(); ++i) {
       auto *Alloca = llvm::cast<InstAlloca>(Allocas[i]);
       // Emit a fake definition of the rematerializable variable.
       Variable *Dest = Alloca->getDest();
       auto *Def = InstFakeDef::create(this, Dest);
       if (BaseVariableType == BVT_StackPointer)
         Dest->setRematerializable(getTarget()->getStackReg(), Offsets[i]);
       else
         Dest->setRematerializable(getTarget()->getFrameReg(), Offsets[i]);
       Insts.push_front(Def);
       Alloca->setDeleted();
     }
     // Allocate the fixed area in the function prolog.
     getTarget()->reserveFixedAllocaArea(TotalSize, CombinedAlignment);
   } break;
   }
 }

 void Cfg::processAllocas(bool SortAndCombine) {
   TimerMarker _(TimerStack::TT_alloca, this);
   const uint32_t StackAlignment = getTarget()->getStackAlignment();
   CfgNode *EntryNode = getEntryNode();
   assert(EntryNode);
   // LLVM enforces power of 2 alignment.
   assert(llvm::isPowerOf2_32(StackAlignment));
   // If the ABI's stack alignment is smaller than the vector size (16 bytes),
   // conservatively use a frame pointer to allow for explicit alignment of the
   // stack pointer. This needs to happen before register allocation so the frame
   // pointer can be reserved.
   if (getTarget()->needsStackPointerAlignment()) {
     getTarget()->setHasFramePointer();
   }
   // Determine if there are large alignment allocations in the entry block or
   // dynamic allocations (variable size in the entry block).
   bool HasLargeAlignment = false;
   bool HasDynamicAllocation = false;
   for (Inst &Instr : EntryNode->getInsts()) {
     if (Instr.isDeleted())
       continue;
     if (auto *Alloca = llvm::dyn_cast<InstAlloca>(&Instr)) {
       uint32_t AlignmentParam = Alloca->getAlignInBytes();
       if (AlignmentParam > StackAlignment)
         HasLargeAlignment = true;
       if (llvm::isa<Constant>(Alloca->getSizeInBytes()))
         Alloca->setKnownFrameOffset();
       else {
         HasDynamicAllocation = true;
         // If Allocas are not sorted, the first dynamic allocation causes
         // later Allocas to be at unknown offsets relative to the stack/frame.
         if (!SortAndCombine)
           break;
       }
     }
   }
   // Don't do the heavyweight sorting and layout for low optimization levels.
   if (!SortAndCombine)
     return;
   // Any alloca outside the entry block is a dynamic allocation.
   for (CfgNode *Node : Nodes) {
     if (Node == EntryNode)
       continue;
     for (Inst &Instr : Node->getInsts()) {
       if (Instr.isDeleted())
         continue;
       if (llvm::isa<InstAlloca>(&Instr)) {
         // Allocations outside the entry block require a frame pointer.
         HasDynamicAllocation = true;
         break;
       }
     }
     if (HasDynamicAllocation)
       break;
   }
   // Mark the target as requiring a frame pointer.
   if (HasLargeAlignment || HasDynamicAllocation)
     getTarget()->setHasFramePointer();
   // Collect the Allocas into the two vectors.
   // Allocas in the entry block that have constant size and alignment less
   // than or equal to the function's stack alignment.
   CfgVector<InstAlloca *> FixedAllocas;
   // Allocas in the entry block that have constant size and alignment greater
   // than the function's stack alignment.
   CfgVector<InstAlloca *> AlignedAllocas;
   // Maximum alignment used by any alloca.
   uint32_t MaxAlignment = StackAlignment;
   for (Inst &Instr : EntryNode->getInsts()) {
     if (Instr.isDeleted())
       continue;
     if (auto *Alloca = llvm::dyn_cast<InstAlloca>(&Instr)) {
       if (!llvm::isa<Constant>(Alloca->getSizeInBytes()))
         continue;
       uint32_t AlignmentParam = Alloca->getAlignInBytes();
       // For default align=0, set it to the real value 1, to avoid any
       // bit-manipulation problems below.
       AlignmentParam = std::max(AlignmentParam, 1u);
       assert(llvm::isPowerOf2_32(AlignmentParam));
       if (HasDynamicAllocation && AlignmentParam > StackAlignment) {
         // If we have both dynamic allocations and large stack alignments,
         // high-alignment allocations are pulled out with their own base.
         AlignedAllocas.push_back(Alloca);
       } else {
         FixedAllocas.push_back(Alloca);
       }
       MaxAlignment = std::max(AlignmentParam, MaxAlignment);
     }
   }
   // Add instructions to the head of the entry block in reverse order.
   InstList &Insts = getEntryNode()->getInsts();
   if (HasDynamicAllocation && HasLargeAlignment) {
     // We are using a frame pointer, but fixed large-alignment alloca addresses
     // do not have a known offset from either the stack or frame pointer.
     // They grow up from a user pointer from an alloca.
     sortAndCombineAllocas(AlignedAllocas, MaxAlignment, Insts, BVT_UserPointer);
     // Fixed size allocas are addressed relative to the frame pointer.
     sortAndCombineAllocas(FixedAllocas, StackAlignment, Insts,
                           BVT_FramePointer);
   } else {
     // Otherwise, fixed size allocas are addressed relative to the stack unless
     // there are dynamic allocas.
     const AllocaBaseVariableType BasePointerType =
         (HasDynamicAllocation ? BVT_FramePointer : BVT_StackPointer);
     sortAndCombineAllocas(FixedAllocas, MaxAlignment, Insts, BasePointerType);
   }
   if (!FixedAllocas.empty() || !AlignedAllocas.empty())
     // No use calling findRematerializable() unless there is some
     // rematerializable alloca instruction to seed it.
     findRematerializable();
 }

 namespace {

 // Helpers for findRematerializable().  For each of them, if a suitable
 // rematerialization is found, the instruction's Dest variable is set to be
 // rematerializable and it returns true, otherwise it returns false.

 bool rematerializeArithmetic(const Inst *Instr) {
   // Check that it's an Arithmetic instruction with an Add operation.
   auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr);
   if (Arith == nullptr || Arith->getOp() != InstArithmetic::Add)
     return false;
   // Check that Src(0) is rematerializable.
   auto *Src0Var = llvm::dyn_cast<Variable>(Arith->getSrc(0));
   if (Src0Var == nullptr || !Src0Var->isRematerializable())
     return false;
   // Check that Src(1) is an immediate.
   auto *Src1Imm = llvm::dyn_cast<ConstantInteger32>(Arith->getSrc(1));
   if (Src1Imm == nullptr)
     return false;
   Arith->getDest()->setRematerializable(
       Src0Var->getRegNum(), Src0Var->getStackOffset() + Src1Imm->getValue());
   return true;
 }

 bool rematerializeAssign(const Inst *Instr) {
   // An InstAssign only originates from an inttoptr or ptrtoint instruction,
   // which never occurs in a MINIMAL build.
   if (BuildDefs::minimal())
     return false;
   // Check that it's an Assign instruction.
   if (!llvm::isa<InstAssign>(Instr))
     return false;
   // Check that Src(0) is rematerializable.
   auto *Src0Var = llvm::dyn_cast<Variable>(Instr->getSrc(0));
   if (Src0Var == nullptr || !Src0Var->isRematerializable())
     return false;
   Instr->getDest()->setRematerializable(Src0Var->getRegNum(),
                                         Src0Var->getStackOffset());
   return true;
 }

 bool rematerializeCast(const Inst *Instr) {
   // An pointer-type bitcast never occurs in a MINIMAL build.
   if (BuildDefs::minimal())
     return false;
   // Check that it's a Cast instruction with a Bitcast operation.
   auto *Cast = llvm::dyn_cast<InstCast>(Instr);
   if (Cast == nullptr || Cast->getCastKind() != InstCast::Bitcast)
     return false;
   // Check that Src(0) is rematerializable.
   auto *Src0Var = llvm::dyn_cast<Variable>(Cast->getSrc(0));
   if (Src0Var == nullptr || !Src0Var->isRematerializable())
     return false;
   // Check that Dest and Src(0) have the same type.
   Variable *Dest = Cast->getDest();
   if (Dest->getType() != Src0Var->getType())
     return false;
   Dest->setRematerializable(Src0Var->getRegNum(), Src0Var->getStackOffset());
   return true;
 }

 } // end of anonymous namespace

 /// Scan the function to find additional rematerializable variables.  This is
 /// possible when the source operand of an InstAssignment is a rematerializable
 /// variable, or the same for a pointer-type InstCast::Bitcast, or when an
 /// InstArithmetic is an add of a rematerializable variable and an immediate.
 /// Note that InstAssignment instructions and pointer-type InstCast::Bitcast
 /// instructions generally only come about from the IceConverter's treatment of
 /// inttoptr, ptrtoint, and bitcast instructions.  TODO(stichnot): Consider
 /// other possibilities, however unlikely, such as InstArithmetic::Sub, or
 /// commutativity.
 void Cfg::findRematerializable() {
   // Scan the instructions in order, and repeat until no new opportunities are
   // found.  It may take more than one iteration because a variable's defining
   // block may happen to come after a block where it is used, depending on the
   // CfgNode linearization order.
   bool FoundNewAssignment;
   do {
     FoundNewAssignment = false;
     for (CfgNode *Node : getNodes()) {
       // No need to process Phi instructions.
       for (Inst &Instr : Node->getInsts()) {
         if (Instr.isDeleted())
           continue;
         Variable *Dest = Instr.getDest();
         if (Dest == nullptr || Dest->isRematerializable())
           continue;
         if (rematerializeArithmetic(&Instr) || rematerializeAssign(&Instr) ||
             rematerializeCast(&Instr)) {
           FoundNewAssignment = true;
         }
       }
     }
   } while (FoundNewAssignment);
 }

 void Cfg::doAddressOpt() {
   TimerMarker T(TimerStack::TT_doAddressOpt, this);
   for (CfgNode *Node : Nodes)
     Node->doAddressOpt();
 }

 namespace {
 // ShuffleVectorUtils implements helper functions for rematerializing
 // shufflevector instructions from a sequence of extractelement/insertelement
 // instructions. It looks for the following pattern:
 //
 // %t0 = extractelement A, %n0
 // %t1 = extractelement B, %n1
 // %t2 = extractelement C, %n2
 // ...
 // %tN = extractelement N, %nN
 // %d0 = insertelement undef, %t0, 0
 // %d1 = insertelement %d0, %t1, 1
 // %d2 = insertelement %d1, %t2, 2
 // ...
 // %dest = insertelement %d_N-1, %tN, N
 //
 // where N is num_element(typeof(%dest)), and A, B, C, ... N are at most two
 // distinct variables.
 namespace ShuffleVectorUtils {
 // findAllInserts is used when searching for all the insertelements that are
 // used in a shufflevector operation. This function works recursively, when
 // invoked with I = i, the function assumes Insts[i] is the last found
 // insertelement in the chain. The next insertelement insertruction is saved in
 // Insts[i+1].
 bool findAllInserts(Cfg *Func, GlobalContext *Ctx, VariablesMetadata *VM,
                     CfgVector<const Inst *> *Insts, SizeT I = 0) {
   const bool Verbose = BuildDefs::dump() && Func->isVerbose(IceV_ShufMat);

   if (I > Insts->size()) {
     if (Verbose) {
       Ctx->getStrDump() << "\tToo many inserts.\n";
     }
     return false;
   }

   const auto *LastInsert = Insts->at(I);
   assert(llvm::isa<InstInsertElement>(LastInsert));

   if (I == Insts->size() - 1) {
     // Matching against undef is not really needed because the value in Src(0)
     // will be totally overwritten. We still enforce it anyways because the
     // PNaCl toolchain generates the bitcode with it.
     if (!llvm::isa<ConstantUndef>(LastInsert->getSrc(0))) {
       if (Verbose) {
         Ctx->getStrDump() << "\tSrc0 is not undef: " << I << " "
                           << Insts->size();
         LastInsert->dump(Func);
         Ctx->getStrDump() << "\n";
       }
       return false;
     }

     // The following loop ensures that the insertelements are sorted. In theory,
     // we could relax this restriction and allow any order. As long as each
     // index appears exactly once, this chain is still a candidate for becoming
     // a shufflevector. The Insts vector is traversed backwards because the
     // instructions are "enqueued" in reverse order.
     int32_t ExpectedElement = 0;
     for (const auto *I : reverse_range(*Insts)) {
       if (llvm::cast<ConstantInteger32>(I->getSrc(2))->getValue() !=
           ExpectedElement) {
         return false;
       }
       ++ExpectedElement;
     }
     return true;
   }

   const auto *Src0V = llvm::cast<Variable>(LastInsert->getSrc(0));
   const auto *Def = VM->getSingleDefinition(Src0V);

   // Only optimize if the first operand in
   //
   //   Dest = insertelement A, B, 10
   //
   // is singly-def'ed.
   if (Def == nullptr) {
     if (Verbose) {
       Ctx->getStrDump() << "\tmulti-def: ";
       (*Insts)[I]->dump(Func);
       Ctx->getStrDump() << "\n";
     }
     return false;
   }

   // We also require the (single) definition to come from an insertelement
   // instruction.
   if (!llvm::isa<InstInsertElement>(Def)) {
     if (Verbose) {
       Ctx->getStrDump() << "\tnot insert element: ";
       Def->dump(Func);
       Ctx->getStrDump() << "\n";
     }
     return false;
   }

   // Everything seems fine, so we save Def in Insts, and delegate the decision
   // to findAllInserts.
   (*Insts)[I + 1] = Def;

   return findAllInserts(Func, Ctx, VM, Insts, I + 1);
 }

 // insertsLastElement returns true if Insert is inserting an element in the last
 // position of a vector.
 bool insertsLastElement(const Inst &Insert) {
   const Type DestTy = Insert.getDest()->getType();
   assert(isVectorType(DestTy));
   const SizeT Elem =
       llvm::cast<ConstantInteger32>(Insert.getSrc(2))->getValue();
   return Elem == typeNumElements(DestTy) - 1;
 }

 // findAllExtracts goes over all the insertelement instructions that are
 // candidates to be replaced by a shufflevector, and searches for all the
 // definitions of the elements being inserted. If all of the elements are the
 // result of an extractelement instruction, and all of the extractelements
 // operate on at most two different sources, than the instructions can be
 // replaced by a shufflevector.
 bool findAllExtracts(Cfg *Func, GlobalContext *Ctx, VariablesMetadata *VM,
                      const CfgVector<const Inst *> &Insts, Variable **Src0,
                      Variable **Src1, CfgVector<const Inst *> *Extracts) {
   const bool Verbose = BuildDefs::dump() && Func->isVerbose(IceV_ShufMat);

   *Src0 = nullptr;
   *Src1 = nullptr;
   assert(Insts.size() > 0);
   for (SizeT I = 0; I < Insts.size(); ++I) {
     const auto *Insert = Insts.at(I);
     const auto *Src1V = llvm::dyn_cast<Variable>(Insert->getSrc(1));
     if (Src1V == nullptr) {
       if (Verbose) {
         Ctx->getStrDump() << "src(1) is not a variable: ";
         Insert->dump(Func);
         Ctx->getStrDump() << "\n";
       }
       return false;
     }

     const auto *Def = VM->getSingleDefinition(Src1V);
     if (Def == nullptr) {
       if (Verbose) {
         Ctx->getStrDump() << "multi-def src(1): ";
         Insert->dump(Func);
         Ctx->getStrDump() << "\n";
       }
       return false;
     }

     if (!llvm::isa<InstExtractElement>(Def)) {
       if (Verbose) {
         Ctx->getStrDump() << "not extractelement: ";
         Def->dump(Func);
         Ctx->getStrDump() << "\n";
       }
       return false;
     }

     auto *Src = llvm::cast<Variable>(Def->getSrc(0));
     if (*Src0 == nullptr) {
       // No sources yet. Save Src to Src0.
       *Src0 = Src;
     } else if (*Src1 == nullptr) {
       // We already have a source, so we might save Src in Src1 -- but only if
       // Src0 is not Src.
       if (*Src0 != Src) {
         *Src1 = Src;
       }
     } else if (Src != *Src0 && Src != *Src1) {
       // More than two sources, so we can't rematerialize the shufflevector
       // instruction.
       if (Verbose) {
         Ctx->getStrDump() << "Can't shuffle more than two sources.\n";
       }
       return false;
     }

     (*Extracts)[I] = Def;
   }

   // We should have seen at least one source operand.
   assert(*Src0 != nullptr);

   // If a second source was not seen, then we just make Src1 = Src0 to simplify
   // things down stream. This should not matter, as all of the indexes in the
   // shufflevector instruction will point to Src0.
   if (*Src1 == nullptr) {
     *Src1 = *Src0;
   }

   return true;
 }

 } // end of namespace ShuffleVectorUtils
 } // end of anonymous namespace

 void Cfg::materializeVectorShuffles() {
   const bool Verbose = BuildDefs::dump() && isVerbose(IceV_ShufMat);

   std::unique_ptr<OstreamLocker> L;
   if (Verbose) {
     L.reset(new OstreamLocker(getContext()));
     getContext()->getStrDump() << "\nShuffle materialization:\n";
   }

   // MaxVectorElements is the maximum number of elements in the vector types
   // handled by Subzero. We use it to create the Inserts and Extracts vectors
   // with the appropriate size, thus avoiding resize() calls.
   const SizeT MaxVectorElements = typeNumElements(IceType_v16i8);
   CfgVector<const Inst *> Inserts(MaxVectorElements);
   CfgVector<const Inst *> Extracts(MaxVectorElements);

   TimerMarker T(TimerStack::TT_materializeVectorShuffles, this);
   for (CfgNode *Node : Nodes) {
     for (auto &Instr : Node->getInsts()) {
       if (!llvm::isa<InstInsertElement>(Instr)) {
         continue;
       }
       if (!ShuffleVectorUtils::insertsLastElement(Instr)) {
         // To avoid wasting time, we only start the pattern match at the last
         // insertelement instruction -- and go backwards from there.
         continue;
       }
       if (Verbose) {
         getContext()->getStrDump() << "\tCandidate: ";
         Instr.dump(this);
         getContext()->getStrDump() << "\n";
       }
       Inserts.resize(typeNumElements(Instr.getDest()->getType()));
       Inserts[0] = &Instr;
       if (!ShuffleVectorUtils::findAllInserts(this, getContext(),
                                               VMetadata.get(), &Inserts)) {
         // If we fail to find a sequence of insertelements, we stop the
         // optimization.
         if (Verbose) {
           getContext()->getStrDump() << "\tFalse alarm.\n";
         }
         continue;
       }
       if (Verbose) {
         getContext()->getStrDump() << "\tFound the following insertelement: \n";
         for (auto *I : reverse_range(Inserts)) {
           getContext()->getStrDump() << "\t\t";
           I->dump(this);
           getContext()->getStrDump() << "\n";
         }
       }
       Extracts.resize(Inserts.size());
       Variable *Src0;
       Variable *Src1;
       if (!ShuffleVectorUtils::findAllExtracts(this, getContext(),
                                                VMetadata.get(), Inserts, &Src0,
                                                &Src1, &Extracts)) {
         // If we fail to match the definitions of the insertelements' sources
         // with extractelement instructions -- or if those instructions operate
         // on more than two different variables -- we stop the optimization.
         if (Verbose) {
           getContext()->getStrDump() << "\tFailed to match extractelements.\n";
         }
         continue;
       }
       if (Verbose) {
         getContext()->getStrDump()
             << "\tFound the following insert/extract element pairs: \n";
         for (SizeT I = 0; I < Inserts.size(); ++I) {
           const SizeT Pos = Inserts.size() - I - 1;
           getContext()->getStrDump() << "\t\tInsert : ";
           Inserts[Pos]->dump(this);
           getContext()->getStrDump() << "\n\t\tExtract: ";
           Extracts[Pos]->dump(this);
           getContext()->getStrDump() << "\n";
         }
       }

       assert(Src0 != nullptr);
       assert(Src1 != nullptr);

       auto *ShuffleVector =
           InstShuffleVector::create(this, Instr.getDest(), Src0, Src1);
       assert(ShuffleVector->getSrc(0) == Src0);
       assert(ShuffleVector->getSrc(1) == Src1);
       for (SizeT I = 0; I < Extracts.size(); ++I) {
         const SizeT Pos = Extracts.size() - I - 1;
         auto *Index = llvm::cast<ConstantInteger32>(Extracts[Pos]->getSrc(1));
         if (Src0 == Extracts[Pos]->getSrc(0)) {
           ShuffleVector->addIndex(Index);
         } else {
           ShuffleVector->addIndex(llvm::cast<ConstantInteger32>(
               Ctx->getConstantInt32(Index->getValue() + Extracts.size())));
         }
       }

       if (Verbose) {
         getContext()->getStrDump() << "Created: ";
         ShuffleVector->dump(this);
         getContext()->getStrDump() << "\n";
       }

       Instr.setDeleted();
       auto &LoweringContext = getTarget()->getContext();
       LoweringContext.setInsertPoint(instToIterator(&Instr));
       LoweringContext.insert(ShuffleVector);
     }
   }
 }

 void Cfg::doNopInsertion() {
   if (!getFlags().getShouldDoNopInsertion())
     return;
   TimerMarker T(TimerStack::TT_doNopInsertion, this);
   RandomNumberGenerator RNG(getFlags().getRandomSeed(), RPE_NopInsertion,
                             SequenceNumber);
   for (CfgNode *Node : Nodes)
     Node->doNopInsertion(RNG);
 }

 void Cfg::genCode() {
   TimerMarker T(TimerStack::TT_genCode, this);
   for (CfgNode *Node : Nodes)
     Node->genCode();
 }

 // Compute the stack frame layout.
 void Cfg::genFrame() {
   TimerMarker T(TimerStack::TT_genFrame, this);
   getTarget()->addProlog(Entry);
   for (CfgNode *Node : Nodes)
     if (Node->getHasReturn())
       getTarget()->addEpilog(Node);
 }

 void Cfg::generateLoopInfo() {
   TimerMarker T(TimerStack::TT_computeLoopNestDepth, this);
   LoopInfo = ComputeLoopInfo(this);
 }

 // This is a lightweight version of live-range-end calculation. Marks the last
 // use of only those variables whose definition and uses are completely with a
 // single block. It is a quick single pass and doesn't need to iterate until
 // convergence.
 void Cfg::livenessLightweight() {
   TimerMarker T(TimerStack::TT_livenessLightweight, this);
   getVMetadata()->init(VMK_Uses);
   for (CfgNode *Node : Nodes)
     Node->livenessLightweight();
 }

 void Cfg::liveness(LivenessMode Mode) {
   TimerMarker T(TimerStack::TT_liveness, this);
   // Destroying the previous (if any) Liveness information clears the Liveness
   // allocator TLS pointer.
   Live = nullptr;
   Live = Liveness::create(this, Mode);

   getVMetadata()->init(VMK_Uses);
   Live->init();

   // Initialize with all nodes needing to be processed.
   BitVector NeedToProcess(Nodes.size(), true);
   while (NeedToProcess.any()) {
     // Iterate in reverse topological order to speed up convergence.
     for (CfgNode *Node : reverse_range(Nodes)) {
       if (NeedToProcess[Node->getIndex()]) {
         NeedToProcess[Node->getIndex()] = false;
         bool Changed = Node->liveness(getLiveness());
         if (Changed) {
           // If the beginning-of-block liveness changed since the last
           // iteration, mark all in-edges as needing to be processed.
           for (CfgNode *Pred : Node->getInEdges())
             NeedToProcess[Pred->getIndex()] = true;
         }
       }
     }
   }
   if (Mode == Liveness_Intervals) {
     // Reset each variable's live range.
     for (Variable *Var : Variables)
       Var->resetLiveRange();
   }
   // Make a final pass over each node to delete dead instructions, collect the
   // first and last instruction numbers, and add live range segments for that
   // node.
   for (CfgNode *Node : Nodes) {
     InstNumberT FirstInstNum = Inst::NumberSentinel;
     InstNumberT LastInstNum = Inst::NumberSentinel;
     for (Inst &I : Node->getPhis()) {
       I.deleteIfDead();
       if (Mode == Liveness_Intervals && !I.isDeleted()) {
         if (FirstInstNum == Inst::NumberSentinel)
           FirstInstNum = I.getNumber();
         assert(I.getNumber() > LastInstNum);
         LastInstNum = I.getNumber();
       }
     }
     for (Inst &I : Node->getInsts()) {
       I.deleteIfDead();
       if (Mode == Liveness_Intervals && !I.isDeleted()) {
         if (FirstInstNum == Inst::NumberSentinel)
           FirstInstNum = I.getNumber();
         assert(I.getNumber() > LastInstNum);
         LastInstNum = I.getNumber();
       }
     }
     if (Mode == Liveness_Intervals) {
       // Special treatment for live in-args. Their liveness needs to extend
       // beyond the beginning of the function, otherwise an arg whose only use
       // is in the first instruction will end up having the trivial live range
       // [2,2) and will *not* interfere with other arguments. So if the first
       // instruction of the method is "r=arg1+arg2", both args may be assigned
       // the same register. This is accomplished by extending the entry block's
       // instruction range from [2,n) to [1,n) which will transform the
       // problematic [2,2) live ranges into [1,2).  This extension works because
       // the entry node is guaranteed to have the lowest instruction numbers.
       if (Node == getEntryNode()) {
         FirstInstNum = Inst::NumberExtended;
         // Just in case the entry node somehow contains no instructions...
         if (LastInstNum == Inst::NumberSentinel)
           LastInstNum = FirstInstNum;
       }
       // If this node somehow contains no instructions, don't bother trying to
       // add liveness intervals for it, because variables that are live-in and
       // live-out will have a bogus interval added.
       if (FirstInstNum != Inst::NumberSentinel)
         Node->livenessAddIntervals(getLiveness(), FirstInstNum, LastInstNum);
     }
   }
 }

 // Traverse every Variable of every Inst and verify that it appears within the
 // Variable's computed live range.
 bool Cfg::validateLiveness() const {
   TimerMarker T(TimerStack::TT_validateLiveness, this);
   bool Valid = true;
   OstreamLocker L(Ctx);
   Ostream &Str = Ctx->getStrDump();
   for (CfgNode *Node : Nodes) {
     Inst *FirstInst = nullptr;
     for (Inst &Instr : Node->getInsts()) {
       if (Instr.isDeleted())
         continue;
       if (FirstInst == nullptr)
         FirstInst = &Instr;
       InstNumberT InstNumber = Instr.getNumber();
       if (Variable *Dest = Instr.getDest()) {
         if (!Dest->getIgnoreLiveness()) {
           bool Invalid = false;
           constexpr bool IsDest = true;
           if (!Dest->getLiveRange().containsValue(InstNumber, IsDest))
             Invalid = true;
           // Check that this instruction actually *begins* Dest's live range,
           // by checking that Dest is not live in the previous instruction. As
           // a special exception, we don't check this for the first instruction
           // of the block, because a Phi temporary may be live at the end of
           // the previous block, and if it is also assigned in the first
           // instruction of this block, the adjacent live ranges get merged.
           if (&Instr != FirstInst && !Instr.isDestRedefined() &&
               Dest->getLiveRange().containsValue(InstNumber - 1, IsDest))
             Invalid = true;
           if (Invalid) {
             Valid = false;
             Str << "Liveness error: inst " << Instr.getNumber() << " dest ";
             Dest->dump(this);
             Str << " live range " << Dest->getLiveRange() << "\n";
           }
         }
       }
       FOREACH_VAR_IN_INST(Var, Instr) {
         static constexpr bool IsDest = false;
         if (!Var->getIgnoreLiveness() &&
             !Var->getLiveRange().containsValue(InstNumber, IsDest)) {
           Valid = false;
           Str << "Liveness error: inst " << Instr.getNumber() << " var ";
           Var->dump(this);
           Str << " live range " << Var->getLiveRange() << "\n";
         }
       }
     }
   }
   return Valid;
 }

 void Cfg::contractEmptyNodes() {
   // If we're decorating the asm output with register liveness info, this
   // information may become corrupted or incorrect after contracting nodes that
   // contain only redundant assignments. As such, we disable this pass when
   // DecorateAsm is specified. This may make the resulting code look more
   // branchy, but it should have no effect on the register assignments.
   if (getFlags().getDecorateAsm())
     return;
   for (CfgNode *Node : Nodes) {
     Node->contractIfEmpty();
   }
 }

 void Cfg::doBranchOpt() {
   TimerMarker T(TimerStack::TT_doBranchOpt, this);
   for (auto I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
     auto NextNode = I + 1;
     (*I)->doBranchOpt(NextNode == E ? nullptr : *NextNode);
   }
 }

 void Cfg::markNodesForSandboxing() {
   for (const InstJumpTable *JT : JumpTables)
     for (SizeT I = 0; I < JT->getNumTargets(); ++I)
       JT->getTarget(I)->setNeedsAlignment();
 }

 // ======================== Dump routines ======================== //

 // emitTextHeader() is not target-specific (apart from what is abstracted by
 // the Assembler), so it is defined here rather than in the target lowering
 // class.
 void Cfg::emitTextHeader(GlobalString Name, GlobalContext *Ctx,
                          const Assembler *Asm) {
   if (!BuildDefs::dump())
     return;
   Ostream &Str = Ctx->getStrEmit();
   Str << "\t.text\n";
   if (getFlags().getFunctionSections())
     Str << "\t.section\t.text." << Name << ",\"ax\",%progbits\n";
   if (!Asm->getInternal() || getFlags().getDisableInternal()) {
     Str << "\t.globl\t" << Name << "\n";
     Str << "\t.type\t" << Name << ",%function\n";
   }
   Str << "\t" << Asm->getAlignDirective() << " "
       << Asm->getBundleAlignLog2Bytes() << ",0x";
   for (uint8_t I : Asm->getNonExecBundlePadding())
     Str.write_hex(I);
   Str << "\n";
   Str << Name << ":\n";
 }

 void Cfg::emitJumpTables() {
   switch (getFlags().getOutFileType()) {
   case FT_Elf:
   case FT_Iasm: {
     // The emission needs to be delayed until the after the text section so
     // save the offsets in the global context.
     for (const InstJumpTable *JumpTable : JumpTables) {
       Ctx->addJumpTableData(JumpTable->toJumpTableData(getAssembler()));
     }
   } break;
   case FT_Asm: {
     // Emit the assembly directly so we don't need to hang on to all the names
     for (const InstJumpTable *JumpTable : JumpTables)
       getTarget()->emitJumpTable(this, JumpTable);
   } break;
   }
 }

 void Cfg::emit() {
   if (!BuildDefs::dump())
     return;
   TimerMarker T(TimerStack::TT_emitAsm, this);
   if (getFlags().getDecorateAsm()) {
     renumberInstructions();
     getVMetadata()->init(VMK_Uses);
     liveness(Liveness_Basic);
     dump("After recomputing liveness for -decorate-asm");
   }
   OstreamLocker L(Ctx);
   Ostream &Str = Ctx->getStrEmit();
   const Assembler *Asm = getAssembler<>();
   const bool NeedSandboxing = getFlags().getUseSandboxing();

   emitTextHeader(FunctionName, Ctx, Asm);
   if (getFlags().getDecorateAsm()) {
     for (Variable *Var : getVariables()) {
       if (Var->hasKnownStackOffset() && !Var->isRematerializable()) {
         Str << "\t" << Var->getSymbolicStackOffset() << " = "
             << Var->getStackOffset() << "\n";
       }
     }
   }
   for (CfgNode *Node : Nodes) {
     if (NeedSandboxing && Node->needsAlignment()) {
       Str << "\t" << Asm->getAlignDirective() << " "
           << Asm->getBundleAlignLog2Bytes() << "\n";
     }
     Node->emit(this);
   }
   emitJumpTables();
   Str << "\n";
 }

 void Cfg::emitIAS() {
   TimerMarker T(TimerStack::TT_emitAsm, this);
   // The emitIAS() routines emit into the internal assembler buffer, so there's
   // no need to lock the streams.
   const bool NeedSandboxing = getFlags().getUseSandboxing();
   for (CfgNode *Node : Nodes) {
     if (NeedSandboxing && Node->needsAlignment())
       getAssembler()->alignCfgNode();
     Node->emitIAS(this);
   }
   emitJumpTables();
 }

 size_t Cfg::getTotalMemoryMB() const {
   constexpr size_t _1MB = 1024 * 1024;
   assert(Allocator != nullptr);
   assert(CfgAllocatorTraits::current() == Allocator.get());
   return Allocator->getTotalMemory() / _1MB;
 }

 size_t Cfg::getLivenessMemoryMB() const {
   constexpr size_t _1MB = 1024 * 1024;
   if (Live == nullptr) {
     return 0;
   }
   return Live->getAllocator()->getTotalMemory() / _1MB;
 }

 // Dumps the IR with an optional introductory message.
 void Cfg::dump(const char *Message) {
   if (!BuildDefs::dump())
     return;
   if (!isVerbose())
     return;
   OstreamLocker L(Ctx);
   Ostream &Str = Ctx->getStrDump();
   if (Message[0])
     Str << "================ " << Message << " ================\n";
   if (isVerbose(IceV_Mem)) {
     Str << "Memory size = " << getTotalMemoryMB() << " MB\n";
   }
   setCurrentNode(getEntryNode());
   // Print function name+args
   if (isVerbose(IceV_Instructions)) {
     Str << "define ";
     if (getInternal() && !getFlags().getDisableInternal())
       Str << "internal ";
     Str << ReturnType << " @" << getFunctionName() << "(";
     for (SizeT i = 0; i < Args.size(); ++i) {
       if (i > 0)
         Str << ", ";
       Str << Args[i]->getType() << " ";
       Args[i]->dump(this);
     }
     // Append an extra copy of the function name here, in order to print its
     // size stats but not mess up lit tests.
     Str << ") { # " << getFunctionNameAndSize() << "\n";
   }
   resetCurrentNode();
   if (isVerbose(IceV_Liveness)) {
     // Print summary info about variables
     for (Variable *Var : Variables) {
       Str << "// multiblock=";
       if (getVMetadata()->isTracked(Var))
         Str << getVMetadata()->isMultiBlock(Var);
       else
         Str << "?";
       Str << " defs=";
       bool FirstPrint = true;
       if (VMetadata->getKind() != VMK_Uses) {
         if (const Inst *FirstDef = VMetadata->getFirstDefinition(Var)) {
           Str << FirstDef->getNumber();
           FirstPrint = false;
         }
       }
       if (VMetadata->getKind() == VMK_All) {
         for (const Inst *Instr : VMetadata->getLatterDefinitions(Var)) {
           if (!FirstPrint)
             Str << ",";
           Str << Instr->getNumber();
           FirstPrint = false;
         }
       }
       Str << " weight=" << Var->getWeight(this) << " ";
       Var->dump(this);
       Str << " LIVE=" << Var->getLiveRange() << "\n";
     }
   }
   // Print each basic block
   for (CfgNode *Node : Nodes)
     Node->dump(this);
   if (isVerbose(IceV_Instructions))
     Str << "}\n";
 }

 } // end of namespace Ice