This improves the variable use weight by taking into account use in loops. It
further improves spec2k performance and fixes the regression in ammp. Loops are
identified using an extension to Tarjan's algorithm.

BUG=
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1318553003.
diff --git a/Makefile.standalone b/Makefile.standalone
index 18ecc9a..0210ed4 100644
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -195,6 +195,7 @@
 	IceInstX8664.cpp \
 	IceIntrinsics.cpp \
 	IceLiveness.cpp \
+	IceLoopAnalyzer.cpp \
 	IceOperand.cpp \
 	IceRegAlloc.cpp \
 	IceRNG.cpp \
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index 416376b..0b2aa94 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -24,6 +24,7 @@
 #include "IceInst.h"
 #include "IceInstVarIter.h"
 #include "IceLiveness.h"
+#include "IceLoopAnalyzer.h"
 #include "IceOperand.h"
 #include "IceTargetLowering.h"
 
@@ -463,10 +464,16 @@
       getTarget()->addEpilog(Node);
 }
 
-// This is a lightweight version of live-range-end calculation.  Marks
-// the last use of only those variables whose definition and uses are
-// completely with a single block.  It is a quick single pass and
-// doesn't need to iterate until convergence.
+void Cfg::computeLoopNestDepth() {
+  TimerMarker T(TimerStack::TT_computeLoopNestDepth, this);
+  LoopAnalyzer LA(this);
+  LA.computeLoopNestDepth();
+}
+
+// This is a lightweight version of live-range-end calculation.  Marks the last
+// use of only those variables whose definition and uses are completely with a
+// single block.  It is a quick single pass and doesn't need to iterate until
+// convergence.
 void Cfg::livenessLightweight() {
   TimerMarker T(TimerStack::TT_livenessLightweight, this);
   getVMetadata()->init(VMK_Uses);
@@ -602,12 +609,11 @@
 }
 
 void Cfg::contractEmptyNodes() {
-  // If we're decorating the asm output with register liveness info,
-  // this information may become corrupted or incorrect after
-  // contracting nodes that contain only redundant assignments.  As
-  // such, we disable this pass when DecorateAsm is specified.  This
-  // may make the resulting code look more branchy, but it should have
-  // no effect on the register assignments.
+  // If we're decorating the asm output with register liveness info, this
+  // information may become corrupted or incorrect after contracting nodes that
+  // contain only redundant assignments. As such, we disable this pass when
+  // DecorateAsm is specified. This may make the resulting code look more
+  // branchy, but it should have no effect on the register assignments.
   if (Ctx->getFlags().getDecorateAsm())
     return;
   for (CfgNode *Node : Nodes) {
diff --git a/src/IceCfg.h b/src/IceCfg.h
index e6c5a4a..3c8972d 100644
--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -86,7 +86,9 @@
   /// @{
   void setEntryNode(CfgNode *EntryNode) { Entry = EntryNode; }
   CfgNode *getEntryNode() const { return Entry; }
-  /// Create a node and append it to the end of the linearized list.
+  /// Create a node and append it to the end of the linearized list. The loop
+  /// nest depth of the new node may not be valid if it is created after
+  /// computeLoopNestDepth.
   CfgNode *makeNode();
   SizeT getNumNodes() const { return Nodes.size(); }
   const NodeList &getNodes() const { return Nodes; }
@@ -189,6 +191,7 @@
   void doNopInsertion();
   void genCode();
   void genFrame();
+  void computeLoopNestDepth();
   void livenessLightweight();
   void liveness(LivenessMode Mode);
   bool validateLiveness() const;
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp
index 4199c41..095994e 100644
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -219,6 +219,11 @@
 // not contain duplicates.
 CfgNode *CfgNode::splitIncomingEdge(CfgNode *Pred, SizeT EdgeIndex) {
   CfgNode *NewNode = Func->makeNode();
+  // Depth is the minimum as it works if both are the same, but if one is
+  // outside the loop and the other is inside, the new node should be placed
+  // outside and not be executed multiple times within the loop.
+  NewNode->setLoopNestDepth(
+      std::min(getLoopNestDepth(), Pred->getLoopNestDepth()));
   if (BuildDefs::dump())
     NewNode->setName("split_" + Pred->getName() + "_" + getName() + "_" +
                      std::to_string(EdgeIndex));
@@ -1175,9 +1180,11 @@
   Func->setCurrentNode(this);
   Ostream &Str = Func->getContext()->getStrDump();
   Liveness *Liveness = Func->getLiveness();
-  if (Func->isVerbose(IceV_Instructions)) {
+  if (Func->isVerbose(IceV_Instructions) || Func->isVerbose(IceV_Loop))
     Str << getName() << ":\n";
-  }
+  // Dump the loop nest depth
+  if (Func->isVerbose(IceV_Loop))
+    Str << "    // LoopNestDepth = " << getLoopNestDepth() << "\n";
   // Dump list of predecessor nodes.
   if (Func->isVerbose(IceV_Preds) && !InEdges.empty()) {
     Str << "    // preds = ";
diff --git a/src/IceCfgNode.h b/src/IceCfgNode.h
index 963638d..bbfdae2 100644
--- a/src/IceCfgNode.h
+++ b/src/IceCfgNode.h
@@ -46,6 +46,10 @@
     return ".L" + Func->getFunctionName() + "$" + getName();
   }
 
+  void incrementLoopNestDepth() { ++LoopNestDepth; }
+  void setLoopNestDepth(SizeT NewDepth) { LoopNestDepth = NewDepth; }
+  SizeT getLoopNestDepth() const { return LoopNestDepth; }
+
   /// The HasReturn flag indicates that this node contains a return
   /// instruction and therefore needs an epilog.
   void setHasReturn() { HasReturn = true; }
@@ -111,6 +115,7 @@
   SizeT Number; /// label index
   Cfg::IdentifierIndexType NameIndex =
       Cfg::IdentifierIndexInvalid; /// index into Cfg::NodeNames table
+  SizeT LoopNestDepth = 0;         /// the loop nest depth of this node
   bool HasReturn = false;          /// does this block need an epilog?
   bool NeedsPlacement = false;
   bool NeedsAlignment = false;       /// is sandboxing required?
diff --git a/src/IceClFlags.cpp b/src/IceClFlags.cpp
index fea785d..60117fd 100644
--- a/src/IceClFlags.cpp
+++ b/src/IceClFlags.cpp
@@ -234,6 +234,7 @@
         clEnumValN(Ice::IceV_Random, "random", "Randomization details"),
         clEnumValN(Ice::IceV_Folding, "fold", "Instruction folding details"),
         clEnumValN(Ice::IceV_RMW, "rmw", "ReadModifyWrite optimization"),
+        clEnumValN(Ice::IceV_Loop, "loop", "Loop nest depth analysis"),
         clEnumValN(Ice::IceV_All, "all", "Use all verbose options"),
         clEnumValN(Ice::IceV_Most, "most",
                    "Use all verbose options except 'regalloc'"),
diff --git a/src/IceDefs.h b/src/IceDefs.h
index 7ea34be..7bec8e0 100644
--- a/src/IceDefs.h
+++ b/src/IceDefs.h
@@ -225,6 +225,7 @@
   IceV_Random = 1 << 10,
   IceV_Folding = 1 << 11,
   IceV_RMW = 1 << 12,
+  IceV_Loop = 1 << 13,
   IceV_All = ~IceV_None,
   IceV_Most = IceV_All & ~IceV_LinearScan
 };
diff --git a/src/IceInst.cpp b/src/IceInst.cpp
index ae63ffa..b92e954 100644
--- a/src/IceInst.cpp
+++ b/src/IceInst.cpp
@@ -412,6 +412,10 @@
   addSource(Data);
 }
 
+Variable *InstStore::getRmwBeacon() const {
+  return llvm::dyn_cast<Variable>(getSrc(2));
+}
+
 void InstStore::setRmwBeacon(Variable *Beacon) {
   Dest = llvm::dyn_cast<Variable>(getData());
   Srcs[2] = Beacon;
diff --git a/src/IceInst.h b/src/IceInst.h
index 6bfd7af..360135a 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -693,7 +693,7 @@
   }
   Operand *getAddr() const { return getSrc(1); }
   Operand *getData() const { return getSrc(0); }
-  Variable *getRmwBeacon() const { return llvm::dyn_cast<Variable>(getSrc(2)); }
+  Variable *getRmwBeacon() const;
   void setRmwBeacon(Variable *Beacon);
   void dump(const Cfg *Func) const override;
   static bool classof(const Inst *Inst) { return Inst->getKind() == Store; }
diff --git a/src/IceLoopAnalyzer.cpp b/src/IceLoopAnalyzer.cpp
new file mode 100644
index 0000000..7da479f
--- /dev/null
+++ b/src/IceLoopAnalyzer.cpp
@@ -0,0 +1,142 @@
+//===- subzero/src/IceLoopAnalyzer.cpp - Loop Analysis --------------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the loop analysis on the CFG.
+///
+//===----------------------------------------------------------------------===//
+#include "IceLoopAnalyzer.h"
+
+#include "IceCfg.h"
+#include "IceCfgNode.h"
+
+namespace Ice {
+
+void LoopAnalyzer::LoopNode::reset() {
+  if (Deleted)
+    return;
+  Succ = BB->getOutEdges().begin();
+  Index = LowLink = UndefinedIndex;
+  OnStack = false;
+}
+
+NodeList::const_iterator LoopAnalyzer::LoopNode::successorsEnd() const {
+  return BB->getOutEdges().end();
+}
+
+void LoopAnalyzer::LoopNode::incrementLoopNestDepth() {
+  BB->incrementLoopNestDepth();
+}
+
+LoopAnalyzer::LoopAnalyzer(Cfg *Func) : Func(Func) {
+  const NodeList &Nodes = Func->getNodes();
+
+  // Allocate memory ahead of time. This is why a vector is used instead of a
+  // stack which doesn't support reserving (or bulk erasure used below).
+  AllNodes.reserve(Nodes.size());
+  WorkStack.reserve(Nodes.size());
+  LoopStack.reserve(Nodes.size());
+
+  // Create the LoopNodes from the function's CFG
+  for (CfgNode *Node : Nodes)
+    AllNodes.emplace_back(Node);
+}
+
+void LoopAnalyzer::computeLoopNestDepth() {
+  assert(AllNodes.size() == Func->getNodes().size());
+  assert(NextIndex == FirstDefinedIndex);
+  assert(NumDeletedNodes == 0);
+
+  while (NumDeletedNodes < AllNodes.size()) {
+    // Prepare to run Tarjan's
+    for (LoopNode &Node : AllNodes)
+      Node.reset();
+
+    assert(WorkStack.empty());
+    assert(LoopStack.empty());
+
+    for (LoopNode &Node : AllNodes) {
+      if (Node.isDeleted() || Node.isVisited())
+        continue;
+
+      WorkStack.push_back(&Node);
+
+      while (!WorkStack.empty()) {
+        LoopNode &WorkNode = *WorkStack.back();
+        if (LoopNode *Succ = processNode(WorkNode))
+          WorkStack.push_back(Succ);
+        else
+          WorkStack.pop_back();
+      }
+    }
+  }
+}
+
+LoopAnalyzer::LoopNode *
+LoopAnalyzer::processNode(LoopAnalyzer::LoopNode &Node) {
+  if (!Node.isVisited()) {
+    Node.visit(NextIndex++);
+    LoopStack.push_back(&Node);
+    Node.setOnStack();
+  } else {
+    // Returning to a node after having recursed into Succ so continue
+    // iterating through successors after using the Succ.LowLink value that was
+    // computed in the recursion.
+    LoopNode &Succ = AllNodes[(*Node.currentSuccessor())->getIndex()];
+    Node.tryLink(Succ.getLowLink());
+    Node.nextSuccessor();
+  }
+
+  // Visit the successors and recurse into unvisited nodes. The recursion could
+  // cause the iteration to be suspended but it will resume as the stack is
+  // unwound.
+  auto SuccEnd = Node.successorsEnd();
+  for (; Node.currentSuccessor() != SuccEnd; Node.nextSuccessor()) {
+    LoopNode &Succ = AllNodes[(*Node.currentSuccessor())->getIndex()];
+
+    if (Succ.isDeleted())
+      continue;
+
+    if (!Succ.isVisited())
+      return &Succ;
+    else if (Succ.isOnStack())
+      Node.tryLink(Succ.getIndex());
+  }
+
+  if (Node.getLowLink() != Node.getIndex())
+    return nullptr;
+
+  // Single node means no loop in the CFG
+  if (LoopStack.back() == &Node) {
+    LoopStack.back()->setOnStack(false);
+    LoopStack.back()->setDeleted();
+    ++NumDeletedNodes;
+    LoopStack.pop_back();
+    return nullptr;
+  }
+
+  // Reaching here means a loop has been found! It consists of the nodes on
+  // the top of the stack, down until the current node being processed, Node,
+  // is found.
+  for (auto It = LoopStack.rbegin(); It != LoopStack.rend(); ++It) {
+    (*It)->setOnStack(false);
+    (*It)->incrementLoopNestDepth();
+    // Remove the loop from the stack and delete the head node
+    if (*It == &Node) {
+      (*It)->setDeleted();
+      ++NumDeletedNodes;
+      LoopStack.erase(It.base() - 1, LoopStack.end());
+      break;
+    }
+  }
+
+  return nullptr;
+}
+
+} // end of namespace Ice
diff --git a/src/IceLoopAnalyzer.h b/src/IceLoopAnalyzer.h
new file mode 100644
index 0000000..5991798
--- /dev/null
+++ b/src/IceLoopAnalyzer.h
@@ -0,0 +1,113 @@
+//===- subzero/src/IceLoopAnalyzer.h - Loop Analysis ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This analysis identifies loops in the CFG.
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ICELOOPANALYZER_H
+#define SUBZERO_SRC_ICELOOPANALYZER_H
+
+#include "IceDefs.h"
+
+namespace Ice {
+
+/// Analyze a function's CFG for loops. The CFG must not change during the
+/// lifetime of this object.
+class LoopAnalyzer {
+  LoopAnalyzer() = delete;
+  LoopAnalyzer(const LoopAnalyzer &) = delete;
+  LoopAnalyzer &operator=(const LoopAnalyzer &) = delete;
+
+public:
+  explicit LoopAnalyzer(Cfg *Func);
+
+  /// Use Tarjan's strongly connected components algorithm to identify outermost
+  /// to innermost loops. By deleting the head of the loop from the graph, inner
+  /// loops can be found. This assumes that the head node is not shared between
+  /// loops but instead all paths to the head come from 'continue' constructs.
+  ///
+  /// This only computes the loop nest depth within the function and does not
+  /// take into account whether the function was called from within a loop.
+  void computeLoopNestDepth();
+
+private:
+  using IndexT = uint32_t;
+  static constexpr IndexT UndefinedIndex = 0;
+  static constexpr IndexT FirstDefinedIndex = 1;
+
+  // TODO(ascull): classify the other fields
+  class LoopNode {
+    LoopNode() = delete;
+    LoopNode operator=(const LoopNode &) = delete;
+
+  public:
+    explicit LoopNode(CfgNode *BB) : BB(BB) { reset(); }
+    LoopNode(const LoopNode &) = default;
+
+    void reset();
+
+    NodeList::const_iterator successorsEnd() const;
+    NodeList::const_iterator currentSuccessor() const { return Succ; }
+    void nextSuccessor() { ++Succ; }
+
+    void visit(IndexT VisitIndex) { Index = LowLink = VisitIndex; }
+    bool isVisited() const { return Index != UndefinedIndex; }
+    IndexT getIndex() const { return Index; }
+
+    void tryLink(IndexT NewLink) {
+      if (NewLink < LowLink)
+        LowLink = NewLink;
+    }
+    IndexT getLowLink() const { return LowLink; }
+
+    void setOnStack(bool NewValue = true) { OnStack = NewValue; }
+    bool isOnStack() const { return OnStack; }
+
+    void setDeleted() { Deleted = true; }
+    bool isDeleted() const { return Deleted; }
+
+    void incrementLoopNestDepth();
+
+  private:
+    CfgNode *BB;
+    NodeList::const_iterator Succ;
+    IndexT Index;
+    IndexT LowLink;
+    bool OnStack;
+    bool Deleted = false;
+  };
+
+  using LoopNodeList = std::vector<LoopNode, CfgLocalAllocator<LoopNode>>;
+  using LoopNodePtrList =
+      std::vector<LoopNode *, CfgLocalAllocator<LoopNode *>>;
+
+  /// Process the node as part as part of Tarjan's algorithm and return either
+  /// a node to recurse into or nullptr when the node has been fully processed.
+  LoopNode *processNode(LoopNode &Node);
+
+  /// The fuction to analyze for loops.
+  Cfg *const Func;
+  /// A list of decorated nodes in the same order as Func->getNodes() which
+  /// means the node's index will also be valid in this list.
+  LoopNodeList AllNodes;
+  /// This is used as a replacement for the call stack.
+  LoopNodePtrList WorkStack;
+  /// Track which loop a node belongs to.
+  LoopNodePtrList LoopStack;
+  /// The index to assign to the next visited node.
+  IndexT NextIndex = FirstDefinedIndex;
+  /// The number of nodes which have been marked deleted. This is used to track
+  /// when the iteration should end.
+  LoopNodePtrList::size_type NumDeletedNodes = 0;
+};
+
+} // end of namespace Ice
+
+#endif //  SUBZERO_SRC_ICELOOPANALYZER_H
diff --git a/src/IceOperand.cpp b/src/IceOperand.cpp
index c10656f..2013dcf 100644
--- a/src/IceOperand.cpp
+++ b/src/IceOperand.cpp
@@ -148,18 +148,26 @@
 
 RegWeight Variable::getWeight(const Cfg *Func) const {
   VariablesMetadata *VMetadata = Func->getVMetadata();
-  return RegWeight(mustHaveReg()
-                       ? RegWeight::Inf
-                       : mustNotHaveReg() ? RegWeight::Zero
-                                          : VMetadata->getUseWeight(this));
+  return mustHaveReg() ? RegWeight(RegWeight::Inf)
+                       : mustNotHaveReg() ? RegWeight(RegWeight::Zero)
+                                          : VMetadata->getUseWeight(this);
 }
 
 void VariableTracking::markUse(MetadataKind TrackingKind, const Inst *Instr,
                                CfgNode *Node, bool IsImplicit) {
   (void)TrackingKind;
 
-  // TODO(ascull): get the loop nest depth from CfgNode
-  UseWeight += 1;
+  // Increment the use weight depending on the loop nest depth. The weight is
+  // exponential in the nest depth as inner loops are expected to be executed
+  // an exponentially greater number of times.
+  constexpr uint32_t LogLoopTripCountEstimate = 2; // 2^2 = 4
+  constexpr SizeT MaxShift = sizeof(uint32_t) * CHAR_BIT - 1;
+  constexpr SizeT MaxLoopNestDepth = MaxShift / LogLoopTripCountEstimate;
+  const uint32_t LoopNestDepth =
+      std::min(Node->getLoopNestDepth(), MaxLoopNestDepth);
+  const uint32_t ThisUseWeight = uint32_t(1)
+                                 << LoopNestDepth * LogLoopTripCountEstimate;
+  UseWeight.addWeight(ThisUseWeight);
 
   if (MultiBlock == MBS_MultiBlock)
     return;
@@ -391,9 +399,9 @@
   return Metadata[VarNum].getNode();
 }
 
-uint32_t VariablesMetadata::getUseWeight(const Variable *Var) const {
+RegWeight VariablesMetadata::getUseWeight(const Variable *Var) const {
   if (!isTracked(Var))
-    return 1; // conservative answer
+    return RegWeight(1); // conservative answer
   SizeT VarNum = Var->getIndex();
   return Metadata[VarNum].getUseWeight();
 }
diff --git a/src/IceOperand.h b/src/IceOperand.h
index 946f47d..9638066 100644
--- a/src/IceOperand.h
+++ b/src/IceOperand.h
@@ -321,13 +321,15 @@
   explicit RegWeight(uint32_t Weight) : Weight(Weight) {}
   RegWeight(const RegWeight &) = default;
   RegWeight &operator=(const RegWeight &) = default;
-  const static uint32_t Inf = ~0; /// Force regalloc to give a register
-  const static uint32_t Zero = 0; /// Force regalloc NOT to give a register
+  const static uint32_t Inf = ~0;      /// Force regalloc to give a register
+  const static uint32_t Zero = 0;      /// Force regalloc NOT to give a register
+  const static uint32_t Max = Inf - 1; /// Max natural weight.
   void addWeight(uint32_t Delta) {
     if (Delta == Inf)
       Weight = Inf;
     else if (Weight != Inf)
-      Weight += Delta;
+      if (Utils::add_overflow(Weight, Delta, &Weight) || Weight == Inf)
+        Weight = Max;
   }
   void addWeight(const RegWeight &Other) { addWeight(Other.Weight); }
   void setWeight(uint32_t Val) { Weight = Val; }
@@ -579,7 +581,7 @@
   const Inst *getSingleDefinition() const;
   const InstDefList &getLatterDefinitions() const { return Definitions; }
   CfgNode *getNode() const { return SingleUseNode; }
-  uint32_t getUseWeight() const { return UseWeight; }
+  RegWeight getUseWeight() const { return UseWeight; }
   void markUse(MetadataKind TrackingKind, const Inst *Instr, CfgNode *Node,
                bool IsImplicit);
   void markDef(MetadataKind TrackingKind, const Inst *Instr, CfgNode *Node);
@@ -594,7 +596,7 @@
   InstDefList Definitions; /// Only used if Kind==VMK_All
   const Inst *FirstOrSingleDefinition =
       nullptr; /// Is a copy of Definitions[0] if Kind==VMK_All
-  uint32_t UseWeight = 0;
+  RegWeight UseWeight;
 };
 
 /// VariablesMetadata analyzes and summarizes the metadata for the complete set
@@ -649,7 +651,7 @@
 
   /// Returns the total use weight computed as the sum of uses multiplied by a
   /// loop nest depth factor for each use.
-  uint32_t getUseWeight(const Variable *Var) const;
+  RegWeight getUseWeight(const Variable *Var) const;
 
 private:
   const Cfg *Func;
diff --git a/src/IceSwitchLowering.cpp b/src/IceSwitchLowering.cpp
index 53f7890..6207495 100644
--- a/src/IceSwitchLowering.cpp
+++ b/src/IceSwitchLowering.cpp
@@ -1,4 +1,4 @@
-//===- subzero/src/IceSwitchLowering.cpp - Switch lowering -----------------==//
+//===- subzero/src/IceSwitchLowering.cpp - Switch lowering ----------------===//
 //
 //                        The Subzero Code Generator
 //
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index bb0ba86..c061b33 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -325,6 +325,13 @@
     Func->dump("After Phi lowering");
   }
 
+  // Run this early so it can be used to focus optimizations on potentially hot
+  // code.
+  // TODO(stichnot,ascull): currently only used for regalloc not expensive high
+  // level optimizations which could be focused on potentially hot code.
+  Func->computeLoopNestDepth();
+  Func->dump("After loop nest depth analysis");
+
   // Address mode optimization.
   Func->getVMetadata()->init(VMK_SingleDefs);
   Func->doAddressOpt();
@@ -365,8 +372,9 @@
     return;
   Func->dump("After x86 codegen");
 
-  // Register allocation.  This requires instruction renumbering and full
-  // liveness analysis.
+  // Register allocation. This requires instruction renumbering and full
+  // liveness analysis. Loops must be identified before liveness so variable
+  // use weights are correct.
   Func->renumberInstructions();
   if (Func->hasError())
     return;
diff --git a/src/IceTimerTree.def b/src/IceTimerTree.def
index 663cc39..9bcc53c 100644
--- a/src/IceTimerTree.def
+++ b/src/IceTimerTree.def
@@ -20,6 +20,7 @@
   X(O2)                        \
   X(Om1)                       \
   X(advancedPhiLowering)       \
+  X(computeLoopNestDepth)      \
   X(convertToIce)              \
   X(deletePhis)                \
   X(doAddressOpt)              \
diff --git a/src/IceUtils.h b/src/IceUtils.h
index 6d65ade..f07a566 100644
--- a/src/IceUtils.h
+++ b/src/IceUtils.h
@@ -70,6 +70,18 @@
             (X < 0 && Y < 0 && (X < std::numeric_limits<T>::min() - Y)));
   }
 
+  /// Adds x to y and stores the result in sum. Returns true if the addition
+  /// overflowed.
+  static inline bool add_overflow(uint32_t x, uint32_t y, uint32_t *sum) {
+    static_assert(std::is_same<uint32_t, unsigned>::value, "Must match type");
+#if __has_builtin(__builtin_uadd_overflow)
+    return __builtin_uadd_overflow(x, y, sum);
+#else
+    *sum = x + y;
+    return WouldOverflowAdd(x, y);
+#endif
+  }
+
   /// Return true if X is already aligned by N, where N is a power of 2.
   template <typename T> static inline bool IsAligned(T X, intptr_t N) {
     assert(llvm::isPowerOf2_64(N));
diff --git a/tests_lit/llvm2ice_tests/loop-nest-depth.ll b/tests_lit/llvm2ice_tests/loop-nest-depth.ll
new file mode 100644
index 0000000..16a017e
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/loop-nest-depth.ll
@@ -0,0 +1,280 @@
+; Test the the loop nest depth is correctly calculated for basic blocks.
+
+; REQUIRES: allow_dump
+
+; Single threaded so that the dumps used for checking happen in order
+; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 --verbose=loop \
+; RUN: --threads=0 | FileCheck %s
+
+define void @test_single_loop(i32 %a32) {
+entry:
+  %a = trunc i32 %a32 to i1
+  br label %loop0
+
+loop0:                               ; <-+
+  br label %loop1                    ;   |
+loop1:                               ;   |
+  br i1 %a, label %loop0, label %out ; --+
+
+out:
+  ret void
+}
+
+; CHECK-LABEL: After loop nest depth analysis
+; CHECK-NEXT: entry:
+; CHECK-NEXT: LoopNestDepth = 0
+; CHECK-NEXT: loop0:
+; CHECK-NEXT: LoopNestDepth = 1
+; CHECK-NEXT: loop1:
+; CHECK-NEXT: LoopNestDepth = 1
+; CHECK-NEXT: out:
+; CHECK-NEXT: LoopNestDepth = 0
+; CHECK-LABEL: Before RMW
+
+define void @test_single_loop_with_continue(i32 %a32, i32 %b32) {
+entry:
+  %a = trunc i32 %a32 to i1
+  %b = trunc i32 %b32 to i1
+  br label %loop0
+
+loop0:                                 ; <-+
+  br label %loop1                      ;   |
+loop1:                                 ;   |
+  br i1 %a, label %loop0, label %loop2 ; --+
+loop2:                                 ;   |
+  br i1 %b, label %loop0, label %out   ; --+
+
+out:
+  ret void
+}
+
+; CHECK-LABEL: After loop nest depth analysis
+; CHECK-NEXT: entry:
+; CHECK-NEXT: LoopNestDepth = 0
+; CHECK-NEXT: loop0:
+; CHECK-NEXT: LoopNestDepth = 1
+; CHECK-NEXT: loop1:
+; CHECK-NEXT: LoopNestDepth = 1
+; CHECK-NEXT: loop2:
+; CHECK-NEXT: LoopNestDepth = 1
+; CHECK-NEXT: out:
+; CHECK-NEXT: LoopNestDepth = 0
+; CHECK-LABEL: Before RMW
+
+define void @test_multiple_exits(i32 %a32, i32 %b32) {
+entry:
+  %a = trunc i32 %a32 to i1
+  %b = trunc i32 %b32 to i1
+  br label %loop0
+
+loop0:                               ; <-+
+  br label %loop1                    ;   |
+loop1:                               ;   |
+  br i1 %a, label %loop2, label %out ; --+-+
+loop2:                               ;   | |
+  br i1 %b, label %loop0, label %out ; --+ |
+                                     ;     |
+out:                                 ; <---+
+  ret void
+}
+
+; CHECK-LABEL: After loop nest depth analysis
+; CHECK-NEXT: entry:
+; CHECK-NEXT: LoopNestDepth = 0
+; CHECK-NEXT: loop0:
+; CHECK-NEXT: LoopNestDepth = 1
+; CHECK-NEXT: loop1:
+; CHECK-NEXT: LoopNestDepth = 1
+; CHECK-NEXT: loop2:
+; CHECK-NEXT: LoopNestDepth = 1
+; CHECK-NEXT: out:
+; CHECK-NEXT: LoopNestDepth = 0
+; CHECK-LABEL: Before RMW
+
+define void @test_two_nested_loops(i32 %a32, i32 %b32) {
+entry:
+  %a = trunc i32 %a32 to i1
+  %b = trunc i32 %b32 to i1
+  br label %loop0_0
+
+loop0_0:                                   ; <---+
+  br label %loop1_0                        ;     |
+loop1_0:                                   ; <-+ |
+  br label %loop1_1                        ;   | |
+loop1_1:                                   ;   | |
+  br i1 %a, label %loop1_0, label %loop0_1 ; --+ |
+loop0_1:                                   ;     |
+  br i1 %b, label %loop0_0, label %out     ; ----+
+
+out:
+  ret void
+}
+
+; CHECK-LABEL: After loop nest depth analysis
+; CHECK-NEXT: entry:
+; CHECK-NEXT: LoopNestDepth = 0
+; CHECK-NEXT: loop0_0:
+; CHECK-NEXT: LoopNestDepth = 1
+; CHECK-NEXT: loop1_0:
+; CHECK-NEXT: LoopNestDepth = 2
+; CHECK-NEXT: loop1_1:
+; CHECK-NEXT: LoopNestDepth = 2
+; CHECK-NEXT: loop0_1:
+; CHECK-NEXT: LoopNestDepth = 1
+; CHECK-NEXT: out:
+; CHECK-NEXT: LoopNestDepth = 0
+; CHECK-LABEL: Before RMW
+
+define void @test_two_nested_loops_with_continue(i32 %a32, i32 %b32, i32 %c32) {
+entry:
+  %a = trunc i32 %a32 to i1
+  %b = trunc i32 %b32 to i1
+  %c = trunc i32 %c32 to i1
+  br label %loop0_0
+
+loop0_0:                                   ; <---+
+  br label %loop1_0                        ;     |
+loop1_0:                                   ; <-+ |
+  br label %loop1_1                        ;   | |
+loop1_1:                                   ;   | |
+  br i1 %a, label %loop1_0, label %loop1_2 ; --+ |
+loop1_2:                                   ;   | |
+  br i1 %a, label %loop1_0, label %loop0_1 ; --+ |
+loop0_1:                                   ;     |
+  br i1 %b, label %loop0_0, label %out     ; ----+
+
+out:
+  ret void
+}
+
+; CHECK-LABEL: After loop nest depth analysis
+; CHECK-NEXT: entry:
+; CHECK-NEXT: LoopNestDepth = 0
+; CHECK-NEXT: loop0_0:
+; CHECK-NEXT: LoopNestDepth = 1
+; CHECK-NEXT: loop1_0:
+; CHECK-NEXT: LoopNestDepth = 2
+; CHECK-NEXT: loop1_1:
+; CHECK-NEXT: LoopNestDepth = 2
+; CHECK-NEXT: loop1_2:
+; CHECK-NEXT: LoopNestDepth = 2
+; CHECK-NEXT: loop0_1:
+; CHECK-NEXT: LoopNestDepth = 1
+; CHECK-NEXT: out:
+; CHECK-NEXT: LoopNestDepth = 0
+; CHECK-LABEL: Before RMW
+
+define void @test_multiple_nested_loops(i32 %a32, i32 %b32) {
+entry:
+  %a = trunc i32 %a32 to i1
+  %b = trunc i32 %b32 to i1
+  br label %loop0_0
+
+loop0_0:                                   ; <---+
+  br label %loop1_0                        ;     |
+loop1_0:                                   ; <-+ |
+  br label %loop1_1                        ;   | |
+loop1_1:                                   ;   | |
+  br i1 %a, label %loop1_0, label %loop0_1 ; --+ |
+loop0_1:                                   ;     |
+  br label %loop2_0                        ;     |
+loop2_0:                                   ; <-+ |
+  br label %loop2_1                        ;   | |
+loop2_1:                                   ;   | |
+  br i1 %a, label %loop2_0, label %loop0_2 ; --+ |
+loop0_2:                                   ;     |
+  br i1 %b, label %loop0_0, label %out     ; ----+
+
+out:
+  ret void
+}
+
+; CHECK-LABEL: After loop nest depth analysis
+; CHECK-NEXT: entry:
+; CHECK-NEXT: LoopNestDepth = 0
+; CHECK-NEXT: loop0_0:
+; CHECK-NEXT: LoopNestDepth = 1
+; CHECK-NEXT: loop1_0:
+; CHECK-NEXT: LoopNestDepth = 2
+; CHECK-NEXT: loop1_1:
+; CHECK-NEXT: LoopNestDepth = 2
+; CHECK-NEXT: loop0_1:
+; CHECK-NEXT: LoopNestDepth = 1
+; CHECK-NEXT: loop2_0:
+; CHECK-NEXT: LoopNestDepth = 2
+; CHECK-NEXT: loop2_1:
+; CHECK-NEXT: LoopNestDepth = 2
+; CHECK-NEXT: loop0_2:
+; CHECK-NEXT: LoopNestDepth = 1
+; CHECK-NEXT: out:
+; CHECK-NEXT: LoopNestDepth = 0
+; CHECK-LABEL: Before RMW
+
+define void @test_three_nested_loops(i32 %a32, i32 %b32, i32 %c32) {
+entry:
+  %a = trunc i32 %a32 to i1
+  %b = trunc i32 %b32 to i1
+  %c = trunc i32 %c32 to i1
+  br label %loop0_0
+
+loop0_0:                                   ; <-----+
+  br label %loop1_0                        ;       |
+loop1_0:                                   ; <---+ |
+  br label %loop2_0                        ;     | |
+loop2_0:                                   ; <-+ | |
+  br label %loop2_1                        ;   | | |
+loop2_1:                                   ;   | | |
+  br i1 %a, label %loop2_0, label %loop1_1 ; --+ | |
+loop1_1:                                   ;     | |
+  br i1 %b, label %loop1_0, label %loop0_1 ; ----+ |
+loop0_1:                                   ;       |
+  br i1 %c, label %loop0_0, label %out     ; ------+
+
+out:
+  ret void
+}
+
+; CHECK-LABEL: After loop nest depth analysis
+; CHECK-NEXT: entry:
+; CHECK-NEXT: LoopNestDepth = 0
+; CHECK-NEXT: loop0_0:
+; CHECK-NEXT: LoopNestDepth = 1
+; CHECK-NEXT: loop1_0:
+; CHECK-NEXT: LoopNestDepth = 2
+; CHECK-NEXT: loop2_0:
+; CHECK-NEXT: LoopNestDepth = 3
+; CHECK-NEXT: loop2_1:
+; CHECK-NEXT: LoopNestDepth = 3
+; CHECK-NEXT: loop1_1:
+; CHECK-NEXT: LoopNestDepth = 2
+; CHECK-NEXT: loop0_1:
+; CHECK-NEXT: LoopNestDepth = 1
+; CHECK-NEXT: out:
+; CHECK-NEXT: LoopNestDepth = 0
+; CHECK-LABEL: Before RMW
+
+define void @test_diamond(i32 %a32) {
+entry:
+  %a = trunc i32 %a32 to i1
+  br i1 %a, label %left, label %right
+
+left:
+  br label %out
+
+right:
+  br label %out
+
+out:
+  ret void
+}
+
+; CHECK-LABEL: After loop nest depth analysis
+; CHECK-NEXT: entry:
+; CHECK-NEXT: LoopNestDepth = 0
+; CHECK-NEXT: left:
+; CHECK-NEXT: LoopNestDepth = 0
+; CHECK-NEXT: right:
+; CHECK-NEXT: LoopNestDepth = 0
+; CHECK-NEXT: out:
+; CHECK-NEXT: LoopNestDepth = 0
+; CHECK-LABEL: Before RMW