Subzero: Rewrite the pass timing infrastructure.

This makes it much more useful for individual analysis and long-term translation performance tracking.

1. Collect and report aggregated across the entire translation, instead of function-by-function.  If you really care about a single function, just extract it and translate it separately for analysis.

2. Remove "-verbose time" and just use -timing.

3. Collects two kinds of timings: cumulative and flat.  Cumulative measures the total time, even if a callee also times itself.  Flat only measures the currently active timer at the top of the stack.  The flat times should add up to 100%, but cumulative will usually add up to much more than 100%.

BUG= none
R=jvoung@chromium.org

Review URL: https://codereview.chromium.org/610813002
diff --git a/Makefile.standalone b/Makefile.standalone
index a913497..175dd12 100644
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -80,9 +80,10 @@
 	IceMemoryRegion.cpp \
 	IceOperand.cpp \
 	IceRegAlloc.cpp \
-        IceRNG.cpp \
+	IceRNG.cpp \
 	IceTargetLowering.cpp \
 	IceTargetLoweringX8632.cpp \
+	IceTimerTree.cpp \
 	IceTranslator.cpp \
 	IceTypeConverter.cpp \
 	IceTypes.cpp \
@@ -113,7 +114,7 @@
 $(OBJDIR):
 	@mkdir -p $@
 
-check-lit: llvm2ice
+check-lit: llvm2ice make_symlink
 	LLVM_BIN_PATH=$(LLVM_BIN_PATH) \
 	$(LLVM_SRC_PATH)/utils/lit/lit.py -sv tests_lit
 
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index 8f51f43..d2c83ab 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -69,14 +69,14 @@
 void Cfg::translate() {
   if (hasError())
     return;
+  static TimerIdT IDtranslate = GlobalContext::getTimerID("translate");
+  TimerMarker T(IDtranslate, getContext());
 
   dump("Initial CFG");
 
-  Timer T_translate;
   // The set of translation passes and their order are determined by
   // the target.
   getTarget()->translate();
-  T_translate.printElapsedUs(getContext(), "translate()");
 
   dump("Final output");
 }
@@ -88,6 +88,9 @@
 }
 
 void Cfg::renumberInstructions() {
+  static TimerIdT IDrenumberInstructions =
+      GlobalContext::getTimerID("renumberInstructions");
+  TimerMarker T(IDrenumberInstructions, getContext());
   NextInstNumber = 1;
   for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
     (*I)->renumberInstructions();
@@ -96,6 +99,8 @@
 
 // placePhiLoads() must be called before placePhiStores().
 void Cfg::placePhiLoads() {
+  static TimerIdT IDplacePhiLoads = GlobalContext::getTimerID("placePhiLoads");
+  TimerMarker T(IDplacePhiLoads, getContext());
   for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
     (*I)->placePhiLoads();
   }
@@ -103,34 +108,48 @@
 
 // placePhiStores() must be called after placePhiLoads().
 void Cfg::placePhiStores() {
+  static TimerIdT IDplacePhiStores =
+      GlobalContext::getTimerID("placePhiStores");
+  TimerMarker T(IDplacePhiStores, getContext());
   for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
     (*I)->placePhiStores();
   }
 }
 
 void Cfg::deletePhis() {
+  static TimerIdT IDdeletePhis = GlobalContext::getTimerID("deletePhis");
+  TimerMarker T(IDdeletePhis, getContext());
   for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
     (*I)->deletePhis();
   }
 }
 
 void Cfg::doArgLowering() {
+  static TimerIdT IDdoArgLowering = GlobalContext::getTimerID("doArgLowering");
+  TimerMarker T(IDdoArgLowering, getContext());
   getTarget()->lowerArguments();
 }
 
 void Cfg::doAddressOpt() {
+  static TimerIdT IDdoAddressOpt = GlobalContext::getTimerID("doAddressOpt");
+  TimerMarker T(IDdoAddressOpt, getContext());
   for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
     (*I)->doAddressOpt();
   }
 }
 
 void Cfg::doNopInsertion() {
+  static TimerIdT IDdoNopInsertion =
+      GlobalContext::getTimerID("doNopInsertion");
+  TimerMarker T(IDdoNopInsertion, getContext());
   for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
     (*I)->doNopInsertion();
   }
 }
 
 void Cfg::genCode() {
+  static TimerIdT IDgenCode = GlobalContext::getTimerID("genCode");
+  TimerMarker T(IDgenCode, getContext());
   for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
     (*I)->genCode();
   }
@@ -138,6 +157,8 @@
 
 // Compute the stack frame layout.
 void Cfg::genFrame() {
+  static TimerIdT IDgenFrame = GlobalContext::getTimerID("genFrame");
+  TimerMarker T(IDgenFrame, getContext());
   getTarget()->addProlog(Entry);
   // TODO: Consider folding epilog generation into the final
   // emission/assembly pass to avoid an extra iteration over the node
@@ -154,6 +175,9 @@
 // completely with a single block.  It is a quick single pass and
 // doesn't need to iterate until convergence.
 void Cfg::livenessLightweight() {
+  static TimerIdT IDlivenessLightweight =
+      GlobalContext::getTimerID("livenessLightweight");
+  TimerMarker T(IDlivenessLightweight, getContext());
   getVMetadata()->init();
   for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
     (*I)->livenessLightweight();
@@ -161,6 +185,8 @@
 }
 
 void Cfg::liveness(LivenessMode Mode) {
+  static TimerIdT IDliveness = GlobalContext::getTimerID("liveness");
+  TimerMarker T(IDliveness, getContext());
   Live.reset(new Liveness(this, Mode));
   getVMetadata()->init();
   Live->init();
@@ -199,9 +225,10 @@
   // Collect timing for just the portion that constructs the live
   // range intervals based on the end-of-live-range computation, for a
   // finer breakdown of the cost.
-  Timer T_liveRange;
   // Make a final pass over instructions to delete dead instructions
   // and build each Variable's live range.
+  static TimerIdT IDliveRange = GlobalContext::getTimerID("liveRange");
+  TimerMarker T1(IDliveRange, getContext());
   for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
     (*I)->livenessPostprocess(Mode, getLiveness());
   }
@@ -241,7 +268,6 @@
       if (Var->getWeight().isInf())
         Var->setLiveRangeInfiniteWeight();
     }
-    T_liveRange.printElapsedUs(getContext(), "live range construction");
     dump();
   }
 }
@@ -249,6 +275,9 @@
 // Traverse every Variable of every Inst and verify that it
 // appears within the Variable's computed live range.
 bool Cfg::validateLiveness() const {
+  static TimerIdT IDvalidateLiveness =
+      GlobalContext::getTimerID("validateLiveness");
+  TimerMarker T(IDvalidateLiveness, getContext());
   bool Valid = true;
   Ostream &Str = Ctx->getStrDump();
   for (NodeList::const_iterator I1 = Nodes.begin(), E1 = Nodes.end(); I1 != E1;
@@ -296,18 +325,21 @@
 }
 
 void Cfg::doBranchOpt() {
+  static TimerIdT IDdoBranchOpt = GlobalContext::getTimerID("doBranchOpt");
+  TimerMarker T(IDdoBranchOpt, getContext());
   for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
     NodeList::iterator NextNode = I;
     ++NextNode;
-    (*I)->doBranchOpt(*NextNode);
+    (*I)->doBranchOpt(NextNode == E ? NULL : *NextNode);
   }
 }
 
 // ======================== Dump routines ======================== //
 
 void Cfg::emit() {
+  static TimerIdT IDemit = GlobalContext::getTimerID("emit");
+  TimerMarker T(IDemit, getContext());
   Ostream &Str = Ctx->getStrEmit();
-  Timer T_emit;
   if (!Ctx->testAndSetHasEmittedFirstMethod()) {
     // Print a helpful command for assembling the output.
     // TODO: have the Target emit the header
@@ -339,7 +371,6 @@
     (*I)->emit(this);
   }
   Str << "\n";
-  T_emit.printElapsedUs(Ctx, "emit()");
 }
 
 // Dumps the IR with an optional introductory message.
diff --git a/src/IceConverter.cpp b/src/IceConverter.cpp
index 4888e8f..9910f06 100644
--- a/src/IceConverter.cpp
+++ b/src/IceConverter.cpp
@@ -11,11 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "IceConverter.h"
-
 #include "IceCfg.h"
 #include "IceCfgNode.h"
 #include "IceClFlags.h"
+#include "IceConverter.h"
 #include "IceDefs.h"
 #include "IceGlobalContext.h"
 #include "IceInst.h"
@@ -60,6 +59,9 @@
 
   // Caller is expected to delete the returned Ice::Cfg object.
   Ice::Cfg *convertFunction(const Function *F) {
+    static Ice::TimerIdT IDllvmConvert =
+        Ice::GlobalContext::getTimerID("llvmConvert");
+    Ice::TimerMarker T(IDllvmConvert, Ctx);
     VarMap.clear();
     NodeMap.clear();
     Func = new Ice::Cfg(Ctx);
@@ -621,6 +623,8 @@
 namespace Ice {
 
 void Converter::convertToIce() {
+  static TimerIdT IDconvertToIce = GlobalContext::getTimerID("convertToIce");
+  TimerMarker T(IDconvertToIce, Ctx);
   nameUnnamedGlobalAddresses(Mod);
   if (!Ctx->getFlags().DisableGlobals)
     convertGlobals(Mod);
@@ -635,11 +639,6 @@
 
     Timer TConvert;
     Cfg *Fcn = FunctionConverter.convertFunction(I);
-    if (Ctx->getFlags().SubzeroTimingEnabled) {
-      std::cerr << "[Subzero timing] Convert function "
-                << Fcn->getFunctionName() << ": " << TConvert.getElapsedSec()
-                << " sec\n";
-    }
     translateFcn(Fcn);
   }
 
diff --git a/src/IceDefs.h b/src/IceDefs.h
index 4aa99e2..68f30c7 100644
--- a/src/IceDefs.h
+++ b/src/IceDefs.h
@@ -78,6 +78,8 @@
 // numbers are used for representing Variable live ranges.
 typedef int32_t InstNumberT;
 
+typedef uint32_t TimerIdT;
+
 enum LivenessMode {
   // Basic version of live-range-end calculation.  Marks the last uses
   // of variables based on dataflow analysis.  Records the set of
@@ -103,34 +105,14 @@
   IceV_RegOrigins = 1 << 7,
   IceV_LinearScan = 1 << 8,
   IceV_Frame = 1 << 9,
-  IceV_Timing = 1 << 10,
-  IceV_AddrOpt = 1 << 11,
+  IceV_AddrOpt = 1 << 10,
   IceV_All = ~IceV_None,
-  IceV_Most = IceV_All & ~(IceV_Timing | IceV_LinearScan)
+  IceV_Most = IceV_All & ~IceV_LinearScan
 };
 typedef uint32_t VerboseMask;
 
 typedef llvm::raw_ostream Ostream;
 
-// TODO: Implement in terms of std::chrono after switching to C++11.
-class Timer {
-public:
-  Timer() : Start(llvm::TimeRecord::getCurrentTime(false)) {}
-  uint64_t getElapsedNs() const { return getElapsedSec() * 1000 * 1000 * 1000; }
-  uint64_t getElapsedUs() const { return getElapsedSec() * 1000 * 1000; }
-  uint64_t getElapsedMs() const { return getElapsedSec() * 1000; }
-  double getElapsedSec() const {
-    llvm::TimeRecord End = llvm::TimeRecord::getCurrentTime(false);
-    return End.getWallTime() - Start.getWallTime();
-  }
-  void printElapsedUs(GlobalContext *Ctx, const IceString &Tag) const;
-
-private:
-  const llvm::TimeRecord Start;
-  Timer(const Timer &) LLVM_DELETED_FUNCTION;
-  Timer &operator=(const Timer &) LLVM_DELETED_FUNCTION;
-};
-
 } // end of namespace Ice
 
 #endif // SUBZERO_SRC_ICEDEFS_H
diff --git a/src/IceGlobalContext.cpp b/src/IceGlobalContext.cpp
index 319e18a..4cca6c9 100644
--- a/src/IceGlobalContext.cpp
+++ b/src/IceGlobalContext.cpp
@@ -1,4 +1,4 @@
-//===- subzero/src/IceGlobalContext.cpp - Global context defs ---*- C++ -*-===//
+//===- subzero/src/IceGlobalContext.cpp - Global context defs -------------===//
 //
 //                        The Subzero Code Generator
 //
@@ -22,6 +22,7 @@
 #include "IceGlobalContext.h"
 #include "IceOperand.h"
 #include "IceTargetLowering.h"
+#include "IceTimerTree.h"
 
 namespace Ice {
 
@@ -122,7 +123,7 @@
     : StrDump(OsDump), StrEmit(OsEmit), VMask(Mask),
       ConstPool(new ConstantPool()), Arch(Arch), Opt(Opt),
       TestPrefix(TestPrefix), Flags(Flags), HasEmittedFirstMethod(false),
-      RNG("") {}
+      RNG(""), Timers(new TimerStack("main")) {}
 
 // Scan a string for S[0-9A-Z]*_ patterns and replace them with
 // S<num>_ where <num> is the next base-36 value.  If a type name
@@ -384,6 +385,14 @@
   llvm_unreachable("Unknown type");
 }
 
+TimerIdT GlobalContext::getTimerID(const IceString &Name) {
+  return TimerStack::getTimerID(Name);
+}
+
+void GlobalContext::pushTimer(TimerIdT ID) { Timers->push(ID); }
+
+void GlobalContext::popTimer(TimerIdT ID) { Timers->pop(ID); }
+
 void GlobalContext::dumpStats(const IceString &Name, bool Final) {
   if (Flags.DumpStats) {
     if (Final) {
@@ -395,12 +404,6 @@
   }
 }
 
-void Timer::printElapsedUs(GlobalContext *Ctx, const IceString &Tag) const {
-  if (Ctx->isVerbose(IceV_Timing)) {
-    // Prefixing with '#' allows timing strings to be included
-    // without error in textual assembly output.
-    Ctx->getStrDump() << "# " << getElapsedUs() << " usec " << Tag << "\n";
-  }
-}
+void GlobalContext::dumpTimers() { Timers->dump(getStrDump()); }
 
 } // end of namespace Ice
diff --git a/src/IceGlobalContext.h b/src/IceGlobalContext.h
index f997b65..3c53773 100644
--- a/src/IceGlobalContext.h
+++ b/src/IceGlobalContext.h
@@ -69,12 +69,8 @@
 
   // Returns true if any of the specified options in the verbose mask
   // are set.  If the argument is omitted, it checks if any verbose
-  // options at all are set.  IceV_Timing is treated specially, so
-  // that running with just IceV_Timing verbosity doesn't trigger an
-  // avalanche of extra output.
-  bool isVerbose(VerboseMask Mask = (IceV_All & ~IceV_Timing)) const {
-    return VMask & Mask;
-  }
+  // options at all are set.
+  bool isVerbose(VerboseMask Mask = IceV_All) const { return VMask & Mask; }
   void setVerbose(VerboseMask Mask) { VMask = Mask; }
   void addVerbose(VerboseMask Mask) { VMask |= Mask; }
   void subVerbose(VerboseMask Mask) { VMask &= ~Mask; }
@@ -154,6 +150,11 @@
     StatsCumulative.updateFills();
   }
 
+  static TimerIdT getTimerID(const IceString &Name);
+  void pushTimer(TimerIdT ID);
+  void popTimer(TimerIdT ID);
+  void dumpTimers();
+
 private:
   Ostream *StrDump; // Stream for dumping / diagnostics
   Ostream *StrEmit; // Stream for code emission
@@ -170,6 +171,7 @@
   RandomNumberGenerator RNG;
   CodeStats StatsFunction;
   CodeStats StatsCumulative;
+  llvm::OwningPtr<class TimerStack> Timers;
   GlobalContext(const GlobalContext &) LLVM_DELETED_FUNCTION;
   GlobalContext &operator=(const GlobalContext &) LLVM_DELETED_FUNCTION;
 
@@ -178,6 +180,30 @@
   void incrementSubstitutions(ManglerVector &OldName) const;
 };
 
+// Helper class to push and pop a timer marker.  The constructor
+// pushes a marker, and the destructor pops it.  This is for
+// convenient timing of regions of code.
+class TimerMarker {
+  TimerMarker(const TimerMarker &) LLVM_DELETED_FUNCTION;
+  TimerMarker &operator=(const TimerMarker &) LLVM_DELETED_FUNCTION;
+
+public:
+  TimerMarker(TimerIdT ID, GlobalContext *Ctx)
+      : ID(ID), Ctx(Ctx), Active(Ctx->getFlags().SubzeroTimingEnabled) {
+    if (Active)
+      Ctx->pushTimer(ID);
+  }
+  ~TimerMarker() {
+    if (Active)
+      Ctx->popTimer(ID);
+  }
+
+private:
+  TimerIdT ID;
+  GlobalContext *const Ctx;
+  const bool Active;
+};
+
 } // end of namespace Ice
 
 #endif // SUBZERO_SRC_ICEGLOBALCONTEXT_H
diff --git a/src/IceInst.h b/src/IceInst.h
index 5c50d2f..3727e6e 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -509,9 +509,9 @@
 class InstLoad : public InstHighLevel {
 public:
   static InstLoad *create(Cfg *Func, Variable *Dest, Operand *SourceAddr,
-                          uint32_t align = 1) {
+                          uint32_t Align = 1) {
     // TODO(kschimpf) Stop ignoring alignment specification.
-    (void)align;
+    (void)Align;
     return new (Func->allocateInst<InstLoad>())
         InstLoad(Func, Dest, SourceAddr);
   }
diff --git a/src/IceOperand.cpp b/src/IceOperand.cpp
index d1854f4..3614725 100644
--- a/src/IceOperand.cpp
+++ b/src/IceOperand.cpp
@@ -122,9 +122,26 @@
 }
 
 bool LiveRange::overlaps(InstNumberT OtherBegin) const {
+  bool Result = false;
+  for (RangeType::const_iterator I = Range.begin(), E = Range.end(); I != E;
+       ++I) {
+    if (OtherBegin < I->first) {
+      Result = false;
+      break;
+    }
+    if (OtherBegin < I->second) {
+      Result = true;
+      break;
+    }
+  }
+#if 0
+  // An equivalent but less inefficient implementation:
   LiveRange Temp;
   Temp.addSegment(OtherBegin, OtherBegin + 1);
-  return overlaps(Temp);
+  bool Validation = overlaps(Temp);
+  assert(Result == Validation);
+#endif
+  return Result;
 }
 
 // Returns true if the live range contains the given instruction
@@ -259,6 +276,8 @@
 }
 
 void VariablesMetadata::init() {
+  static TimerIdT IDvmetadata = GlobalContext::getTimerID("vmetadata");
+  TimerMarker T(IDvmetadata, Func->getContext());
   Metadata.clear();
   Metadata.resize(Func->getNumVariables());
 
diff --git a/src/IceRegAlloc.cpp b/src/IceRegAlloc.cpp
index 36ada12..e072085 100644
--- a/src/IceRegAlloc.cpp
+++ b/src/IceRegAlloc.cpp
@@ -64,12 +64,15 @@
 // preparation.  Results are assigned to Variable::RegNum for each
 // Variable.
 void LinearScan::scan(const llvm::SmallBitVector &RegMaskFull) {
+  static TimerIdT IDscan = GlobalContext::getTimerID("linearScan");
+  TimerMarker T(IDscan, Func->getContext());
   assert(RegMaskFull.any()); // Sanity check
   Unhandled.clear();
   Handled.clear();
   Inactive.clear();
   Active.clear();
   Ostream &Str = Func->getContext()->getStrDump();
+  bool Verbose = Func->getContext()->isVerbose(IceV_LinearScan);
   Func->resetCurrentNode();
   VariablesMetadata *VMetadata = Func->getVMetadata();
 
@@ -81,26 +84,32 @@
   // a result, it may be useful to design a better data structure for
   // storing Func->getVariables().
   const VarList &Vars = Func->getVariables();
-  for (VarList::const_iterator I = Vars.begin(), E = Vars.end(); I != E; ++I) {
-    Variable *Var = *I;
-    // Explicitly don't consider zero-weight variables, which are
-    // meant to be spill slots.
-    if (Var->getWeight() == RegWeight::Zero)
-      continue;
-    // Don't bother if the variable has a null live range, which means
-    // it was never referenced.
-    if (Var->getLiveRange().isEmpty())
-      continue;
-    Unhandled.insert(LiveRangeWrapper(Var));
-    if (Var->hasReg()) {
-      Var->setRegNumTmp(Var->getRegNum());
-      Var->setLiveRangeInfiniteWeight();
+  {
+    static TimerIdT IDinitUnhandled =
+        GlobalContext::getTimerID("initUnhandled");
+    TimerMarker T(IDinitUnhandled, Func->getContext());
+    for (VarList::const_iterator I = Vars.begin(), E = Vars.end(); I != E;
+         ++I) {
+      Variable *Var = *I;
+      // Explicitly don't consider zero-weight variables, which are
+      // meant to be spill slots.
+      if (Var->getWeight() == RegWeight::Zero)
+        continue;
+      // Don't bother if the variable has a null live range, which means
+      // it was never referenced.
+      if (Var->getLiveRange().isEmpty())
+        continue;
+      Unhandled.insert(LiveRangeWrapper(Var));
+      if (Var->hasReg()) {
+        Var->setRegNumTmp(Var->getRegNum());
+        Var->setLiveRangeInfiniteWeight();
+      }
     }
   }
 
   // RegUses[I] is the number of live ranges (variables) that register
   // I is currently assigned to.  It can be greater than 1 as a result
-  // of Variable::AllowRegisterOverlap.
+  // of AllowOverlap inference below.
   std::vector<int> RegUses(RegMaskFull.size());
   // Unhandled is already set to all ranges in increasing order of
   // start points.
@@ -112,7 +121,7 @@
   while (!Unhandled.empty()) {
     LiveRangeWrapper Cur = *Unhandled.begin();
     Unhandled.erase(Unhandled.begin());
-    if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+    if (Verbose) {
       Str << "\nConsidering  ";
       Cur.dump(Func);
       Str << "\n";
@@ -130,7 +139,7 @@
       int32_t RegNum = Cur.Var->getRegNum();
       // RegNumTmp should have already been set above.
       assert(Cur.Var->getRegNumTmp() == RegNum);
-      if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+      if (Verbose) {
         Str << "Precoloring  ";
         Cur.dump(Func);
         Str << "\n";
@@ -150,7 +159,7 @@
       bool Moved = false;
       if (Item.endsBefore(Cur)) {
         // Move Item from Active to Handled list.
-        if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+        if (Verbose) {
           Str << "Expiring     ";
           Item.dump(Func);
           Str << "\n";
@@ -160,7 +169,7 @@
         Moved = true;
       } else if (!Item.overlapsStart(Cur)) {
         // Move Item from Active to Inactive list.
-        if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+        if (Verbose) {
           Str << "Inactivating ";
           Item.dump(Func);
           Str << "\n";
@@ -186,7 +195,7 @@
       LiveRangeWrapper Item = *I;
       if (Item.endsBefore(Cur)) {
         // Move Item from Inactive to Handled list.
-        if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+        if (Verbose) {
           Str << "Expiring     ";
           Item.dump(Func);
           Str << "\n";
@@ -195,7 +204,7 @@
         Handled.push_back(Item);
       } else if (Item.overlapsStart(Cur)) {
         // Move Item from Inactive to Active list.
-        if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+        if (Verbose) {
           Str << "Reactivating ";
           Item.dump(Func);
           Str << "\n";
@@ -261,7 +270,7 @@
         }
       }
     }
-    if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+    if (Verbose) {
       if (Prefer) {
         Str << "Initial Prefer=" << *Prefer << " R=" << PreferReg
             << " LIVE=" << Prefer->getLiveRange() << " Overlap=" << AllowOverlap
@@ -279,8 +288,7 @@
         int32_t RegNum = Item.Var->getRegNumTmp();
         // Don't assert(Free[RegNum]) because in theory (though
         // probably never in practice) there could be two inactive
-        // variables that were allowed marked with
-        // AllowRegisterOverlap.
+        // variables that were marked with AllowOverlap.
         Free[RegNum] = false;
         // Disable AllowOverlap if an Inactive variable, which is not
         // Prefer, shares Prefer's register, and has a definition
@@ -331,7 +339,7 @@
     }
 
     // Print info about physical register availability.
-    if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+    if (Verbose) {
       for (SizeT i = 0; i < RegMask.size(); ++i) {
         if (RegMask[i]) {
           Str << Func->getTarget()->getRegName(i, IceType_i32)
@@ -346,7 +354,7 @@
       // First choice: a preferred register that is either free or is
       // allowed to overlap with its linked variable.
       Cur.Var->setRegNumTmp(PreferReg);
-      if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+      if (Verbose) {
         Str << "Preferring   ";
         Cur.dump(Func);
         Str << "\n";
@@ -360,7 +368,7 @@
       // picking the lowest-numbered available register?
       int32_t RegNum = Free.find_first();
       Cur.Var->setRegNumTmp(RegNum);
-      if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+      if (Verbose) {
         Str << "Allocating   ";
         Cur.dump(Func);
         Str << "\n";
@@ -434,7 +442,7 @@
           ++Next;
           LiveRangeWrapper Item = *I;
           if (Item.Var->getRegNumTmp() == MinWeightIndex) {
-            if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+            if (Verbose) {
               Str << "Evicting     ";
               Item.dump(Func);
               Str << "\n";
@@ -463,7 +471,7 @@
           // instructions.
           if (Item.Var->getRegNumTmp() == MinWeightIndex &&
               Item.overlaps(Cur)) {
-            if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+            if (Verbose) {
               Str << "Evicting     ";
               Item.dump(Func);
               Str << "\n";
@@ -478,7 +486,7 @@
         assert(RegUses[MinWeightIndex] >= 0);
         ++RegUses[MinWeightIndex];
         Active.push_back(Cur);
-        if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+        if (Verbose) {
           Str << "Allocating   ";
           Cur.dump(Func);
           Str << "\n";
@@ -509,7 +517,7 @@
        I != E; ++I) {
     LiveRangeWrapper Item = *I;
     int32_t RegNum = Item.Var->getRegNumTmp();
-    if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+    if (Verbose) {
       if (!Item.Var->hasRegTmp()) {
         Str << "Not assigning ";
         Item.Var->dump(Func);
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index f663155..76d3473 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -229,6 +229,8 @@
 // registers could potentially be parameterized if we want to restrict
 // registers e.g. for performance testing.
 void TargetLowering::regAlloc() {
+  static TimerIdT IDregAlloc = GlobalContext::getTimerID("regAlloc");
+  TimerMarker T(IDregAlloc, Ctx);
   LinearScan LinearScan(Func);
   RegSetMask RegInclude = RegSet_None;
   RegSetMask RegExclude = RegSet_None;
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 96cc57b..b9d3170 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -302,35 +302,27 @@
 
 void TargetX8632::translateO2() {
   GlobalContext *Context = Func->getContext();
+  static TimerIdT IDO2 = GlobalContext::getTimerID("O2");
+  TimerMarker T(IDO2, Context);
 
   // Lower Phi instructions.
-  Timer T_placePhiLoads;
   Func->placePhiLoads();
   if (Func->hasError())
     return;
-  T_placePhiLoads.printElapsedUs(Context, "placePhiLoads()");
-  Timer T_placePhiStores;
   Func->placePhiStores();
   if (Func->hasError())
     return;
-  T_placePhiStores.printElapsedUs(Context, "placePhiStores()");
-  Timer T_deletePhis;
   Func->deletePhis();
   if (Func->hasError())
     return;
-  T_deletePhis.printElapsedUs(Context, "deletePhis()");
   Func->dump("After Phi lowering");
 
   // Address mode optimization.
-  Timer T_doAddressOpt;
   Func->getVMetadata()->init();
   Func->doAddressOpt();
-  T_doAddressOpt.printElapsedUs(Context, "doAddressOpt()");
 
   // Argument lowering
-  Timer T_argLowering;
   Func->doArgLowering();
-  T_argLowering.printElapsedUs(Context, "lowerArguments()");
 
   // Target lowering.  This requires liveness analysis for some parts
   // of the lowering decisions, such as compare/branch fusing.  If
@@ -338,40 +330,30 @@
   // to be renumbered first.  TODO: This renumbering should only be
   // necessary if we're actually calculating live intervals, which we
   // only do for register allocation.
-  Timer T_renumber1;
   Func->renumberInstructions();
   if (Func->hasError())
     return;
-  T_renumber1.printElapsedUs(Context, "renumberInstructions()");
 
   // TODO: It should be sufficient to use the fastest liveness
   // calculation, i.e. livenessLightweight().  However, for some
   // reason that slows down the rest of the translation.  Investigate.
-  Timer T_liveness1;
   Func->liveness(Liveness_Basic);
   if (Func->hasError())
     return;
-  T_liveness1.printElapsedUs(Context, "liveness()");
   Func->dump("After x86 address mode opt");
 
-  Timer T_genCode;
   Func->genCode();
   if (Func->hasError())
     return;
-  T_genCode.printElapsedUs(Context, "genCode()");
 
   // Register allocation.  This requires instruction renumbering and
   // full liveness analysis.
-  Timer T_renumber2;
   Func->renumberInstructions();
   if (Func->hasError())
     return;
-  T_renumber2.printElapsedUs(Context, "renumberInstructions()");
-  Timer T_liveness2;
   Func->liveness(Liveness_Intervals);
   if (Func->hasError())
     return;
-  T_liveness2.printElapsedUs(Context, "liveness()");
   // Validate the live range computations.  Do it outside the timing
   // code.  TODO: Put this under a flag.
   bool ValidLiveness = Func->validateLiveness();
@@ -381,20 +363,16 @@
   // The post-codegen dump is done here, after liveness analysis and
   // associated cleanup, to make the dump cleaner and more useful.
   Func->dump("After initial x8632 codegen");
-  Timer T_regAlloc;
   Func->getVMetadata()->init();
   regAlloc();
   if (Func->hasError())
     return;
-  T_regAlloc.printElapsedUs(Context, "regAlloc()");
   Func->dump("After linear scan regalloc");
 
   // Stack frame mapping.
-  Timer T_genFrame;
   Func->genFrame();
   if (Func->hasError())
     return;
-  T_genFrame.printElapsedUs(Context, "genFrame()");
   Func->dump("After stack frame mapping");
 
   // Branch optimization.  This needs to be done just before code
@@ -413,39 +391,29 @@
 
 void TargetX8632::translateOm1() {
   GlobalContext *Context = Func->getContext();
-  Timer T_placePhiLoads;
+  static TimerIdT IDOm1 = GlobalContext::getTimerID("Om1");
+  TimerMarker T(IDOm1, Context);
   Func->placePhiLoads();
   if (Func->hasError())
     return;
-  T_placePhiLoads.printElapsedUs(Context, "placePhiLoads()");
-  Timer T_placePhiStores;
   Func->placePhiStores();
   if (Func->hasError())
     return;
-  T_placePhiStores.printElapsedUs(Context, "placePhiStores()");
-  Timer T_deletePhis;
   Func->deletePhis();
   if (Func->hasError())
     return;
-  T_deletePhis.printElapsedUs(Context, "deletePhis()");
   Func->dump("After Phi lowering");
 
-  Timer T_argLowering;
   Func->doArgLowering();
-  T_argLowering.printElapsedUs(Context, "lowerArguments()");
 
-  Timer T_genCode;
   Func->genCode();
   if (Func->hasError())
     return;
-  T_genCode.printElapsedUs(Context, "genCode()");
   Func->dump("After initial x8632 codegen");
 
-  Timer T_genFrame;
   Func->genFrame();
   if (Func->hasError())
     return;
-  T_genFrame.printElapsedUs(Context, "genFrame()");
   Func->dump("After stack frame mapping");
 
   // Nop insertion
diff --git a/src/IceTimerTree.cpp b/src/IceTimerTree.cpp
new file mode 100644
index 0000000..4bdb80a
--- /dev/null
+++ b/src/IceTimerTree.cpp
@@ -0,0 +1,142 @@
+//===- subzero/src/IceTimerTree.cpp - Pass timer defs ---------------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the TimerTree class, which tracks flat and
+// cumulative execution time collection of call chains.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IceDefs.h"
+#include "IceTimerTree.h"
+
+namespace Ice {
+
+std::vector<IceString> TimerStack::IDs;
+
+TimerStack::TimerStack(const IceString &TopLevelName)
+    : FirstTimestamp(timestamp()), LastTimestamp(FirstTimestamp),
+      StateChangeCount(0), StackTop(0) {
+  Nodes.resize(1); // Reserve Nodes[0] for the root node.
+  push(getTimerID(TopLevelName));
+}
+
+// Returns the unique timer ID for the given Name, creating a new ID
+// if needed.  For performance reasons, it's best to make only one
+// call per Name and cache the result, e.g. via a static initializer.
+TimerIdT TimerStack::getTimerID(const IceString &Name) {
+  TimerIdT Size = IDs.size();
+  for (TimerIdT i = 0; i < Size; ++i) {
+    if (IDs[i] == Name)
+      return i;
+  }
+  IDs.push_back(Name);
+  return Size;
+}
+
+// Pushes a new marker onto the timer stack.
+void TimerStack::push(TimerIdT ID) {
+  update();
+  if (Nodes[StackTop].Children.size() <= ID)
+    Nodes[StackTop].Children.resize(ID + 1);
+  if (Nodes[StackTop].Children[ID] == 0) {
+    TTindex Size = Nodes.size();
+    Nodes[StackTop].Children[ID] = Size;
+    Nodes.resize(Size + 1);
+    Nodes[Size].Parent = StackTop;
+    Nodes[Size].Interior = ID;
+  }
+  StackTop = Nodes[StackTop].Children[ID];
+}
+
+// Pop the top marker from the timer stack.  Validates via assert()
+// that the expected marker is popped.
+void TimerStack::pop(TimerIdT ID) {
+  update();
+  assert(StackTop);
+  assert(Nodes[StackTop].Parent < StackTop);
+  // Verify that the expected ID is being popped.
+  assert(Nodes[StackTop].Interior == ID);
+  (void)ID;
+  // Verify that the parent's child points to the current stack top.
+  assert(Nodes[Nodes[StackTop].Parent].Children[ID] == StackTop);
+  StackTop = Nodes[StackTop].Parent;
+}
+
+// At a state change (e.g. push or pop), updates the flat and
+// cumulative timings for everything on the timer stack.
+void TimerStack::update() {
+  ++StateChangeCount;
+  // Whenever the stack is about to change, we grab the time delta
+  // since the last change and add it to all active cumulative
+  // elements and to the flat element for the top of the stack.
+  double Current = timestamp();
+  double Delta = Current - LastTimestamp;
+  LastTimestamp = Current;
+  if (StackTop) {
+    TimerIdT Leaf = Nodes[StackTop].Interior;
+    if (Leaf >= LeafTimes.size())
+      LeafTimes.resize(Leaf + 1);
+    LeafTimes[Leaf] += Delta;
+  }
+  TTindex Prefix = StackTop;
+  while (Prefix) {
+    Nodes[Prefix].Time += Delta;
+    TTindex Next = Nodes[Prefix].Parent;
+    assert(Next < Prefix);
+    Prefix = Next;
+  }
+}
+
+namespace {
+
+typedef std::multimap<double, IceString> DumpMapType;
+
+// Dump the Map items in reverse order of their time contribution.
+void dumpHelper(Ostream &Str, const DumpMapType &Map, double TotalTime) {
+  for (DumpMapType::const_reverse_iterator I = Map.rbegin(), E = Map.rend();
+       I != E; ++I) {
+    char buf[80];
+    snprintf(buf, llvm::array_lengthof(buf), "  %10.6f (%4.1f%%): ", I->first,
+             I->first * 100 / TotalTime);
+    Str << buf << I->second << "\n";
+  }
+}
+
+} // end of anonymous namespace
+
+void TimerStack::dump(Ostream &Str) {
+  update();
+  double TotalTime = LastTimestamp - FirstTimestamp;
+  assert(TotalTime);
+  Str << "Cumulative function times:\n";
+  DumpMapType CumulativeMap;
+  for (TTindex i = 1; i < Nodes.size(); ++i) {
+    TTindex Prefix = i;
+    IceString Suffix = "";
+    while (Prefix) {
+      if (Suffix.empty())
+        Suffix = IDs[Nodes[Prefix].Interior];
+      else
+        Suffix = IDs[Nodes[Prefix].Interior] + "." + Suffix;
+      assert(Nodes[Prefix].Parent < Prefix);
+      Prefix = Nodes[Prefix].Parent;
+    }
+    CumulativeMap.insert(std::make_pair(Nodes[i].Time, Suffix));
+  }
+  dumpHelper(Str, CumulativeMap, TotalTime);
+  Str << "Flat function times:\n";
+  DumpMapType FlatMap;
+  for (TimerIdT i = 0; i < LeafTimes.size(); ++i) {
+    FlatMap.insert(std::make_pair(LeafTimes[i], IDs[i]));
+  }
+  dumpHelper(Str, FlatMap, TotalTime);
+  Str << "Number of timer updates: " << StateChangeCount << "\n";
+}
+
+} // end of namespace Ice
diff --git a/src/IceTimerTree.h b/src/IceTimerTree.h
new file mode 100644
index 0000000..3162587
--- /dev/null
+++ b/src/IceTimerTree.h
@@ -0,0 +1,68 @@
+//===- subzero/src/IceTimerTree.h - Pass timer defs -------------*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the TimerTree class, which allows flat and
+// cumulative execution time collection of call chains.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ICETIMERTREE_H
+#define SUBZERO_SRC_ICETIMERTREE_H
+
+namespace Ice {
+
+class TimerTreeNode;
+
+// Timer tree index type
+typedef std::vector<TimerTreeNode>::size_type TTindex;
+
+// TimerTreeNode represents an interior or leaf node in the call tree.
+// It contains a list of children, a pointer to its parent, and the
+// timer ID for the node.  It also holds the cumulative time spent at
+// this node and below.  The children are always at a higher index in
+// the TimerTreeNode::Nodes array, and the parent is always at a lower
+// index.
+class TimerTreeNode {
+public:
+  TimerTreeNode() : Parent(0), Interior(0), Time(0) {}
+  std::vector<TTindex> Children; // indexed by TimerIdT
+  TTindex Parent;
+  TimerIdT Interior;
+  double Time;
+};
+
+class TimerStack {
+  TimerStack(const TimerStack &) LLVM_DELETED_FUNCTION;
+  TimerStack &operator=(const TimerStack &) LLVM_DELETED_FUNCTION;
+
+public:
+  TimerStack(const IceString &TopLevelName);
+  static TimerIdT getTimerID(const IceString &Name);
+  void push(TimerIdT ID);
+  void pop(TimerIdT ID);
+  void dump(Ostream &Str);
+
+private:
+  void update();
+  static double timestamp() {
+    // TODO: Implement in terms of std::chrono for C++11.
+    return llvm::TimeRecord::getCurrentTime(false).getWallTime();
+  }
+  const double FirstTimestamp;
+  double LastTimestamp;
+  uint64_t StateChangeCount;
+  static std::vector<IceString> IDs; // indexed by TimerIdT
+  std::vector<TimerTreeNode> Nodes;  // indexed by TTindex
+  std::vector<double> LeafTimes;     // indexed by TimerIdT
+  TTindex StackTop;
+};
+
+} // end of namespace Ice
+
+#endif // SUBZERO_SRC_ICETIMERTREE_H
diff --git a/src/IceTranslator.cpp b/src/IceTranslator.cpp
index e0c7e2d..91dcfd4 100644
--- a/src/IceTranslator.cpp
+++ b/src/IceTranslator.cpp
@@ -12,12 +12,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "IceTranslator.h"
-
 #include "IceCfg.h"
 #include "IceClFlags.h"
 #include "IceDefs.h"
 #include "IceTargetLowering.h"
+#include "IceTranslator.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -80,24 +79,13 @@
   if (Ctx->getFlags().DisableTranslation) {
     Func->dump();
   } else {
-    Timer TTranslate;
     Func->translate();
-    if (Ctx->getFlags().SubzeroTimingEnabled) {
-      std::cerr << "[Subzero timing] Translate function "
-                << Func->getFunctionName() << ": " << TTranslate.getElapsedSec()
-                << " sec\n";
-    }
     if (Func->hasError()) {
       std::cerr << "ICE translation error: " << Func->getError() << "\n";
       ErrorStatus = true;
     }
 
-    Timer TEmit;
     Func->emit();
-    if (Ctx->getFlags().SubzeroTimingEnabled) {
-      std::cerr << "[Subzero timing] Emit function " << Func->getFunctionName()
-                << ": " << TEmit.getElapsedSec() << " sec\n";
-    }
     Ctx->dumpStats(Func->getFunctionName());
   }
 }
diff --git a/src/PNaClTranslator.cpp b/src/PNaClTranslator.cpp
index 47f6a5e..e1925df 100644
--- a/src/PNaClTranslator.cpp
+++ b/src/PNaClTranslator.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PNaClTranslator.h"
 #include "IceCfg.h"
 #include "IceCfgNode.h"
 #include "IceClFlags.h"
@@ -20,6 +19,7 @@
 #include "IceInst.h"
 #include "IceOperand.h"
 #include "IceTypeConverter.h"
+#include "PNaClTranslator.h"
 #include "llvm/Analysis/NaCl/PNaClABIProps.h"
 #include "llvm/Bitcode/NaCl/NaClBitcodeDecoders.h"
 #include "llvm/Bitcode/NaCl/NaClBitcodeHeader.h"
@@ -852,7 +852,7 @@
     }
   }
 
-  ~FunctionParser() override;
+  ~FunctionParser() override {};
 
   // Set the next constant ID to the given constant C.
   void setNextConstantID(Ice::Constant *C) {
@@ -860,8 +860,6 @@
   }
 
 private:
-  // Timer for reading function bitcode and converting to ICE.
-  Ice::Timer TConvert;
   // The corresponding ICE function defined by the function block.
   Ice::Cfg *Func;
   // The index to the current basic block being built.
@@ -1376,13 +1374,6 @@
   }
 };
 
-FunctionParser::~FunctionParser() {
-  if (getFlags().SubzeroTimingEnabled) {
-    errs() << "[Subzero timing] Convert function " << Func->getFunctionName()
-           << ": " << TConvert.getElapsedSec() << " sec\n";
-  }
-}
-
 void FunctionParser::ReportInvalidBinopOpcode(unsigned Opcode, Ice::Type Ty) {
   std::string Buffer;
   raw_string_ostream StrBuf(Buffer);
diff --git a/src/llvm2ice.cpp b/src/llvm2ice.cpp
index 515e2fe..7f1fc38 100644
--- a/src/llvm2ice.cpp
+++ b/src/llvm2ice.cpp
@@ -44,7 +44,6 @@
         clEnumValN(Ice::IceV_RegOrigins, "orig", "Physical register origins"),
         clEnumValN(Ice::IceV_LinearScan, "regalloc", "Linear scan details"),
         clEnumValN(Ice::IceV_Frame, "frame", "Stack frame layout details"),
-        clEnumValN(Ice::IceV_Timing, "time", "Pass timing details"),
         clEnumValN(Ice::IceV_AddrOpt, "addropt", "Address mode optimization"),
         clEnumValN(Ice::IceV_All, "all", "Use all verbose options"),
         clEnumValN(Ice::IceV_Most, "most",
@@ -175,6 +174,8 @@
 
   Ice::GlobalContext Ctx(Ls, Os, VMask, TargetArch, OptLevel, TestPrefix,
                          Flags);
+  static Ice::TimerIdT IDszmain = Ice::GlobalContext::getTimerID("szmain");
+  Ice::TimerMarker T(IDszmain, &Ctx);
 
   int ErrorStatus = 0;
   if (BuildOnRead) {
@@ -184,15 +185,11 @@
   } else {
     // Parse the input LLVM IR file into a module.
     SMDiagnostic Err;
-    Ice::Timer T;
+    static Ice::TimerIdT IDparse = Ice::GlobalContext::getTimerID("parse");
+    Ice::TimerMarker T1(IDparse, &Ctx);
     Module *Mod =
         NaClParseIRFile(IRFilename, InputFileFormat, Err, getGlobalContext());
 
-    if (SubzeroTimingEnabled) {
-      std::cerr << "[Subzero timing] IR Parsing: " << T.getElapsedSec()
-                << " sec\n";
-    }
-
     if (!Mod) {
       Err.print(argv[0], errs());
       return 1;
@@ -202,6 +199,8 @@
     Converter.convertToIce();
     ErrorStatus = Converter.getErrorStatus();
   }
+  if (SubzeroTimingEnabled)
+    Ctx.dumpTimers();
   const bool FinalStats = true;
   Ctx.dumpStats("_FINAL_", FinalStats);
   return ErrorStatus;