Subzero: Add a few performance measurement tools.

--timing-funcs - Produces a sorted list of total time spent translating each function.

--timing-focus=<F> - Turns on the --timing equivalent just for one function.  Use '*' to do this for all functions, i.e. get complete timing breakdowns across all functions.

--verbose-focus=<F> - Temporarily turns on --verbose=all for one function.

BUG= none
R=jvoung@chromium.org

Review URL: https://codereview.chromium.org/620373004
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index 1134fdc..cf5a81b 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -25,8 +25,8 @@
 
 Cfg::Cfg(GlobalContext *Ctx)
     : Ctx(Ctx), FunctionName(""), ReturnType(IceType_void),
-      IsInternalLinkage(false), HasError(false), ErrorMessage(""), Entry(NULL),
-      NextInstNumber(1), Live(nullptr),
+      IsInternalLinkage(false), HasError(false), FocusedTiming(false),
+      ErrorMessage(""), Entry(NULL), NextInstNumber(1), Live(nullptr),
       Target(TargetLowering::createLowering(Ctx->getTargetArch(), this)),
       VMetadata(new VariablesMetadata(this)),
       TargetAssembler(
@@ -69,8 +69,15 @@
 void Cfg::translate() {
   if (hasError())
     return;
-  static TimerIdT IDtranslate = GlobalContext::getTimerID("translate");
-  TimerMarker T(IDtranslate, getContext());
+  VerboseMask OldVerboseMask = getContext()->getVerbose();
+  const IceString &TimingFocusOn = getContext()->getFlags().TimingFocusOn;
+  if (TimingFocusOn == "*" || TimingFocusOn == getFunctionName())
+    setFocusedTiming();
+  bool VerboseFocus =
+      (getContext()->getFlags().VerboseFocusOn == getFunctionName());
+  if (VerboseFocus)
+    getContext()->setVerbose(IceV_All);
+  TimerMarker T(TimerStack::TT_translate, this);
 
   dump("Initial CFG");
 
@@ -79,6 +86,10 @@
   getTarget()->translate();
 
   dump("Final output");
+  if (getFocusedTiming())
+    getContext()->dumpTimers();
+  if (VerboseFocus)
+    getContext()->setVerbose(OldVerboseMask);
 }
 
 void Cfg::computePredecessors() {
@@ -87,9 +98,7 @@
 }
 
 void Cfg::renumberInstructions() {
-  static TimerIdT IDrenumberInstructions =
-      GlobalContext::getTimerID("renumberInstructions");
-  TimerMarker T(IDrenumberInstructions, getContext());
+  TimerMarker T(TimerStack::TT_renumberInstructions, this);
   NextInstNumber = 1;
   for (CfgNode *Node : Nodes)
     Node->renumberInstructions();
@@ -97,60 +106,50 @@
 
 // placePhiLoads() must be called before placePhiStores().
 void Cfg::placePhiLoads() {
-  static TimerIdT IDplacePhiLoads = GlobalContext::getTimerID("placePhiLoads");
-  TimerMarker T(IDplacePhiLoads, getContext());
+  TimerMarker T(TimerStack::TT_placePhiLoads, this);
   for (CfgNode *Node : Nodes)
     Node->placePhiLoads();
 }
 
 // placePhiStores() must be called after placePhiLoads().
 void Cfg::placePhiStores() {
-  static TimerIdT IDplacePhiStores =
-      GlobalContext::getTimerID("placePhiStores");
-  TimerMarker T(IDplacePhiStores, getContext());
+  TimerMarker T(TimerStack::TT_placePhiStores, this);
   for (CfgNode *Node : Nodes)
     Node->placePhiStores();
 }
 
 void Cfg::deletePhis() {
-  static TimerIdT IDdeletePhis = GlobalContext::getTimerID("deletePhis");
-  TimerMarker T(IDdeletePhis, getContext());
+  TimerMarker T(TimerStack::TT_deletePhis, this);
   for (CfgNode *Node : Nodes)
     Node->deletePhis();
 }
 
 void Cfg::doArgLowering() {
-  static TimerIdT IDdoArgLowering = GlobalContext::getTimerID("doArgLowering");
-  TimerMarker T(IDdoArgLowering, getContext());
+  TimerMarker T(TimerStack::TT_doArgLowering, this);
   getTarget()->lowerArguments();
 }
 
 void Cfg::doAddressOpt() {
-  static TimerIdT IDdoAddressOpt = GlobalContext::getTimerID("doAddressOpt");
-  TimerMarker T(IDdoAddressOpt, getContext());
+  TimerMarker T(TimerStack::TT_doAddressOpt, this);
   for (CfgNode *Node : Nodes)
     Node->doAddressOpt();
 }
 
 void Cfg::doNopInsertion() {
-  static TimerIdT IDdoNopInsertion =
-      GlobalContext::getTimerID("doNopInsertion");
-  TimerMarker T(IDdoNopInsertion, getContext());
+  TimerMarker T(TimerStack::TT_doNopInsertion, this);
   for (CfgNode *Node : Nodes)
     Node->doNopInsertion();
 }
 
 void Cfg::genCode() {
-  static TimerIdT IDgenCode = GlobalContext::getTimerID("genCode");
-  TimerMarker T(IDgenCode, getContext());
+  TimerMarker T(TimerStack::TT_genCode, this);
   for (CfgNode *Node : Nodes)
     Node->genCode();
 }
 
 // Compute the stack frame layout.
 void Cfg::genFrame() {
-  static TimerIdT IDgenFrame = GlobalContext::getTimerID("genFrame");
-  TimerMarker T(IDgenFrame, getContext());
+  TimerMarker T(TimerStack::TT_genFrame, this);
   getTarget()->addProlog(Entry);
   // TODO: Consider folding epilog generation into the final
   // emission/assembly pass to avoid an extra iteration over the node
@@ -165,17 +164,14 @@
 // completely with a single block.  It is a quick single pass and
 // doesn't need to iterate until convergence.
 void Cfg::livenessLightweight() {
-  static TimerIdT IDlivenessLightweight =
-      GlobalContext::getTimerID("livenessLightweight");
-  TimerMarker T(IDlivenessLightweight, getContext());
+  TimerMarker T(TimerStack::TT_livenessLightweight, this);
   getVMetadata()->init();
   for (CfgNode *Node : Nodes)
     Node->livenessLightweight();
 }
 
 void Cfg::liveness(LivenessMode Mode) {
-  static TimerIdT IDliveness = GlobalContext::getTimerID("liveness");
-  TimerMarker T(IDliveness, getContext());
+  TimerMarker T(TimerStack::TT_liveness, this);
   Live.reset(new Liveness(this, Mode));
   getVMetadata()->init();
   Live->init();
@@ -208,8 +204,7 @@
   // finer breakdown of the cost.
   // Make a final pass over instructions to delete dead instructions
   // and build each Variable's live range.
-  static TimerIdT IDliveRange = GlobalContext::getTimerID("liveRange");
-  TimerMarker T1(IDliveRange, getContext());
+  TimerMarker T1(TimerStack::TT_liveRange, this);
   for (CfgNode *Node : Nodes)
     Node->livenessPostprocess(Mode, getLiveness());
   if (Mode == Liveness_Intervals) {
@@ -255,9 +250,7 @@
 // Traverse every Variable of every Inst and verify that it
 // appears within the Variable's computed live range.
 bool Cfg::validateLiveness() const {
-  static TimerIdT IDvalidateLiveness =
-      GlobalContext::getTimerID("validateLiveness");
-  TimerMarker T(IDvalidateLiveness, getContext());
+  TimerMarker T(TimerStack::TT_validateLiveness, this);
   bool Valid = true;
   Ostream &Str = Ctx->getStrDump();
   for (CfgNode *Node : Nodes) {
@@ -300,8 +293,7 @@
 }
 
 void Cfg::doBranchOpt() {
-  static TimerIdT IDdoBranchOpt = GlobalContext::getTimerID("doBranchOpt");
-  TimerMarker T(IDdoBranchOpt, getContext());
+  TimerMarker T(TimerStack::TT_doBranchOpt, this);
   for (auto I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
     auto NextNode = I;
     ++NextNode;
@@ -312,8 +304,7 @@
 // ======================== Dump routines ======================== //
 
 void Cfg::emit() {
-  static TimerIdT IDemit = GlobalContext::getTimerID("emit");
-  TimerMarker T(IDemit, getContext());
+  TimerMarker T(TimerStack::TT_emit, this);
   Ostream &Str = Ctx->getStrEmit();
   if (!Ctx->testAndSetHasEmittedFirstMethod()) {
     // Print a helpful command for assembling the output.
diff --git a/src/IceCfg.h b/src/IceCfg.h
index 4766d47..e2aef46 100644
--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -96,6 +96,8 @@
     return getContext()->getFlags().UseIntegratedAssembler;
   }
   bool hasComputedFrame() const;
+  bool getFocusedTiming() const { return FocusedTiming; }
+  void setFocusedTiming() { FocusedTiming = true; }
 
   // Passes over the CFG.
   void translate();
@@ -165,6 +167,7 @@
   Type ReturnType;
   bool IsInternalLinkage;
   bool HasError;
+  bool FocusedTiming;
   IceString ErrorMessage;
   CfgNode *Entry; // entry basic block
   NodeList Nodes; // linearized node list; Entry should be first
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp
index 1a386c8..f19552f 100644
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -495,7 +495,7 @@
     Str << "    // preds = ";
     bool First = true;
     for (CfgNode *I : InEdges) {
-      if (First)
+      if (!First)
         Str << ", ";
       First = false;
       Str << "%" << I->getName();
@@ -540,7 +540,7 @@
     Str << "    // succs = ";
     bool First = true;
     for (CfgNode *I : OutEdges) {
-      if (First)
+      if (!First)
         Str << ", ";
       First = false;
       Str << "%" << I->getName();
diff --git a/src/IceClFlags.h b/src/IceClFlags.h
index d6c232f..f1dec0c 100644
--- a/src/IceClFlags.h
+++ b/src/IceClFlags.h
@@ -25,8 +25,9 @@
       : DisableInternal(false), SubzeroTimingEnabled(false),
         DisableTranslation(false), DisableGlobals(false),
         FunctionSections(false), UseIntegratedAssembler(false),
-        UseSandboxing(false), DumpStats(false), DefaultGlobalPrefix(""),
-        DefaultFunctionPrefix("") {}
+        UseSandboxing(false), DumpStats(false), TimeEachFunction(false),
+        DefaultGlobalPrefix(""), DefaultFunctionPrefix(""), TimingFocusOn(""),
+        VerboseFocusOn("") {}
   bool DisableInternal;
   bool SubzeroTimingEnabled;
   bool DisableTranslation;
@@ -35,8 +36,11 @@
   bool UseIntegratedAssembler;
   bool UseSandboxing;
   bool DumpStats;
+  bool TimeEachFunction;
   IceString DefaultGlobalPrefix;
   IceString DefaultFunctionPrefix;
+  IceString TimingFocusOn;
+  IceString VerboseFocusOn;
 };
 
 } // end of namespace Ice
diff --git a/src/IceConverter.cpp b/src/IceConverter.cpp
index df64cff..cdb0052 100644
--- a/src/IceConverter.cpp
+++ b/src/IceConverter.cpp
@@ -59,15 +59,13 @@
 
   // Caller is expected to delete the returned Ice::Cfg object.
   Ice::Cfg *convertFunction(const Function *F) {
-    static Ice::TimerIdT IDllvmConvert =
-        Ice::GlobalContext::getTimerID("llvmConvert");
-    Ice::TimerMarker T(IDllvmConvert, Ctx);
     VarMap.clear();
     NodeMap.clear();
     Func = new Ice::Cfg(Ctx);
     Func->setFunctionName(F->getName());
     Func->setReturnType(convertToIceType(F->getReturnType()));
     Func->setInternal(F->hasInternalLinkage());
+    Ice::TimerMarker T(Ice::TimerStack::TT_llvmConvert, Func);
 
     // The initial definition/use of each arg is the entry node.
     for (auto ArgI = F->arg_begin(), ArgE = F->arg_end(); ArgI != ArgE;
@@ -617,8 +615,7 @@
 namespace Ice {
 
 void Converter::convertToIce() {
-  static TimerIdT IDconvertToIce = GlobalContext::getTimerID("convertToIce");
-  TimerMarker T(IDconvertToIce, Ctx);
+  TimerMarker T(TimerStack::TT_convertToIce, Ctx);
   nameUnnamedGlobalAddresses(Mod);
   if (!Ctx->getFlags().DisableGlobals)
     convertGlobals(Mod);
@@ -626,13 +623,21 @@
 }
 
 void Converter::convertFunctions() {
+  TimerStackIdT StackID = GlobalContext::TSK_Funcs;
   for (const Function &I : *Mod) {
     if (I.empty())
       continue;
+    TimerIdT TimerID = 0;
+    if (Ctx->getFlags().TimeEachFunction) {
+      TimerID = Ctx->getTimerID(StackID, I.getName());
+      Ctx->pushTimer(TimerID, StackID);
+    }
     LLVM2ICEConverter FunctionConverter(Ctx, Mod->getContext());
 
     Cfg *Fcn = FunctionConverter.convertFunction(&I);
     translateFcn(Fcn);
+    if (Ctx->getFlags().TimeEachFunction)
+      Ctx->popTimer(TimerID, StackID);
   }
 
   emitConstants();
diff --git a/src/IceDefs.h b/src/IceDefs.h
index 98bd8af..29ed5d8 100644
--- a/src/IceDefs.h
+++ b/src/IceDefs.h
@@ -69,6 +69,7 @@
 // numbers are used for representing Variable live ranges.
 typedef int32_t InstNumberT;
 
+typedef uint32_t TimerStackIdT;
 typedef uint32_t TimerIdT;
 
 enum LivenessMode {
diff --git a/src/IceGlobalContext.cpp b/src/IceGlobalContext.cpp
index 7ea7e5d..80728b0 100644
--- a/src/IceGlobalContext.cpp
+++ b/src/IceGlobalContext.cpp
@@ -119,7 +119,11 @@
     : StrDump(OsDump), StrEmit(OsEmit), VMask(Mask),
       ConstPool(new ConstantPool()), Arch(Arch), Opt(Opt),
       TestPrefix(TestPrefix), Flags(Flags), HasEmittedFirstMethod(false),
-      RNG(""), Timers(new TimerStack("main")) {}
+      RNG("") {
+  // Pre-register built-in stack names.
+  newTimerStackID("Total across all functions");
+  newTimerStackID("Per-function summary");
+}
 
 // Scan a string for S[0-9A-Z]*_ patterns and replace them with
 // S<num>_ where <num> is the next base-36 value.  If a type name
@@ -381,13 +385,27 @@
   llvm_unreachable("Unknown type");
 }
 
-TimerIdT GlobalContext::getTimerID(const IceString &Name) {
-  return TimerStack::getTimerID(Name);
+TimerIdT GlobalContext::getTimerID(TimerStackIdT StackID,
+                                   const IceString &Name) {
+  assert(StackID < Timers.size());
+  return Timers[StackID].getTimerID(Name);
 }
 
-void GlobalContext::pushTimer(TimerIdT ID) { Timers->push(ID); }
+TimerStackIdT GlobalContext::newTimerStackID(const IceString &Name) {
+  TimerStackIdT NewID = Timers.size();
+  Timers.push_back(TimerStack(Name));
+  return NewID;
+}
 
-void GlobalContext::popTimer(TimerIdT ID) { Timers->pop(ID); }
+void GlobalContext::pushTimer(TimerIdT ID, TimerStackIdT StackID) {
+  assert(StackID < Timers.size());
+  Timers[StackID].push(ID);
+}
+
+void GlobalContext::popTimer(TimerIdT ID, TimerStackIdT StackID) {
+  assert(StackID < Timers.size());
+  Timers[StackID].pop(ID);
+}
 
 void GlobalContext::dumpStats(const IceString &Name, bool Final) {
   if (Flags.DumpStats) {
@@ -400,6 +418,16 @@
   }
 }
 
-void GlobalContext::dumpTimers() { Timers->dump(getStrDump()); }
+void GlobalContext::dumpTimers(TimerStackIdT StackID, bool DumpCumulative) {
+  assert(Timers.size() > StackID);
+  Timers[StackID].dump(getStrDump(), DumpCumulative);
+}
+
+TimerMarker::TimerMarker(TimerIdT ID, const Cfg *Func)
+    : ID(ID), Ctx(Func->getContext()),
+      Active(Func->getFocusedTiming() || Ctx->getFlags().SubzeroTimingEnabled) {
+  if (Active)
+    Ctx->pushTimer(ID);
+}
 
 } // end of namespace Ice
diff --git a/src/IceGlobalContext.h b/src/IceGlobalContext.h
index 0ffff1c..9968abb 100644
--- a/src/IceGlobalContext.h
+++ b/src/IceGlobalContext.h
@@ -23,6 +23,7 @@
 #include "IceDefs.h"
 #include "IceIntrinsics.h"
 #include "IceRNG.h"
+#include "IceTimerTree.h"
 #include "IceTypes.h"
 
 namespace Ice {
@@ -71,6 +72,7 @@
   // Returns true if any of the specified options in the verbose mask
   // are set.  If the argument is omitted, it checks if any verbose
   // options at all are set.
+  VerboseMask getVerbose() const { return VMask; }
   bool isVerbose(VerboseMask Mask = IceV_All) const { return VMask & Mask; }
   void setVerbose(VerboseMask Mask) { VMask = Mask; }
   void addVerbose(VerboseMask Mask) { VMask |= Mask; }
@@ -151,10 +153,19 @@
     StatsCumulative.updateFills();
   }
 
-  static TimerIdT getTimerID(const IceString &Name);
-  void pushTimer(TimerIdT ID);
-  void popTimer(TimerIdT ID);
-  void dumpTimers();
+  // These are predefined TimerStackIdT values.
+  enum TimerStackKind {
+    TSK_Default = 0,
+    TSK_Funcs,
+    TSK_Num
+  };
+
+  TimerIdT getTimerID(TimerStackIdT StackID, const IceString &Name);
+  TimerStackIdT newTimerStackID(const IceString &Name);
+  void pushTimer(TimerIdT ID, TimerStackIdT StackID = TSK_Default);
+  void popTimer(TimerIdT ID, TimerStackIdT StackID = TSK_Default);
+  void dumpTimers(TimerStackIdT StackID = TSK_Default,
+                  bool DumpCumulative = true);
 
 private:
   Ostream *StrDump; // Stream for dumping / diagnostics
@@ -172,7 +183,7 @@
   RandomNumberGenerator RNG;
   CodeStats StatsFunction;
   CodeStats StatsCumulative;
-  std::unique_ptr<class TimerStack> Timers;
+  std::vector<TimerStack> Timers;
   GlobalContext(const GlobalContext &) = delete;
   GlobalContext &operator=(const GlobalContext &) = delete;
 
@@ -194,6 +205,8 @@
     if (Active)
       Ctx->pushTimer(ID);
   }
+  TimerMarker(TimerIdT ID, const Cfg *Func);
+
   ~TimerMarker() {
     if (Active)
       Ctx->popTimer(ID);
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 18b4b44..7150fa9 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -782,7 +782,8 @@
 // Unary XMM ops
 template <>
 const x86::AssemblerX86::XmmEmitterRegOp InstX8632Sqrtss::Emitter = {
-    &x86::AssemblerX86::sqrtss, &x86::AssemblerX86::sqrtss};
+  &x86::AssemblerX86::sqrtss, &x86::AssemblerX86::sqrtss
+};
 
 // Binary GPR ops
 template <>
@@ -824,58 +825,76 @@
 // Binary XMM ops
 template <>
 const x86::AssemblerX86::XmmEmitterRegOp InstX8632Addss::Emitter = {
-    &x86::AssemblerX86::addss, &x86::AssemblerX86::addss};
+  &x86::AssemblerX86::addss, &x86::AssemblerX86::addss
+};
 template <>
 const x86::AssemblerX86::XmmEmitterRegOp InstX8632Addps::Emitter = {
-    &x86::AssemblerX86::addps, &x86::AssemblerX86::addps};
+  &x86::AssemblerX86::addps, &x86::AssemblerX86::addps
+};
 template <>
 const x86::AssemblerX86::XmmEmitterRegOp InstX8632Divss::Emitter = {
-    &x86::AssemblerX86::divss, &x86::AssemblerX86::divss};
+  &x86::AssemblerX86::divss, &x86::AssemblerX86::divss
+};
 template <>
 const x86::AssemblerX86::XmmEmitterRegOp InstX8632Divps::Emitter = {
-    &x86::AssemblerX86::divps, &x86::AssemblerX86::divps};
+  &x86::AssemblerX86::divps, &x86::AssemblerX86::divps
+};
 template <>
 const x86::AssemblerX86::XmmEmitterRegOp InstX8632Mulss::Emitter = {
-    &x86::AssemblerX86::mulss, &x86::AssemblerX86::mulss};
+  &x86::AssemblerX86::mulss, &x86::AssemblerX86::mulss
+};
 template <>
 const x86::AssemblerX86::XmmEmitterRegOp InstX8632Mulps::Emitter = {
-    &x86::AssemblerX86::mulps, &x86::AssemblerX86::mulps};
+  &x86::AssemblerX86::mulps, &x86::AssemblerX86::mulps
+};
 template <>
 const x86::AssemblerX86::XmmEmitterRegOp InstX8632Padd::Emitter = {
-    &x86::AssemblerX86::padd, &x86::AssemblerX86::padd};
+  &x86::AssemblerX86::padd, &x86::AssemblerX86::padd
+};
 template <>
 const x86::AssemblerX86::XmmEmitterRegOp InstX8632Pand::Emitter = {
-    &x86::AssemblerX86::pand, &x86::AssemblerX86::pand};
+  &x86::AssemblerX86::pand, &x86::AssemblerX86::pand
+};
 template <>
 const x86::AssemblerX86::XmmEmitterRegOp InstX8632Pandn::Emitter = {
-    &x86::AssemblerX86::pandn, &x86::AssemblerX86::pandn};
+  &x86::AssemblerX86::pandn, &x86::AssemblerX86::pandn
+};
 template <>
 const x86::AssemblerX86::XmmEmitterRegOp InstX8632Pcmpeq::Emitter = {
-    &x86::AssemblerX86::pcmpeq, &x86::AssemblerX86::pcmpeq};
+  &x86::AssemblerX86::pcmpeq, &x86::AssemblerX86::pcmpeq
+};
 template <>
 const x86::AssemblerX86::XmmEmitterRegOp InstX8632Pcmpgt::Emitter = {
-    &x86::AssemblerX86::pcmpgt, &x86::AssemblerX86::pcmpgt};
+  &x86::AssemblerX86::pcmpgt, &x86::AssemblerX86::pcmpgt
+};
 template <>
 const x86::AssemblerX86::XmmEmitterRegOp InstX8632Pmull::Emitter = {
-    &x86::AssemblerX86::pmull, &x86::AssemblerX86::pmull};
+  &x86::AssemblerX86::pmull, &x86::AssemblerX86::pmull
+};
 template <>
 const x86::AssemblerX86::XmmEmitterRegOp InstX8632Pmuludq::Emitter = {
-    &x86::AssemblerX86::pmuludq, &x86::AssemblerX86::pmuludq};
+  &x86::AssemblerX86::pmuludq, &x86::AssemblerX86::pmuludq
+};
 template <>
 const x86::AssemblerX86::XmmEmitterRegOp InstX8632Por::Emitter = {
-    &x86::AssemblerX86::por, &x86::AssemblerX86::por};
+  &x86::AssemblerX86::por, &x86::AssemblerX86::por
+};
 template <>
 const x86::AssemblerX86::XmmEmitterRegOp InstX8632Psub::Emitter = {
-    &x86::AssemblerX86::psub, &x86::AssemblerX86::psub};
+  &x86::AssemblerX86::psub, &x86::AssemblerX86::psub
+};
 template <>
 const x86::AssemblerX86::XmmEmitterRegOp InstX8632Pxor::Emitter = {
-    &x86::AssemblerX86::pxor, &x86::AssemblerX86::pxor};
+  &x86::AssemblerX86::pxor, &x86::AssemblerX86::pxor
+};
 template <>
 const x86::AssemblerX86::XmmEmitterRegOp InstX8632Subss::Emitter = {
-    &x86::AssemblerX86::subss, &x86::AssemblerX86::subss};
+  &x86::AssemblerX86::subss, &x86::AssemblerX86::subss
+};
 template <>
 const x86::AssemblerX86::XmmEmitterRegOp InstX8632Subps::Emitter = {
-    &x86::AssemblerX86::subps, &x86::AssemblerX86::subps};
+  &x86::AssemblerX86::subps, &x86::AssemblerX86::subps
+};
 
 // Binary XMM Shift ops
 template <>
@@ -1427,10 +1446,11 @@
   const Operand *Src1 = getSrc(1);
   Type Ty = Src0->getType();
   static const x86::AssemblerX86::GPREmitterRegOp RegEmitter = {
-      &x86::AssemblerX86::cmp, &x86::AssemblerX86::cmp,
-      &x86::AssemblerX86::cmp};
+    &x86::AssemblerX86::cmp, &x86::AssemblerX86::cmp, &x86::AssemblerX86::cmp
+  };
   static const x86::AssemblerX86::GPREmitterAddrOp AddrEmitter = {
-      &x86::AssemblerX86::cmp, &x86::AssemblerX86::cmp};
+    &x86::AssemblerX86::cmp, &x86::AssemblerX86::cmp
+  };
   if (const Variable *SrcVar0 = llvm::dyn_cast<Variable>(Src0)) {
     if (SrcVar0->hasReg()) {
       emitIASRegOpTyGPR(Func, Ty, SrcVar0, Src1, RegEmitter);
@@ -1471,7 +1491,8 @@
   const Variable *Src0 = llvm::cast<Variable>(getSrc(0));
   Type Ty = Src0->getType();
   const static x86::AssemblerX86::XmmEmitterRegOp Emitter = {
-      &x86::AssemblerX86::ucomiss, &x86::AssemblerX86::ucomiss};
+    &x86::AssemblerX86::ucomiss, &x86::AssemblerX86::ucomiss
+  };
   emitIASRegOpTyXMM(Func, Ty, Src0, getSrc(1), Emitter);
 }
 
@@ -1517,9 +1538,11 @@
   Type Ty = Src0->getType();
   // The Reg/Addr form of test is not encodeable.
   static const x86::AssemblerX86::GPREmitterRegOp RegEmitter = {
-      &x86::AssemblerX86::test, NULL, &x86::AssemblerX86::test};
+    &x86::AssemblerX86::test, NULL, &x86::AssemblerX86::test
+  };
   static const x86::AssemblerX86::GPREmitterAddrOp AddrEmitter = {
-      &x86::AssemblerX86::test, &x86::AssemblerX86::test};
+    &x86::AssemblerX86::test, &x86::AssemblerX86::test
+  };
   if (const Variable *SrcVar0 = llvm::dyn_cast<Variable>(Src0)) {
     if (SrcVar0->hasReg()) {
       emitIASRegOpTyGPR(Func, Ty, SrcVar0, Src1, RegEmitter);
@@ -1770,8 +1793,9 @@
   const Variable *Dest = getDest();
   const Operand *Src = getSrc(0);
   const static x86::AssemblerX86::XmmEmitterMovOps Emitter = {
-      &x86::AssemblerX86::movups, &x86::AssemblerX86::movups,
-      &x86::AssemblerX86::movups};
+    &x86::AssemblerX86::movups, &x86::AssemblerX86::movups,
+    &x86::AssemblerX86::movups
+  };
   emitIASMovlikeXMM(Func, Dest, Src, Emitter);
 }
 
@@ -1794,8 +1818,8 @@
   const Variable *Dest = getDest();
   const Operand *Src = getSrc(0);
   const static x86::AssemblerX86::XmmEmitterMovOps Emitter = {
-      &x86::AssemblerX86::movq, &x86::AssemblerX86::movq,
-      &x86::AssemblerX86::movq};
+    &x86::AssemblerX86::movq, &x86::AssemblerX86::movq, &x86::AssemblerX86::movq
+  };
   emitIASMovlikeXMM(Func, Dest, Src, Emitter);
 }
 
diff --git a/src/IceOperand.cpp b/src/IceOperand.cpp
index c366dd9..c631e80 100644
--- a/src/IceOperand.cpp
+++ b/src/IceOperand.cpp
@@ -278,8 +278,7 @@
 }
 
 void VariablesMetadata::init() {
-  static TimerIdT IDvmetadata = GlobalContext::getTimerID("vmetadata");
-  TimerMarker T(IDvmetadata, Func->getContext());
+  TimerMarker T(TimerStack::TT_vmetadata, Func);
   Metadata.clear();
   Metadata.resize(Func->getNumVariables());
 
@@ -438,7 +437,7 @@
   Str << "(weight=" << Weight << ") ";
   bool First = true;
   for (const RangeElementType &I : Range) {
-    if (First)
+    if (!First)
       Str << ", ";
     First = false;
     Str << "[" << I.first << ":" << I.second << ")";
diff --git a/src/IceRegAlloc.cpp b/src/IceRegAlloc.cpp
index 69353a1..2d00db0 100644
--- a/src/IceRegAlloc.cpp
+++ b/src/IceRegAlloc.cpp
@@ -64,8 +64,7 @@
 // preparation.  Results are assigned to Variable::RegNum for each
 // Variable.
 void LinearScan::scan(const llvm::SmallBitVector &RegMaskFull) {
-  static TimerIdT IDscan = GlobalContext::getTimerID("linearScan");
-  TimerMarker T(IDscan, Func->getContext());
+  TimerMarker T(TimerStack::TT_linearScan, Func);
   assert(RegMaskFull.any()); // Sanity check
   Unhandled.clear();
   UnhandledPrecolored.clear();
@@ -86,9 +85,7 @@
   // storing Func->getVariables().
   const VarList &Vars = Func->getVariables();
   {
-    static TimerIdT IDinitUnhandled =
-        GlobalContext::getTimerID("initUnhandled");
-    TimerMarker T(IDinitUnhandled, Func->getContext());
+    TimerMarker T(TimerStack::TT_initUnhandled, Func);
     for (Variable *Var : Vars) {
       // Explicitly don't consider zero-weight variables, which are
       // meant to be spill slots.
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index 6dc46b1..bcc6290 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -229,8 +229,7 @@
 // registers could potentially be parameterized if we want to restrict
 // registers e.g. for performance testing.
 void TargetLowering::regAlloc() {
-  static TimerIdT IDregAlloc = GlobalContext::getTimerID("regAlloc");
-  TimerMarker T(IDregAlloc, Ctx);
+  TimerMarker T(TimerStack::TT_regAlloc, Func);
   LinearScan LinearScan(Func);
   RegSetMask RegInclude = RegSet_None;
   RegSetMask RegExclude = RegSet_None;
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 3217141..9cac11d 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -313,9 +313,7 @@
 }
 
 void TargetX8632::translateO2() {
-  GlobalContext *Context = Func->getContext();
-  static TimerIdT IDO2 = GlobalContext::getTimerID("O2");
-  TimerMarker T(IDO2, Context);
+  TimerMarker T(TimerStack::TT_O2, Func);
 
   // Lower Phi instructions.
   Func->placePhiLoads();
@@ -400,9 +398,7 @@
 }
 
 void TargetX8632::translateOm1() {
-  GlobalContext *Context = Func->getContext();
-  static TimerIdT IDOm1 = GlobalContext::getTimerID("Om1");
-  TimerMarker T(IDOm1, Context);
+  TimerMarker T(TimerStack::TT_Om1, Func);
   Func->placePhiLoads();
   if (Func->hasError())
     return;
@@ -4305,8 +4301,7 @@
 void TargetX8632::postLower() {
   if (Ctx->getOptLevel() != Opt_m1)
     return;
-  static TimerIdT IDpostLower = GlobalContext::getTimerID("postLower");
-  TimerMarker T(IDpostLower, Ctx);
+  TimerMarker T(TimerStack::TT_postLower, Func);
   // TODO: Avoid recomputing WhiteList every instruction.
   RegSetMask RegInclude = RegSet_All;
   RegSetMask RegExclude = RegSet_StackPointer;
diff --git a/src/IceTimerTree.cpp b/src/IceTimerTree.cpp
index 847941f..0cd73dc 100644
--- a/src/IceTimerTree.cpp
+++ b/src/IceTimerTree.cpp
@@ -19,26 +19,28 @@
 
 namespace Ice {
 
-std::vector<IceString> TimerStack::IDs;
-
-TimerStack::TimerStack(const IceString &TopLevelName)
-    : FirstTimestamp(timestamp()), LastTimestamp(FirstTimestamp),
+TimerStack::TimerStack(const IceString &Name)
+    : Name(Name), FirstTimestamp(timestamp()), LastTimestamp(FirstTimestamp),
       StateChangeCount(0), StackTop(0) {
   Nodes.resize(1); // Reserve Nodes[0] for the root node.
-  push(getTimerID(TopLevelName));
+  IDs.resize(TT__num);
+#define STR(s) #s
+#define X(tag)                                                                 \
+  IDs[TT_##tag] = STR(tag);                                                    \
+  IDsIndex[STR(tag)] = TT_##tag;
+  TIMERTREE_TABLE;
+#undef X
+#undef STR
 }
 
 // Returns the unique timer ID for the given Name, creating a new ID
-// if needed.  For performance reasons, it's best to make only one
-// call per Name and cache the result, e.g. via a static initializer.
+// if needed.
 TimerIdT TimerStack::getTimerID(const IceString &Name) {
-  TimerIdT Size = IDs.size();
-  for (TimerIdT i = 0; i < Size; ++i) {
-    if (IDs[i] == Name)
-      return i;
+  if (IDsIndex.find(Name) == IDsIndex.end()) {
+    IDsIndex[Name] = IDs.size();
+    IDs.push_back(Name);
   }
-  IDs.push_back(Name);
-  return Size;
+  return IDsIndex[Name];
 }
 
 // Pushes a new marker onto the timer stack.
@@ -112,27 +114,29 @@
 
 } // end of anonymous namespace
 
-void TimerStack::dump(Ostream &Str) {
+void TimerStack::dump(Ostream &Str, bool DumpCumulative) {
   update();
   double TotalTime = LastTimestamp - FirstTimestamp;
   assert(TotalTime);
-  Str << "Cumulative function times:\n";
-  DumpMapType CumulativeMap;
-  for (TTindex i = 1; i < Nodes.size(); ++i) {
-    TTindex Prefix = i;
-    IceString Suffix = "";
-    while (Prefix) {
-      if (Suffix.empty())
-        Suffix = IDs[Nodes[Prefix].Interior];
-      else
-        Suffix = IDs[Nodes[Prefix].Interior] + "." + Suffix;
-      assert(Nodes[Prefix].Parent < Prefix);
-      Prefix = Nodes[Prefix].Parent;
+  if (DumpCumulative) {
+    Str << Name << " - Cumulative times:\n";
+    DumpMapType CumulativeMap;
+    for (TTindex i = 1; i < Nodes.size(); ++i) {
+      TTindex Prefix = i;
+      IceString Suffix = "";
+      while (Prefix) {
+        if (Suffix.empty())
+          Suffix = IDs[Nodes[Prefix].Interior];
+        else
+          Suffix = IDs[Nodes[Prefix].Interior] + "." + Suffix;
+        assert(Nodes[Prefix].Parent < Prefix);
+        Prefix = Nodes[Prefix].Parent;
+      }
+      CumulativeMap.insert(std::make_pair(Nodes[i].Time, Suffix));
     }
-    CumulativeMap.insert(std::make_pair(Nodes[i].Time, Suffix));
+    dumpHelper(Str, CumulativeMap, TotalTime);
   }
-  dumpHelper(Str, CumulativeMap, TotalTime);
-  Str << "Flat function times:\n";
+  Str << Name << " - Flat times:\n";
   DumpMapType FlatMap;
   for (TimerIdT i = 0; i < LeafTimes.size(); ++i) {
     FlatMap.insert(std::make_pair(LeafTimes[i], IDs[i]));
diff --git a/src/IceTimerTree.def b/src/IceTimerTree.def
new file mode 100644
index 0000000..5319b1d
--- /dev/null
+++ b/src/IceTimerTree.def
@@ -0,0 +1,49 @@
+//===- subzero/src/IceTimerTree.def - X-macros for timing -------*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file lists predefined timing tags.  New tags can be added to
+// avoid a runtime string lookup.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ICETIMERTREE_DEF
+
+#define TIMERTREE_TABLE    \
+  /* enum value */         \
+  X(O2)                    \
+  X(Om1)                   \
+  X(convertToIce)          \
+  X(deletePhis)            \
+  X(doAddressOpt)          \
+  X(doArgLowering)         \
+  X(doBranchOpt)           \
+  X(doNopInsertion)        \
+  X(emit)                  \
+  X(genCode)               \
+  X(genFrame)              \
+  X(initUnhandled)         \
+  X(linearScan)            \
+  X(liveRange)             \
+  X(liveness)              \
+  X(livenessLightweight)   \
+  X(llvmConvert)           \
+  X(parse)                 \
+  X(placePhiLoads)         \
+  X(placePhiStores)        \
+  X(postLower)             \
+  X(regAlloc)              \
+  X(renumberInstructions)  \
+  X(szmain)                \
+  X(translate)             \
+  X(validateLiveness)      \
+  X(vmetadata)
+//#define X(tag)
+
+#define SUBZERO_SRC_ICETIMERTREE_DEF
+#endif // SUBZERO_SRC_ICETIMERTREE_DEF
diff --git a/src/IceTimerTree.h b/src/IceTimerTree.h
index 029b7f4..289ea2d 100644
--- a/src/IceTimerTree.h
+++ b/src/IceTimerTree.h
@@ -15,6 +15,8 @@
 #ifndef SUBZERO_SRC_ICETIMERTREE_H
 #define SUBZERO_SRC_ICETIMERTREE_H
 
+#include "IceTimerTree.def"
+
 namespace Ice {
 
 class TimerTreeNode;
@@ -38,23 +40,32 @@
 };
 
 class TimerStack {
-  TimerStack(const TimerStack &) = delete;
+  // TimerStack(const TimerStack &) = delete;
   TimerStack &operator=(const TimerStack &) = delete;
 
 public:
-  TimerStack(const IceString &TopLevelName);
-  static TimerIdT getTimerID(const IceString &Name);
+  enum TimerTag {
+#define X(tag) TT_##tag,
+    TIMERTREE_TABLE
+#undef X
+        TT__num
+  };
+  TimerStack(const IceString &Name);
+  TimerIdT getTimerID(const IceString &Name);
   void push(TimerIdT ID);
   void pop(TimerIdT ID);
-  void dump(Ostream &Str);
+  void dump(Ostream &Str, bool DumpCumulative);
 
 private:
   void update();
   static double timestamp();
+  const IceString Name;
   const double FirstTimestamp;
   double LastTimestamp;
   uint64_t StateChangeCount;
-  static std::vector<IceString> IDs; // indexed by TimerIdT
+  // IDsIndex maps a symbolic timer name to its integer ID.
+  std::map<IceString, TimerIdT> IDsIndex;
+  std::vector<IceString> IDs;        // indexed by TimerIdT
   std::vector<TimerTreeNode> Nodes;  // indexed by TTindex
   std::vector<double> LeafTimes;     // indexed by TimerIdT
   TTindex StackTop;
diff --git a/src/PNaClTranslator.cpp b/src/PNaClTranslator.cpp
index af78e2f..f05835a 100644
--- a/src/PNaClTranslator.cpp
+++ b/src/PNaClTranslator.cpp
@@ -843,6 +843,11 @@
         NextLocalInstIndex(Context->getNumGlobalValueIDs()),
         InstIsTerminating(false) {
     Func->setFunctionName(LLVMFunc->getName());
+    if (getFlags().TimeEachFunction)
+      getTranslator().getContext()->pushTimer(
+          getTranslator().getContext()->getTimerID(
+              Ice::GlobalContext::TSK_Funcs, Func->getFunctionName()),
+          Ice::GlobalContext::TSK_Funcs);
     Func->setReturnType(Context->convertToIceType(LLVMFunc->getReturnType()));
     Func->setInternal(LLVMFunc->hasInternalLinkage());
     CurrentNode = InstallNextBasicBlock();
@@ -1404,6 +1409,11 @@
   // for such parsing errors.
   if (Context->getNumErrors() == 0)
     getTranslator().translateFcn(Func);
+  if (getFlags().TimeEachFunction)
+    getTranslator().getContext()->popTimer(
+        getTranslator().getContext()->getTimerID(Ice::GlobalContext::TSK_Funcs,
+                                                 Func->getFunctionName()),
+        Ice::GlobalContext::TSK_Funcs);
 }
 
 void FunctionParser::ReportInvalidBinaryOp(Ice::InstArithmetic::OpKind Op,
diff --git a/src/llvm2ice.cpp b/src/llvm2ice.cpp
index f95c02e..f392b7e 100644
--- a/src/llvm2ice.cpp
+++ b/src/llvm2ice.cpp
@@ -95,8 +95,22 @@
     "timing", cl::desc("Enable breakdown timing of Subzero translation"));
 
 static cl::opt<bool>
-    DisableGlobals("disable-globals",
-                   cl::desc("Disable global initializer translation"));
+TimeEachFunction("timing-funcs",
+                 cl::desc("Print total translation time for each function"));
+
+static cl::opt<std::string> TimingFocusOn(
+    "timing-focus",
+    cl::desc("Break down timing for a specific function (use '*' for all)"),
+    cl::init(""));
+
+static cl::opt<std::string> VerboseFocusOn(
+    "verbose-focus",
+    cl::desc("Temporarily enable full verbosity for a specific function"),
+    cl::init(""));
+
+static cl::opt<bool>
+DisableGlobals("disable-globals",
+               cl::desc("Disable global initializer translation"));
 
 // This is currently unused, and is a placeholder for lit tests.
 static cl::opt<bool>
@@ -169,13 +183,15 @@
   Flags.UseIntegratedAssembler = UseIntegratedAssembler;
   Flags.UseSandboxing = UseSandboxing;
   Flags.DumpStats = DumpStats;
+  Flags.TimeEachFunction = TimeEachFunction;
   Flags.DefaultGlobalPrefix = DefaultGlobalPrefix;
   Flags.DefaultFunctionPrefix = DefaultFunctionPrefix;
+  Flags.TimingFocusOn = TimingFocusOn;
+  Flags.VerboseFocusOn = VerboseFocusOn;
 
   Ice::GlobalContext Ctx(Ls, Os, VMask, TargetArch, OptLevel, TestPrefix,
                          Flags);
-  static Ice::TimerIdT IDszmain = Ice::GlobalContext::getTimerID("szmain");
-  Ice::TimerMarker T(IDszmain, &Ctx);
+  Ice::TimerMarker T(Ice::TimerStack::TT_szmain, &Ctx);
 
   int ErrorStatus = 0;
   if (BuildOnRead) {
@@ -185,8 +201,7 @@
   } else {
     // Parse the input LLVM IR file into a module.
     SMDiagnostic Err;
-    static Ice::TimerIdT IDparse = Ice::GlobalContext::getTimerID("parse");
-    Ice::TimerMarker T1(IDparse, &Ctx);
+    Ice::TimerMarker T1(Ice::TimerStack::TT_parse, &Ctx);
     Module *Mod =
         NaClParseIRFile(IRFilename, InputFileFormat, Err, getGlobalContext());
 
@@ -199,6 +214,10 @@
     Converter.convertToIce();
     ErrorStatus = Converter.getErrorStatus();
   }
+  if (TimeEachFunction) {
+    const bool DumpCumulative = false;
+    Ctx.dumpTimers(Ice::GlobalContext::TSK_Funcs, DumpCumulative);
+  }
   if (SubzeroTimingEnabled)
     Ctx.dumpTimers();
   const bool FinalStats = true;