Subzero: Fix timers for multithreaded translation.

Now that multithreaded parsing and translation is in place, timer operations have to be made thread-local.  After the non-main threads end, their thread-local timer data needs to be merged into the global timer data, which resides in the GlobalContext object.  The merge is a bit tricky because the internal timer stack structure is built up dynamically as items are pushed and popped.  Two threads may have radically different timing data:

1. The parser thread profile is completely different from a translator thread.

2. For -timing-funcs, two translator threads hold data for entirely different sets of functions.

A bit more tweaking will need to be done to make the timing output fully usable in a multithreaded run.  Because of multiple threads, times may add up to >100%.  Also, time spent blocked is being "unfairly" attributed to the caller of the blocking operation - we should either count the user time instead of wall-clock time, or add a special timer marker for blocking locking operations.

BUG= none
R=jvoung@chromium.org

Review URL: https://codereview.chromium.org/878383004
diff --git a/src/IceGlobalContext.h b/src/IceGlobalContext.h
index ac321e6..04f08a2 100644
--- a/src/IceGlobalContext.h
+++ b/src/IceGlobalContext.h
@@ -83,6 +83,35 @@
     uint32_t Fills;
   };
 
+  // TimerList is a vector of TimerStack objects, with extra methods
+  // to initialize and merge these vectors.
+  class TimerList : public std::vector<TimerStack> {
+  public:
+    // initInto() initializes a target list of timers based on the
+    // current list.  In particular, it creates the same number of
+    // timers, in the same order, with the same names, but initially
+    // empty of timing data.
+    void initInto(TimerList &Dest) const {
+      if (!ALLOW_DUMP)
+        return;
+      Dest.clear();
+      for (const TimerStack &Stack : *this) {
+        Dest.push_back(TimerStack(Stack.getName()));
+      }
+    }
+    void mergeFrom(TimerList &Src) {
+      if (!ALLOW_DUMP)
+        return;
+      assert(size() == Src.size());
+      size_type i = 0;
+      for (TimerStack &Stack : *this) {
+        assert(Stack.getName() == Src[i].getName());
+        Stack.mergeFrom(Src[i]);
+        ++i;
+      }
+    }
+  };
+
   // ThreadContext contains thread-local data.  This data can be
   // combined/reduced as needed after all threads complete.
   class ThreadContext {
@@ -92,7 +121,7 @@
   public:
     ThreadContext() {}
     CodeStats StatsFunction;
-    std::vector<TimerStack> Timers;
+    TimerList Timers;
   };
 
 public:
@@ -211,14 +240,20 @@
   // These are predefined TimerStackIdT values.
   enum TimerStackKind { TSK_Default = 0, TSK_Funcs, TSK_Num };
 
+  // newTimerStackID() creates a new TimerStack in the global space.
+  // It does not affect any TimerStack objects in TLS.
   TimerStackIdT newTimerStackID(const IceString &Name);
-  TimerIdT getTimerID(TimerStackIdT StackID, const IceString &Name);
-  void pushTimer(TimerIdT ID, TimerStackIdT StackID = TSK_Default);
-  void popTimer(TimerIdT ID, TimerStackIdT StackID = TSK_Default);
-  void resetTimer(TimerStackIdT StackID);
-  void setTimerName(TimerStackIdT StackID, const IceString &NewName);
+  // dumpTimers() dumps the global timer data.  As such, one probably
+  // wants to call mergeTimerStacks() as a prerequisite.
   void dumpTimers(TimerStackIdT StackID = TSK_Default,
                   bool DumpCumulative = true);
+  // The following methods affect only the calling thread's TLS timer
+  // data.
+  TimerIdT getTimerID(TimerStackIdT StackID, const IceString &Name);
+  void pushTimer(TimerIdT ID, TimerStackIdT StackID);
+  void popTimer(TimerIdT ID, TimerStackIdT StackID);
+  void resetTimer(TimerStackIdT StackID);
+  void setTimerName(TimerStackIdT StackID, const IceString &NewName);
 
   // Adds a newly parsed and constructed function to the Cfg work
   // queue.  Notifies any idle workers that a new function is
@@ -235,8 +270,10 @@
 
   void startWorkerThreads() {
     size_t NumWorkers = getFlags().NumTranslationThreads;
+    auto Timers = getTimers();
     for (size_t i = 0; i < NumWorkers; ++i) {
       ThreadContext *WorkerTLS = new ThreadContext();
+      Timers->initInto(WorkerTLS->Timers);
       AllThreadContexts.push_back(WorkerTLS);
       TranslationThreads.push_back(std::thread(
           &GlobalContext::translateFunctionsWrapper, this, WorkerTLS));
@@ -254,6 +291,11 @@
     }
     TranslationThreads.clear();
     // TODO(stichnot): join the emitter thread.
+    if (ALLOW_DUMP) {
+      auto Timers = getTimers();
+      for (ThreadContext *TLS : AllThreadContexts)
+        Timers->mergeFrom(TLS->Timers);
+    }
   }
 
   // Translation thread startup routine.
@@ -301,7 +343,7 @@
   ICE_CACHELINE_BOUNDARY;
   // Managed by getTimers()
   GlobalLockType TimerLock;
-  std::vector<TimerStack> Timers;
+  TimerList Timers;
 
   ICE_CACHELINE_BOUNDARY;
   // StrLock is a global lock on the dump and emit output streams.
@@ -331,8 +373,8 @@
   LockedPtr<CodeStats> getStatsCumulative() {
     return LockedPtr<CodeStats>(&StatsCumulative, &StatsLock);
   }
-  LockedPtr<std::vector<TimerStack>> getTimers() {
-    return LockedPtr<std::vector<TimerStack>>(&Timers, &TimerLock);
+  LockedPtr<TimerList> getTimers() {
+    return LockedPtr<TimerList>(&Timers, &TimerLock);
   }
 
   std::vector<ThreadContext *> AllThreadContexts;
@@ -357,24 +399,31 @@
   TimerMarker &operator=(const TimerMarker &) = delete;
 
 public:
-  TimerMarker(TimerIdT ID, GlobalContext *Ctx)
-      : ID(ID), Ctx(Ctx), Active(false) {
-    if (ALLOW_DUMP) {
-      Active = Ctx->getFlags().SubzeroTimingEnabled;
-      if (Active)
-        Ctx->pushTimer(ID);
-    }
+  TimerMarker(TimerIdT ID, GlobalContext *Ctx,
+              TimerStackIdT StackID = GlobalContext::TSK_Default)
+      : ID(ID), Ctx(Ctx), StackID(StackID), Active(false) {
+    if (ALLOW_DUMP)
+      push();
   }
-  TimerMarker(TimerIdT ID, const Cfg *Func);
+  TimerMarker(TimerIdT ID, const Cfg *Func,
+              TimerStackIdT StackID = GlobalContext::TSK_Default)
+      : ID(ID), Ctx(nullptr), StackID(StackID), Active(false) {
+    // Ctx gets set at the beginning of pushCfg().
+    if (ALLOW_DUMP)
+      pushCfg(Func);
+  }
 
   ~TimerMarker() {
     if (ALLOW_DUMP && Active)
-      Ctx->popTimer(ID);
+      Ctx->popTimer(ID, StackID);
   }
 
 private:
-  TimerIdT ID;
-  GlobalContext *const Ctx;
+  void push();
+  void pushCfg(const Cfg *Func);
+  const TimerIdT ID;
+  GlobalContext *Ctx;
+  const TimerStackIdT StackID;
   bool Active;
 };