Subzero: Add locking to prepare for multithreaded translation.

This just gets the locking in place.  Actual multithreading will be added later.

Mutexes are added for accessing the GlobalContext allocator, the constant pool, the stats data, and the profiling timers.  These are managed via the LockedPtr<> helper.  Finer grain locks on the constant pool may be added later, i.e. a separate lock for each data type.

An vector of pointers to TLS objects is added to GlobalContext.  Each new thread will get its own TLS object, whose address is added to the vector.  (After threads complete, things like stats can be combined by iterating over the vector.)

The dump/emit streams are guarded by a separate lock, to avoid fine-grain interleaving of output by multiple threads.  E.g., lock the streams, emit an entire function, and unlock the streams.  This works for dumping too, though dump output for different passes on the same function may be interleaved with that of another thread.  There is an OstreamLocker helper class to keep this simple.

CodeStats is made an inner class of GlobalContext (this was missed on a previous CL).

BUG= none
R=jfb@chromium.org, jvoung@chromium.org, kschimpf@google.com

Review URL: https://codereview.chromium.org/848193003
diff --git a/src/IceGlobalContext.h b/src/IceGlobalContext.h
index cf2478f..ab9c814 100644
--- a/src/IceGlobalContext.h
+++ b/src/IceGlobalContext.h
@@ -16,6 +16,7 @@
 #define SUBZERO_SRC_ICEGLOBALCONTEXT_H
 
 #include <memory>
+#include <mutex>
 
 #include "IceDefs.h"
 #include "IceClFlags.h"
@@ -27,40 +28,74 @@
 namespace Ice {
 
 class ClFlags;
+class ConstantPool;
 class FuncSigType;
 
-// This class collects rudimentary statistics during translation.
-class CodeStats {
-  CodeStats(const CodeStats &) = delete;
-  CodeStats &operator=(const CodeStats &) = default;
+typedef std::mutex GlobalLockType;
+
+// LockedPtr is a way to provide automatically locked access to some object.
+template <typename T> class LockedPtr {
+  LockedPtr() = delete;
+  LockedPtr(const LockedPtr &) = delete;
+  LockedPtr &operator=(const LockedPtr &) = delete;
 
 public:
-  CodeStats()
-      : InstructionsEmitted(0), RegistersSaved(0), FrameBytes(0), Spills(0),
-        Fills(0) {}
-  void reset() { *this = CodeStats(); }
-  void updateEmitted(uint32_t InstCount) { InstructionsEmitted += InstCount; }
-  void updateRegistersSaved(uint32_t Num) { RegistersSaved += Num; }
-  void updateFrameBytes(uint32_t Bytes) { FrameBytes += Bytes; }
-  void updateSpills() { ++Spills; }
-  void updateFills() { ++Fills; }
-  void dump(const IceString &Name, Ostream &Str);
+  LockedPtr(T *Value, GlobalLockType *Lock) : Value(Value), Lock(Lock) {
+    Lock->lock();
+  }
+  LockedPtr(LockedPtr &&Other) : Value(Other.Value), Lock(Other.Lock) {
+    Other.Value = nullptr;
+    Other.Lock = nullptr;
+  }
+  ~LockedPtr() { Lock->unlock(); }
+  T *operator->() const { return Value; }
 
 private:
-  uint32_t InstructionsEmitted;
-  uint32_t RegistersSaved;
-  uint32_t FrameBytes;
-  uint32_t Spills;
-  uint32_t Fills;
+  T *Value;
+  GlobalLockType *Lock;
 };
 
-// TODO: Accesses to all non-const fields of GlobalContext need to
-// be synchronized, especially the constant pool, the allocator, and
-// the output streams.
 class GlobalContext {
   GlobalContext(const GlobalContext &) = delete;
   GlobalContext &operator=(const GlobalContext &) = delete;
 
+  // CodeStats collects rudimentary statistics during translation.
+  class CodeStats {
+    CodeStats(const CodeStats &) = delete;
+    CodeStats &operator=(const CodeStats &) = default;
+
+  public:
+    CodeStats()
+        : InstructionsEmitted(0), RegistersSaved(0), FrameBytes(0), Spills(0),
+          Fills(0) {}
+    void reset() { *this = CodeStats(); }
+    void updateEmitted(uint32_t InstCount) { InstructionsEmitted += InstCount; }
+    void updateRegistersSaved(uint32_t Num) { RegistersSaved += Num; }
+    void updateFrameBytes(uint32_t Bytes) { FrameBytes += Bytes; }
+    void updateSpills() { ++Spills; }
+    void updateFills() { ++Fills; }
+    void dump(const IceString &Name, Ostream &Str);
+
+  private:
+    uint32_t InstructionsEmitted;
+    uint32_t RegistersSaved;
+    uint32_t FrameBytes;
+    uint32_t Spills;
+    uint32_t Fills;
+  };
+
+  // ThreadContext contains thread-local data.  This data can be
+  // combined/reduced as needed after all threads complete.
+  class ThreadContext {
+    ThreadContext(const ThreadContext &) = delete;
+    ThreadContext &operator=(const ThreadContext &) = delete;
+
+  public:
+    ThreadContext() {}
+    CodeStats StatsFunction;
+    std::vector<TimerStack> Timers;
+  };
+
 public:
   GlobalContext(Ostream *OsDump, Ostream *OsEmit, ELFStreamer *ELFStreamer,
                 VerboseMask Mask, TargetArch Arch, OptLevel Opt,
@@ -76,6 +111,19 @@
   void addVerbose(VerboseMask Mask) { VMask |= Mask; }
   void subVerbose(VerboseMask Mask) { VMask &= ~Mask; }
 
+  // The dump and emit streams need to be used by only one thread at a
+  // time.  This is done by exclusively reserving the streams via
+  // lockStr() and unlockStr().  The OstreamLocker class can be used
+  // to conveniently manage this.
+  //
+  // The model is that a thread grabs the stream lock, then does an
+  // arbitrary amount of work during which far-away callees may grab
+  // the stream and do something with it, and finally the thread
+  // releases the stream lock.  This allows large chunks of output to
+  // be dumped or emitted without risking interleaving from multiple
+  // threads.
+  void lockStr() { StrLock.lock(); }
+  void unlockStr() { StrLock.unlock(); }
   Ostream &getStrDump() { return *StrDump; }
   Ostream &getStrEmit() { return *StrEmit; }
 
@@ -109,7 +157,7 @@
   Constant *getConstantZero(Type Ty);
   // getConstantPool() returns a copy of the constant pool for
   // constants of a given type.
-  ConstantList getConstantPool(Type Ty) const;
+  ConstantList getConstantPool(Type Ty);
   // Returns a new function declaration, allocated in an internal
   // memory pool.  Ownership of the function is maintained by this
   // class instance.
@@ -129,7 +177,7 @@
   }
 
   // Allocate data of type T using the global allocator.
-  template <typename T> T *allocate() { return Allocator.Allocate<T>(); }
+  template <typename T> T *allocate() { return getAllocator()->Allocate<T>(); }
 
   const Intrinsics &getIntrinsicsInfo() const { return IntrinsicsInfo; }
 
@@ -142,38 +190,38 @@
   // Reset stats at the beginning of a function.
   void resetStats() {
     if (ALLOW_DUMP)
-      StatsFunction.reset();
+      TLS->StatsFunction.reset();
   }
   void dumpStats(const IceString &Name, bool Final = false);
   void statsUpdateEmitted(uint32_t InstCount) {
-    if (!ALLOW_DUMP)
+    if (!ALLOW_DUMP || !getFlags().DumpStats)
       return;
-    StatsFunction.updateEmitted(InstCount);
-    StatsCumulative.updateEmitted(InstCount);
+    TLS->StatsFunction.updateEmitted(InstCount);
+    getStatsCumulative()->updateEmitted(InstCount);
   }
   void statsUpdateRegistersSaved(uint32_t Num) {
-    if (!ALLOW_DUMP)
+    if (!ALLOW_DUMP || !getFlags().DumpStats)
       return;
-    StatsFunction.updateRegistersSaved(Num);
-    StatsCumulative.updateRegistersSaved(Num);
+    TLS->StatsFunction.updateRegistersSaved(Num);
+    getStatsCumulative()->updateRegistersSaved(Num);
   }
   void statsUpdateFrameBytes(uint32_t Bytes) {
-    if (!ALLOW_DUMP)
+    if (!ALLOW_DUMP || !getFlags().DumpStats)
       return;
-    StatsFunction.updateFrameBytes(Bytes);
-    StatsCumulative.updateFrameBytes(Bytes);
+    TLS->StatsFunction.updateFrameBytes(Bytes);
+    getStatsCumulative()->updateFrameBytes(Bytes);
   }
   void statsUpdateSpills() {
-    if (!ALLOW_DUMP)
+    if (!ALLOW_DUMP || !getFlags().DumpStats)
       return;
-    StatsFunction.updateSpills();
-    StatsCumulative.updateSpills();
+    TLS->StatsFunction.updateSpills();
+    getStatsCumulative()->updateSpills();
   }
   void statsUpdateFills() {
-    if (!ALLOW_DUMP)
+    if (!ALLOW_DUMP || !getFlags().DumpStats)
       return;
-    StatsFunction.updateFills();
-    StatsCumulative.updateFills();
+    TLS->StatsFunction.updateFills();
+    getStatsCumulative()->updateFills();
   }
 
   // These are predefined TimerStackIdT values.
@@ -183,8 +231,8 @@
     TSK_Num
   };
 
-  TimerIdT getTimerID(TimerStackIdT StackID, const IceString &Name);
   TimerStackIdT newTimerStackID(const IceString &Name);
+  TimerIdT getTimerID(TimerStackIdT StackID, const IceString &Name);
   void pushTimer(TimerIdT ID, TimerStackIdT StackID = TSK_Default);
   void popTimer(TimerIdT ID, TimerStackIdT StackID = TSK_Default);
   void resetTimer(TimerStackIdT StackID);
@@ -193,12 +241,24 @@
                   bool DumpCumulative = true);
 
 private:
+  // Try to make sure the mutexes are allocated on separate cache
+  // lines, assuming the maximum cache line size is 64.
+  const static size_t MaxCacheLineSize = 64;
+  alignas(MaxCacheLineSize) GlobalLockType AllocLock;
+  alignas(MaxCacheLineSize) GlobalLockType ConstPoolLock;
+  alignas(MaxCacheLineSize) GlobalLockType StatsLock;
+  alignas(MaxCacheLineSize) GlobalLockType TimerLock;
+
+  // StrLock is a global lock on the dump and emit output streams.
+  typedef std::mutex StrLockType;
+  StrLockType StrLock;
+
   Ostream *StrDump; // Stream for dumping / diagnostics
   Ostream *StrEmit; // Stream for code emission
 
   ArenaAllocator<> Allocator;
   VerboseMask VMask;
-  std::unique_ptr<class ConstantPool> ConstPool;
+  std::unique_ptr<ConstantPool> ConstPool;
   Intrinsics IntrinsicsInfo;
   const TargetArch Arch;
   const OptLevel Opt;
@@ -206,11 +266,28 @@
   const ClFlags &Flags;
   RandomNumberGenerator RNG;
   std::unique_ptr<ELFObjectWriter> ObjectWriter;
-  CodeStats StatsFunction;
   CodeStats StatsCumulative;
   std::vector<TimerStack> Timers;
   std::vector<GlobalDeclaration *> GlobalDeclarations;
 
+  LockedPtr<ArenaAllocator<>> getAllocator() {
+    return LockedPtr<ArenaAllocator<>>(&Allocator, &AllocLock);
+  }
+  LockedPtr<ConstantPool> getConstPool() {
+    return LockedPtr<ConstantPool>(ConstPool.get(), &ConstPoolLock);
+  }
+  LockedPtr<CodeStats> getStatsCumulative() {
+    return LockedPtr<CodeStats>(&StatsCumulative, &StatsLock);
+  }
+  LockedPtr<std::vector<TimerStack>> getTimers() {
+    return LockedPtr<std::vector<TimerStack>>(&Timers, &TimerLock);
+  }
+
+  std::vector<ThreadContext *> AllThreadContexts;
+  // Each thread has its own TLS pointer which is also held in
+  // AllThreadContexts.
+  thread_local static ThreadContext *TLS;
+
   // Private helpers for mangleName()
   typedef llvm::SmallVector<char, 32> ManglerVector;
   void incrementSubstitutions(ManglerVector &OldName) const;
@@ -245,6 +322,22 @@
   bool Active;
 };
 
+// Helper class for locking the streams and then automatically
+// unlocking them.
+class OstreamLocker {
+private:
+  OstreamLocker() = delete;
+  OstreamLocker(const OstreamLocker &) = delete;
+  OstreamLocker &operator=(const OstreamLocker &) = delete;
+
+public:
+  explicit OstreamLocker(GlobalContext *Ctx) : Ctx(Ctx) { Ctx->lockStr(); }
+  ~OstreamLocker() { Ctx->unlockStr(); }
+
+private:
+  GlobalContext *const Ctx;
+};
+
 } // end of namespace Ice
 
 #endif // SUBZERO_SRC_ICEGLOBALCONTEXT_H