Subzero: Basic Block Profiler.

BUG= None
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1147023007.
diff --git a/Makefile.standalone b/Makefile.standalone
index 6df7c40..82eff15 100644
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -276,7 +276,7 @@
 $(OBJDIR)/unittest: $(OBJDIR)
 	@mkdir -p $@
 
-RT_SRC := runtime/szrt.c runtime/szrt_ll.ll
+RT_SRC := runtime/szrt.c runtime/szrt_ll.ll runtime/szrt_profiler.c
 RT_OBJ := build/runtime/szrt_native_x8632.o build/runtime/szrt_sb_x8632.o
 
 runtime: $(RT_OBJ)
@@ -285,7 +285,7 @@
 # even in a parallel build.
 .INTERMEDIATE: runtime.is.built
 $(RT_OBJ): runtime.is.built
-runtime.is.built: $(RT_SRC)
+runtime.is.built: $(RT_SRC) pydir/build-runtime.py
 	@echo ================ Building Subzero runtime ================
 	./pydir/build-runtime.py -v --pnacl-root $(PNACL_TOOLCHAIN_ROOT)
 
diff --git a/pydir/build-runtime.py b/pydir/build-runtime.py
index 81c8ee2..4d0bb41 100755
--- a/pydir/build-runtime.py
+++ b/pydir/build-runtime.py
@@ -23,11 +23,18 @@
               '-o', obj
           ] + extra_args, echo=verbose)
     shellcmd(['objcopy',
-              '--localize-symbol=nacl_tp_tdb_offset',
-              '--localize-symbol=nacl_tp_tls_offset',
+              '--strip-symbol=nacl_tp_tdb_offset',
+              '--strip-symbol=nacl_tp_tls_offset',
               obj
         ], echo=verbose)
 
+def PartialLink(obj_files, extra_args, lib, verbose):
+    """Partially links a set of obj files into a final obj library."""
+    shellcmd(['ld',
+              '-o', lib,
+              '-r',
+        ] + extra_args + obj_files, echo=verbose)
+
 def main():
     """Build the Subzero runtime support library for all architectures.
     """
@@ -72,12 +79,30 @@
             ], echo=args.verbose)
         ll_files = ['{dir}/szrt.ll'.format(dir=tempdir),
                     '{srcdir}/szrt_ll.ll'.format(srcdir=srcdir)]
-        # Translate tempdir/szrt.ll and srcdir/szrt_ll.ll to szrt_native_x8632.o
+
+        # Translate tempdir/szrt.ll and tempdir/szrt_ll.ll to
+        # szrt_native_x8632.tmp.o.
         Translate(ll_files,
                   ['-mtriple=i686', '-mcpu=pentium4m'],
-                  '{rtdir}/szrt_native_x8632.o'.format(rtdir=rtdir),
+                  '{dir}/szrt_native_x8632.tmp.o'.format(dir=tempdir),
                   args.verbose)
-        # Translate tempdir/szrt.ll and srcdir/szrt_ll.ll to szrt_sb_x8632.o
+        # Compile srcdir/szrt_profiler.c to tempdir/szrt_profiler_native_i686.o
+        shellcmd(['clang',
+                  '-O2',
+                  '-target=i686',
+                  '-c',
+                  '{srcdir}/szrt_profiler.c'.format(srcdir=srcdir),
+                  '-o', '{dir}/szrt_profiler_native_x8632.o'.format(dir=tempdir)
+            ], echo=args.verbose)
+        # Writing full szrt_native_i686.o.
+        PartialLink(['{dir}/szrt_native_x8632.tmp.o'.format(dir=tempdir),
+                     '{dir}/szrt_profiler_native_x8632.o'.format(dir=tempdir)
+            ], ['-m elf_i386'],
+            '{rtdir}/szrt_native_x8632.o'.format(rtdir=rtdir), args.verbose)
+
+        # Translate tempdir/szrt.ll and tempdir/szrt_ll.ll to szrt_sb_x8632.o
+        # The sandboxed library does not get the profiler helper function as the
+        # binaries are linked with -nostdlib.
         Translate(ll_files,
                   ['-mtriple=i686-nacl', '-mcpu=pentium4m'],
                   '{rtdir}/szrt_sb_x8632.o'.format(rtdir=rtdir),
diff --git a/pydir/szbuild.py b/pydir/szbuild.py
index c962d4c..1e19cf3 100755
--- a/pydir/szbuild.py
+++ b/pydir/szbuild.py
@@ -84,7 +84,10 @@
                            choices=['obj', 'asm', 'iasm'],
                            help='Output file type.  Default %(default)s.')
     argparser.add_argument('--sandbox', dest='sandbox', action='store_true',
-                           help='Enabled sandboxing in the translator')
+                           help='Enable sandboxing in the translator')
+    argparser.add_argument('--enable-block-profile',
+                           dest='enable_block_profile', action='store_true',
+                           help='Enable basic block profiling.')
     argparser.add_argument('--verbose', '-v', dest='verbose',
                            action='store_true',
                            help='Display some extra debugging output')
@@ -217,6 +220,8 @@
                    '-ffunction-sections',
                    '-fdata-sections'] if hybrid else []) +
                  (['-sandbox'] if args.sandbox else []) +
+                 (['-enable-block-profile'] if
+                      args.enable_block_profile and not args.sandbox else []) +
                  args.sz_args +
                  [pexe],
                  echo=args.verbose)
diff --git a/runtime/szrt_profiler.c b/runtime/szrt_profiler.c
new file mode 100644
index 0000000..e31692e
--- /dev/null
+++ b/runtime/szrt_profiler.c
@@ -0,0 +1,59 @@
+#include <stdint.h>
+#include <stdio.h>
+
+struct BlockProfileInfo {
+  uint64_t Counter;
+  const char *const BlockName;
+} __attribute__((aligned(8)));
+
+extern const struct BlockProfileInfo *__Sz_block_profile_info;
+
+static const char SubzeroLogo[] =
+    "\n"
+    "\n"
+    "__________________________________________________________________________"
+    "____________________________\n"
+    " _____/\\\\\\\\\\\\\\\\\\\\\\__________________/"
+    "\\\\\\_______________________________________________________________\n"
+    "  "
+    "___/\\\\\\/////////\\\\\\_______________\\/"
+    "\\\\\\_______________________________________________________________\n"
+    "   "
+    "__\\//\\\\\\______\\///________________\\/"
+    "\\\\\\_______________________________________________________________\n"
+    "    "
+    "___\\////\\\\\\__________/\\\\\\____/\\\\\\_\\/\\\\\\_________/"
+    "\\\\\\\\\\\\\\\\\\\\\\_____/\\\\\\\\\\\\\\\\___/\\\\/\\\\\\\\\\\\\\____/"
+    "\\\\\\\\\\____\n"
+    "     "
+    "______\\////\\\\\\______\\/\\\\\\___\\/\\\\\\_\\/\\\\\\\\\\\\\\\\\\__\\///"
+    "////\\\\\\/____/\\\\\\/////\\\\\\_\\/\\\\\\/////\\\\\\_/\\\\\\///"
+    "\\\\\\__\n"
+    "      "
+    "_________\\////\\\\\\___\\/\\\\\\___\\/\\\\\\_\\/\\\\\\////\\\\\\______/"
+    "\\\\\\/_____/\\\\\\\\\\\\\\\\\\\\\\__\\/\\\\\\__\\///__/\\\\\\__\\//"
+    "\\\\\\_\n"
+    "       "
+    "__/\\\\\\______\\//\\\\\\__\\/\\\\\\___\\/\\\\\\_\\/\\\\\\__\\/\\\\\\____/"
+    "\\\\\\/______\\//\\\\///////___\\/\\\\\\_______\\//\\\\\\__/\\\\\\__\n"
+    "        "
+    "_\\///\\\\\\\\\\\\\\\\\\\\\\/___\\//\\\\\\\\\\\\\\\\\\__\\/"
+    "\\\\\\\\\\\\\\\\\\___/\\\\\\\\\\\\\\\\\\\\\\__\\//\\\\\\\\\\\\\\\\\\\\_\\/"
+    "\\\\\\________\\///\\\\\\\\\\/___\n"
+    "         "
+    "___\\///////////______\\/////////___\\/////////___\\///////////____\\/////"
+    "/////__\\///___________\\/////_____\n"
+    "          "
+    "__________________________________________________________________________"
+    "____________________________\n"
+    "\n"
+    "\n";
+
+void __Sz_profile_summary() {
+  printf("%s", SubzeroLogo);
+  for (const struct BlockProfileInfo **curr = &__Sz_block_profile_info;
+       *curr != NULL; ++curr) {
+    printf("%lld\t%s\n", (*curr)->Counter, (*curr)->BlockName);
+  }
+  fflush(stdout);
+}
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index 50aa5ee..5c2f9ad 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -18,6 +18,7 @@
 #include "IceClFlags.h"
 #include "IceDefs.h"
 #include "IceELFObjectWriter.h"
+#include "IceGlobalInits.h"
 #include "IceInst.h"
 #include "IceLiveness.h"
 #include "IceOperand.h"
@@ -75,6 +76,69 @@
 // is used for dumping the stack frame location of Variables.
 bool Cfg::hasComputedFrame() const { return getTarget()->hasComputedFrame(); }
 
+namespace {
+constexpr char BlockNameGlobalPrefix[] = ".L$profiler$block_name$";
+constexpr char BlockStatsGlobalPrefix[] = ".L$profiler$block_info$";
+
+VariableDeclaration *nodeNameDeclaration(const IceString &NodeAsmName) {
+  VariableDeclaration *Var = VariableDeclaration::create();
+  Var->setName(BlockNameGlobalPrefix + NodeAsmName);
+  Var->setIsConstant(true);
+  Var->addInitializer(new VariableDeclaration::DataInitializer(
+      NodeAsmName.data(), NodeAsmName.size() + 1));
+  const SizeT Int64ByteSize = typeWidthInBytes(IceType_i64);
+  Var->setAlignment(Int64ByteSize); // Wasteful, 32-bit could use 4 bytes.
+  return Var;
+}
+
+VariableDeclaration *
+blockProfilingInfoDeclaration(const IceString &NodeAsmName,
+                              VariableDeclaration *NodeNameDeclaration) {
+  VariableDeclaration *Var = VariableDeclaration::create();
+  Var->setName(BlockStatsGlobalPrefix + NodeAsmName);
+  const SizeT Int64ByteSize = typeWidthInBytes(IceType_i64);
+  Var->addInitializer(new VariableDeclaration::ZeroInitializer(Int64ByteSize));
+
+  const RelocOffsetT NodeNameDeclarationOffset = 0;
+  Var->addInitializer(new VariableDeclaration::RelocInitializer(
+      NodeNameDeclaration, NodeNameDeclarationOffset));
+  Var->setAlignment(Int64ByteSize);
+  return Var;
+}
+
+} // end of anonymous namespace
+
+void Cfg::profileBlocks() {
+  if (GlobalInits == nullptr)
+    GlobalInits.reset(new VariableDeclarationList());
+
+  for (CfgNode *Node : Nodes) {
+    IceString NodeAsmName = Node->getAsmName();
+    GlobalInits->push_back(nodeNameDeclaration(NodeAsmName));
+    GlobalInits->push_back(
+        blockProfilingInfoDeclaration(NodeAsmName, GlobalInits->back()));
+    Node->profileExecutionCount(GlobalInits->back());
+  }
+}
+
+bool Cfg::isProfileGlobal(const VariableDeclaration &Var) {
+  return Var.getName().find(BlockStatsGlobalPrefix) == 0;
+}
+
+void Cfg::addCallToProfileSummary() {
+  // The call(s) to __Sz_profile_summary are added by the profiler in functions
+  // that cause the program to exit. This function is defined in
+  // runtime/szrt_profiler.c.
+  Constant *ProfileSummarySym =
+      Ctx->getConstantExternSym("__Sz_profile_summary");
+  constexpr SizeT NumArgs = 0;
+  constexpr Variable *Void = nullptr;
+  constexpr bool HasTailCall = false;
+  auto *Call =
+      InstCall::create(this, NumArgs, Void, ProfileSummarySym, HasTailCall);
+  getEntryNode()->getInsts().push_front(Call);
+}
+
 void Cfg::translate() {
   if (hasError())
     return;
@@ -99,6 +163,16 @@
 
   dump("Initial CFG");
 
+  if (getContext()->getFlags().getEnableBlockProfile()) {
+    profileBlocks();
+    // TODO(jpp): this is fragile, at best. Figure out a better way of detecting
+    // exit functions.
+    if (GlobalContext::matchSymbolName(getFunctionName(), "exit")) {
+      addCallToProfileSummary();
+    }
+    dump("Profiled CFG");
+  }
+
   // The set of translation passes and their order are determined by
   // the target.
   getTarget()->translate();
diff --git a/src/IceCfg.h b/src/IceCfg.h
index 8f74d07..056812b 100644
--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -128,10 +128,17 @@
     return static_cast<T *>(TargetAssembler.get());
   }
   Assembler *releaseAssembler() { return TargetAssembler.release(); }
+  std::unique_ptr<VariableDeclarationList> getGlobalInits() {
+    return std::move(GlobalInits);
+  }
   bool hasComputedFrame() const;
   bool getFocusedTiming() const { return FocusedTiming; }
   void setFocusedTiming() { FocusedTiming = true; }
 
+  // Returns true if Var is a global variable that is used by the profiling
+  // code.
+  static bool isProfileGlobal(const VariableDeclaration &Var);
+
   // Passes over the CFG.
   void translate();
   // After the CFG is fully constructed, iterate over the nodes and
@@ -188,6 +195,15 @@
 private:
   Cfg(GlobalContext *Ctx, uint32_t SequenceNumber);
 
+  // Adds a call to the ProfileSummary runtime function as the first instruction
+  // in this CFG's entry block.
+  void addCallToProfileSummary();
+
+  // Iterates over the basic blocks in this CFG, adding profiling code to each
+  // one of them. It returns a list with all the globals that the profiling code
+  // needs to be defined.
+  void profileBlocks();
+
   GlobalContext *Ctx;
   uint32_t SequenceNumber; // output order for emission
   VerboseMask VMask;
@@ -209,6 +225,8 @@
   std::unique_ptr<TargetLowering> Target;
   std::unique_ptr<VariablesMetadata> VMetadata;
   std::unique_ptr<Assembler> TargetAssembler;
+  // Globals required by this CFG. Mostly used for the profiler's globals.
+  std::unique_ptr<VariableDeclarationList> GlobalInits;
 
   // CurrentNode is maintained during dumping/emitting just for
   // validating Variable::DefNode.  Normally, a traversal over
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp
index 87eee0f..68578b9 100644
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -15,6 +15,7 @@
 #include "assembler.h"
 #include "IceCfg.h"
 #include "IceCfgNode.h"
+#include "IceGlobalInits.h"
 #include "IceInst.h"
 #include "IceLiveness.h"
 #include "IceOperand.h"
@@ -1243,4 +1244,31 @@
   }
 }
 
+void CfgNode::profileExecutionCount(VariableDeclaration *Var) {
+  constexpr char RMW_I64[] = "llvm.nacl.atomic.rmw.i64";
+
+  GlobalContext *Context = Func->getContext();
+
+  bool BadIntrinsic = false;
+  const Intrinsics::FullIntrinsicInfo *Info =
+      Context->getIntrinsicsInfo().find(RMW_I64, BadIntrinsic);
+  assert(!BadIntrinsic);
+  assert(Info != nullptr);
+
+  Operand *RMWI64Name = Context->getConstantExternSym(RMW_I64);
+  Constant *Counter = Context->getConstantExternSym(Var->getName());
+  Constant *AtomicRMWOp = Context->getConstantInt32(Intrinsics::AtomicAdd);
+  Constant *One = Context->getConstantInt64(1);
+  Constant *OrderAcquireRelease =
+      Context->getConstantInt32(Intrinsics::MemoryOrderAcquireRelease);
+
+  InstIntrinsicCall *Inst = InstIntrinsicCall::create(
+      Func, 5, Func->makeVariable(IceType_i64), RMWI64Name, Info->Info);
+  Inst->addArg(AtomicRMWOp);
+  Inst->addArg(Counter);
+  Inst->addArg(One);
+  Inst->addArg(OrderAcquireRelease);
+  Insts.push_front(Inst);
+}
+
 } // end of namespace Ice
diff --git a/src/IceCfgNode.h b/src/IceCfgNode.h
index e4fe2f9..473c47e 100644
--- a/src/IceCfgNode.h
+++ b/src/IceCfgNode.h
@@ -91,6 +91,8 @@
   void emitIAS(Cfg *Func) const;
   void dump(Cfg *Func) const;
 
+  void profileExecutionCount(VariableDeclaration *Var);
+
 private:
   CfgNode(Cfg *Func, SizeT LabelIndex);
   Cfg *const Func;
diff --git a/src/IceClFlags.cpp b/src/IceClFlags.cpp
index 203b54e..069c3e3 100644
--- a/src/IceClFlags.cpp
+++ b/src/IceClFlags.cpp
@@ -65,6 +65,12 @@
     DumpStats("szstats",
               cl::desc("Print statistics after translating each function"));
 
+cl::opt<bool> EnableBlockProfile(
+    "enable-block-profile",
+    cl::desc("If true, instrument basic blocks, and output profiling "
+             "information to stdout at the end of program execution."),
+    cl::init(false));
+
 cl::opt<bool>
     FunctionSections("ffunction-sections",
                      cl::desc("Emit functions into separate sections"));
@@ -261,6 +267,7 @@
   OutFlags.DisableIRGeneration = false;
   OutFlags.DisableTranslation = false;
   OutFlags.DumpStats = false;
+  OutFlags.EnableBlockProfile = false;
   OutFlags.FunctionSections = false;
   OutFlags.GenerateUnitTestMessages = false;
   OutFlags.PhiEdgeSplit = false;
@@ -311,6 +318,7 @@
   OutFlags.setDisableIRGeneration(::DisableIRGeneration);
   OutFlags.setDisableTranslation(::DisableTranslation);
   OutFlags.setDumpStats(::DumpStats);
+  OutFlags.setEnableBlockProfile(::EnableBlockProfile);
   OutFlags.setFunctionSections(::FunctionSections);
   OutFlags.setNumTranslationThreads(::NumThreads);
   OutFlags.setOptLevel(::OLevel);
diff --git a/src/IceClFlags.h b/src/IceClFlags.h
index 517c77f..7df6973 100644
--- a/src/IceClFlags.h
+++ b/src/IceClFlags.h
@@ -65,6 +65,9 @@
   bool getDumpStats() const { return ALLOW_DUMP && DumpStats; }
   void setDumpStats(bool NewValue) { DumpStats = NewValue; }
 
+  bool getEnableBlockProfile() const { return EnableBlockProfile; }
+  void setEnableBlockProfile(bool NewValue) { EnableBlockProfile = NewValue; }
+
   bool getFunctionSections() const { return FunctionSections; }
   void setFunctionSections(bool NewValue) { FunctionSections = NewValue; }
 
@@ -182,6 +185,7 @@
   bool DisableIRGeneration;
   bool DisableTranslation;
   bool DumpStats;
+  bool EnableBlockProfile;
   bool FunctionSections;
   bool GenerateUnitTestMessages;
   bool PhiEdgeSplit;
diff --git a/src/IceELFObjectWriter.cpp b/src/IceELFObjectWriter.cpp
index aab663c..9761dde 100644
--- a/src/IceELFObjectWriter.cpp
+++ b/src/IceELFObjectWriter.cpp
@@ -383,9 +383,8 @@
       for (VariableDeclaration::Initializer *Init : Var->getInitializers()) {
         switch (Init->getKind()) {
         case VariableDeclaration::Initializer::DataInitializerKind: {
-          const auto Data =
-              llvm::cast<VariableDeclaration::DataInitializer>(Init)
-                  ->getContents();
+          const auto Data = llvm::cast<VariableDeclaration::DataInitializer>(
+                                Init)->getContents();
           Section->appendData(Str, llvm::StringRef(Data.data(), Data.size()));
           break;
         }
diff --git a/src/IceGlobalContext.cpp b/src/IceGlobalContext.cpp
index 8de57f3..3b54c03 100644
--- a/src/IceGlobalContext.cpp
+++ b/src/IceGlobalContext.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Support/Timer.h"
 
 #include "IceCfg.h"
+#include "IceCfgNode.h"
 #include "IceClFlags.h"
 #include "IceDefs.h"
 #include "IceELFObjectWriter.h"
@@ -277,6 +278,7 @@
       Cfg::setCurrentCfg(nullptr);
       continue; // Func goes out of scope and gets deleted
     }
+
     Func->translate();
     EmitterWorkItem *Item = nullptr;
     if (Func->hasError()) {
@@ -285,6 +287,7 @@
       getStrError() << "ICE translation error: " << Func->getFunctionName()
                     << ": " << Func->getError() << "\n";
       Item = new EmitterWorkItem(Func->getSequenceNumber());
+      Item->setGlobalInits(Func->getGlobalInits());
     } else {
       Func->getAssembler<>()->setInternal(Func->getInternal());
       switch (getFlags().getOutFileType()) {
@@ -299,11 +302,15 @@
         // Copy relevant fields into Asm before Func is deleted.
         Asm->setFunctionName(Func->getFunctionName());
         Item = new EmitterWorkItem(Func->getSequenceNumber(), Asm);
+        Item->setGlobalInits(Func->getGlobalInits());
       } break;
       case FT_Asm:
         // The Cfg has not been emitted yet, so stats are not ready
         // to be dumped.
+        std::unique_ptr<VariableDeclarationList> GlobalInits =
+            Func->getGlobalInits();
         Item = new EmitterWorkItem(Func->getSequenceNumber(), Func.release());
+        Item->setGlobalInits(std::move(GlobalInits));
         break;
       }
     }
@@ -316,6 +323,43 @@
 
 namespace {
 
+// Adds an array of pointers to all the profiler-generated globals. The
+// __Sz_profile_summary function iterates over this array for printing the
+// profiling counters.
+VariableDeclaration *blockProfileInfo(const VariableDeclarationList &Globals) {
+  auto *Var = VariableDeclaration::create();
+  Var->setAlignment(typeWidthInBytes(IceType_i64));
+  Var->setIsConstant(true);
+
+  // Note: if you change this symbol, make sure to update
+  // runtime/szrt_profiler.c as well.
+  Var->setName("__Sz_block_profile_info");
+  Var->setLinkage(llvm::GlobalValue::ExternalLinkage);
+  for (const VariableDeclaration *Global : Globals) {
+    if (Cfg::isProfileGlobal(*Global)) {
+      constexpr RelocOffsetT BlockExecutionCounterOffset = 0;
+      Var->addInitializer(new VariableDeclaration::RelocInitializer(
+          Global, BlockExecutionCounterOffset));
+    }
+  }
+
+  // This adds a 64-bit sentinel entry to the end of our array. For 32-bit
+  // architectures this will waste 4 bytes.
+  const SizeT Sizeof64BitNullPtr = typeWidthInBytes(IceType_i64);
+  Var->addInitializer(
+      new VariableDeclaration::ZeroInitializer(Sizeof64BitNullPtr));
+
+  return Var;
+}
+
+void addBlockProfileInfoArrayToGlobals(VariableDeclarationList *Globals) {
+  // Purposefully create the Var temp to prevent bugs in case the compiler
+  // reorders instructions in a way that Globals is extended before the call
+  // to profileInfoArray.
+  VariableDeclaration *Var = blockProfileInfo(*Globals);
+  Globals->push_back(Var);
+}
+
 void lowerGlobals(GlobalContext *Ctx,
                   std::unique_ptr<VariableDeclarationList> VariableDeclarations,
                   TargetDataLowering *DataLowering) {
@@ -331,6 +375,13 @@
   }
   if (Ctx->getFlags().getDisableTranslation())
     return;
+
+  // There should be no need to emit the block_profile_info array if profiling
+  // is disabled. In practice, given that szrt_profiler.o will always be
+  // embedded in the application, we need to add it. In a non-profiled build
+  // this array will only contain the nullptr terminator.
+  addBlockProfileInfoArrayToGlobals(VariableDeclarations.get());
+
   DataLowering->lowerGlobals(std::move(VariableDeclarations));
 }
 
@@ -340,6 +391,13 @@
     Pending.resize(Index + 1);
 }
 
+void addAllIfNotNull(std::unique_ptr<VariableDeclarationList> src,
+                     VariableDeclarationList *dst) {
+  if (src != nullptr) {
+    dst->insert(dst->end(), src->begin(), src->end());
+  }
+}
+
 } // end of anonymous namespace
 
 void GlobalContext::emitItems() {
@@ -350,6 +408,8 @@
   // the work queue, and if it's not the item we're waiting for, we
   // insert it into Pending and repeat.  The work item is deleted
   // after it is processed.
+  std::unique_ptr<VariableDeclarationList> GlobalInits(
+      new VariableDeclarationList());
   std::vector<EmitterWorkItem *> Pending;
   uint32_t DesiredSequenceNumber = getFirstSequenceNumber();
   while (true) {
@@ -359,7 +419,7 @@
     if (RawItem == nullptr)
       RawItem = emitQueueBlockingPop();
     if (RawItem == nullptr)
-      return;
+      break;
     uint32_t ItemSeq = RawItem->getSequenceNumber();
     if (Threaded && ItemSeq != DesiredSequenceNumber) {
       resizePending(Pending, ItemSeq);
@@ -373,10 +433,10 @@
     case EmitterWorkItem::WI_Nop:
       break;
     case EmitterWorkItem::WI_GlobalInits: {
-      lowerGlobals(this, Item->getGlobalInits(),
-                   TargetDataLowering::createLowering(this).get());
+      addAllIfNotNull(Item->getGlobalInits(), GlobalInits.get());
     } break;
     case EmitterWorkItem::WI_Asm: {
+      addAllIfNotNull(Item->getGlobalInits(), GlobalInits.get());
       std::unique_ptr<Assembler> Asm = Item->getAsm();
       Asm->alignFunction();
       IceString MangledName = mangleName(Asm->getFunctionName());
@@ -398,6 +458,9 @@
     case EmitterWorkItem::WI_Cfg: {
       if (!ALLOW_DUMP)
         llvm::report_fatal_error("WI_Cfg work item created inappropriately");
+
+      addAllIfNotNull(Item->getGlobalInits(), GlobalInits.get());
+
       assert(getFlags().getOutFileType() == FT_Asm);
       std::unique_ptr<Cfg> Func = Item->getCfg();
       // Unfortunately, we have to temporarily install the Cfg in TLS
@@ -410,6 +473,9 @@
     } break;
     }
   }
+
+  lowerGlobals(this, std::move(GlobalInits),
+               TargetDataLowering::createLowering(this).get());
 }
 
 // Scan a string for S[0-9A-Z]*_ patterns and replace them with
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index b8b633a..90bb0e3 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -3119,11 +3119,10 @@
       Func->setError("Unexpected memory ordering for AtomicRMW");
       return;
     }
-    lowerAtomicRMW(
-        Instr->getDest(),
-        static_cast<uint32_t>(
-            llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
-        Instr->getArg(1), Instr->getArg(2));
+    lowerAtomicRMW(Instr->getDest(),
+                   static_cast<uint32_t>(llvm::cast<ConstantInteger32>(
+                                             Instr->getArg(0))->getValue()),
+                   Instr->getArg(1), Instr->getArg(2));
     return;
   case Intrinsics::AtomicStore: {
     if (!Intrinsics::isMemoryOrderValid(
diff --git a/src/IceThreading.cpp b/src/IceThreading.cpp
index 5576abb..5cdaaa8 100644
--- a/src/IceThreading.cpp
+++ b/src/IceThreading.cpp
@@ -30,8 +30,15 @@
     : Sequence(Seq), Kind(WI_Cfg), GlobalInits(nullptr), Function(nullptr),
       RawFunc(F) {}
 
+void EmitterWorkItem::setGlobalInits(
+    std::unique_ptr<VariableDeclarationList> GloblInits) {
+  assert(getKind() == WI_Asm || getKind() == WI_Cfg);
+  GlobalInits = std::move(GloblInits);
+}
+
 std::unique_ptr<VariableDeclarationList> EmitterWorkItem::getGlobalInits() {
-  assert(getKind() == WI_GlobalInits);
+  assert(getKind() == WI_GlobalInits || getKind() == WI_Asm ||
+         getKind() == WI_Cfg);
   return std::move(GlobalInits);
 }
 
diff --git a/src/IceThreading.h b/src/IceThreading.h
index 9ae3b67..35e1bfb 100644
--- a/src/IceThreading.h
+++ b/src/IceThreading.h
@@ -190,6 +190,7 @@
   EmitterWorkItem(uint32_t Seq, Cfg *F);
   uint32_t getSequenceNumber() const { return Sequence; }
   ItemKind getKind() const { return Kind; }
+  void setGlobalInits(std::unique_ptr<VariableDeclarationList> GloblInits);
   std::unique_ptr<VariableDeclarationList> getGlobalInits();
   std::unique_ptr<Assembler> getAsm();
   std::unique_ptr<Cfg> getCfg();