Improve use of CfgLocalAllocator and introduce containers that use it.

This doesn't make a big difference but does reduce the proportion of time spent
in malloc and free.

BUG=
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1349833005 .
diff --git a/Makefile.standalone b/Makefile.standalone
index 9469f81..12baf72 100644
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -121,6 +121,7 @@
 endif
 
 ifdef MSAN
+  # TODO(ascull): this has an as yet undiagnosed uninitialized memory access
   OBJDIR := $(OBJDIR)+MSan
   CXX_EXTRA += -fsanitize=memory
   LD_EXTRA += -fsanitize=memory
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index 4c703cf..02d4503 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -314,13 +314,13 @@
 void Cfg::reorderNodes() {
   // TODO(ascull): it would be nice if the switch tests were always followed by
   // the default case to allow for fall through.
-  using PlacedList = std::list<CfgNode *>;
+  using PlacedList = CfgList<CfgNode *>;
   PlacedList Placed;      // Nodes with relative placement locked down
   PlacedList Unreachable; // Unreachable nodes
   PlacedList::iterator NoPlace = Placed.end();
   // Keep track of where each node has been tentatively placed so that we can
   // manage insertions into the middle.
-  std::vector<PlacedList::iterator> PlaceIndex(Nodes.size(), NoPlace);
+  CfgVector<PlacedList::iterator> PlaceIndex(Nodes.size(), NoPlace);
   for (CfgNode *Node : Nodes) {
     // The "do ... while(0);" construct is to factor out the --PlaceIndex and
     // assert() statements before moving to the next node.
diff --git a/src/IceCfg.h b/src/IceCfg.h
index 4147dd9..62a8914 100644
--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -273,7 +273,7 @@
   std::unique_ptr<Assembler> TargetAssembler;
   /// Globals required by this CFG. Mostly used for the profiler's globals.
   std::unique_ptr<VariableDeclarationList> GlobalInits;
-  std::vector<InstJumpTable *> JumpTables;
+  CfgVector<InstJumpTable *> JumpTables;
 
   /// CurrentNode is maintained during dumping/emitting just for validating
   /// Variable::DefNode. Normally, a traversal over CfgNodes maintains this, but
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp
index 31a6e8a..dbbfc18 100644
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -827,7 +827,7 @@
 // Helper functions for emit().
 
 void emitRegisterUsage(Ostream &Str, const Cfg *Func, const CfgNode *Node,
-                       bool IsLiveIn, std::vector<SizeT> &LiveRegCount) {
+                       bool IsLiveIn, CfgVector<SizeT> &LiveRegCount) {
   if (!BuildDefs::dump())
     return;
   Liveness *Liveness = Func->getLiveness();
@@ -840,7 +840,7 @@
     Str << "\t\t\t\t# LiveOut=";
   }
   if (!Live->empty()) {
-    std::vector<Variable *> LiveRegs;
+    CfgVector<Variable *> LiveRegs;
     for (SizeT i = 0; i < Live->size(); ++i) {
       if ((*Live)[i]) {
         Variable *Var = Liveness->getVariable(i, Node);
@@ -869,7 +869,7 @@
 }
 
 void emitLiveRangesEnded(Ostream &Str, const Cfg *Func, const Inst *Instr,
-                         std::vector<SizeT> &LiveRegCount) {
+                         CfgVector<SizeT> &LiveRegCount) {
   if (!BuildDefs::dump())
     return;
   bool First = true;
@@ -935,7 +935,7 @@
   // each register is assigned to. Normally that would be only 0 or 1, but the
   // register allocator's AllowOverlap inference allows it to be greater than 1
   // for short periods.
-  std::vector<SizeT> LiveRegCount(Func->getTarget()->getNumRegisters());
+  CfgVector<SizeT> LiveRegCount(Func->getTarget()->getNumRegisters());
   if (DecorateAsm) {
     constexpr bool IsLiveIn = true;
     emitRegisterUsage(Str, Func, this, IsLiveIn, LiveRegCount);
diff --git a/src/IceDefs.h b/src/IceDefs.h
index a38da03..5f96d7e 100644
--- a/src/IceDefs.h
+++ b/src/IceDefs.h
@@ -145,10 +145,14 @@
 using PhiList = InstList;
 using AssignList = InstList;
 
+// Standard library containers with CfgLocalAllocator.
+template <typename T> using CfgVector = std::vector<T, CfgLocalAllocator<T>>;
+template <typename T> using CfgList = std::list<T, CfgLocalAllocator<T>>;
+
 // Containers that are arena-allocated from the Cfg's allocator.
-using OperandList = std::vector<Operand *, CfgLocalAllocator<Operand *>>;
-using VarList = std::vector<Variable *, CfgLocalAllocator<Variable *>>;
-using NodeList = std::vector<CfgNode *, CfgLocalAllocator<CfgNode *>>;
+using OperandList = CfgVector<Operand *>;
+using VarList = CfgVector<Variable *>;
+using NodeList = CfgVector<CfgNode *>;
 
 // Contains that use the default (global) allocator.
 using ConstantList = std::vector<Constant *>;
@@ -168,8 +172,7 @@
 /// value, giving the instruction number that begins or ends a variable's live
 /// range.
 using LiveBeginEndMapEntry = std::pair<SizeT, InstNumberT>;
-using LiveBeginEndMap =
-    std::vector<LiveBeginEndMapEntry, CfgLocalAllocator<LiveBeginEndMapEntry>>;
+using LiveBeginEndMap = CfgVector<LiveBeginEndMapEntry>;
 using LivenessBV = llvm::BitVector;
 
 using TimerStackIdT = uint32_t;
diff --git a/src/IceGlobalContext.cpp b/src/IceGlobalContext.cpp
index 7b7183d..c7497da 100644
--- a/src/IceGlobalContext.cpp
+++ b/src/IceGlobalContext.cpp
@@ -722,10 +722,8 @@
   llvm::DeleteContainerPointers(AllThreadContexts);
   LockedPtr<DestructorArray> Dtors = getDestructors();
   // Destructors are invoked in the opposite object construction order.
-  for (auto DtorIter = Dtors->crbegin(); DtorIter != Dtors->crend();
-       ++DtorIter) {
-    (*DtorIter)();
-  }
+  for (const auto &Dtor : reverse_range(*Dtors))
+    Dtor();
 }
 
 // TODO(stichnot): Consider adding thread-local caches of constant pool entries
diff --git a/src/IceInst.h b/src/IceInst.h
index a727683..fcd7269 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -161,10 +161,7 @@
   void dumpDest(const Cfg *Func) const;
   virtual bool isRedundantAssign() const { return false; }
 
-  // TODO(jpp): Insts should not have non-trivial destructors, but they
-  // currently do. This dtor is marked final as a multi-step refactor that
-  // will eventually fix this problem.
-  virtual ~Inst() = default;
+  ~Inst() = default;
 
 protected:
   Inst(Cfg *Func, InstKind Kind, SizeT MaxSrcs, Variable *Dest);
diff --git a/src/IceLiveness.h b/src/IceLiveness.h
index bd739d3..535ed02 100644
--- a/src/IceLiveness.h
+++ b/src/IceLiveness.h
@@ -47,7 +47,7 @@
     // LiveToVarMap maps a liveness bitvector index to a Variable. This is
     // generally just for printing/dumping. The index should be less than
     // NumLocals + Liveness::NumGlobals.
-    std::vector<Variable *> LiveToVarMap;
+    CfgVector<Variable *> LiveToVarMap;
     // LiveIn and LiveOut track the in- and out-liveness of the global
     // variables. The size of each vector is LivenessNode::NumGlobals.
     LivenessBV LiveIn, LiveOut;
@@ -107,13 +107,13 @@
   LivenessMode Mode;
   SizeT NumGlobals = 0;
   /// Size of Nodes is Cfg::Nodes.size().
-  std::vector<LivenessNode> Nodes;
+  CfgVector<LivenessNode> Nodes;
   /// VarToLiveMap maps a Variable's Variable::Number to its live index within
   /// its basic block.
-  std::vector<SizeT> VarToLiveMap;
+  CfgVector<SizeT> VarToLiveMap;
   /// LiveToVarMap is analogous to LivenessNode::LiveToVarMap, but for non-local
   /// variables.
-  std::vector<Variable *> LiveToVarMap;
+  CfgVector<Variable *> LiveToVarMap;
   /// RangeMask[Variable::Number] indicates whether we want to track that
   /// Variable's live range.
   llvm::BitVector RangeMask;
diff --git a/src/IceLoopAnalyzer.h b/src/IceLoopAnalyzer.h
index 19d38d1..8932e08 100644
--- a/src/IceLoopAnalyzer.h
+++ b/src/IceLoopAnalyzer.h
@@ -88,9 +88,8 @@
     bool Deleted = false;
   };
 
-  using LoopNodeList = std::vector<LoopNode, CfgLocalAllocator<LoopNode>>;
-  using LoopNodePtrList =
-      std::vector<LoopNode *, CfgLocalAllocator<LoopNode *>>;
+  using LoopNodeList = CfgVector<LoopNode>;
+  using LoopNodePtrList = CfgVector<LoopNode *>;
 
   /// Process the node as part as part of Tarjan's algorithm and return either a
   /// node to recurse into or nullptr when the node has been fully processed.
diff --git a/src/IceOperand.h b/src/IceOperand.h
index b4e06be..67b1d23 100644
--- a/src/IceOperand.h
+++ b/src/IceOperand.h
@@ -85,12 +85,13 @@
   }
   /// @}
 
+  ~Operand() = default;
+
 protected:
   Operand(OperandKind Kind, Type Ty) : Ty(Ty), Kind(Kind) {
     // It is undefined behavior to have a larger value in the enum
     assert(Kind <= kTarget_Max);
   }
-  virtual ~Operand() = default;
 
   const Type Ty;
   const OperandKind Kind;
@@ -354,7 +355,7 @@
   LiveRange() = default;
   /// Special constructor for building a kill set. The advantage is that we can
   /// reserve the right amount of space in advance.
-  explicit LiveRange(const std::vector<InstNumberT> &Kills) {
+  explicit LiveRange(const CfgVector<InstNumberT> &Kills) {
     Range.reserve(Kills.size());
     for (InstNumberT I : Kills)
       addSegment(I, I);
@@ -388,8 +389,7 @@
 private:
   using RangeElementType = std::pair<InstNumberT, InstNumberT>;
   /// RangeType is arena-allocated from the Cfg's allocator.
-  using RangeType =
-      std::vector<RangeElementType, CfgLocalAllocator<RangeElementType>>;
+  using RangeType = CfgVector<RangeElementType>;
   RangeType Range;
   /// TrimmedBegin is an optimization for the overlaps() computation. Since the
   /// linear-scan algorithm always calls it as overlaps(Cur) and Cur advances
@@ -556,7 +556,7 @@
   VMK_SingleDefs, /// Track uses+defs, but only record single def
   VMK_All         /// Track uses+defs, including full def list
 };
-using InstDefList = std::vector<const Inst *, CfgLocalAllocator<const Inst *>>;
+using InstDefList = CfgVector<const Inst *>;
 
 /// VariableTracking tracks the metadata for a single variable.  It is
 /// only meant to be used internally by VariablesMetadata.
@@ -652,7 +652,7 @@
 private:
   const Cfg *Func;
   MetadataKind Kind;
-  std::vector<VariableTracking> Metadata;
+  CfgVector<VariableTracking> Metadata;
   const static InstDefList NoDefinitions;
 };
 
diff --git a/src/IceRegAlloc.cpp b/src/IceRegAlloc.cpp
index 304ac37..eac255a 100644
--- a/src/IceRegAlloc.cpp
+++ b/src/IceRegAlloc.cpp
@@ -166,8 +166,8 @@
 
   // Iterate across all instructions and record the begin and end of the live
   // range for each variable that is pre-colored or infinite weight.
-  std::vector<InstNumberT> LRBegin(Vars.size(), Inst::NumberSentinel);
-  std::vector<InstNumberT> LREnd(Vars.size(), Inst::NumberSentinel);
+  CfgVector<InstNumberT> LRBegin(Vars.size(), Inst::NumberSentinel);
+  CfgVector<InstNumberT> LREnd(Vars.size(), Inst::NumberSentinel);
   for (CfgNode *Node : Func->getNodes()) {
     for (Inst &Inst : Node->getInsts()) {
       if (Inst.isDeleted())
diff --git a/src/IceRegAlloc.h b/src/IceRegAlloc.h
index ec37aa0..7742ad7 100644
--- a/src/IceRegAlloc.h
+++ b/src/IceRegAlloc.h
@@ -39,8 +39,8 @@
   static constexpr size_t REGS_SIZE = 32;
 
 private:
-  using OrderedRanges = std::vector<Variable *>;
-  using UnorderedRanges = std::vector<Variable *>;
+  using OrderedRanges = CfgVector<Variable *>;
+  using UnorderedRanges = CfgVector<Variable *>;
 
   class IterationState {
     IterationState(const IterationState &) = delete;
@@ -103,7 +103,7 @@
   /// faster processing.
   OrderedRanges UnhandledPrecolored;
   UnorderedRanges Active, Inactive, Handled;
-  std::vector<InstNumberT> Kills;
+  CfgVector<InstNumberT> Kills;
   RegAllocKind Kind = RAK_Unknown;
   /// RegUses[I] is the number of live ranges (variables) that register I is
   /// currently assigned to. It can be greater than 1 as a result of
diff --git a/src/IceSwitchLowering.h b/src/IceSwitchLowering.h
index df3bef3..8ac1842 100644
--- a/src/IceSwitchLowering.h
+++ b/src/IceSwitchLowering.h
@@ -20,8 +20,7 @@
 
 class CaseCluster;
 
-using CaseClusterArray =
-    std::vector<CaseCluster, CfgLocalAllocator<CaseCluster>>;
+using CaseClusterArray = CfgVector<CaseCluster>;
 
 /// A cluster of cases can be tested by a common method during switch lowering.
 class CaseCluster {
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 47f6ae1..5d8ad3f 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -116,7 +116,6 @@
   // the document "OS X ABI Function Call Guide" by Apple.
   NeedsStackAlignment = true;
 
-  using OperandList = std::vector<Operand *>;
   OperandList XmmArgs;
   OperandList StackArgs, StackArgLocations;
   uint32_t ParameterAreaSizeBytes = 0;
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index a63f470..d92ba88 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -491,8 +491,8 @@
 
 template <class Machine> void TargetX86Base<Machine>::findRMW() {
   Func->dump("Before RMW");
-  OstreamLocker L(Func->getContext());
-  Ostream &Str = Func->getContext()->getStrDump();
+  if (Func->isVerbose(IceV_RMW))
+    Func->getContext()->lockStr();
   for (CfgNode *Node : Func->getNodes()) {
     // Walk through the instructions, considering each sequence of 3
     // instructions, and look for the particular RMW pattern. Note that this
@@ -510,75 +510,76 @@
       assert(!I1->isDeleted());
       assert(!I2->isDeleted());
       assert(!I3->isDeleted());
-      if (auto *Load = llvm::dyn_cast<InstLoad>(I1)) {
-        if (auto *Arith = llvm::dyn_cast<InstArithmetic>(I2)) {
-          if (auto *Store = llvm::dyn_cast<InstStore>(I3)) {
-            // Look for:
-            //   a = Load addr
-            //   b = <op> a, other
-            //   Store b, addr
-            // Change to:
-            //   a = Load addr
-            //   b = <op> a, other
-            //   x = FakeDef
-            //   RMW <op>, addr, other, x
-            //   b = Store b, addr, x
-            // Note that inferTwoAddress() makes sure setDestNonKillable() gets
-            // called on the updated Store instruction, to avoid liveness
-            // problems later.
-            //
-            // With this transformation, the Store instruction acquires a Dest
-            // variable and is now subject to dead code elimination if there
-            // are no more uses of "b".  Variable "x" is a beacon for
-            // determining whether the Store instruction gets dead-code
-            // eliminated.  If the Store instruction is eliminated, then it
-            // must be the case that the RMW instruction ends x's live range,
-            // and therefore the RMW instruction will be retained and later
-            // lowered.  On the other hand, if the RMW instruction does not end
-            // x's live range, then the Store instruction must still be
-            // present, and therefore the RMW instruction is ignored during
-            // lowering because it is redundant with the Store instruction.
-            //
-            // Note that if "a" has further uses, the RMW transformation may
-            // still trigger, resulting in two loads and one store, which is
-            // worse than the original one load and one store.  However, this
-            // is probably rare, and caching probably keeps it just as fast.
-            if (!isSameMemAddressOperand<Machine>(Load->getSourceAddress(),
-                                                  Store->getAddr()))
-              continue;
-            Operand *ArithSrcFromLoad = Arith->getSrc(0);
-            Operand *ArithSrcOther = Arith->getSrc(1);
-            if (ArithSrcFromLoad != Load->getDest()) {
-              if (!Arith->isCommutative() || ArithSrcOther != Load->getDest())
-                continue;
-              std::swap(ArithSrcFromLoad, ArithSrcOther);
-            }
-            if (Arith->getDest() != Store->getData())
-              continue;
-            if (!canRMW(Arith))
-              continue;
-            if (Func->isVerbose(IceV_RMW)) {
-              Str << "Found RMW in " << Func->getFunctionName() << ":\n  ";
-              Load->dump(Func);
-              Str << "\n  ";
-              Arith->dump(Func);
-              Str << "\n  ";
-              Store->dump(Func);
-              Str << "\n";
-            }
-            Variable *Beacon = Func->makeVariable(IceType_i32);
-            Beacon->setMustNotHaveReg();
-            Store->setRmwBeacon(Beacon);
-            InstFakeDef *BeaconDef = InstFakeDef::create(Func, Beacon);
-            Node->getInsts().insert(I3, BeaconDef);
-            auto *RMW = Traits::Insts::FakeRMW::create(
-                Func, ArithSrcOther, Store->getAddr(), Beacon, Arith->getOp());
-            Node->getInsts().insert(I3, RMW);
-          }
-        }
+      auto *Load = llvm::dyn_cast<InstLoad>(I1);
+      auto *Arith = llvm::dyn_cast<InstArithmetic>(I2);
+      auto *Store = llvm::dyn_cast<InstStore>(I3);
+      if (!Load || !Arith || !Store)
+        continue;
+      // Look for:
+      //   a = Load addr
+      //   b = <op> a, other
+      //   Store b, addr
+      // Change to:
+      //   a = Load addr
+      //   b = <op> a, other
+      //   x = FakeDef
+      //   RMW <op>, addr, other, x
+      //   b = Store b, addr, x
+      // Note that inferTwoAddress() makes sure setDestNonKillable() gets
+      // called on the updated Store instruction, to avoid liveness problems
+      // later.
+      //
+      // With this transformation, the Store instruction acquires a Dest
+      // variable and is now subject to dead code elimination if there are no
+      // more uses of "b".  Variable "x" is a beacon for determining whether
+      // the Store instruction gets dead-code eliminated.  If the Store
+      // instruction is eliminated, then it must be the case that the RMW
+      // instruction ends x's live range, and therefore the RMW instruction
+      // will be retained and later lowered.  On the other hand, if the RMW
+      // instruction does not end x's live range, then the Store instruction
+      // must still be present, and therefore the RMW instruction is ignored
+      // during lowering because it is redundant with the Store instruction.
+      //
+      // Note that if "a" has further uses, the RMW transformation may still
+      // trigger, resulting in two loads and one store, which is worse than the
+      // original one load and one store.  However, this is probably rare, and
+      // caching probably keeps it just as fast.
+      if (!isSameMemAddressOperand<Machine>(Load->getSourceAddress(),
+                                            Store->getAddr()))
+        continue;
+      Operand *ArithSrcFromLoad = Arith->getSrc(0);
+      Operand *ArithSrcOther = Arith->getSrc(1);
+      if (ArithSrcFromLoad != Load->getDest()) {
+        if (!Arith->isCommutative() || ArithSrcOther != Load->getDest())
+          continue;
+        std::swap(ArithSrcFromLoad, ArithSrcOther);
       }
+      if (Arith->getDest() != Store->getData())
+        continue;
+      if (!canRMW(Arith))
+        continue;
+      if (Func->isVerbose(IceV_RMW)) {
+        Ostream &Str = Func->getContext()->getStrDump();
+        Str << "Found RMW in " << Func->getFunctionName() << ":\n  ";
+        Load->dump(Func);
+        Str << "\n  ";
+        Arith->dump(Func);
+        Str << "\n  ";
+        Store->dump(Func);
+        Str << "\n";
+      }
+      Variable *Beacon = Func->makeVariable(IceType_i32);
+      Beacon->setMustNotHaveReg();
+      Store->setRmwBeacon(Beacon);
+      InstFakeDef *BeaconDef = InstFakeDef::create(Func, Beacon);
+      Node->getInsts().insert(I3, BeaconDef);
+      auto *RMW = Traits::Insts::FakeRMW::create(
+          Func, ArithSrcOther, Store->getAddr(), Beacon, Arith->getOp());
+      Node->getInsts().insert(I3, RMW);
     }
   }
+  if (Func->isVerbose(IceV_RMW))
+    Func->getContext()->unlockStr();
 }
 
 // Converts a ConstantInteger32 operand into its constant value, or