Improve use of CfgLocalAllocator and introduce containers that use it. This doesn't make a big difference but does reduce the proportion of time spent in malloc and free. BUG= R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1349833005 .
diff --git a/Makefile.standalone b/Makefile.standalone index 9469f81..12baf72 100644 --- a/Makefile.standalone +++ b/Makefile.standalone
@@ -121,6 +121,7 @@ endif ifdef MSAN + # TODO(ascull): this has an as yet undiagnosed uninitialized memory access OBJDIR := $(OBJDIR)+MSan CXX_EXTRA += -fsanitize=memory LD_EXTRA += -fsanitize=memory
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp index 4c703cf..02d4503 100644 --- a/src/IceCfg.cpp +++ b/src/IceCfg.cpp
@@ -314,13 +314,13 @@ void Cfg::reorderNodes() { // TODO(ascull): it would be nice if the switch tests were always followed by // the default case to allow for fall through. - using PlacedList = std::list<CfgNode *>; + using PlacedList = CfgList<CfgNode *>; PlacedList Placed; // Nodes with relative placement locked down PlacedList Unreachable; // Unreachable nodes PlacedList::iterator NoPlace = Placed.end(); // Keep track of where each node has been tentatively placed so that we can // manage insertions into the middle. - std::vector<PlacedList::iterator> PlaceIndex(Nodes.size(), NoPlace); + CfgVector<PlacedList::iterator> PlaceIndex(Nodes.size(), NoPlace); for (CfgNode *Node : Nodes) { // The "do ... while(0);" construct is to factor out the --PlaceIndex and // assert() statements before moving to the next node.
diff --git a/src/IceCfg.h b/src/IceCfg.h index 4147dd9..62a8914 100644 --- a/src/IceCfg.h +++ b/src/IceCfg.h
@@ -273,7 +273,7 @@ std::unique_ptr<Assembler> TargetAssembler; /// Globals required by this CFG. Mostly used for the profiler's globals. std::unique_ptr<VariableDeclarationList> GlobalInits; - std::vector<InstJumpTable *> JumpTables; + CfgVector<InstJumpTable *> JumpTables; /// CurrentNode is maintained during dumping/emitting just for validating /// Variable::DefNode. Normally, a traversal over CfgNodes maintains this, but
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp index 31a6e8a..dbbfc18 100644 --- a/src/IceCfgNode.cpp +++ b/src/IceCfgNode.cpp
@@ -827,7 +827,7 @@ // Helper functions for emit(). void emitRegisterUsage(Ostream &Str, const Cfg *Func, const CfgNode *Node, - bool IsLiveIn, std::vector<SizeT> &LiveRegCount) { + bool IsLiveIn, CfgVector<SizeT> &LiveRegCount) { if (!BuildDefs::dump()) return; Liveness *Liveness = Func->getLiveness(); @@ -840,7 +840,7 @@ Str << "\t\t\t\t# LiveOut="; } if (!Live->empty()) { - std::vector<Variable *> LiveRegs; + CfgVector<Variable *> LiveRegs; for (SizeT i = 0; i < Live->size(); ++i) { if ((*Live)[i]) { Variable *Var = Liveness->getVariable(i, Node); @@ -869,7 +869,7 @@ } void emitLiveRangesEnded(Ostream &Str, const Cfg *Func, const Inst *Instr, - std::vector<SizeT> &LiveRegCount) { + CfgVector<SizeT> &LiveRegCount) { if (!BuildDefs::dump()) return; bool First = true; @@ -935,7 +935,7 @@ // each register is assigned to. Normally that would be only 0 or 1, but the // register allocator's AllowOverlap inference allows it to be greater than 1 // for short periods. - std::vector<SizeT> LiveRegCount(Func->getTarget()->getNumRegisters()); + CfgVector<SizeT> LiveRegCount(Func->getTarget()->getNumRegisters()); if (DecorateAsm) { constexpr bool IsLiveIn = true; emitRegisterUsage(Str, Func, this, IsLiveIn, LiveRegCount);
diff --git a/src/IceDefs.h b/src/IceDefs.h index a38da03..5f96d7e 100644 --- a/src/IceDefs.h +++ b/src/IceDefs.h
@@ -145,10 +145,14 @@ using PhiList = InstList; using AssignList = InstList; +// Standard library containers with CfgLocalAllocator. +template <typename T> using CfgVector = std::vector<T, CfgLocalAllocator<T>>; +template <typename T> using CfgList = std::list<T, CfgLocalAllocator<T>>; + // Containers that are arena-allocated from the Cfg's allocator. -using OperandList = std::vector<Operand *, CfgLocalAllocator<Operand *>>; -using VarList = std::vector<Variable *, CfgLocalAllocator<Variable *>>; -using NodeList = std::vector<CfgNode *, CfgLocalAllocator<CfgNode *>>; +using OperandList = CfgVector<Operand *>; +using VarList = CfgVector<Variable *>; +using NodeList = CfgVector<CfgNode *>; // Contains that use the default (global) allocator. using ConstantList = std::vector<Constant *>; @@ -168,8 +172,7 @@ /// value, giving the instruction number that begins or ends a variable's live /// range. using LiveBeginEndMapEntry = std::pair<SizeT, InstNumberT>; -using LiveBeginEndMap = - std::vector<LiveBeginEndMapEntry, CfgLocalAllocator<LiveBeginEndMapEntry>>; +using LiveBeginEndMap = CfgVector<LiveBeginEndMapEntry>; using LivenessBV = llvm::BitVector; using TimerStackIdT = uint32_t;
diff --git a/src/IceGlobalContext.cpp b/src/IceGlobalContext.cpp index 7b7183d..c7497da 100644 --- a/src/IceGlobalContext.cpp +++ b/src/IceGlobalContext.cpp
@@ -722,10 +722,8 @@ llvm::DeleteContainerPointers(AllThreadContexts); LockedPtr<DestructorArray> Dtors = getDestructors(); // Destructors are invoked in the opposite object construction order. - for (auto DtorIter = Dtors->crbegin(); DtorIter != Dtors->crend(); - ++DtorIter) { - (*DtorIter)(); - } + for (const auto &Dtor : reverse_range(*Dtors)) + Dtor(); } // TODO(stichnot): Consider adding thread-local caches of constant pool entries
diff --git a/src/IceInst.h b/src/IceInst.h index a727683..fcd7269 100644 --- a/src/IceInst.h +++ b/src/IceInst.h
@@ -161,10 +161,7 @@ void dumpDest(const Cfg *Func) const; virtual bool isRedundantAssign() const { return false; } - // TODO(jpp): Insts should not have non-trivial destructors, but they - // currently do. This dtor is marked final as a multi-step refactor that - // will eventually fix this problem. - virtual ~Inst() = default; + ~Inst() = default; protected: Inst(Cfg *Func, InstKind Kind, SizeT MaxSrcs, Variable *Dest);
diff --git a/src/IceLiveness.h b/src/IceLiveness.h index bd739d3..535ed02 100644 --- a/src/IceLiveness.h +++ b/src/IceLiveness.h
@@ -47,7 +47,7 @@ // LiveToVarMap maps a liveness bitvector index to a Variable. This is // generally just for printing/dumping. The index should be less than // NumLocals + Liveness::NumGlobals. - std::vector<Variable *> LiveToVarMap; + CfgVector<Variable *> LiveToVarMap; // LiveIn and LiveOut track the in- and out-liveness of the global // variables. The size of each vector is LivenessNode::NumGlobals. LivenessBV LiveIn, LiveOut; @@ -107,13 +107,13 @@ LivenessMode Mode; SizeT NumGlobals = 0; /// Size of Nodes is Cfg::Nodes.size(). - std::vector<LivenessNode> Nodes; + CfgVector<LivenessNode> Nodes; /// VarToLiveMap maps a Variable's Variable::Number to its live index within /// its basic block. - std::vector<SizeT> VarToLiveMap; + CfgVector<SizeT> VarToLiveMap; /// LiveToVarMap is analogous to LivenessNode::LiveToVarMap, but for non-local /// variables. - std::vector<Variable *> LiveToVarMap; + CfgVector<Variable *> LiveToVarMap; /// RangeMask[Variable::Number] indicates whether we want to track that /// Variable's live range. llvm::BitVector RangeMask;
diff --git a/src/IceLoopAnalyzer.h b/src/IceLoopAnalyzer.h index 19d38d1..8932e08 100644 --- a/src/IceLoopAnalyzer.h +++ b/src/IceLoopAnalyzer.h
@@ -88,9 +88,8 @@ bool Deleted = false; }; - using LoopNodeList = std::vector<LoopNode, CfgLocalAllocator<LoopNode>>; - using LoopNodePtrList = - std::vector<LoopNode *, CfgLocalAllocator<LoopNode *>>; + using LoopNodeList = CfgVector<LoopNode>; + using LoopNodePtrList = CfgVector<LoopNode *>; /// Process the node as part as part of Tarjan's algorithm and return either a /// node to recurse into or nullptr when the node has been fully processed.
diff --git a/src/IceOperand.h b/src/IceOperand.h index b4e06be..67b1d23 100644 --- a/src/IceOperand.h +++ b/src/IceOperand.h
@@ -85,12 +85,13 @@ } /// @} + ~Operand() = default; + protected: Operand(OperandKind Kind, Type Ty) : Ty(Ty), Kind(Kind) { // It is undefined behavior to have a larger value in the enum assert(Kind <= kTarget_Max); } - virtual ~Operand() = default; const Type Ty; const OperandKind Kind; @@ -354,7 +355,7 @@ LiveRange() = default; /// Special constructor for building a kill set. The advantage is that we can /// reserve the right amount of space in advance. - explicit LiveRange(const std::vector<InstNumberT> &Kills) { + explicit LiveRange(const CfgVector<InstNumberT> &Kills) { Range.reserve(Kills.size()); for (InstNumberT I : Kills) addSegment(I, I); @@ -388,8 +389,7 @@ private: using RangeElementType = std::pair<InstNumberT, InstNumberT>; /// RangeType is arena-allocated from the Cfg's allocator. - using RangeType = - std::vector<RangeElementType, CfgLocalAllocator<RangeElementType>>; + using RangeType = CfgVector<RangeElementType>; RangeType Range; /// TrimmedBegin is an optimization for the overlaps() computation. Since the /// linear-scan algorithm always calls it as overlaps(Cur) and Cur advances @@ -556,7 +556,7 @@ VMK_SingleDefs, /// Track uses+defs, but only record single def VMK_All /// Track uses+defs, including full def list }; -using InstDefList = std::vector<const Inst *, CfgLocalAllocator<const Inst *>>; +using InstDefList = CfgVector<const Inst *>; /// VariableTracking tracks the metadata for a single variable. It is /// only meant to be used internally by VariablesMetadata. @@ -652,7 +652,7 @@ private: const Cfg *Func; MetadataKind Kind; - std::vector<VariableTracking> Metadata; + CfgVector<VariableTracking> Metadata; const static InstDefList NoDefinitions; };
diff --git a/src/IceRegAlloc.cpp b/src/IceRegAlloc.cpp index 304ac37..eac255a 100644 --- a/src/IceRegAlloc.cpp +++ b/src/IceRegAlloc.cpp
@@ -166,8 +166,8 @@ // Iterate across all instructions and record the begin and end of the live // range for each variable that is pre-colored or infinite weight. - std::vector<InstNumberT> LRBegin(Vars.size(), Inst::NumberSentinel); - std::vector<InstNumberT> LREnd(Vars.size(), Inst::NumberSentinel); + CfgVector<InstNumberT> LRBegin(Vars.size(), Inst::NumberSentinel); + CfgVector<InstNumberT> LREnd(Vars.size(), Inst::NumberSentinel); for (CfgNode *Node : Func->getNodes()) { for (Inst &Inst : Node->getInsts()) { if (Inst.isDeleted())
diff --git a/src/IceRegAlloc.h b/src/IceRegAlloc.h index ec37aa0..7742ad7 100644 --- a/src/IceRegAlloc.h +++ b/src/IceRegAlloc.h
@@ -39,8 +39,8 @@ static constexpr size_t REGS_SIZE = 32; private: - using OrderedRanges = std::vector<Variable *>; - using UnorderedRanges = std::vector<Variable *>; + using OrderedRanges = CfgVector<Variable *>; + using UnorderedRanges = CfgVector<Variable *>; class IterationState { IterationState(const IterationState &) = delete; @@ -103,7 +103,7 @@ /// faster processing. OrderedRanges UnhandledPrecolored; UnorderedRanges Active, Inactive, Handled; - std::vector<InstNumberT> Kills; + CfgVector<InstNumberT> Kills; RegAllocKind Kind = RAK_Unknown; /// RegUses[I] is the number of live ranges (variables) that register I is /// currently assigned to. It can be greater than 1 as a result of
diff --git a/src/IceSwitchLowering.h b/src/IceSwitchLowering.h index df3bef3..8ac1842 100644 --- a/src/IceSwitchLowering.h +++ b/src/IceSwitchLowering.h
@@ -20,8 +20,7 @@ class CaseCluster; -using CaseClusterArray = - std::vector<CaseCluster, CfgLocalAllocator<CaseCluster>>; +using CaseClusterArray = CfgVector<CaseCluster>; /// A cluster of cases can be tested by a common method during switch lowering. class CaseCluster {
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp index 47f6ae1..5d8ad3f 100644 --- a/src/IceTargetLoweringX8632.cpp +++ b/src/IceTargetLoweringX8632.cpp
@@ -116,7 +116,6 @@ // the document "OS X ABI Function Call Guide" by Apple. NeedsStackAlignment = true; - using OperandList = std::vector<Operand *>; OperandList XmmArgs; OperandList StackArgs, StackArgLocations; uint32_t ParameterAreaSizeBytes = 0;
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h index a63f470..d92ba88 100644 --- a/src/IceTargetLoweringX86BaseImpl.h +++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -491,8 +491,8 @@ template <class Machine> void TargetX86Base<Machine>::findRMW() { Func->dump("Before RMW"); - OstreamLocker L(Func->getContext()); - Ostream &Str = Func->getContext()->getStrDump(); + if (Func->isVerbose(IceV_RMW)) + Func->getContext()->lockStr(); for (CfgNode *Node : Func->getNodes()) { // Walk through the instructions, considering each sequence of 3 // instructions, and look for the particular RMW pattern. Note that this @@ -510,75 +510,76 @@ assert(!I1->isDeleted()); assert(!I2->isDeleted()); assert(!I3->isDeleted()); - if (auto *Load = llvm::dyn_cast<InstLoad>(I1)) { - if (auto *Arith = llvm::dyn_cast<InstArithmetic>(I2)) { - if (auto *Store = llvm::dyn_cast<InstStore>(I3)) { - // Look for: - // a = Load addr - // b = <op> a, other - // Store b, addr - // Change to: - // a = Load addr - // b = <op> a, other - // x = FakeDef - // RMW <op>, addr, other, x - // b = Store b, addr, x - // Note that inferTwoAddress() makes sure setDestNonKillable() gets - // called on the updated Store instruction, to avoid liveness - // problems later. - // - // With this transformation, the Store instruction acquires a Dest - // variable and is now subject to dead code elimination if there - // are no more uses of "b". Variable "x" is a beacon for - // determining whether the Store instruction gets dead-code - // eliminated. If the Store instruction is eliminated, then it - // must be the case that the RMW instruction ends x's live range, - // and therefore the RMW instruction will be retained and later - // lowered. On the other hand, if the RMW instruction does not end - // x's live range, then the Store instruction must still be - // present, and therefore the RMW instruction is ignored during - // lowering because it is redundant with the Store instruction. - // - // Note that if "a" has further uses, the RMW transformation may - // still trigger, resulting in two loads and one store, which is - // worse than the original one load and one store. However, this - // is probably rare, and caching probably keeps it just as fast. - if (!isSameMemAddressOperand<Machine>(Load->getSourceAddress(), - Store->getAddr())) - continue; - Operand *ArithSrcFromLoad = Arith->getSrc(0); - Operand *ArithSrcOther = Arith->getSrc(1); - if (ArithSrcFromLoad != Load->getDest()) { - if (!Arith->isCommutative() || ArithSrcOther != Load->getDest()) - continue; - std::swap(ArithSrcFromLoad, ArithSrcOther); - } - if (Arith->getDest() != Store->getData()) - continue; - if (!canRMW(Arith)) - continue; - if (Func->isVerbose(IceV_RMW)) { - Str << "Found RMW in " << Func->getFunctionName() << ":\n "; - Load->dump(Func); - Str << "\n "; - Arith->dump(Func); - Str << "\n "; - Store->dump(Func); - Str << "\n"; - } - Variable *Beacon = Func->makeVariable(IceType_i32); - Beacon->setMustNotHaveReg(); - Store->setRmwBeacon(Beacon); - InstFakeDef *BeaconDef = InstFakeDef::create(Func, Beacon); - Node->getInsts().insert(I3, BeaconDef); - auto *RMW = Traits::Insts::FakeRMW::create( - Func, ArithSrcOther, Store->getAddr(), Beacon, Arith->getOp()); - Node->getInsts().insert(I3, RMW); - } - } + auto *Load = llvm::dyn_cast<InstLoad>(I1); + auto *Arith = llvm::dyn_cast<InstArithmetic>(I2); + auto *Store = llvm::dyn_cast<InstStore>(I3); + if (!Load || !Arith || !Store) + continue; + // Look for: + // a = Load addr + // b = <op> a, other + // Store b, addr + // Change to: + // a = Load addr + // b = <op> a, other + // x = FakeDef + // RMW <op>, addr, other, x + // b = Store b, addr, x + // Note that inferTwoAddress() makes sure setDestNonKillable() gets + // called on the updated Store instruction, to avoid liveness problems + // later. + // + // With this transformation, the Store instruction acquires a Dest + // variable and is now subject to dead code elimination if there are no + // more uses of "b". Variable "x" is a beacon for determining whether + // the Store instruction gets dead-code eliminated. If the Store + // instruction is eliminated, then it must be the case that the RMW + // instruction ends x's live range, and therefore the RMW instruction + // will be retained and later lowered. On the other hand, if the RMW + // instruction does not end x's live range, then the Store instruction + // must still be present, and therefore the RMW instruction is ignored + // during lowering because it is redundant with the Store instruction. + // + // Note that if "a" has further uses, the RMW transformation may still + // trigger, resulting in two loads and one store, which is worse than the + // original one load and one store. However, this is probably rare, and + // caching probably keeps it just as fast. + if (!isSameMemAddressOperand<Machine>(Load->getSourceAddress(), + Store->getAddr())) + continue; + Operand *ArithSrcFromLoad = Arith->getSrc(0); + Operand *ArithSrcOther = Arith->getSrc(1); + if (ArithSrcFromLoad != Load->getDest()) { + if (!Arith->isCommutative() || ArithSrcOther != Load->getDest()) + continue; + std::swap(ArithSrcFromLoad, ArithSrcOther); } + if (Arith->getDest() != Store->getData()) + continue; + if (!canRMW(Arith)) + continue; + if (Func->isVerbose(IceV_RMW)) { + Ostream &Str = Func->getContext()->getStrDump(); + Str << "Found RMW in " << Func->getFunctionName() << ":\n "; + Load->dump(Func); + Str << "\n "; + Arith->dump(Func); + Str << "\n "; + Store->dump(Func); + Str << "\n"; + } + Variable *Beacon = Func->makeVariable(IceType_i32); + Beacon->setMustNotHaveReg(); + Store->setRmwBeacon(Beacon); + InstFakeDef *BeaconDef = InstFakeDef::create(Func, Beacon); + Node->getInsts().insert(I3, BeaconDef); + auto *RMW = Traits::Insts::FakeRMW::create( + Func, ArithSrcOther, Store->getAddr(), Beacon, Arith->getOp()); + Node->getInsts().insert(I3, RMW); } } + if (Func->isVerbose(IceV_RMW)) + Func->getContext()->unlockStr(); } // Converts a ConstantInteger32 operand into its constant value, or