Improve use of CfgLocalAllocator and introduce containers that use it.
This doesn't make a big difference but does reduce the proportion of time spent
in malloc and free.
BUG=
R=stichnot@chromium.org
Review URL: https://codereview.chromium.org/1349833005 .
diff --git a/Makefile.standalone b/Makefile.standalone
index 9469f81..12baf72 100644
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -121,6 +121,7 @@
endif
ifdef MSAN
+ # TODO(ascull): this has an as yet undiagnosed uninitialized memory access
OBJDIR := $(OBJDIR)+MSan
CXX_EXTRA += -fsanitize=memory
LD_EXTRA += -fsanitize=memory
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index 4c703cf..02d4503 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -314,13 +314,13 @@
void Cfg::reorderNodes() {
// TODO(ascull): it would be nice if the switch tests were always followed by
// the default case to allow for fall through.
- using PlacedList = std::list<CfgNode *>;
+ using PlacedList = CfgList<CfgNode *>;
PlacedList Placed; // Nodes with relative placement locked down
PlacedList Unreachable; // Unreachable nodes
PlacedList::iterator NoPlace = Placed.end();
// Keep track of where each node has been tentatively placed so that we can
// manage insertions into the middle.
- std::vector<PlacedList::iterator> PlaceIndex(Nodes.size(), NoPlace);
+ CfgVector<PlacedList::iterator> PlaceIndex(Nodes.size(), NoPlace);
for (CfgNode *Node : Nodes) {
// The "do ... while(0);" construct is to factor out the --PlaceIndex and
// assert() statements before moving to the next node.
diff --git a/src/IceCfg.h b/src/IceCfg.h
index 4147dd9..62a8914 100644
--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -273,7 +273,7 @@
std::unique_ptr<Assembler> TargetAssembler;
/// Globals required by this CFG. Mostly used for the profiler's globals.
std::unique_ptr<VariableDeclarationList> GlobalInits;
- std::vector<InstJumpTable *> JumpTables;
+ CfgVector<InstJumpTable *> JumpTables;
/// CurrentNode is maintained during dumping/emitting just for validating
/// Variable::DefNode. Normally, a traversal over CfgNodes maintains this, but
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp
index 31a6e8a..dbbfc18 100644
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -827,7 +827,7 @@
// Helper functions for emit().
void emitRegisterUsage(Ostream &Str, const Cfg *Func, const CfgNode *Node,
- bool IsLiveIn, std::vector<SizeT> &LiveRegCount) {
+ bool IsLiveIn, CfgVector<SizeT> &LiveRegCount) {
if (!BuildDefs::dump())
return;
Liveness *Liveness = Func->getLiveness();
@@ -840,7 +840,7 @@
Str << "\t\t\t\t# LiveOut=";
}
if (!Live->empty()) {
- std::vector<Variable *> LiveRegs;
+ CfgVector<Variable *> LiveRegs;
for (SizeT i = 0; i < Live->size(); ++i) {
if ((*Live)[i]) {
Variable *Var = Liveness->getVariable(i, Node);
@@ -869,7 +869,7 @@
}
void emitLiveRangesEnded(Ostream &Str, const Cfg *Func, const Inst *Instr,
- std::vector<SizeT> &LiveRegCount) {
+ CfgVector<SizeT> &LiveRegCount) {
if (!BuildDefs::dump())
return;
bool First = true;
@@ -935,7 +935,7 @@
// each register is assigned to. Normally that would be only 0 or 1, but the
// register allocator's AllowOverlap inference allows it to be greater than 1
// for short periods.
- std::vector<SizeT> LiveRegCount(Func->getTarget()->getNumRegisters());
+ CfgVector<SizeT> LiveRegCount(Func->getTarget()->getNumRegisters());
if (DecorateAsm) {
constexpr bool IsLiveIn = true;
emitRegisterUsage(Str, Func, this, IsLiveIn, LiveRegCount);
diff --git a/src/IceDefs.h b/src/IceDefs.h
index a38da03..5f96d7e 100644
--- a/src/IceDefs.h
+++ b/src/IceDefs.h
@@ -145,10 +145,14 @@
using PhiList = InstList;
using AssignList = InstList;
+// Standard library containers with CfgLocalAllocator.
+template <typename T> using CfgVector = std::vector<T, CfgLocalAllocator<T>>;
+template <typename T> using CfgList = std::list<T, CfgLocalAllocator<T>>;
+
// Containers that are arena-allocated from the Cfg's allocator.
-using OperandList = std::vector<Operand *, CfgLocalAllocator<Operand *>>;
-using VarList = std::vector<Variable *, CfgLocalAllocator<Variable *>>;
-using NodeList = std::vector<CfgNode *, CfgLocalAllocator<CfgNode *>>;
+using OperandList = CfgVector<Operand *>;
+using VarList = CfgVector<Variable *>;
+using NodeList = CfgVector<CfgNode *>;
// Contains that use the default (global) allocator.
using ConstantList = std::vector<Constant *>;
@@ -168,8 +172,7 @@
/// value, giving the instruction number that begins or ends a variable's live
/// range.
using LiveBeginEndMapEntry = std::pair<SizeT, InstNumberT>;
-using LiveBeginEndMap =
- std::vector<LiveBeginEndMapEntry, CfgLocalAllocator<LiveBeginEndMapEntry>>;
+using LiveBeginEndMap = CfgVector<LiveBeginEndMapEntry>;
using LivenessBV = llvm::BitVector;
using TimerStackIdT = uint32_t;
diff --git a/src/IceGlobalContext.cpp b/src/IceGlobalContext.cpp
index 7b7183d..c7497da 100644
--- a/src/IceGlobalContext.cpp
+++ b/src/IceGlobalContext.cpp
@@ -722,10 +722,8 @@
llvm::DeleteContainerPointers(AllThreadContexts);
LockedPtr<DestructorArray> Dtors = getDestructors();
// Destructors are invoked in the opposite object construction order.
- for (auto DtorIter = Dtors->crbegin(); DtorIter != Dtors->crend();
- ++DtorIter) {
- (*DtorIter)();
- }
+ for (const auto &Dtor : reverse_range(*Dtors))
+ Dtor();
}
// TODO(stichnot): Consider adding thread-local caches of constant pool entries
diff --git a/src/IceInst.h b/src/IceInst.h
index a727683..fcd7269 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -161,10 +161,7 @@
void dumpDest(const Cfg *Func) const;
virtual bool isRedundantAssign() const { return false; }
- // TODO(jpp): Insts should not have non-trivial destructors, but they
- // currently do. This dtor is marked final as a multi-step refactor that
- // will eventually fix this problem.
- virtual ~Inst() = default;
+ ~Inst() = default;
protected:
Inst(Cfg *Func, InstKind Kind, SizeT MaxSrcs, Variable *Dest);
diff --git a/src/IceLiveness.h b/src/IceLiveness.h
index bd739d3..535ed02 100644
--- a/src/IceLiveness.h
+++ b/src/IceLiveness.h
@@ -47,7 +47,7 @@
// LiveToVarMap maps a liveness bitvector index to a Variable. This is
// generally just for printing/dumping. The index should be less than
// NumLocals + Liveness::NumGlobals.
- std::vector<Variable *> LiveToVarMap;
+ CfgVector<Variable *> LiveToVarMap;
// LiveIn and LiveOut track the in- and out-liveness of the global
// variables. The size of each vector is LivenessNode::NumGlobals.
LivenessBV LiveIn, LiveOut;
@@ -107,13 +107,13 @@
LivenessMode Mode;
SizeT NumGlobals = 0;
/// Size of Nodes is Cfg::Nodes.size().
- std::vector<LivenessNode> Nodes;
+ CfgVector<LivenessNode> Nodes;
/// VarToLiveMap maps a Variable's Variable::Number to its live index within
/// its basic block.
- std::vector<SizeT> VarToLiveMap;
+ CfgVector<SizeT> VarToLiveMap;
/// LiveToVarMap is analogous to LivenessNode::LiveToVarMap, but for non-local
/// variables.
- std::vector<Variable *> LiveToVarMap;
+ CfgVector<Variable *> LiveToVarMap;
/// RangeMask[Variable::Number] indicates whether we want to track that
/// Variable's live range.
llvm::BitVector RangeMask;
diff --git a/src/IceLoopAnalyzer.h b/src/IceLoopAnalyzer.h
index 19d38d1..8932e08 100644
--- a/src/IceLoopAnalyzer.h
+++ b/src/IceLoopAnalyzer.h
@@ -88,9 +88,8 @@
bool Deleted = false;
};
- using LoopNodeList = std::vector<LoopNode, CfgLocalAllocator<LoopNode>>;
- using LoopNodePtrList =
- std::vector<LoopNode *, CfgLocalAllocator<LoopNode *>>;
+ using LoopNodeList = CfgVector<LoopNode>;
+ using LoopNodePtrList = CfgVector<LoopNode *>;
/// Process the node as part as part of Tarjan's algorithm and return either a
/// node to recurse into or nullptr when the node has been fully processed.
diff --git a/src/IceOperand.h b/src/IceOperand.h
index b4e06be..67b1d23 100644
--- a/src/IceOperand.h
+++ b/src/IceOperand.h
@@ -85,12 +85,13 @@
}
/// @}
+ ~Operand() = default;
+
protected:
Operand(OperandKind Kind, Type Ty) : Ty(Ty), Kind(Kind) {
// It is undefined behavior to have a larger value in the enum
assert(Kind <= kTarget_Max);
}
- virtual ~Operand() = default;
const Type Ty;
const OperandKind Kind;
@@ -354,7 +355,7 @@
LiveRange() = default;
/// Special constructor for building a kill set. The advantage is that we can
/// reserve the right amount of space in advance.
- explicit LiveRange(const std::vector<InstNumberT> &Kills) {
+ explicit LiveRange(const CfgVector<InstNumberT> &Kills) {
Range.reserve(Kills.size());
for (InstNumberT I : Kills)
addSegment(I, I);
@@ -388,8 +389,7 @@
private:
using RangeElementType = std::pair<InstNumberT, InstNumberT>;
/// RangeType is arena-allocated from the Cfg's allocator.
- using RangeType =
- std::vector<RangeElementType, CfgLocalAllocator<RangeElementType>>;
+ using RangeType = CfgVector<RangeElementType>;
RangeType Range;
/// TrimmedBegin is an optimization for the overlaps() computation. Since the
/// linear-scan algorithm always calls it as overlaps(Cur) and Cur advances
@@ -556,7 +556,7 @@
VMK_SingleDefs, /// Track uses+defs, but only record single def
VMK_All /// Track uses+defs, including full def list
};
-using InstDefList = std::vector<const Inst *, CfgLocalAllocator<const Inst *>>;
+using InstDefList = CfgVector<const Inst *>;
/// VariableTracking tracks the metadata for a single variable. It is
/// only meant to be used internally by VariablesMetadata.
@@ -652,7 +652,7 @@
private:
const Cfg *Func;
MetadataKind Kind;
- std::vector<VariableTracking> Metadata;
+ CfgVector<VariableTracking> Metadata;
const static InstDefList NoDefinitions;
};
diff --git a/src/IceRegAlloc.cpp b/src/IceRegAlloc.cpp
index 304ac37..eac255a 100644
--- a/src/IceRegAlloc.cpp
+++ b/src/IceRegAlloc.cpp
@@ -166,8 +166,8 @@
// Iterate across all instructions and record the begin and end of the live
// range for each variable that is pre-colored or infinite weight.
- std::vector<InstNumberT> LRBegin(Vars.size(), Inst::NumberSentinel);
- std::vector<InstNumberT> LREnd(Vars.size(), Inst::NumberSentinel);
+ CfgVector<InstNumberT> LRBegin(Vars.size(), Inst::NumberSentinel);
+ CfgVector<InstNumberT> LREnd(Vars.size(), Inst::NumberSentinel);
for (CfgNode *Node : Func->getNodes()) {
for (Inst &Inst : Node->getInsts()) {
if (Inst.isDeleted())
diff --git a/src/IceRegAlloc.h b/src/IceRegAlloc.h
index ec37aa0..7742ad7 100644
--- a/src/IceRegAlloc.h
+++ b/src/IceRegAlloc.h
@@ -39,8 +39,8 @@
static constexpr size_t REGS_SIZE = 32;
private:
- using OrderedRanges = std::vector<Variable *>;
- using UnorderedRanges = std::vector<Variable *>;
+ using OrderedRanges = CfgVector<Variable *>;
+ using UnorderedRanges = CfgVector<Variable *>;
class IterationState {
IterationState(const IterationState &) = delete;
@@ -103,7 +103,7 @@
/// faster processing.
OrderedRanges UnhandledPrecolored;
UnorderedRanges Active, Inactive, Handled;
- std::vector<InstNumberT> Kills;
+ CfgVector<InstNumberT> Kills;
RegAllocKind Kind = RAK_Unknown;
/// RegUses[I] is the number of live ranges (variables) that register I is
/// currently assigned to. It can be greater than 1 as a result of
diff --git a/src/IceSwitchLowering.h b/src/IceSwitchLowering.h
index df3bef3..8ac1842 100644
--- a/src/IceSwitchLowering.h
+++ b/src/IceSwitchLowering.h
@@ -20,8 +20,7 @@
class CaseCluster;
-using CaseClusterArray =
- std::vector<CaseCluster, CfgLocalAllocator<CaseCluster>>;
+using CaseClusterArray = CfgVector<CaseCluster>;
/// A cluster of cases can be tested by a common method during switch lowering.
class CaseCluster {
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 47f6ae1..5d8ad3f 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -116,7 +116,6 @@
// the document "OS X ABI Function Call Guide" by Apple.
NeedsStackAlignment = true;
- using OperandList = std::vector<Operand *>;
OperandList XmmArgs;
OperandList StackArgs, StackArgLocations;
uint32_t ParameterAreaSizeBytes = 0;
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index a63f470..d92ba88 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -491,8 +491,8 @@
template <class Machine> void TargetX86Base<Machine>::findRMW() {
Func->dump("Before RMW");
- OstreamLocker L(Func->getContext());
- Ostream &Str = Func->getContext()->getStrDump();
+ if (Func->isVerbose(IceV_RMW))
+ Func->getContext()->lockStr();
for (CfgNode *Node : Func->getNodes()) {
// Walk through the instructions, considering each sequence of 3
// instructions, and look for the particular RMW pattern. Note that this
@@ -510,75 +510,76 @@
assert(!I1->isDeleted());
assert(!I2->isDeleted());
assert(!I3->isDeleted());
- if (auto *Load = llvm::dyn_cast<InstLoad>(I1)) {
- if (auto *Arith = llvm::dyn_cast<InstArithmetic>(I2)) {
- if (auto *Store = llvm::dyn_cast<InstStore>(I3)) {
- // Look for:
- // a = Load addr
- // b = <op> a, other
- // Store b, addr
- // Change to:
- // a = Load addr
- // b = <op> a, other
- // x = FakeDef
- // RMW <op>, addr, other, x
- // b = Store b, addr, x
- // Note that inferTwoAddress() makes sure setDestNonKillable() gets
- // called on the updated Store instruction, to avoid liveness
- // problems later.
- //
- // With this transformation, the Store instruction acquires a Dest
- // variable and is now subject to dead code elimination if there
- // are no more uses of "b". Variable "x" is a beacon for
- // determining whether the Store instruction gets dead-code
- // eliminated. If the Store instruction is eliminated, then it
- // must be the case that the RMW instruction ends x's live range,
- // and therefore the RMW instruction will be retained and later
- // lowered. On the other hand, if the RMW instruction does not end
- // x's live range, then the Store instruction must still be
- // present, and therefore the RMW instruction is ignored during
- // lowering because it is redundant with the Store instruction.
- //
- // Note that if "a" has further uses, the RMW transformation may
- // still trigger, resulting in two loads and one store, which is
- // worse than the original one load and one store. However, this
- // is probably rare, and caching probably keeps it just as fast.
- if (!isSameMemAddressOperand<Machine>(Load->getSourceAddress(),
- Store->getAddr()))
- continue;
- Operand *ArithSrcFromLoad = Arith->getSrc(0);
- Operand *ArithSrcOther = Arith->getSrc(1);
- if (ArithSrcFromLoad != Load->getDest()) {
- if (!Arith->isCommutative() || ArithSrcOther != Load->getDest())
- continue;
- std::swap(ArithSrcFromLoad, ArithSrcOther);
- }
- if (Arith->getDest() != Store->getData())
- continue;
- if (!canRMW(Arith))
- continue;
- if (Func->isVerbose(IceV_RMW)) {
- Str << "Found RMW in " << Func->getFunctionName() << ":\n ";
- Load->dump(Func);
- Str << "\n ";
- Arith->dump(Func);
- Str << "\n ";
- Store->dump(Func);
- Str << "\n";
- }
- Variable *Beacon = Func->makeVariable(IceType_i32);
- Beacon->setMustNotHaveReg();
- Store->setRmwBeacon(Beacon);
- InstFakeDef *BeaconDef = InstFakeDef::create(Func, Beacon);
- Node->getInsts().insert(I3, BeaconDef);
- auto *RMW = Traits::Insts::FakeRMW::create(
- Func, ArithSrcOther, Store->getAddr(), Beacon, Arith->getOp());
- Node->getInsts().insert(I3, RMW);
- }
- }
+ auto *Load = llvm::dyn_cast<InstLoad>(I1);
+ auto *Arith = llvm::dyn_cast<InstArithmetic>(I2);
+ auto *Store = llvm::dyn_cast<InstStore>(I3);
+ if (!Load || !Arith || !Store)
+ continue;
+ // Look for:
+ // a = Load addr
+ // b = <op> a, other
+ // Store b, addr
+ // Change to:
+ // a = Load addr
+ // b = <op> a, other
+ // x = FakeDef
+ // RMW <op>, addr, other, x
+ // b = Store b, addr, x
+ // Note that inferTwoAddress() makes sure setDestNonKillable() gets
+ // called on the updated Store instruction, to avoid liveness problems
+ // later.
+ //
+ // With this transformation, the Store instruction acquires a Dest
+ // variable and is now subject to dead code elimination if there are no
+ // more uses of "b". Variable "x" is a beacon for determining whether
+ // the Store instruction gets dead-code eliminated. If the Store
+ // instruction is eliminated, then it must be the case that the RMW
+ // instruction ends x's live range, and therefore the RMW instruction
+ // will be retained and later lowered. On the other hand, if the RMW
+ // instruction does not end x's live range, then the Store instruction
+ // must still be present, and therefore the RMW instruction is ignored
+ // during lowering because it is redundant with the Store instruction.
+ //
+ // Note that if "a" has further uses, the RMW transformation may still
+ // trigger, resulting in two loads and one store, which is worse than the
+ // original one load and one store. However, this is probably rare, and
+ // caching probably keeps it just as fast.
+ if (!isSameMemAddressOperand<Machine>(Load->getSourceAddress(),
+ Store->getAddr()))
+ continue;
+ Operand *ArithSrcFromLoad = Arith->getSrc(0);
+ Operand *ArithSrcOther = Arith->getSrc(1);
+ if (ArithSrcFromLoad != Load->getDest()) {
+ if (!Arith->isCommutative() || ArithSrcOther != Load->getDest())
+ continue;
+ std::swap(ArithSrcFromLoad, ArithSrcOther);
}
+ if (Arith->getDest() != Store->getData())
+ continue;
+ if (!canRMW(Arith))
+ continue;
+ if (Func->isVerbose(IceV_RMW)) {
+ Ostream &Str = Func->getContext()->getStrDump();
+ Str << "Found RMW in " << Func->getFunctionName() << ":\n ";
+ Load->dump(Func);
+ Str << "\n ";
+ Arith->dump(Func);
+ Str << "\n ";
+ Store->dump(Func);
+ Str << "\n";
+ }
+ Variable *Beacon = Func->makeVariable(IceType_i32);
+ Beacon->setMustNotHaveReg();
+ Store->setRmwBeacon(Beacon);
+ InstFakeDef *BeaconDef = InstFakeDef::create(Func, Beacon);
+ Node->getInsts().insert(I3, BeaconDef);
+ auto *RMW = Traits::Insts::FakeRMW::create(
+ Func, ArithSrcOther, Store->getAddr(), Beacon, Arith->getOp());
+ Node->getInsts().insert(I3, RMW);
}
}
+ if (Func->isVerbose(IceV_RMW))
+ Func->getContext()->unlockStr();
}
// Converts a ConstantInteger32 operand into its constant value, or