Subzero: Rewrite the pass timing infrastructure.
This makes it much more useful for individual analysis and long-term translation performance tracking.
1. Collect and report aggregated across the entire translation, instead of function-by-function. If you really care about a single function, just extract it and translate it separately for analysis.
2. Remove "-verbose time" and just use -timing.
3. Collects two kinds of timings: cumulative and flat. Cumulative measures the total time, even if a callee also times itself. Flat only measures the currently active timer at the top of the stack. The flat times should add up to 100%, but cumulative will usually add up to much more than 100%.
BUG= none
R=jvoung@chromium.org
Review URL: https://codereview.chromium.org/610813002
diff --git a/Makefile.standalone b/Makefile.standalone
index a913497..175dd12 100644
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -80,9 +80,10 @@
IceMemoryRegion.cpp \
IceOperand.cpp \
IceRegAlloc.cpp \
- IceRNG.cpp \
+ IceRNG.cpp \
IceTargetLowering.cpp \
IceTargetLoweringX8632.cpp \
+ IceTimerTree.cpp \
IceTranslator.cpp \
IceTypeConverter.cpp \
IceTypes.cpp \
@@ -113,7 +114,7 @@
$(OBJDIR):
@mkdir -p $@
-check-lit: llvm2ice
+check-lit: llvm2ice make_symlink
LLVM_BIN_PATH=$(LLVM_BIN_PATH) \
$(LLVM_SRC_PATH)/utils/lit/lit.py -sv tests_lit
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index 8f51f43..d2c83ab 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -69,14 +69,14 @@
void Cfg::translate() {
if (hasError())
return;
+ static TimerIdT IDtranslate = GlobalContext::getTimerID("translate");
+ TimerMarker T(IDtranslate, getContext());
dump("Initial CFG");
- Timer T_translate;
// The set of translation passes and their order are determined by
// the target.
getTarget()->translate();
- T_translate.printElapsedUs(getContext(), "translate()");
dump("Final output");
}
@@ -88,6 +88,9 @@
}
void Cfg::renumberInstructions() {
+ static TimerIdT IDrenumberInstructions =
+ GlobalContext::getTimerID("renumberInstructions");
+ TimerMarker T(IDrenumberInstructions, getContext());
NextInstNumber = 1;
for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
(*I)->renumberInstructions();
@@ -96,6 +99,8 @@
// placePhiLoads() must be called before placePhiStores().
void Cfg::placePhiLoads() {
+ static TimerIdT IDplacePhiLoads = GlobalContext::getTimerID("placePhiLoads");
+ TimerMarker T(IDplacePhiLoads, getContext());
for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
(*I)->placePhiLoads();
}
@@ -103,34 +108,48 @@
// placePhiStores() must be called after placePhiLoads().
void Cfg::placePhiStores() {
+ static TimerIdT IDplacePhiStores =
+ GlobalContext::getTimerID("placePhiStores");
+ TimerMarker T(IDplacePhiStores, getContext());
for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
(*I)->placePhiStores();
}
}
void Cfg::deletePhis() {
+ static TimerIdT IDdeletePhis = GlobalContext::getTimerID("deletePhis");
+ TimerMarker T(IDdeletePhis, getContext());
for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
(*I)->deletePhis();
}
}
void Cfg::doArgLowering() {
+ static TimerIdT IDdoArgLowering = GlobalContext::getTimerID("doArgLowering");
+ TimerMarker T(IDdoArgLowering, getContext());
getTarget()->lowerArguments();
}
void Cfg::doAddressOpt() {
+ static TimerIdT IDdoAddressOpt = GlobalContext::getTimerID("doAddressOpt");
+ TimerMarker T(IDdoAddressOpt, getContext());
for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
(*I)->doAddressOpt();
}
}
void Cfg::doNopInsertion() {
+ static TimerIdT IDdoNopInsertion =
+ GlobalContext::getTimerID("doNopInsertion");
+ TimerMarker T(IDdoNopInsertion, getContext());
for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
(*I)->doNopInsertion();
}
}
void Cfg::genCode() {
+ static TimerIdT IDgenCode = GlobalContext::getTimerID("genCode");
+ TimerMarker T(IDgenCode, getContext());
for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
(*I)->genCode();
}
@@ -138,6 +157,8 @@
// Compute the stack frame layout.
void Cfg::genFrame() {
+ static TimerIdT IDgenFrame = GlobalContext::getTimerID("genFrame");
+ TimerMarker T(IDgenFrame, getContext());
getTarget()->addProlog(Entry);
// TODO: Consider folding epilog generation into the final
// emission/assembly pass to avoid an extra iteration over the node
@@ -154,6 +175,9 @@
// completely with a single block. It is a quick single pass and
// doesn't need to iterate until convergence.
void Cfg::livenessLightweight() {
+ static TimerIdT IDlivenessLightweight =
+ GlobalContext::getTimerID("livenessLightweight");
+ TimerMarker T(IDlivenessLightweight, getContext());
getVMetadata()->init();
for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
(*I)->livenessLightweight();
@@ -161,6 +185,8 @@
}
void Cfg::liveness(LivenessMode Mode) {
+ static TimerIdT IDliveness = GlobalContext::getTimerID("liveness");
+ TimerMarker T(IDliveness, getContext());
Live.reset(new Liveness(this, Mode));
getVMetadata()->init();
Live->init();
@@ -199,9 +225,10 @@
// Collect timing for just the portion that constructs the live
// range intervals based on the end-of-live-range computation, for a
// finer breakdown of the cost.
- Timer T_liveRange;
// Make a final pass over instructions to delete dead instructions
// and build each Variable's live range.
+ static TimerIdT IDliveRange = GlobalContext::getTimerID("liveRange");
+ TimerMarker T1(IDliveRange, getContext());
for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
(*I)->livenessPostprocess(Mode, getLiveness());
}
@@ -241,7 +268,6 @@
if (Var->getWeight().isInf())
Var->setLiveRangeInfiniteWeight();
}
- T_liveRange.printElapsedUs(getContext(), "live range construction");
dump();
}
}
@@ -249,6 +275,9 @@
// Traverse every Variable of every Inst and verify that it
// appears within the Variable's computed live range.
bool Cfg::validateLiveness() const {
+ static TimerIdT IDvalidateLiveness =
+ GlobalContext::getTimerID("validateLiveness");
+ TimerMarker T(IDvalidateLiveness, getContext());
bool Valid = true;
Ostream &Str = Ctx->getStrDump();
for (NodeList::const_iterator I1 = Nodes.begin(), E1 = Nodes.end(); I1 != E1;
@@ -296,18 +325,21 @@
}
void Cfg::doBranchOpt() {
+ static TimerIdT IDdoBranchOpt = GlobalContext::getTimerID("doBranchOpt");
+ TimerMarker T(IDdoBranchOpt, getContext());
for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
NodeList::iterator NextNode = I;
++NextNode;
- (*I)->doBranchOpt(*NextNode);
+ (*I)->doBranchOpt(NextNode == E ? NULL : *NextNode);
}
}
// ======================== Dump routines ======================== //
void Cfg::emit() {
+ static TimerIdT IDemit = GlobalContext::getTimerID("emit");
+ TimerMarker T(IDemit, getContext());
Ostream &Str = Ctx->getStrEmit();
- Timer T_emit;
if (!Ctx->testAndSetHasEmittedFirstMethod()) {
// Print a helpful command for assembling the output.
// TODO: have the Target emit the header
@@ -339,7 +371,6 @@
(*I)->emit(this);
}
Str << "\n";
- T_emit.printElapsedUs(Ctx, "emit()");
}
// Dumps the IR with an optional introductory message.
diff --git a/src/IceConverter.cpp b/src/IceConverter.cpp
index 4888e8f..9910f06 100644
--- a/src/IceConverter.cpp
+++ b/src/IceConverter.cpp
@@ -11,11 +11,10 @@
//
//===----------------------------------------------------------------------===//
-#include "IceConverter.h"
-
#include "IceCfg.h"
#include "IceCfgNode.h"
#include "IceClFlags.h"
+#include "IceConverter.h"
#include "IceDefs.h"
#include "IceGlobalContext.h"
#include "IceInst.h"
@@ -60,6 +59,9 @@
// Caller is expected to delete the returned Ice::Cfg object.
Ice::Cfg *convertFunction(const Function *F) {
+ static Ice::TimerIdT IDllvmConvert =
+ Ice::GlobalContext::getTimerID("llvmConvert");
+ Ice::TimerMarker T(IDllvmConvert, Ctx);
VarMap.clear();
NodeMap.clear();
Func = new Ice::Cfg(Ctx);
@@ -621,6 +623,8 @@
namespace Ice {
void Converter::convertToIce() {
+ static TimerIdT IDconvertToIce = GlobalContext::getTimerID("convertToIce");
+ TimerMarker T(IDconvertToIce, Ctx);
nameUnnamedGlobalAddresses(Mod);
if (!Ctx->getFlags().DisableGlobals)
convertGlobals(Mod);
@@ -635,11 +639,6 @@
Timer TConvert;
Cfg *Fcn = FunctionConverter.convertFunction(I);
- if (Ctx->getFlags().SubzeroTimingEnabled) {
- std::cerr << "[Subzero timing] Convert function "
- << Fcn->getFunctionName() << ": " << TConvert.getElapsedSec()
- << " sec\n";
- }
translateFcn(Fcn);
}
diff --git a/src/IceDefs.h b/src/IceDefs.h
index 4aa99e2..68f30c7 100644
--- a/src/IceDefs.h
+++ b/src/IceDefs.h
@@ -78,6 +78,8 @@
// numbers are used for representing Variable live ranges.
typedef int32_t InstNumberT;
+typedef uint32_t TimerIdT;
+
enum LivenessMode {
// Basic version of live-range-end calculation. Marks the last uses
// of variables based on dataflow analysis. Records the set of
@@ -103,34 +105,14 @@
IceV_RegOrigins = 1 << 7,
IceV_LinearScan = 1 << 8,
IceV_Frame = 1 << 9,
- IceV_Timing = 1 << 10,
- IceV_AddrOpt = 1 << 11,
+ IceV_AddrOpt = 1 << 10,
IceV_All = ~IceV_None,
- IceV_Most = IceV_All & ~(IceV_Timing | IceV_LinearScan)
+ IceV_Most = IceV_All & ~IceV_LinearScan
};
typedef uint32_t VerboseMask;
typedef llvm::raw_ostream Ostream;
-// TODO: Implement in terms of std::chrono after switching to C++11.
-class Timer {
-public:
- Timer() : Start(llvm::TimeRecord::getCurrentTime(false)) {}
- uint64_t getElapsedNs() const { return getElapsedSec() * 1000 * 1000 * 1000; }
- uint64_t getElapsedUs() const { return getElapsedSec() * 1000 * 1000; }
- uint64_t getElapsedMs() const { return getElapsedSec() * 1000; }
- double getElapsedSec() const {
- llvm::TimeRecord End = llvm::TimeRecord::getCurrentTime(false);
- return End.getWallTime() - Start.getWallTime();
- }
- void printElapsedUs(GlobalContext *Ctx, const IceString &Tag) const;
-
-private:
- const llvm::TimeRecord Start;
- Timer(const Timer &) LLVM_DELETED_FUNCTION;
- Timer &operator=(const Timer &) LLVM_DELETED_FUNCTION;
-};
-
} // end of namespace Ice
#endif // SUBZERO_SRC_ICEDEFS_H
diff --git a/src/IceGlobalContext.cpp b/src/IceGlobalContext.cpp
index 319e18a..4cca6c9 100644
--- a/src/IceGlobalContext.cpp
+++ b/src/IceGlobalContext.cpp
@@ -1,4 +1,4 @@
-//===- subzero/src/IceGlobalContext.cpp - Global context defs ---*- C++ -*-===//
+//===- subzero/src/IceGlobalContext.cpp - Global context defs -------------===//
//
// The Subzero Code Generator
//
@@ -22,6 +22,7 @@
#include "IceGlobalContext.h"
#include "IceOperand.h"
#include "IceTargetLowering.h"
+#include "IceTimerTree.h"
namespace Ice {
@@ -122,7 +123,7 @@
: StrDump(OsDump), StrEmit(OsEmit), VMask(Mask),
ConstPool(new ConstantPool()), Arch(Arch), Opt(Opt),
TestPrefix(TestPrefix), Flags(Flags), HasEmittedFirstMethod(false),
- RNG("") {}
+ RNG(""), Timers(new TimerStack("main")) {}
// Scan a string for S[0-9A-Z]*_ patterns and replace them with
// S<num>_ where <num> is the next base-36 value. If a type name
@@ -384,6 +385,14 @@
llvm_unreachable("Unknown type");
}
+TimerIdT GlobalContext::getTimerID(const IceString &Name) {
+ return TimerStack::getTimerID(Name);
+}
+
+void GlobalContext::pushTimer(TimerIdT ID) { Timers->push(ID); }
+
+void GlobalContext::popTimer(TimerIdT ID) { Timers->pop(ID); }
+
void GlobalContext::dumpStats(const IceString &Name, bool Final) {
if (Flags.DumpStats) {
if (Final) {
@@ -395,12 +404,6 @@
}
}
-void Timer::printElapsedUs(GlobalContext *Ctx, const IceString &Tag) const {
- if (Ctx->isVerbose(IceV_Timing)) {
- // Prefixing with '#' allows timing strings to be included
- // without error in textual assembly output.
- Ctx->getStrDump() << "# " << getElapsedUs() << " usec " << Tag << "\n";
- }
-}
+void GlobalContext::dumpTimers() { Timers->dump(getStrDump()); }
} // end of namespace Ice
diff --git a/src/IceGlobalContext.h b/src/IceGlobalContext.h
index f997b65..3c53773 100644
--- a/src/IceGlobalContext.h
+++ b/src/IceGlobalContext.h
@@ -69,12 +69,8 @@
// Returns true if any of the specified options in the verbose mask
// are set. If the argument is omitted, it checks if any verbose
- // options at all are set. IceV_Timing is treated specially, so
- // that running with just IceV_Timing verbosity doesn't trigger an
- // avalanche of extra output.
- bool isVerbose(VerboseMask Mask = (IceV_All & ~IceV_Timing)) const {
- return VMask & Mask;
- }
+ // options at all are set.
+ bool isVerbose(VerboseMask Mask = IceV_All) const { return VMask & Mask; }
void setVerbose(VerboseMask Mask) { VMask = Mask; }
void addVerbose(VerboseMask Mask) { VMask |= Mask; }
void subVerbose(VerboseMask Mask) { VMask &= ~Mask; }
@@ -154,6 +150,11 @@
StatsCumulative.updateFills();
}
+ static TimerIdT getTimerID(const IceString &Name);
+ void pushTimer(TimerIdT ID);
+ void popTimer(TimerIdT ID);
+ void dumpTimers();
+
private:
Ostream *StrDump; // Stream for dumping / diagnostics
Ostream *StrEmit; // Stream for code emission
@@ -170,6 +171,7 @@
RandomNumberGenerator RNG;
CodeStats StatsFunction;
CodeStats StatsCumulative;
+ llvm::OwningPtr<class TimerStack> Timers;
GlobalContext(const GlobalContext &) LLVM_DELETED_FUNCTION;
GlobalContext &operator=(const GlobalContext &) LLVM_DELETED_FUNCTION;
@@ -178,6 +180,30 @@
void incrementSubstitutions(ManglerVector &OldName) const;
};
+// Helper class to push and pop a timer marker. The constructor
+// pushes a marker, and the destructor pops it. This is for
+// convenient timing of regions of code.
+class TimerMarker {
+ TimerMarker(const TimerMarker &) LLVM_DELETED_FUNCTION;
+ TimerMarker &operator=(const TimerMarker &) LLVM_DELETED_FUNCTION;
+
+public:
+ TimerMarker(TimerIdT ID, GlobalContext *Ctx)
+ : ID(ID), Ctx(Ctx), Active(Ctx->getFlags().SubzeroTimingEnabled) {
+ if (Active)
+ Ctx->pushTimer(ID);
+ }
+ ~TimerMarker() {
+ if (Active)
+ Ctx->popTimer(ID);
+ }
+
+private:
+ TimerIdT ID;
+ GlobalContext *const Ctx;
+ const bool Active;
+};
+
} // end of namespace Ice
#endif // SUBZERO_SRC_ICEGLOBALCONTEXT_H
diff --git a/src/IceInst.h b/src/IceInst.h
index 5c50d2f..3727e6e 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -509,9 +509,9 @@
class InstLoad : public InstHighLevel {
public:
static InstLoad *create(Cfg *Func, Variable *Dest, Operand *SourceAddr,
- uint32_t align = 1) {
+ uint32_t Align = 1) {
// TODO(kschimpf) Stop ignoring alignment specification.
- (void)align;
+ (void)Align;
return new (Func->allocateInst<InstLoad>())
InstLoad(Func, Dest, SourceAddr);
}
diff --git a/src/IceOperand.cpp b/src/IceOperand.cpp
index d1854f4..3614725 100644
--- a/src/IceOperand.cpp
+++ b/src/IceOperand.cpp
@@ -122,9 +122,26 @@
}
bool LiveRange::overlaps(InstNumberT OtherBegin) const {
+ bool Result = false;
+ for (RangeType::const_iterator I = Range.begin(), E = Range.end(); I != E;
+ ++I) {
+ if (OtherBegin < I->first) {
+ Result = false;
+ break;
+ }
+ if (OtherBegin < I->second) {
+ Result = true;
+ break;
+ }
+ }
+#if 0
+ // An equivalent but less inefficient implementation:
LiveRange Temp;
Temp.addSegment(OtherBegin, OtherBegin + 1);
- return overlaps(Temp);
+ bool Validation = overlaps(Temp);
+ assert(Result == Validation);
+#endif
+ return Result;
}
// Returns true if the live range contains the given instruction
@@ -259,6 +276,8 @@
}
void VariablesMetadata::init() {
+ static TimerIdT IDvmetadata = GlobalContext::getTimerID("vmetadata");
+ TimerMarker T(IDvmetadata, Func->getContext());
Metadata.clear();
Metadata.resize(Func->getNumVariables());
diff --git a/src/IceRegAlloc.cpp b/src/IceRegAlloc.cpp
index 36ada12..e072085 100644
--- a/src/IceRegAlloc.cpp
+++ b/src/IceRegAlloc.cpp
@@ -64,12 +64,15 @@
// preparation. Results are assigned to Variable::RegNum for each
// Variable.
void LinearScan::scan(const llvm::SmallBitVector &RegMaskFull) {
+ static TimerIdT IDscan = GlobalContext::getTimerID("linearScan");
+ TimerMarker T(IDscan, Func->getContext());
assert(RegMaskFull.any()); // Sanity check
Unhandled.clear();
Handled.clear();
Inactive.clear();
Active.clear();
Ostream &Str = Func->getContext()->getStrDump();
+ bool Verbose = Func->getContext()->isVerbose(IceV_LinearScan);
Func->resetCurrentNode();
VariablesMetadata *VMetadata = Func->getVMetadata();
@@ -81,26 +84,32 @@
// a result, it may be useful to design a better data structure for
// storing Func->getVariables().
const VarList &Vars = Func->getVariables();
- for (VarList::const_iterator I = Vars.begin(), E = Vars.end(); I != E; ++I) {
- Variable *Var = *I;
- // Explicitly don't consider zero-weight variables, which are
- // meant to be spill slots.
- if (Var->getWeight() == RegWeight::Zero)
- continue;
- // Don't bother if the variable has a null live range, which means
- // it was never referenced.
- if (Var->getLiveRange().isEmpty())
- continue;
- Unhandled.insert(LiveRangeWrapper(Var));
- if (Var->hasReg()) {
- Var->setRegNumTmp(Var->getRegNum());
- Var->setLiveRangeInfiniteWeight();
+ {
+ static TimerIdT IDinitUnhandled =
+ GlobalContext::getTimerID("initUnhandled");
+ TimerMarker T(IDinitUnhandled, Func->getContext());
+ for (VarList::const_iterator I = Vars.begin(), E = Vars.end(); I != E;
+ ++I) {
+ Variable *Var = *I;
+ // Explicitly don't consider zero-weight variables, which are
+ // meant to be spill slots.
+ if (Var->getWeight() == RegWeight::Zero)
+ continue;
+ // Don't bother if the variable has a null live range, which means
+ // it was never referenced.
+ if (Var->getLiveRange().isEmpty())
+ continue;
+ Unhandled.insert(LiveRangeWrapper(Var));
+ if (Var->hasReg()) {
+ Var->setRegNumTmp(Var->getRegNum());
+ Var->setLiveRangeInfiniteWeight();
+ }
}
}
// RegUses[I] is the number of live ranges (variables) that register
// I is currently assigned to. It can be greater than 1 as a result
- // of Variable::AllowRegisterOverlap.
+ // of AllowOverlap inference below.
std::vector<int> RegUses(RegMaskFull.size());
// Unhandled is already set to all ranges in increasing order of
// start points.
@@ -112,7 +121,7 @@
while (!Unhandled.empty()) {
LiveRangeWrapper Cur = *Unhandled.begin();
Unhandled.erase(Unhandled.begin());
- if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+ if (Verbose) {
Str << "\nConsidering ";
Cur.dump(Func);
Str << "\n";
@@ -130,7 +139,7 @@
int32_t RegNum = Cur.Var->getRegNum();
// RegNumTmp should have already been set above.
assert(Cur.Var->getRegNumTmp() == RegNum);
- if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+ if (Verbose) {
Str << "Precoloring ";
Cur.dump(Func);
Str << "\n";
@@ -150,7 +159,7 @@
bool Moved = false;
if (Item.endsBefore(Cur)) {
// Move Item from Active to Handled list.
- if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+ if (Verbose) {
Str << "Expiring ";
Item.dump(Func);
Str << "\n";
@@ -160,7 +169,7 @@
Moved = true;
} else if (!Item.overlapsStart(Cur)) {
// Move Item from Active to Inactive list.
- if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+ if (Verbose) {
Str << "Inactivating ";
Item.dump(Func);
Str << "\n";
@@ -186,7 +195,7 @@
LiveRangeWrapper Item = *I;
if (Item.endsBefore(Cur)) {
// Move Item from Inactive to Handled list.
- if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+ if (Verbose) {
Str << "Expiring ";
Item.dump(Func);
Str << "\n";
@@ -195,7 +204,7 @@
Handled.push_back(Item);
} else if (Item.overlapsStart(Cur)) {
// Move Item from Inactive to Active list.
- if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+ if (Verbose) {
Str << "Reactivating ";
Item.dump(Func);
Str << "\n";
@@ -261,7 +270,7 @@
}
}
}
- if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+ if (Verbose) {
if (Prefer) {
Str << "Initial Prefer=" << *Prefer << " R=" << PreferReg
<< " LIVE=" << Prefer->getLiveRange() << " Overlap=" << AllowOverlap
@@ -279,8 +288,7 @@
int32_t RegNum = Item.Var->getRegNumTmp();
// Don't assert(Free[RegNum]) because in theory (though
// probably never in practice) there could be two inactive
- // variables that were allowed marked with
- // AllowRegisterOverlap.
+ // variables that were marked with AllowOverlap.
Free[RegNum] = false;
// Disable AllowOverlap if an Inactive variable, which is not
// Prefer, shares Prefer's register, and has a definition
@@ -331,7 +339,7 @@
}
// Print info about physical register availability.
- if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+ if (Verbose) {
for (SizeT i = 0; i < RegMask.size(); ++i) {
if (RegMask[i]) {
Str << Func->getTarget()->getRegName(i, IceType_i32)
@@ -346,7 +354,7 @@
// First choice: a preferred register that is either free or is
// allowed to overlap with its linked variable.
Cur.Var->setRegNumTmp(PreferReg);
- if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+ if (Verbose) {
Str << "Preferring ";
Cur.dump(Func);
Str << "\n";
@@ -360,7 +368,7 @@
// picking the lowest-numbered available register?
int32_t RegNum = Free.find_first();
Cur.Var->setRegNumTmp(RegNum);
- if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+ if (Verbose) {
Str << "Allocating ";
Cur.dump(Func);
Str << "\n";
@@ -434,7 +442,7 @@
++Next;
LiveRangeWrapper Item = *I;
if (Item.Var->getRegNumTmp() == MinWeightIndex) {
- if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+ if (Verbose) {
Str << "Evicting ";
Item.dump(Func);
Str << "\n";
@@ -463,7 +471,7 @@
// instructions.
if (Item.Var->getRegNumTmp() == MinWeightIndex &&
Item.overlaps(Cur)) {
- if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+ if (Verbose) {
Str << "Evicting ";
Item.dump(Func);
Str << "\n";
@@ -478,7 +486,7 @@
assert(RegUses[MinWeightIndex] >= 0);
++RegUses[MinWeightIndex];
Active.push_back(Cur);
- if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+ if (Verbose) {
Str << "Allocating ";
Cur.dump(Func);
Str << "\n";
@@ -509,7 +517,7 @@
I != E; ++I) {
LiveRangeWrapper Item = *I;
int32_t RegNum = Item.Var->getRegNumTmp();
- if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+ if (Verbose) {
if (!Item.Var->hasRegTmp()) {
Str << "Not assigning ";
Item.Var->dump(Func);
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index f663155..76d3473 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -229,6 +229,8 @@
// registers could potentially be parameterized if we want to restrict
// registers e.g. for performance testing.
void TargetLowering::regAlloc() {
+ static TimerIdT IDregAlloc = GlobalContext::getTimerID("regAlloc");
+ TimerMarker T(IDregAlloc, Ctx);
LinearScan LinearScan(Func);
RegSetMask RegInclude = RegSet_None;
RegSetMask RegExclude = RegSet_None;
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 96cc57b..b9d3170 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -302,35 +302,27 @@
void TargetX8632::translateO2() {
GlobalContext *Context = Func->getContext();
+ static TimerIdT IDO2 = GlobalContext::getTimerID("O2");
+ TimerMarker T(IDO2, Context);
// Lower Phi instructions.
- Timer T_placePhiLoads;
Func->placePhiLoads();
if (Func->hasError())
return;
- T_placePhiLoads.printElapsedUs(Context, "placePhiLoads()");
- Timer T_placePhiStores;
Func->placePhiStores();
if (Func->hasError())
return;
- T_placePhiStores.printElapsedUs(Context, "placePhiStores()");
- Timer T_deletePhis;
Func->deletePhis();
if (Func->hasError())
return;
- T_deletePhis.printElapsedUs(Context, "deletePhis()");
Func->dump("After Phi lowering");
// Address mode optimization.
- Timer T_doAddressOpt;
Func->getVMetadata()->init();
Func->doAddressOpt();
- T_doAddressOpt.printElapsedUs(Context, "doAddressOpt()");
// Argument lowering
- Timer T_argLowering;
Func->doArgLowering();
- T_argLowering.printElapsedUs(Context, "lowerArguments()");
// Target lowering. This requires liveness analysis for some parts
// of the lowering decisions, such as compare/branch fusing. If
@@ -338,40 +330,30 @@
// to be renumbered first. TODO: This renumbering should only be
// necessary if we're actually calculating live intervals, which we
// only do for register allocation.
- Timer T_renumber1;
Func->renumberInstructions();
if (Func->hasError())
return;
- T_renumber1.printElapsedUs(Context, "renumberInstructions()");
// TODO: It should be sufficient to use the fastest liveness
// calculation, i.e. livenessLightweight(). However, for some
// reason that slows down the rest of the translation. Investigate.
- Timer T_liveness1;
Func->liveness(Liveness_Basic);
if (Func->hasError())
return;
- T_liveness1.printElapsedUs(Context, "liveness()");
Func->dump("After x86 address mode opt");
- Timer T_genCode;
Func->genCode();
if (Func->hasError())
return;
- T_genCode.printElapsedUs(Context, "genCode()");
// Register allocation. This requires instruction renumbering and
// full liveness analysis.
- Timer T_renumber2;
Func->renumberInstructions();
if (Func->hasError())
return;
- T_renumber2.printElapsedUs(Context, "renumberInstructions()");
- Timer T_liveness2;
Func->liveness(Liveness_Intervals);
if (Func->hasError())
return;
- T_liveness2.printElapsedUs(Context, "liveness()");
// Validate the live range computations. Do it outside the timing
// code. TODO: Put this under a flag.
bool ValidLiveness = Func->validateLiveness();
@@ -381,20 +363,16 @@
// The post-codegen dump is done here, after liveness analysis and
// associated cleanup, to make the dump cleaner and more useful.
Func->dump("After initial x8632 codegen");
- Timer T_regAlloc;
Func->getVMetadata()->init();
regAlloc();
if (Func->hasError())
return;
- T_regAlloc.printElapsedUs(Context, "regAlloc()");
Func->dump("After linear scan regalloc");
// Stack frame mapping.
- Timer T_genFrame;
Func->genFrame();
if (Func->hasError())
return;
- T_genFrame.printElapsedUs(Context, "genFrame()");
Func->dump("After stack frame mapping");
// Branch optimization. This needs to be done just before code
@@ -413,39 +391,29 @@
void TargetX8632::translateOm1() {
GlobalContext *Context = Func->getContext();
- Timer T_placePhiLoads;
+ static TimerIdT IDOm1 = GlobalContext::getTimerID("Om1");
+ TimerMarker T(IDOm1, Context);
Func->placePhiLoads();
if (Func->hasError())
return;
- T_placePhiLoads.printElapsedUs(Context, "placePhiLoads()");
- Timer T_placePhiStores;
Func->placePhiStores();
if (Func->hasError())
return;
- T_placePhiStores.printElapsedUs(Context, "placePhiStores()");
- Timer T_deletePhis;
Func->deletePhis();
if (Func->hasError())
return;
- T_deletePhis.printElapsedUs(Context, "deletePhis()");
Func->dump("After Phi lowering");
- Timer T_argLowering;
Func->doArgLowering();
- T_argLowering.printElapsedUs(Context, "lowerArguments()");
- Timer T_genCode;
Func->genCode();
if (Func->hasError())
return;
- T_genCode.printElapsedUs(Context, "genCode()");
Func->dump("After initial x8632 codegen");
- Timer T_genFrame;
Func->genFrame();
if (Func->hasError())
return;
- T_genFrame.printElapsedUs(Context, "genFrame()");
Func->dump("After stack frame mapping");
// Nop insertion
diff --git a/src/IceTimerTree.cpp b/src/IceTimerTree.cpp
new file mode 100644
index 0000000..4bdb80a
--- /dev/null
+++ b/src/IceTimerTree.cpp
@@ -0,0 +1,142 @@
+//===- subzero/src/IceTimerTree.cpp - Pass timer defs ---------------------===//
+//
+// The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the TimerTree class, which tracks flat and
+// cumulative execution time collection of call chains.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IceDefs.h"
+#include "IceTimerTree.h"
+
+namespace Ice {
+
+std::vector<IceString> TimerStack::IDs;
+
+TimerStack::TimerStack(const IceString &TopLevelName)
+ : FirstTimestamp(timestamp()), LastTimestamp(FirstTimestamp),
+ StateChangeCount(0), StackTop(0) {
+ Nodes.resize(1); // Reserve Nodes[0] for the root node.
+ push(getTimerID(TopLevelName));
+}
+
+// Returns the unique timer ID for the given Name, creating a new ID
+// if needed. For performance reasons, it's best to make only one
+// call per Name and cache the result, e.g. via a static initializer.
+TimerIdT TimerStack::getTimerID(const IceString &Name) {
+ TimerIdT Size = IDs.size();
+ for (TimerIdT i = 0; i < Size; ++i) {
+ if (IDs[i] == Name)
+ return i;
+ }
+ IDs.push_back(Name);
+ return Size;
+}
+
+// Pushes a new marker onto the timer stack.
+void TimerStack::push(TimerIdT ID) {
+ update();
+ if (Nodes[StackTop].Children.size() <= ID)
+ Nodes[StackTop].Children.resize(ID + 1);
+ if (Nodes[StackTop].Children[ID] == 0) {
+ TTindex Size = Nodes.size();
+ Nodes[StackTop].Children[ID] = Size;
+ Nodes.resize(Size + 1);
+ Nodes[Size].Parent = StackTop;
+ Nodes[Size].Interior = ID;
+ }
+ StackTop = Nodes[StackTop].Children[ID];
+}
+
+// Pop the top marker from the timer stack. Validates via assert()
+// that the expected marker is popped.
+void TimerStack::pop(TimerIdT ID) {
+ update();
+ assert(StackTop);
+ assert(Nodes[StackTop].Parent < StackTop);
+ // Verify that the expected ID is being popped.
+ assert(Nodes[StackTop].Interior == ID);
+ (void)ID;
+ // Verify that the parent's child points to the current stack top.
+ assert(Nodes[Nodes[StackTop].Parent].Children[ID] == StackTop);
+ StackTop = Nodes[StackTop].Parent;
+}
+
+// At a state change (e.g. push or pop), updates the flat and
+// cumulative timings for everything on the timer stack.
+void TimerStack::update() {
+ ++StateChangeCount;
+ // Whenever the stack is about to change, we grab the time delta
+ // since the last change and add it to all active cumulative
+ // elements and to the flat element for the top of the stack.
+ double Current = timestamp();
+ double Delta = Current - LastTimestamp;
+ LastTimestamp = Current;
+ if (StackTop) {
+ TimerIdT Leaf = Nodes[StackTop].Interior;
+ if (Leaf >= LeafTimes.size())
+ LeafTimes.resize(Leaf + 1);
+ LeafTimes[Leaf] += Delta;
+ }
+ TTindex Prefix = StackTop;
+ while (Prefix) {
+ Nodes[Prefix].Time += Delta;
+ TTindex Next = Nodes[Prefix].Parent;
+ assert(Next < Prefix);
+ Prefix = Next;
+ }
+}
+
+namespace {
+
+typedef std::multimap<double, IceString> DumpMapType;
+
+// Dump the Map items in reverse order of their time contribution.
+void dumpHelper(Ostream &Str, const DumpMapType &Map, double TotalTime) {
+ for (DumpMapType::const_reverse_iterator I = Map.rbegin(), E = Map.rend();
+ I != E; ++I) {
+ char buf[80];
+ snprintf(buf, llvm::array_lengthof(buf), " %10.6f (%4.1f%%): ", I->first,
+ I->first * 100 / TotalTime);
+ Str << buf << I->second << "\n";
+ }
+}
+
+} // end of anonymous namespace
+
+void TimerStack::dump(Ostream &Str) {
+ update();
+ double TotalTime = LastTimestamp - FirstTimestamp;
+ assert(TotalTime);
+ Str << "Cumulative function times:\n";
+ DumpMapType CumulativeMap;
+ for (TTindex i = 1; i < Nodes.size(); ++i) {
+ TTindex Prefix = i;
+ IceString Suffix = "";
+ while (Prefix) {
+ if (Suffix.empty())
+ Suffix = IDs[Nodes[Prefix].Interior];
+ else
+ Suffix = IDs[Nodes[Prefix].Interior] + "." + Suffix;
+ assert(Nodes[Prefix].Parent < Prefix);
+ Prefix = Nodes[Prefix].Parent;
+ }
+ CumulativeMap.insert(std::make_pair(Nodes[i].Time, Suffix));
+ }
+ dumpHelper(Str, CumulativeMap, TotalTime);
+ Str << "Flat function times:\n";
+ DumpMapType FlatMap;
+ for (TimerIdT i = 0; i < LeafTimes.size(); ++i) {
+ FlatMap.insert(std::make_pair(LeafTimes[i], IDs[i]));
+ }
+ dumpHelper(Str, FlatMap, TotalTime);
+ Str << "Number of timer updates: " << StateChangeCount << "\n";
+}
+
+} // end of namespace Ice
diff --git a/src/IceTimerTree.h b/src/IceTimerTree.h
new file mode 100644
index 0000000..3162587
--- /dev/null
+++ b/src/IceTimerTree.h
@@ -0,0 +1,68 @@
+//===- subzero/src/IceTimerTree.h - Pass timer defs -------------*- C++ -*-===//
+//
+// The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the TimerTree class, which allows flat and
+// cumulative execution time collection of call chains.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ICETIMERTREE_H
+#define SUBZERO_SRC_ICETIMERTREE_H
+
+namespace Ice {
+
+class TimerTreeNode;
+
+// Timer tree index type
+typedef std::vector<TimerTreeNode>::size_type TTindex;
+
+// TimerTreeNode represents an interior or leaf node in the call tree.
+// It contains a list of children, a pointer to its parent, and the
+// timer ID for the node. It also holds the cumulative time spent at
+// this node and below. The children are always at a higher index in
+// the TimerTreeNode::Nodes array, and the parent is always at a lower
+// index.
+class TimerTreeNode {
+public:
+ TimerTreeNode() : Parent(0), Interior(0), Time(0) {}
+ std::vector<TTindex> Children; // indexed by TimerIdT
+ TTindex Parent;
+ TimerIdT Interior;
+ double Time;
+};
+
+class TimerStack {
+ TimerStack(const TimerStack &) LLVM_DELETED_FUNCTION;
+ TimerStack &operator=(const TimerStack &) LLVM_DELETED_FUNCTION;
+
+public:
+ TimerStack(const IceString &TopLevelName);
+ static TimerIdT getTimerID(const IceString &Name);
+ void push(TimerIdT ID);
+ void pop(TimerIdT ID);
+ void dump(Ostream &Str);
+
+private:
+ void update();
+ static double timestamp() {
+ // TODO: Implement in terms of std::chrono for C++11.
+ return llvm::TimeRecord::getCurrentTime(false).getWallTime();
+ }
+ const double FirstTimestamp;
+ double LastTimestamp;
+ uint64_t StateChangeCount;
+ static std::vector<IceString> IDs; // indexed by TimerIdT
+ std::vector<TimerTreeNode> Nodes; // indexed by TTindex
+ std::vector<double> LeafTimes; // indexed by TimerIdT
+ TTindex StackTop;
+};
+
+} // end of namespace Ice
+
+#endif // SUBZERO_SRC_ICETIMERTREE_H
diff --git a/src/IceTranslator.cpp b/src/IceTranslator.cpp
index e0c7e2d..91dcfd4 100644
--- a/src/IceTranslator.cpp
+++ b/src/IceTranslator.cpp
@@ -12,12 +12,11 @@
//
//===----------------------------------------------------------------------===//
-#include "IceTranslator.h"
-
#include "IceCfg.h"
#include "IceClFlags.h"
#include "IceDefs.h"
#include "IceTargetLowering.h"
+#include "IceTranslator.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
@@ -80,24 +79,13 @@
if (Ctx->getFlags().DisableTranslation) {
Func->dump();
} else {
- Timer TTranslate;
Func->translate();
- if (Ctx->getFlags().SubzeroTimingEnabled) {
- std::cerr << "[Subzero timing] Translate function "
- << Func->getFunctionName() << ": " << TTranslate.getElapsedSec()
- << " sec\n";
- }
if (Func->hasError()) {
std::cerr << "ICE translation error: " << Func->getError() << "\n";
ErrorStatus = true;
}
- Timer TEmit;
Func->emit();
- if (Ctx->getFlags().SubzeroTimingEnabled) {
- std::cerr << "[Subzero timing] Emit function " << Func->getFunctionName()
- << ": " << TEmit.getElapsedSec() << " sec\n";
- }
Ctx->dumpStats(Func->getFunctionName());
}
}
diff --git a/src/PNaClTranslator.cpp b/src/PNaClTranslator.cpp
index 47f6a5e..e1925df 100644
--- a/src/PNaClTranslator.cpp
+++ b/src/PNaClTranslator.cpp
@@ -12,7 +12,6 @@
//
//===----------------------------------------------------------------------===//
-#include "PNaClTranslator.h"
#include "IceCfg.h"
#include "IceCfgNode.h"
#include "IceClFlags.h"
@@ -20,6 +19,7 @@
#include "IceInst.h"
#include "IceOperand.h"
#include "IceTypeConverter.h"
+#include "PNaClTranslator.h"
#include "llvm/Analysis/NaCl/PNaClABIProps.h"
#include "llvm/Bitcode/NaCl/NaClBitcodeDecoders.h"
#include "llvm/Bitcode/NaCl/NaClBitcodeHeader.h"
@@ -852,7 +852,7 @@
}
}
- ~FunctionParser() override;
+ ~FunctionParser() override {};
// Set the next constant ID to the given constant C.
void setNextConstantID(Ice::Constant *C) {
@@ -860,8 +860,6 @@
}
private:
- // Timer for reading function bitcode and converting to ICE.
- Ice::Timer TConvert;
// The corresponding ICE function defined by the function block.
Ice::Cfg *Func;
// The index to the current basic block being built.
@@ -1376,13 +1374,6 @@
}
};
-FunctionParser::~FunctionParser() {
- if (getFlags().SubzeroTimingEnabled) {
- errs() << "[Subzero timing] Convert function " << Func->getFunctionName()
- << ": " << TConvert.getElapsedSec() << " sec\n";
- }
-}
-
void FunctionParser::ReportInvalidBinopOpcode(unsigned Opcode, Ice::Type Ty) {
std::string Buffer;
raw_string_ostream StrBuf(Buffer);
diff --git a/src/llvm2ice.cpp b/src/llvm2ice.cpp
index 515e2fe..7f1fc38 100644
--- a/src/llvm2ice.cpp
+++ b/src/llvm2ice.cpp
@@ -44,7 +44,6 @@
clEnumValN(Ice::IceV_RegOrigins, "orig", "Physical register origins"),
clEnumValN(Ice::IceV_LinearScan, "regalloc", "Linear scan details"),
clEnumValN(Ice::IceV_Frame, "frame", "Stack frame layout details"),
- clEnumValN(Ice::IceV_Timing, "time", "Pass timing details"),
clEnumValN(Ice::IceV_AddrOpt, "addropt", "Address mode optimization"),
clEnumValN(Ice::IceV_All, "all", "Use all verbose options"),
clEnumValN(Ice::IceV_Most, "most",
@@ -175,6 +174,8 @@
Ice::GlobalContext Ctx(Ls, Os, VMask, TargetArch, OptLevel, TestPrefix,
Flags);
+ static Ice::TimerIdT IDszmain = Ice::GlobalContext::getTimerID("szmain");
+ Ice::TimerMarker T(IDszmain, &Ctx);
int ErrorStatus = 0;
if (BuildOnRead) {
@@ -184,15 +185,11 @@
} else {
// Parse the input LLVM IR file into a module.
SMDiagnostic Err;
- Ice::Timer T;
+ static Ice::TimerIdT IDparse = Ice::GlobalContext::getTimerID("parse");
+ Ice::TimerMarker T1(IDparse, &Ctx);
Module *Mod =
NaClParseIRFile(IRFilename, InputFileFormat, Err, getGlobalContext());
- if (SubzeroTimingEnabled) {
- std::cerr << "[Subzero timing] IR Parsing: " << T.getElapsedSec()
- << " sec\n";
- }
-
if (!Mod) {
Err.print(argv[0], errs());
return 1;
@@ -202,6 +199,8 @@
Converter.convertToIce();
ErrorStatus = Converter.getErrorStatus();
}
+ if (SubzeroTimingEnabled)
+ Ctx.dumpTimers();
const bool FinalStats = true;
Ctx.dumpStats("_FINAL_", FinalStats);
return ErrorStatus;