Subzero: Simplify the FakeKill instruction.

Even after earlier simplifications, FakeKill was still handled somewhat inefficiently for the register allocator.  For x86-32, any function containing call instructions would result in about 11 pre-colored Variables, each with an identical and relatively complex live range consisting of points.  They would start out on the UnhandledPrecolored list, then all move to the Inactive list, where they would be repeatedly compared against each register allocation candidate via overlapsRange().

We improve this by keeping around a single copy of that live range and directly masking out the Free[] register set when that live range overlaps the current candidate's live range.  This saves ~10 overlaps() calculations per candidate while FakeKills are still pending.

Also, slightly rearrange the initialization of the Unhandled etc. sets into a separate init routine, which will make it easier to reuse the register allocator in other situations such as Om1 post-lowering.

BUG= none
R=jvoung@chromium.org

Review URL: https://codereview.chromium.org/720343003
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp
index ce7ea49..e41f114 100644
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -661,18 +661,6 @@
       FirstInstNum = I->getNumber();
     assert(I->getNumber() > LastInstNum);
     LastInstNum = I->getNumber();
-    // Create fake live ranges for a Kill instruction, but only if the
-    // linked instruction is still alive.
-    if (Mode == Liveness_Intervals) {
-      if (InstFakeKill *Kill = llvm::dyn_cast<InstFakeKill>(I)) {
-        if (!Kill->getLinked()->isDeleted()) {
-          for (Variable *Var : Kill->getKilledRegs()) {
-            InstNumberT InstNumber = I->getNumber();
-            Var->addLiveRange(InstNumber, InstNumber, 1);
-          }
-        }
-      }
-    }
   }
   if (Mode != Liveness_Intervals)
     return;
diff --git a/src/IceInst.cpp b/src/IceInst.cpp
index 090faba..5030f5f 100644
--- a/src/IceInst.cpp
+++ b/src/IceInst.cpp
@@ -114,8 +114,6 @@
 
 void Inst::livenessLightweight(Cfg *Func, LivenessBV &Live) {
   assert(!isDeleted());
-  if (llvm::isa<InstFakeKill>(this))
-    return;
   resetLastUses();
   VariablesMetadata *VMetadata = Func->getVMetadata();
   SizeT VarIndex = 0;
@@ -139,8 +137,6 @@
                     Liveness *Liveness, LiveBeginEndMap *LiveBegin,
                     LiveBeginEndMap *LiveEnd) {
   assert(!isDeleted());
-  if (llvm::isa<InstFakeKill>(this))
-    return true;
 
   Dead = false;
   if (Dest) {
@@ -181,15 +177,13 @@
           // twice.  ICE only allows a variable to have a single
           // liveness interval in a basic block (except for blocks
           // where a variable is live-in and live-out but there is a
-          // gap in the middle, and except for the special
-          // InstFakeKill instruction that can appear multiple
-          // times in the same block).  Therefore, this lowered
-          // sequence needs to represent a single conservative live
-          // range for t.  Since the instructions are being traversed
-          // backwards, we make sure LiveEnd is only set once by
-          // setting it only when LiveEnd[VarNum]==0 (sentinel value).
-          // Note that it's OK to set LiveBegin multiple times because
-          // of the backwards traversal.
+          // gap in the middle).  Therefore, this lowered sequence
+          // needs to represent a single conservative live range for
+          // t.  Since the instructions are being traversed backwards,
+          // we make sure LiveEnd is only set once by setting it only
+          // when LiveEnd[VarNum]==0 (sentinel value).  Note that it's
+          // OK to set LiveBegin multiple times because of the
+          // backwards traversal.
           if (LiveEnd) {
             // Ideally, we would verify that VarNum wasn't already
             // added in this block, but this can't be done very
@@ -452,10 +446,8 @@
   addSource(Src);
 }
 
-InstFakeKill::InstFakeKill(Cfg *Func, const VarList &KilledRegs,
-                           const Inst *Linked)
-    : InstHighLevel(Func, Inst::FakeKill, 0, NULL), KilledRegs(KilledRegs),
-      Linked(Linked) {}
+InstFakeKill::InstFakeKill(Cfg *Func, const Inst *Linked)
+    : InstHighLevel(Func, Inst::FakeKill, 0, NULL), Linked(Linked) {}
 
 // ======================== Dump routines ======================== //
 
@@ -757,14 +749,7 @@
   Ostream &Str = Func->getContext()->getStrDump();
   if (Linked->isDeleted())
     Str << "// ";
-  Str << "kill.pseudo ";
-  bool First = true;
-  for (Variable *Var : KilledRegs) {
-    if (!First)
-      Str << ", ";
-    First = false;
-    Var->dump(Func);
-  }
+  Str << "kill.pseudo scratch_regs";
 }
 
 void InstTarget::dump(const Cfg *Func) const {
diff --git a/src/IceInst.h b/src/IceInst.h
index 71da0b3..5d5101b 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -782,11 +782,12 @@
   ~InstFakeUse() override {}
 };
 
-// FakeKill instruction.  This "kills" a set of variables by adding a
-// trivial live range at this instruction to each variable.  The
-// primary use is to indicate that scratch registers are killed after
-// a call, so that the register allocator won't assign a scratch
-// register to a variable whose live range spans a call.
+// FakeKill instruction.  This "kills" a set of variables by modeling
+// a trivial live range at this instruction for each (implicit)
+// variable.  The primary use is to indicate that scratch registers
+// are killed after a call, so that the register allocator won't
+// assign a scratch register to a variable whose live range spans a
+// call.
 //
 // The FakeKill instruction also holds a pointer to the instruction
 // that kills the set of variables, so that if that linked instruction
@@ -796,12 +797,9 @@
   InstFakeKill &operator=(const InstFakeKill &) = delete;
 
 public:
-  static InstFakeKill *create(Cfg *Func, const VarList &KilledRegs,
-                              const Inst *Linked) {
-    return new (Func->allocateInst<InstFakeKill>())
-        InstFakeKill(Func, KilledRegs, Linked);
+  static InstFakeKill *create(Cfg *Func, const Inst *Linked) {
+    return new (Func->allocateInst<InstFakeKill>()) InstFakeKill(Func, Linked);
   }
-  const VarList &getKilledRegs() const { return KilledRegs; }
   const Inst *getLinked() const { return Linked; }
   void emit(const Cfg *Func) const override;
   void emitIAS(const Cfg * /* Func */) const override {}
@@ -809,10 +807,9 @@
   static bool classof(const Inst *Inst) { return Inst->getKind() == FakeKill; }
 
 private:
-  InstFakeKill(Cfg *Func, const VarList &KilledRegs, const Inst *Linked);
+  InstFakeKill(Cfg *Func, const Inst *Linked);
   ~InstFakeKill() override {}
 
-  const VarList &KilledRegs;
   // This instruction is ignored if Linked->isDeleted() is true.
   const Inst *Linked;
 };
diff --git a/src/IceOperand.cpp b/src/IceOperand.cpp
index 7870d180..b2af752 100644
--- a/src/IceOperand.cpp
+++ b/src/IceOperand.cpp
@@ -37,8 +37,6 @@
 }
 
 void LiveRange::addSegment(InstNumberT Start, InstNumberT End) {
-  if (End > Start)
-    IsNonpoints = true;
 #ifdef USE_SET
   RangeElementType Element(Start, End);
   RangeType::iterator Next = Range.lower_bound(Element);
@@ -125,8 +123,6 @@
 }
 
 bool LiveRange::overlapsInst(InstNumberT OtherBegin, bool UseTrimmed) const {
-  if (!IsNonpoints)
-    return false;
   bool Result = false;
   for (auto I = (UseTrimmed ? TrimmedBegin : Range.begin()), E = Range.end();
        I != E; ++I) {
diff --git a/src/IceOperand.h b/src/IceOperand.h
index 5e0c02d..c37b195 100644
--- a/src/IceOperand.h
+++ b/src/IceOperand.h
@@ -309,17 +309,15 @@
 // weight, in case e.g. we want a live range to have higher weight
 // inside a loop.
 class LiveRange {
-  // LiveRange(const LiveRange &) = delete;
-  // LiveRange &operator=(const LiveRange &) = delete;
-
 public:
-  LiveRange() : Weight(0), IsNonpoints(false) {}
+  LiveRange() : Weight(0) {}
+  LiveRange(const LiveRange &) = default;
+  LiveRange &operator=(const LiveRange &) = default;
 
   void reset() {
     Range.clear();
     Weight.setWeight(0);
     untrim();
-    IsNonpoints = false;
   }
   void addSegment(InstNumberT Start, InstNumberT End);
 
@@ -328,7 +326,6 @@
   bool overlapsInst(InstNumberT OtherBegin, bool UseTrimmed = false) const;
   bool containsValue(InstNumberT Value, bool IsDest) const;
   bool isEmpty() const { return Range.empty(); }
-  bool isNonpoints() const { return IsNonpoints; }
   InstNumberT getStart() const {
     return Range.empty() ? -1 : Range.begin()->first;
   }
@@ -365,11 +362,6 @@
   // that linear-scan also has to initialize TrimmedBegin at the
   // beginning by calling untrim().
   RangeType::const_iterator TrimmedBegin;
-  // IsNonpoints keeps track of whether the live range contains at
-  // least one interval where Start!=End.  If it is empty or has the
-  // form [x,x),[y,y),...,[z,z), then overlaps(InstNumberT) is
-  // trivially false.
-  bool IsNonpoints;
 };
 
 Ostream &operator<<(Ostream &Str, const LiveRange &L);
diff --git a/src/IceRegAlloc.cpp b/src/IceRegAlloc.cpp
index 1b4d0c2..80a9432 100644
--- a/src/IceRegAlloc.cpp
+++ b/src/IceRegAlloc.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "IceCfg.h"
+#include "IceCfgNode.h"
 #include "IceInst.h"
 #include "IceOperand.h"
 #include "IceRegAlloc.h"
@@ -72,6 +73,63 @@
 
 } // end of anonymous namespace
 
+void LinearScan::initForGlobalAlloc() {
+  TimerMarker T(TimerStack::TT_initUnhandled, Func);
+  Unhandled.clear();
+  UnhandledPrecolored.clear();
+  Handled.clear();
+  Inactive.clear();
+  Active.clear();
+  // Gather the live ranges of all variables and add them to the
+  // Unhandled set.
+  const VarList &Vars = Func->getVariables();
+  Unhandled.reserve(Vars.size());
+  for (Variable *Var : Vars) {
+    // Explicitly don't consider zero-weight variables, which are
+    // meant to be spill slots.
+    if (Var->getWeight() == RegWeight::Zero)
+      continue;
+    // Don't bother if the variable has a null live range, which means
+    // it was never referenced.
+    if (Var->getLiveRange().isEmpty())
+      continue;
+    Var->untrimLiveRange();
+    Unhandled.push_back(Var);
+    if (Var->hasReg()) {
+      Var->setRegNumTmp(Var->getRegNum());
+      Var->setLiveRangeInfiniteWeight();
+      UnhandledPrecolored.push_back(Var);
+    }
+  }
+  struct CompareRanges {
+    bool operator()(const Variable *L, const Variable *R) {
+      InstNumberT Lstart = L->getLiveRange().getStart();
+      InstNumberT Rstart = R->getLiveRange().getStart();
+      if (Lstart == Rstart)
+        return L->getIndex() < R->getIndex();
+      return Lstart < Rstart;
+    }
+  };
+  // Do a reverse sort so that erasing elements (from the end) is fast.
+  std::sort(Unhandled.rbegin(), Unhandled.rend(), CompareRanges());
+  std::sort(UnhandledPrecolored.rbegin(), UnhandledPrecolored.rend(),
+            CompareRanges());
+
+  // Build the (ordered) list of FakeKill instruction numbers.
+  Kills.clear();
+  for (CfgNode *Node : Func->getNodes()) {
+    for (auto I = Node->getInsts().begin(), E = Node->getInsts().end(); I != E;
+         ++I) {
+      if (I->isDeleted())
+        continue;
+      if (auto Kill = llvm::dyn_cast<InstFakeKill>(I)) {
+        if (!Kill->getLinked()->isDeleted())
+          Kills.push_back(I->getNumber());
+      }
+    }
+  }
+}
+
 // Implements the linear-scan algorithm.  Based on "Linear Scan
 // Register Allocation in the Context of SSA Form and Register
 // Constraints" by Hanspeter Mössenböck and Michael Pfeiffer,
@@ -86,53 +144,16 @@
 void LinearScan::scan(const llvm::SmallBitVector &RegMaskFull) {
   TimerMarker T(TimerStack::TT_linearScan, Func);
   assert(RegMaskFull.any()); // Sanity check
-  Unhandled.clear();
-  UnhandledPrecolored.clear();
-  Handled.clear();
-  Inactive.clear();
-  Active.clear();
   Ostream &Str = Func->getContext()->getStrDump();
   bool Verbose = Func->getContext()->isVerbose(IceV_LinearScan);
   Func->resetCurrentNode();
   VariablesMetadata *VMetadata = Func->getVMetadata();
 
-  // Gather the live ranges of all variables and add them to the
-  // Unhandled set.
-  const VarList &Vars = Func->getVariables();
-  {
-    TimerMarker T(TimerStack::TT_initUnhandled, Func);
-    Unhandled.reserve(Vars.size());
-    for (Variable *Var : Vars) {
-      // Explicitly don't consider zero-weight variables, which are
-      // meant to be spill slots.
-      if (Var->getWeight() == RegWeight::Zero)
-        continue;
-      // Don't bother if the variable has a null live range, which means
-      // it was never referenced.
-      if (Var->getLiveRange().isEmpty())
-        continue;
-      Var->untrimLiveRange();
-      Unhandled.push_back(Var);
-      if (Var->hasReg()) {
-        Var->setRegNumTmp(Var->getRegNum());
-        Var->setLiveRangeInfiniteWeight();
-        UnhandledPrecolored.push_back(Var);
-      }
-    }
-    struct CompareRanges {
-      bool operator()(const Variable *L, const Variable *R) {
-        InstNumberT Lstart = L->getLiveRange().getStart();
-        InstNumberT Rstart = R->getLiveRange().getStart();
-        if (Lstart == Rstart)
-          return L->getIndex() < R->getIndex();
-        return Lstart < Rstart;
-      }
-    };
-    // Do a reverse sort so that erasing elements (from the end) is fast.
-    std::sort(Unhandled.rbegin(), Unhandled.rend(), CompareRanges());
-    std::sort(UnhandledPrecolored.rbegin(), UnhandledPrecolored.rend(),
-              CompareRanges());
-  }
+  // Build a LiveRange representing the Kills list.
+  LiveRange KillsRange;
+  for (InstNumberT I : Kills)
+    KillsRange.addSegment(I, I);
+  KillsRange.untrim();
 
   // RegUses[I] is the number of live ranges (variables) that register
   // I is currently assigned to.  It can be greater than 1 as a result
@@ -144,6 +165,11 @@
   assert(Inactive.empty());
   assert(Handled.empty());
   UnorderedRanges::iterator Next;
+  const TargetLowering::RegSetMask RegsInclude =
+      TargetLowering::RegSet_CallerSave;
+  const TargetLowering::RegSetMask RegsExclude = TargetLowering::RegSet_None;
+  const llvm::SmallBitVector KillsMask =
+      Func->getTarget()->getRegisterSet(RegsInclude, RegsExclude);
 
   while (!Unhandled.empty()) {
     Variable *Cur = Unhandled.back();
@@ -155,6 +181,7 @@
     }
     const llvm::SmallBitVector RegMask =
         RegMaskFull & Func->getTarget()->getRegisterSetForType(Cur->getType());
+    KillsRange.trim(Cur->getLiveRange().getStart());
 
     // Check for precolored ranges.  If Cur is precolored, it
     // definitely gets that register.  Previously processed live
@@ -220,15 +247,6 @@
       ++Next;
       Variable *Item = *I;
       Item->trimLiveRange(Cur->getLiveRange().getStart());
-      // As an optimization, don't bother checking pure point-valued
-      // Inactive ranges, because the overlapsStart() test will never
-      // succeed, and the rangeEndsBefore() test will generally only
-      // succeed after the last call instruction, which statistically
-      // happens near the end.  TODO(stichnot): Consider suppressing
-      // this check every N iterations in case calls are only at the
-      // beginning of the function.
-      if (!Item->getLiveRange().isNonpoints())
-        continue;
       if (Item->rangeEndsBefore(Cur)) {
         // Move Item from Inactive to Handled list.
         if (Verbose) {
@@ -374,6 +392,19 @@
       }
     }
 
+    // Remove scratch registers from the Free[] list, and mark their
+    // Weights[] as infinite, if KillsRange overlaps Cur's live range.
+    const bool UseTrimmed = true;
+    if (Cur->getLiveRange().overlaps(KillsRange, UseTrimmed)) {
+      Free.reset(KillsMask);
+      for (int i = KillsMask.find_first(); i != -1;
+           i = KillsMask.find_next(i)) {
+        Weights[i].setWeight(RegWeight::Inf);
+        if (PreferReg == i)
+          AllowOverlap = false;
+      }
+    }
+
     // Print info about physical register availability.
     if (Verbose) {
       for (SizeT i = 0; i < RegMask.size(); ++i) {
diff --git a/src/IceRegAlloc.h b/src/IceRegAlloc.h
index 0742cc1..f409e7a 100644
--- a/src/IceRegAlloc.h
+++ b/src/IceRegAlloc.h
@@ -27,6 +27,7 @@
 
 public:
   LinearScan(Cfg *Func) : Func(Func) {}
+  void initForGlobalAlloc();
   void scan(const llvm::SmallBitVector &RegMask);
   void dump(Cfg *Func) const;
 
@@ -39,6 +40,7 @@
   // for faster processing.
   OrderedRanges UnhandledPrecolored;
   UnorderedRanges Active, Inactive, Handled;
+  std::vector<InstNumberT> Kills;
 };
 
 } // end of namespace Ice
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index 5661ad9..c8f504e 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -234,6 +234,7 @@
   RegInclude |= RegSet_CalleeSave;
   if (hasFramePointer())
     RegExclude |= RegSet_FramePointer;
+  LinearScan.initForGlobalAlloc();
   llvm::SmallBitVector RegMask = getRegisterSet(RegInclude, RegExclude);
   LinearScan.scan(RegMask);
 }
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 4afa98f..6c3b624 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -316,8 +316,6 @@
 void TargetX8632::translateO2() {
   TimerMarker T(TimerStack::TT_O2, Func);
 
-  initFakeKilledScratchRegisters();
-
   if (!Ctx->getFlags().PhiEdgeSplit) {
     // Lower Phi instructions.
     Func->placePhiLoads();
@@ -413,8 +411,6 @@
 void TargetX8632::translateOm1() {
   TimerMarker T(TimerStack::TT_Om1, Func);
 
-  initFakeKilledScratchRegisters();
-
   Func->placePhiLoads();
   if (Func->hasError())
     return;
@@ -1894,9 +1890,7 @@
   }
 
   // Insert a register-kill pseudo instruction.
-  assert(!FakeKilledScratchRegisters.empty());
-  Context.insert(
-      InstFakeKill::create(Func, FakeKilledScratchRegisters, NewCall));
+  Context.insert(InstFakeKill::create(Func, NewCall));
 
   // Generate a FakeUse to keep the call live if necessary.
   if (Instr->hasSideEffects() && ReturnReg) {
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index 3acd4d9..74ba287 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -483,20 +483,10 @@
   llvm::SmallBitVector RegsUsed;
   SizeT NextLabelNumber;
   VarList PhysicalRegisters[IceType_NUM];
-  VarList FakeKilledScratchRegisters;
   static IceString RegNames[];
 
 private:
   ~TargetX8632() override {}
-  // Ideally, this initialization would be done in the constructor,
-  // but we need to defer it until after the initial CFG is built,
-  // because some of the bitcode reader tests rely on the order that
-  // Variables are created and their default printable names.
-  void initFakeKilledScratchRegisters() {
-    for (SizeT I = 0; I < ScratchRegs.size(); ++I)
-      if (ScratchRegs[I])
-        FakeKilledScratchRegisters.push_back(getPhysicalRegister(I));
-  }
   template <typename T> void emitConstantPool() const;
 };