Subzero: Fix the simple register allocation for -Om1. Background: After lowering each high-level ICE instruction, Om1 calls postLower() to do simple register allocation. It only assigns registers where absolutely necessary, specifically for infinite-weight variables, while honoring pre-coloring decisions. The original Om1 register allocation never tried to reuse registers within a lowered sequence, which was generally OK except for very long lowering sequences, such as call instructions or some intrinsics. In these cases, when it ran out of physical registers, it would just reset the free list and hope for the best, but with no guarantee of correctness. The fix involves keeping track of which instruction in the lowered sequence holds the last use of each variable, and releasing each register back to the free list after its last use. This makes much better use of registers. It's not necessarily optimal, at least with respect to pre-colored variables, since those registers are black-listed even if they don't interfere with an infinite-weight variable. BUG= none R=jvoung@chromium.org Review URL: https://codereview.chromium.org/483453002

commit: 4d79fe5b58142ba6e0e6f9ddafab286fb0ec46af [log] [tgz]
author: Jim Stichnoth <stichnot@chromium.org> Mon Aug 18 10:55:19 2014 -0700
committer: Jim Stichnoth <stichnot@chromium.org> Mon Aug 18 10:55:19 2014 -0700
tree: cfde499cd0360accbfd31840fcfbc8c11f6c8073
parent: c3302746b21dcb4469efef8d484adba47c9d87e9 [diff]
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index d36ad16..0423e39 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp

@@ -22,6 +22,7 @@
 #include "IceOperand.h"
 #include "IceTargetLoweringX8632.def"
 #include "IceTargetLoweringX8632.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/CommandLine.h"
 
 #include <strings.h>
@@ -4164,7 +4165,7 @@
     return;
   // TODO: Avoid recomputing WhiteList every instruction.
   RegSetMask RegInclude = RegSet_All;
-  RegSetMask RegExclude = RegSet_None;
+  RegSetMask RegExclude = RegSet_StackPointer;
   if (hasFramePointer())
     RegExclude |= RegSet_FramePointer;
   llvm::SmallBitVector WhiteList = getRegisterSet(RegInclude, RegExclude);
@@ -4172,11 +4173,25 @@
   // there was some prior register allocation pass that made register
   // assignments, those registers need to be black-listed here as
   // well.
+  llvm::DenseMap<const Variable *, const Inst *> LastUses;
+  // The first pass also keeps track of which instruction is the last
+  // use for each infinite-weight variable.  After the last use, the
+  // variable is released to the free list.
   for (InstList::iterator I = Context.getCur(), E = Context.getEnd(); I != E;
        ++I) {
     const Inst *Inst = *I;
     if (Inst->isDeleted())
       continue;
+    // Don't consider a FakeKill instruction, because (currently) it
+    // is only used to kill all scratch registers at a call site, and
+    // we don't want to black-list all scratch registers during the
+    // call lowering.  This could become a problem since it relies on
+    // the lowering sequence not keeping any infinite-weight variables
+    // live across a call.  TODO(stichnot): Consider replacing this
+    // whole postLower() implementation with a robust local register
+    // allocator, for example compute live ranges only for pre-colored
+    // and infinite-weight variables and run the existing linear-scan
+    // allocator.
     if (llvm::isa<InstFakeKill>(Inst))
       continue;
     for (SizeT SrcNum = 0; SrcNum < Inst->getSrcSize(); ++SrcNum) {
@@ -4184,6 +4199,9 @@
       SizeT NumVars = Src->getNumVars();
       for (SizeT J = 0; J < NumVars; ++J) {
         const Variable *Var = Src->getVar(J);
+        // Track last uses of all variables, regardless of whether
+        // they are pre-colored or infinite-weight.
+        LastUses[Var] = Inst;
         if (!Var->hasReg())
           continue;
         WhiteList[Var->getRegNum()] = false;
@@ -4192,36 +4210,58 @@
   }
   // The second pass colors infinite-weight variables.
   llvm::SmallBitVector AvailableRegisters = WhiteList;
+  llvm::SmallBitVector FreedRegisters(WhiteList.size());
   for (InstList::iterator I = Context.getCur(), E = Context.getEnd(); I != E;
        ++I) {
+    FreedRegisters.reset();
     const Inst *Inst = *I;
     if (Inst->isDeleted())
       continue;
-    for (SizeT SrcNum = 0; SrcNum < Inst->getSrcSize(); ++SrcNum) {
-      Operand *Src = Inst->getSrc(SrcNum);
+    // Skip FakeKill instructions like above.
+    if (llvm::isa<InstFakeKill>(Inst))
+      continue;
+    // Iterate over all variables referenced in the instruction,
+    // including the Dest variable (if any).  If the variable is
+    // marked as infinite-weight, find it a register.  If this
+    // instruction is the last use of the variable in the lowered
+    // sequence, release the register to the free list after this
+    // instruction is completely processed.  Note that the first pass
+    // ignores the Dest operand, under the assumption that a
+    // pre-colored Dest will appear as a source operand in some
+    // subsequent instruction in the lowered sequence.
+    Variable *Dest = Inst->getDest();
+    SizeT NumSrcs = Inst->getSrcSize();
+    if (Dest)
+      ++NumSrcs;
+    OperandList Srcs(NumSrcs);
+    for (SizeT i = 0; i < Inst->getSrcSize(); ++i)
+      Srcs[i] = Inst->getSrc(i);
+    if (Dest)
+      Srcs[NumSrcs - 1] = Dest;
+    for (SizeT SrcNum = 0; SrcNum < NumSrcs; ++SrcNum) {
+      Operand *Src = Srcs[SrcNum];
       SizeT NumVars = Src->getNumVars();
       for (SizeT J = 0; J < NumVars; ++J) {
         Variable *Var = Src->getVar(J);
-        if (Var->hasReg())
-          continue;
-        if (!Var->getWeight().isInf())
-          continue;
-        llvm::SmallBitVector AvailableTypedRegisters =
-            AvailableRegisters & getRegisterSetForType(Var->getType());
-        if (!AvailableTypedRegisters.any()) {
-          // This is a hack in case we run out of physical registers due
-          // to an excessively long code sequence, as might happen when
-          // lowering arguments in lowerCall().
-          AvailableRegisters = WhiteList;
-          AvailableTypedRegisters =
+        if (!Var->hasReg() && Var->getWeight().isInf()) {
+          llvm::SmallBitVector AvailableTypedRegisters =
               AvailableRegisters & getRegisterSetForType(Var->getType());
+          assert(AvailableTypedRegisters.any());
+          int32_t RegNum = AvailableTypedRegisters.find_first();
+          Var->setRegNum(RegNum);
+          AvailableRegisters[RegNum] = false;
         }
-        assert(AvailableTypedRegisters.any());
-        int32_t RegNum = AvailableTypedRegisters.find_first();
-        Var->setRegNum(RegNum);
-        AvailableRegisters[RegNum] = false;
+        if (Var->hasReg()) {
+          int32_t RegNum = Var->getRegNum();
+          assert(!AvailableRegisters[RegNum]);
+          if (LastUses[Var] == Inst) {
+            if (WhiteList[RegNum])
+              FreedRegisters[RegNum] = true;
+          }
+        }
       }
     }
+    AvailableRegisters |= FreedRegisters;
   }
 }
 

diff --git a/tests_lit/llvm2ice_tests/ebp_args.ll b/tests_lit/llvm2ice_tests/ebp_args.ll
index 6a33b2b..8360bf9 100644
--- a/tests_lit/llvm2ice_tests/ebp_args.ll
+++ b/tests_lit/llvm2ice_tests/ebp_args.ll

@@ -21,24 +21,23 @@
 ; and stack slot assignment code, and may need to be relaxed if the
 ; lowering code changes.
 
-; CHECK: memcpy_helper:
-; CHECK:  push     ebx
-; CHECK:  push     ebp
-; CHECK:  mov      ebp, esp
-; CHECK:  sub      esp, 20
-; CHECK:  mov      eax, dword ptr [ebp+16]
-; CHECK:  mov      dword ptr [ebp-4], eax
-; CHECK:  sub      esp, 128
-; CHECK:  mov      dword ptr [ebp-8], esp
-; CHECK:  mov      eax, dword ptr [ebp-8]
-; CHECK:  mov      dword ptr [ebp-12], eax
-; CHECK:  movzx    eax, byte ptr [ebp-4]
-; CHECK:  mov      dword ptr [ebp-16], eax
-; CHECK:  sub      esp, 16
-; CHECK:  mov      ecx, dword ptr [ebp+12]
-; CHECK:  mov      dword ptr [esp], ecx
-; CHECK:  mov      edx, dword ptr [ebp-12]
-; CHECK:  mov      dword ptr [esp+4], edx
-; CHECK:  mov      ebx, dword ptr [ebp-16]
-; CHECK:  mov      dword ptr [esp+8], ebx
-; CHECK:  call     memcpy_helper2
+; CHECK-LABEL: memcpy_helper:
+; CHECK:  push  ebp
+; CHECK:  mov   ebp, esp
+; CHECK:  sub   esp, 24
+; CHECK:  mov   eax, dword ptr [ebp+12]
+; CHECK:  mov   dword ptr [ebp-4], eax
+; CHECK:  sub   esp, 128
+; CHECK:  mov   dword ptr [ebp-8], esp
+; CHECK:  mov   eax, dword ptr [ebp-8]
+; CHECK:  mov   dword ptr [ebp-12], eax
+; CHECK:  movzx eax, byte ptr [ebp-4]
+; CHECK:  mov   dword ptr [ebp-16], eax
+; CHECK:  sub   esp, 16
+; CHECK:  mov   ecx, dword ptr [ebp+8]
+; CHECK:  mov   dword ptr [esp], ecx
+; CHECK:  mov   ecx, dword ptr [ebp-12]
+; CHECK:  mov   dword ptr [esp+4], ecx
+; CHECK:  mov   ecx, dword ptr [ebp-16]
+; CHECK:  mov   dword ptr [esp+8], ecx
+; CHECK:  call  memcpy_helper2

diff --git a/tests_lit/llvm2ice_tests/nop-insertion.ll b/tests_lit/llvm2ice_tests/nop-insertion.ll
index 8c0d242..1300832 100644
--- a/tests_lit/llvm2ice_tests/nop-insertion.ll
+++ b/tests_lit/llvm2ice_tests/nop-insertion.ll

@@ -26,9 +26,9 @@
 ; PROB50: pmuludq xmm1, xmm2
 ; PROB50: nop # variant = 0
 ; PROB50: shufps xmm0, xmm1, 136
-; PROB50: pshufd xmm3, xmm0, 216
+; PROB50: pshufd xmm1, xmm0, 216
 ; PROB50: nop # variant = 2
-; PROB50: movups xmmword ptr [esp], xmm3
+; PROB50: movups xmmword ptr [esp], xmm1
 ; PROB50: movups xmm0, xmmword ptr [esp]
 ; PROB50: add esp, 60
 ; PROB50: nop # variant = 0
@@ -54,9 +54,9 @@
 ; PROB90: nop # variant = 3
 ; PROB90: shufps xmm0, xmm1, 136
 ; PROB90: nop # variant = 4
-; PROB90: pshufd xmm3, xmm0, 216
+; PROB90: pshufd xmm1, xmm0, 216
 ; PROB90: nop # variant = 2
-; PROB90: movups xmmword ptr [esp], xmm3
+; PROB90: movups xmmword ptr [esp], xmm1
 ; PROB90: nop # variant = 4
 ; PROB90: movups xmm0, xmmword ptr [esp]
 ; PROB90: nop # variant = 2
@@ -81,9 +81,9 @@
 ; MAXNOPS2: nop # variant = 3
 ; MAXNOPS2: pmuludq xmm1, xmm2
 ; MAXNOPS2: shufps xmm0, xmm1, 136
-; MAXNOPS2: pshufd xmm3, xmm0, 216
+; MAXNOPS2: pshufd xmm1, xmm0, 216
 ; MAXNOPS2: nop # variant = 3
-; MAXNOPS2: movups xmmword ptr [esp], xmm3
+; MAXNOPS2: movups xmmword ptr [esp], xmm1
 ; MAXNOPS2: nop # variant = 0
 ; MAXNOPS2: movups xmm0, xmmword ptr [esp]
 ; MAXNOPS2: nop # variant = 2
commit	4d79fe5b58142ba6e0e6f9ddafab286fb0ec46af	[log] [tgz]
author	Jim Stichnoth <stichnot@chromium.org>	Mon Aug 18 10:55:19 2014 -0700
committer	Jim Stichnoth <stichnot@chromium.org>	Mon Aug 18 10:55:19 2014 -0700
tree	cfde499cd0360accbfd31840fcfbc8c11f6c8073
parent	c3302746b21dcb4469efef8d484adba47c9d87e9 [diff]