Subzero: Fix the simple register allocation for -Om1.
Background: After lowering each high-level ICE instruction, Om1 calls
postLower() to do simple register allocation. It only assigns
registers where absolutely necessary, specifically for infinite-weight
variables, while honoring pre-coloring decisions. The original Om1
register allocation never tried to reuse registers within a lowered
sequence, which was generally OK except for very long lowering
sequences, such as call instructions or some intrinsics. In these
cases, when it ran out of physical registers, it would just reset the
free list and hope for the best, but with no guarantee of correctness.
The fix involves keeping track of which instruction in the lowered
sequence holds the last use of each variable, and releasing each
register back to the free list after its last use. This makes much
better use of registers. It's not necessarily optimal, at least with
respect to pre-colored variables, since those registers are
black-listed even if they don't interfere with an infinite-weight
variable.
BUG= none
R=jvoung@chromium.org
Review URL: https://codereview.chromium.org/483453002
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index d36ad16..0423e39 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -22,6 +22,7 @@
#include "IceOperand.h"
#include "IceTargetLoweringX8632.def"
#include "IceTargetLoweringX8632.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/Support/CommandLine.h"
#include <strings.h>
@@ -4164,7 +4165,7 @@
return;
// TODO: Avoid recomputing WhiteList every instruction.
RegSetMask RegInclude = RegSet_All;
- RegSetMask RegExclude = RegSet_None;
+ RegSetMask RegExclude = RegSet_StackPointer;
if (hasFramePointer())
RegExclude |= RegSet_FramePointer;
llvm::SmallBitVector WhiteList = getRegisterSet(RegInclude, RegExclude);
@@ -4172,11 +4173,25 @@
// there was some prior register allocation pass that made register
// assignments, those registers need to be black-listed here as
// well.
+ llvm::DenseMap<const Variable *, const Inst *> LastUses;
+ // The first pass also keeps track of which instruction is the last
+ // use for each infinite-weight variable. After the last use, the
+ // variable is released to the free list.
for (InstList::iterator I = Context.getCur(), E = Context.getEnd(); I != E;
++I) {
const Inst *Inst = *I;
if (Inst->isDeleted())
continue;
+ // Don't consider a FakeKill instruction, because (currently) it
+ // is only used to kill all scratch registers at a call site, and
+ // we don't want to black-list all scratch registers during the
+ // call lowering. This could become a problem since it relies on
+ // the lowering sequence not keeping any infinite-weight variables
+ // live across a call. TODO(stichnot): Consider replacing this
+ // whole postLower() implementation with a robust local register
+ // allocator, for example compute live ranges only for pre-colored
+ // and infinite-weight variables and run the existing linear-scan
+ // allocator.
if (llvm::isa<InstFakeKill>(Inst))
continue;
for (SizeT SrcNum = 0; SrcNum < Inst->getSrcSize(); ++SrcNum) {
@@ -4184,6 +4199,9 @@
SizeT NumVars = Src->getNumVars();
for (SizeT J = 0; J < NumVars; ++J) {
const Variable *Var = Src->getVar(J);
+ // Track last uses of all variables, regardless of whether
+ // they are pre-colored or infinite-weight.
+ LastUses[Var] = Inst;
if (!Var->hasReg())
continue;
WhiteList[Var->getRegNum()] = false;
@@ -4192,36 +4210,58 @@
}
// The second pass colors infinite-weight variables.
llvm::SmallBitVector AvailableRegisters = WhiteList;
+ llvm::SmallBitVector FreedRegisters(WhiteList.size());
for (InstList::iterator I = Context.getCur(), E = Context.getEnd(); I != E;
++I) {
+ FreedRegisters.reset();
const Inst *Inst = *I;
if (Inst->isDeleted())
continue;
- for (SizeT SrcNum = 0; SrcNum < Inst->getSrcSize(); ++SrcNum) {
- Operand *Src = Inst->getSrc(SrcNum);
+ // Skip FakeKill instructions like above.
+ if (llvm::isa<InstFakeKill>(Inst))
+ continue;
+ // Iterate over all variables referenced in the instruction,
+ // including the Dest variable (if any). If the variable is
+ // marked as infinite-weight, find it a register. If this
+ // instruction is the last use of the variable in the lowered
+ // sequence, release the register to the free list after this
+ // instruction is completely processed. Note that the first pass
+ // ignores the Dest operand, under the assumption that a
+ // pre-colored Dest will appear as a source operand in some
+ // subsequent instruction in the lowered sequence.
+ Variable *Dest = Inst->getDest();
+ SizeT NumSrcs = Inst->getSrcSize();
+ if (Dest)
+ ++NumSrcs;
+ OperandList Srcs(NumSrcs);
+ for (SizeT i = 0; i < Inst->getSrcSize(); ++i)
+ Srcs[i] = Inst->getSrc(i);
+ if (Dest)
+ Srcs[NumSrcs - 1] = Dest;
+ for (SizeT SrcNum = 0; SrcNum < NumSrcs; ++SrcNum) {
+ Operand *Src = Srcs[SrcNum];
SizeT NumVars = Src->getNumVars();
for (SizeT J = 0; J < NumVars; ++J) {
Variable *Var = Src->getVar(J);
- if (Var->hasReg())
- continue;
- if (!Var->getWeight().isInf())
- continue;
- llvm::SmallBitVector AvailableTypedRegisters =
- AvailableRegisters & getRegisterSetForType(Var->getType());
- if (!AvailableTypedRegisters.any()) {
- // This is a hack in case we run out of physical registers due
- // to an excessively long code sequence, as might happen when
- // lowering arguments in lowerCall().
- AvailableRegisters = WhiteList;
- AvailableTypedRegisters =
+ if (!Var->hasReg() && Var->getWeight().isInf()) {
+ llvm::SmallBitVector AvailableTypedRegisters =
AvailableRegisters & getRegisterSetForType(Var->getType());
+ assert(AvailableTypedRegisters.any());
+ int32_t RegNum = AvailableTypedRegisters.find_first();
+ Var->setRegNum(RegNum);
+ AvailableRegisters[RegNum] = false;
}
- assert(AvailableTypedRegisters.any());
- int32_t RegNum = AvailableTypedRegisters.find_first();
- Var->setRegNum(RegNum);
- AvailableRegisters[RegNum] = false;
+ if (Var->hasReg()) {
+ int32_t RegNum = Var->getRegNum();
+ assert(!AvailableRegisters[RegNum]);
+ if (LastUses[Var] == Inst) {
+ if (WhiteList[RegNum])
+ FreedRegisters[RegNum] = true;
+ }
+ }
}
}
+ AvailableRegisters |= FreedRegisters;
}
}
diff --git a/tests_lit/llvm2ice_tests/ebp_args.ll b/tests_lit/llvm2ice_tests/ebp_args.ll
index 6a33b2b..8360bf9 100644
--- a/tests_lit/llvm2ice_tests/ebp_args.ll
+++ b/tests_lit/llvm2ice_tests/ebp_args.ll
@@ -21,24 +21,23 @@
; and stack slot assignment code, and may need to be relaxed if the
; lowering code changes.
-; CHECK: memcpy_helper:
-; CHECK: push ebx
-; CHECK: push ebp
-; CHECK: mov ebp, esp
-; CHECK: sub esp, 20
-; CHECK: mov eax, dword ptr [ebp+16]
-; CHECK: mov dword ptr [ebp-4], eax
-; CHECK: sub esp, 128
-; CHECK: mov dword ptr [ebp-8], esp
-; CHECK: mov eax, dword ptr [ebp-8]
-; CHECK: mov dword ptr [ebp-12], eax
-; CHECK: movzx eax, byte ptr [ebp-4]
-; CHECK: mov dword ptr [ebp-16], eax
-; CHECK: sub esp, 16
-; CHECK: mov ecx, dword ptr [ebp+12]
-; CHECK: mov dword ptr [esp], ecx
-; CHECK: mov edx, dword ptr [ebp-12]
-; CHECK: mov dword ptr [esp+4], edx
-; CHECK: mov ebx, dword ptr [ebp-16]
-; CHECK: mov dword ptr [esp+8], ebx
-; CHECK: call memcpy_helper2
+; CHECK-LABEL: memcpy_helper:
+; CHECK: push ebp
+; CHECK: mov ebp, esp
+; CHECK: sub esp, 24
+; CHECK: mov eax, dword ptr [ebp+12]
+; CHECK: mov dword ptr [ebp-4], eax
+; CHECK: sub esp, 128
+; CHECK: mov dword ptr [ebp-8], esp
+; CHECK: mov eax, dword ptr [ebp-8]
+; CHECK: mov dword ptr [ebp-12], eax
+; CHECK: movzx eax, byte ptr [ebp-4]
+; CHECK: mov dword ptr [ebp-16], eax
+; CHECK: sub esp, 16
+; CHECK: mov ecx, dword ptr [ebp+8]
+; CHECK: mov dword ptr [esp], ecx
+; CHECK: mov ecx, dword ptr [ebp-12]
+; CHECK: mov dword ptr [esp+4], ecx
+; CHECK: mov ecx, dword ptr [ebp-16]
+; CHECK: mov dword ptr [esp+8], ecx
+; CHECK: call memcpy_helper2
diff --git a/tests_lit/llvm2ice_tests/nop-insertion.ll b/tests_lit/llvm2ice_tests/nop-insertion.ll
index 8c0d242..1300832 100644
--- a/tests_lit/llvm2ice_tests/nop-insertion.ll
+++ b/tests_lit/llvm2ice_tests/nop-insertion.ll
@@ -26,9 +26,9 @@
; PROB50: pmuludq xmm1, xmm2
; PROB50: nop # variant = 0
; PROB50: shufps xmm0, xmm1, 136
-; PROB50: pshufd xmm3, xmm0, 216
+; PROB50: pshufd xmm1, xmm0, 216
; PROB50: nop # variant = 2
-; PROB50: movups xmmword ptr [esp], xmm3
+; PROB50: movups xmmword ptr [esp], xmm1
; PROB50: movups xmm0, xmmword ptr [esp]
; PROB50: add esp, 60
; PROB50: nop # variant = 0
@@ -54,9 +54,9 @@
; PROB90: nop # variant = 3
; PROB90: shufps xmm0, xmm1, 136
; PROB90: nop # variant = 4
-; PROB90: pshufd xmm3, xmm0, 216
+; PROB90: pshufd xmm1, xmm0, 216
; PROB90: nop # variant = 2
-; PROB90: movups xmmword ptr [esp], xmm3
+; PROB90: movups xmmword ptr [esp], xmm1
; PROB90: nop # variant = 4
; PROB90: movups xmm0, xmmword ptr [esp]
; PROB90: nop # variant = 2
@@ -81,9 +81,9 @@
; MAXNOPS2: nop # variant = 3
; MAXNOPS2: pmuludq xmm1, xmm2
; MAXNOPS2: shufps xmm0, xmm1, 136
-; MAXNOPS2: pshufd xmm3, xmm0, 216
+; MAXNOPS2: pshufd xmm1, xmm0, 216
; MAXNOPS2: nop # variant = 3
-; MAXNOPS2: movups xmmword ptr [esp], xmm3
+; MAXNOPS2: movups xmmword ptr [esp], xmm1
; MAXNOPS2: nop # variant = 0
; MAXNOPS2: movups xmm0, xmmword ptr [esp]
; MAXNOPS2: nop # variant = 2