Subzero: Use the linear-scan register allocator for Om1 as well. This removes the need for Om1's postLower() code which did its own ad-hoc register allocation. And it actually speeds up Om1 translation significantly. This mode of register allocation only allocates for infinite-weight Variables, while respecting live ranges of pre-colored Variables. BUG= none R=jvoung@chromium.org Review URL: https://codereview.chromium.org/733643005
diff --git a/src/IceDefs.h b/src/IceDefs.h index 9812e3e..a28da4b 100644 --- a/src/IceDefs.h +++ b/src/IceDefs.h
@@ -102,6 +102,11 @@ Liveness_Intervals }; +enum RegAllocKind { + RAK_Global, // full, global register allocation + RAK_InfOnly // allocation only for infinite-weight Variables +}; + enum VerboseItem { IceV_None = 0, IceV_Instructions = 1 << 0,
diff --git a/src/IceRegAlloc.cpp b/src/IceRegAlloc.cpp index 80a9432..569a616 100644 --- a/src/IceRegAlloc.cpp +++ b/src/IceRegAlloc.cpp
@@ -73,17 +73,16 @@ } // end of anonymous namespace -void LinearScan::initForGlobalAlloc() { +// Prepare for full register allocation of all variables. We depend +// on liveness analysis to have calculated live ranges. +void LinearScan::initForGlobal() { TimerMarker T(TimerStack::TT_initUnhandled, Func); - Unhandled.clear(); - UnhandledPrecolored.clear(); - Handled.clear(); - Inactive.clear(); - Active.clear(); - // Gather the live ranges of all variables and add them to the - // Unhandled set. + FindPreference = true; + FindOverlap = true; const VarList &Vars = Func->getVariables(); Unhandled.reserve(Vars.size()); + // Gather the live ranges of all variables and add them to the + // Unhandled set. for (Variable *Var : Vars) { // Explicitly don't consider zero-weight variables, which are // meant to be spill slots. @@ -101,6 +100,128 @@ UnhandledPrecolored.push_back(Var); } } + + // Build the (ordered) list of FakeKill instruction numbers. + Kills.clear(); + for (CfgNode *Node : Func->getNodes()) { + for (auto I = Node->getInsts().begin(), E = Node->getInsts().end(); I != E; + ++I) { + if (auto Kill = llvm::dyn_cast<InstFakeKill>(I)) { + if (!Kill->isDeleted() && !Kill->getLinked()->isDeleted()) + Kills.push_back(I->getNumber()); + } + } + } +} + +// Prepare for very simple register allocation of only infinite-weight +// Variables while respecting pre-colored Variables. Some properties +// we take advantage of: +// +// * Live ranges of interest consist of a single segment. +// +// * Live ranges of interest never span a call instruction. +// +// * Phi instructions are not considered because either phis have +// already been lowered, or they don't contain any pre-colored or +// infinite-weight Variables. +// +// * We don't need to renumber instructions before computing live +// ranges because all the high-level ICE instructions are deleted +// prior to lowering, and the low-level instructions are added in +// monotonically increasing order. +// +// * There are no opportunities for register preference or allowing +// overlap. +// +// Some properties we aren't (yet) taking advantage of: +// +// * Because live ranges are a single segment, the Unhandled set will +// always be empty, and the live range trimming operation is +// unnecessary. +// +// * Calculating overlap of single-segment live ranges could be +// optimized a bit. +void LinearScan::initForInfOnly() { + TimerMarker T(TimerStack::TT_initUnhandled, Func); + FindPreference = false; + FindOverlap = false; + SizeT NumVars = 0; + const VarList &Vars = Func->getVariables(); + + // Iterate across all instructions and record the begin and end of + // the live range for each variable that is pre-colored or infinite + // weight. + std::vector<InstNumberT> LRBegin(Vars.size(), Inst::NumberSentinel); + std::vector<InstNumberT> LREnd(Vars.size(), Inst::NumberSentinel); + for (CfgNode *Node : Func->getNodes()) { + for (auto Inst = Node->getInsts().begin(), E = Node->getInsts().end(); + Inst != E; ++Inst) { + if (Inst->isDeleted()) + continue; + if (const Variable *Var = Inst->getDest()) { + if (Var->hasReg() || Var->getWeight() == RegWeight::Inf) { + if (LRBegin[Var->getIndex()] == Inst::NumberSentinel) { + LRBegin[Var->getIndex()] = Inst->getNumber(); + ++NumVars; + } + } + } + for (SizeT I = 0; I < Inst->getSrcSize(); ++I) { + Operand *Src = Inst->getSrc(I); + SizeT NumVars = Src->getNumVars(); + for (SizeT J = 0; J < NumVars; ++J) { + const Variable *Var = Src->getVar(J); + if (Var->hasReg() || Var->getWeight() == RegWeight::Inf) + LREnd[Var->getIndex()] = Inst->getNumber(); + } + } + } + } + + Unhandled.reserve(NumVars); + for (SizeT i = 0; i < Vars.size(); ++i) { + Variable *Var = Vars[i]; + if (LRBegin[i] != Inst::NumberSentinel) { + assert(LREnd[i] != Inst::NumberSentinel); + Unhandled.push_back(Var); + Var->resetLiveRange(); + const uint32_t WeightDelta = 1; + Var->addLiveRange(LRBegin[i], LREnd[i], WeightDelta); + Var->untrimLiveRange(); + if (Var->hasReg()) { + Var->setRegNumTmp(Var->getRegNum()); + Var->setLiveRangeInfiniteWeight(); + UnhandledPrecolored.push_back(Var); + } + --NumVars; + } + } + // This isn't actually a fatal condition, but it would be nice to + // know if we somehow pre-calculated Unhandled's size wrong. + assert(NumVars == 0); + + // Don't build up the list of Kills because we know that no + // infinite-weight Variable has a live range spanning a call. + Kills.clear(); +} + +void LinearScan::init(RegAllocKind Kind) { + Unhandled.clear(); + UnhandledPrecolored.clear(); + Handled.clear(); + Inactive.clear(); + Active.clear(); + + switch (Kind) { + case RAK_Global: + initForGlobal(); + break; + case RAK_InfOnly: + initForInfOnly(); + break; + } + struct CompareRanges { bool operator()(const Variable *L, const Variable *R) { InstNumberT Lstart = L->getLiveRange().getStart(); @@ -114,20 +235,6 @@ std::sort(Unhandled.rbegin(), Unhandled.rend(), CompareRanges()); std::sort(UnhandledPrecolored.rbegin(), UnhandledPrecolored.rend(), CompareRanges()); - - // Build the (ordered) list of FakeKill instruction numbers. - Kills.clear(); - for (CfgNode *Node : Func->getNodes()) { - for (auto I = Node->getInsts().begin(), E = Node->getInsts().end(); I != E; - ++I) { - if (I->isDeleted()) - continue; - if (auto Kill = llvm::dyn_cast<InstFakeKill>(I)) { - if (!Kill->getLinked()->isDeleted()) - Kills.push_back(I->getNumber()); - } - } - } } // Implements the linear-scan algorithm. Based on "Linear Scan @@ -292,41 +399,41 @@ Variable *Prefer = NULL; int32_t PreferReg = Variable::NoRegister; bool AllowOverlap = false; - if (const Inst *DefInst = VMetadata->getFirstDefinition(Cur)) { - assert(DefInst->getDest() == Cur); - bool IsAssign = DefInst->isSimpleAssign(); - bool IsSingleDef = !VMetadata->isMultiDef(Cur); - for (SizeT i = 0; i < DefInst->getSrcSize(); ++i) { - // TODO(stichnot): Iterate through the actual Variables of the - // instruction, not just the source operands. This could - // capture Load instructions, including address mode - // optimization, for Prefer (but not for AllowOverlap). - if (Variable *SrcVar = llvm::dyn_cast<Variable>(DefInst->getSrc(i))) { - int32_t SrcReg = SrcVar->getRegNumTmp(); - // Only consider source variables that have (so far) been - // assigned a register. That register must be one in the - // RegMask set, e.g. don't try to prefer the stack pointer - // as a result of the stacksave intrinsic. - if (SrcVar->hasRegTmp() && RegMask[SrcReg]) { - if (!Free[SrcReg]) { - // Don't bother trying to enable AllowOverlap if the - // register is already free. - AllowOverlap = - IsSingleDef && IsAssign && !overlapsDefs(Func, Cur, SrcVar); - } - if (AllowOverlap || Free[SrcReg]) { - Prefer = SrcVar; - PreferReg = SrcReg; + if (FindPreference) { + if (const Inst *DefInst = VMetadata->getFirstDefinition(Cur)) { + assert(DefInst->getDest() == Cur); + bool IsAssign = DefInst->isSimpleAssign(); + bool IsSingleDef = !VMetadata->isMultiDef(Cur); + for (SizeT i = 0; i < DefInst->getSrcSize(); ++i) { + // TODO(stichnot): Iterate through the actual Variables of the + // instruction, not just the source operands. This could + // capture Load instructions, including address mode + // optimization, for Prefer (but not for AllowOverlap). + if (Variable *SrcVar = llvm::dyn_cast<Variable>(DefInst->getSrc(i))) { + int32_t SrcReg = SrcVar->getRegNumTmp(); + // Only consider source variables that have (so far) been + // assigned a register. That register must be one in the + // RegMask set, e.g. don't try to prefer the stack pointer + // as a result of the stacksave intrinsic. + if (SrcVar->hasRegTmp() && RegMask[SrcReg]) { + if (FindOverlap && !Free[SrcReg]) { + // Don't bother trying to enable AllowOverlap if the + // register is already free. + AllowOverlap = + IsSingleDef && IsAssign && !overlapsDefs(Func, Cur, SrcVar); + } + if (AllowOverlap || Free[SrcReg]) { + Prefer = SrcVar; + PreferReg = SrcReg; + } } } } - } - } - if (Verbose) { - if (Prefer) { - Str << "Initial Prefer=" << *Prefer << " R=" << PreferReg - << " LIVE=" << Prefer->getLiveRange() << " Overlap=" << AllowOverlap - << "\n"; + if (Verbose && Prefer) { + Str << "Initial Prefer=" << *Prefer << " R=" << PreferReg + << " LIVE=" << Prefer->getLiveRange() + << " Overlap=" << AllowOverlap << "\n"; + } } } @@ -353,12 +460,14 @@ // Disable AllowOverlap if an Active variable, which is not // Prefer, shares Prefer's register, and has a definition within // Cur's live range. - for (const Variable *Item : Active) { - int32_t RegNum = Item->getRegNumTmp(); - if (Item != Prefer && RegNum == PreferReg && - overlapsDefs(Func, Cur, Item)) { - AllowOverlap = false; - dumpDisableOverlap(Func, Item, "Active"); + if (AllowOverlap) { + for (const Variable *Item : Active) { + int32_t RegNum = Item->getRegNumTmp(); + if (Item != Prefer && RegNum == PreferReg && + overlapsDefs(Func, Cur, Item)) { + AllowOverlap = false; + dumpDisableOverlap(Func, Item, "Active"); + } } }
diff --git a/src/IceRegAlloc.h b/src/IceRegAlloc.h index f409e7a..9b9992c 100644 --- a/src/IceRegAlloc.h +++ b/src/IceRegAlloc.h
@@ -26,12 +26,16 @@ LinearScan &operator=(const LinearScan &) = delete; public: - LinearScan(Cfg *Func) : Func(Func) {} - void initForGlobalAlloc(); + LinearScan(Cfg *Func) + : Func(Func), FindPreference(false), FindOverlap(false) {} + void init(RegAllocKind Kind); void scan(const llvm::SmallBitVector &RegMask); void dump(Cfg *Func) const; private: + void initForGlobal(); + void initForInfOnly(); + Cfg *const Func; typedef std::vector<Variable *> OrderedRanges; typedef std::list<Variable *> UnorderedRanges; @@ -41,6 +45,12 @@ OrderedRanges UnhandledPrecolored; UnorderedRanges Active, Inactive, Handled; std::vector<InstNumberT> Kills; + bool FindPreference; + bool FindOverlap; + // TODO(stichnot): We're not really using FindOverlap yet, but we + // may want a flavor of register allocation where FindPreference is + // useful but we didn't want to initialize VMetadata with VMK_All + // and therefore we can't safely allow overlap. }; } // end of namespace Ice
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp index c8f504e..bab46f6 100644 --- a/src/IceTargetLowering.cpp +++ b/src/IceTargetLowering.cpp
@@ -225,7 +225,7 @@ // perhaps for the frame pointer) to be allocated. This set of // registers could potentially be parameterized if we want to restrict // registers e.g. for performance testing. -void TargetLowering::regAlloc() { +void TargetLowering::regAlloc(RegAllocKind Kind) { TimerMarker T(TimerStack::TT_regAlloc, Func); LinearScan LinearScan(Func); RegSetMask RegInclude = RegSet_None; @@ -234,7 +234,7 @@ RegInclude |= RegSet_CalleeSave; if (hasFramePointer()) RegExclude |= RegSet_FramePointer; - LinearScan.initForGlobalAlloc(); + LinearScan.init(Kind); llvm::SmallBitVector RegMask = getRegisterSet(RegInclude, RegExclude); LinearScan.scan(RegMask); }
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h index a8b0fcc..07d665f 100644 --- a/src/IceTargetLowering.h +++ b/src/IceTargetLowering.h
@@ -195,7 +195,7 @@ virtual llvm::SmallBitVector getRegisterSet(RegSetMask Include, RegSetMask Exclude) const = 0; virtual const llvm::SmallBitVector &getRegisterSetForType(Type Ty) const = 0; - void regAlloc(); + void regAlloc(RegAllocKind Kind); virtual void emitVariable(const Variable *Var) const = 0; @@ -236,11 +236,7 @@ virtual void doAddressOptStore() {} virtual void randomlyInsertNop(float Probability) = 0; // This gives the target an opportunity to post-process the lowered - // expansion before returning. The primary intention is to do some - // Register Manager activity as necessary, specifically to eagerly - // allocate registers based on affinity and other factors. The - // simplest lowering does nothing here and leaves it all to a - // subsequent global register allocation pass. + // expansion before returning. virtual void postLower() {} Cfg *Func;
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp index 6c3b624..cd40e09 100644 --- a/src/IceTargetLoweringX8632.cpp +++ b/src/IceTargetLoweringX8632.cpp
@@ -9,9 +9,7 @@ // // This file implements the TargetLoweringX8632 class, which // consists almost entirely of the lowering sequence for each -// high-level instruction. It also implements -// TargetX8632Fast::postLower() which does the simplest possible -// register allocation for the "fast" target. +// high-level instruction. // //===----------------------------------------------------------------------===// @@ -375,7 +373,7 @@ // associated cleanup, to make the dump cleaner and more useful. Func->dump("After initial x8632 codegen"); Func->getVMetadata()->init(VMK_All); - regAlloc(); + regAlloc(RAK_Global); if (Func->hasError()) return; Func->dump("After linear scan regalloc"); @@ -429,6 +427,11 @@ return; Func->dump("After initial x8632 codegen"); + regAlloc(RAK_InfOnly); + if (Func->hasError()) + return; + Func->dump("After regalloc of infinite-weight variables"); + Func->genFrame(); if (Func->hasError()) return; @@ -1816,9 +1819,6 @@ // stack locations. for (SizeT i = 0, e = StackArgs.size(); i < e; ++i) { lowerStore(InstStore::create(Func, StackArgs[i], StackArgLocations[i])); - // TODO: Consider calling postLower() here to reduce the register - // pressure associated with using too many infinite weight - // temporaries when lowering the call sequence in -Om1 mode. } // Copy arguments to be passed in registers to the appropriate @@ -4112,8 +4112,6 @@ Variable *DestT = Func->makeVariable(Ty); lowerInsertElement(InstInsertElement::create(Func, DestT, T, Res, Index)); T = DestT; - // TODO(stichnot): Use postLower() in -Om1 mode to avoid buildup of - // infinite weight temporaries. } lowerAssign(InstAssign::create(Func, Dest, T)); @@ -4200,7 +4198,7 @@ assert(Node->getPhis().empty()); CfgNode *Succ = Node->getOutEdges().front(); getContext().init(Node); - // Register set setup similar to regAlloc() and postLower(). + // Register set setup similar to regAlloc(). RegSetMask RegInclude = RegSet_All; RegSetMask RegExclude = RegSet_StackPointer; if (hasFramePointer()) @@ -4512,116 +4510,21 @@ } void TargetX8632::postLower() { - if (Ctx->getOptLevel() != Opt_m1) { - // Find two-address non-SSA instructions where Dest==Src0, and set - // the DestNonKillable flag to keep liveness analysis consistent. - for (auto Inst = Context.begin(), E = Context.end(); Inst != E; ++Inst) { - if (Inst->isDeleted()) - continue; - if (Variable *Dest = Inst->getDest()) { - // TODO(stichnot): We may need to consider all source - // operands, not just the first one, if using 3-address - // instructions. - if (Inst->getSrcSize() > 0 && Inst->getSrc(0) == Dest) - Inst->setDestNonKillable(); - } - } + if (Ctx->getOptLevel() == Opt_m1) return; - } - // TODO: Avoid recomputing WhiteList every instruction. - RegSetMask RegInclude = RegSet_All; - RegSetMask RegExclude = RegSet_StackPointer; - if (hasFramePointer()) - RegExclude |= RegSet_FramePointer; - llvm::SmallBitVector WhiteList = getRegisterSet(RegInclude, RegExclude); - // Make one pass to black-list pre-colored registers. TODO: If - // there was some prior register allocation pass that made register - // assignments, those registers need to be black-listed here as - // well. - llvm::DenseMap<const Variable *, const Inst *> LastUses; - // The first pass also keeps track of which instruction is the last - // use for each infinite-weight variable. After the last use, the - // variable is released to the free list. + // Find two-address non-SSA instructions where Dest==Src0, and set + // the DestNonKillable flag to keep liveness analysis consistent. for (auto Inst = Context.begin(), E = Context.end(); Inst != E; ++Inst) { if (Inst->isDeleted()) continue; - // Don't consider a FakeKill instruction, because (currently) it - // is only used to kill all scratch registers at a call site, and - // we don't want to black-list all scratch registers during the - // call lowering. This could become a problem since it relies on - // the lowering sequence not keeping any infinite-weight variables - // live across a call. TODO(stichnot): Consider replacing this - // whole postLower() implementation with a robust local register - // allocator, for example compute live ranges only for pre-colored - // and infinite-weight variables and run the existing linear-scan - // allocator. - assert(!llvm::isa<InstFakeKill>(Inst) || Inst->getSrcSize() == 0); - for (SizeT SrcNum = 0; SrcNum < Inst->getSrcSize(); ++SrcNum) { - Operand *Src = Inst->getSrc(SrcNum); - SizeT NumVars = Src->getNumVars(); - for (SizeT J = 0; J < NumVars; ++J) { - const Variable *Var = Src->getVar(J); - // Track last uses of all variables, regardless of whether - // they are pre-colored or infinite-weight. - LastUses[Var] = Inst; - if (!Var->hasReg()) - continue; - WhiteList[Var->getRegNum()] = false; - } + if (Variable *Dest = Inst->getDest()) { + // TODO(stichnot): We may need to consider all source + // operands, not just the first one, if using 3-address + // instructions. + if (Inst->getSrcSize() > 0 && Inst->getSrc(0) == Dest) + Inst->setDestNonKillable(); } } - // The second pass colors infinite-weight variables. - llvm::SmallBitVector AvailableRegisters = WhiteList; - llvm::SmallBitVector FreedRegisters(WhiteList.size()); - for (auto Inst = Context.begin(), E = Context.end(); Inst != E; ++Inst) { - FreedRegisters.reset(); - if (Inst->isDeleted()) - continue; - // Iterate over all variables referenced in the instruction, - // including the Dest variable (if any). If the variable is - // marked as infinite-weight, find it a register. If this - // instruction is the last use of the variable in the lowered - // sequence, release the register to the free list after this - // instruction is completely processed. Note that the first pass - // ignores the Dest operand, under the assumption that a - // pre-colored Dest will appear as a source operand in some - // subsequent instruction in the lowered sequence. - Variable *Dest = Inst->getDest(); - SizeT NumSrcs = Inst->getSrcSize(); - if (Dest) - ++NumSrcs; - if (NumSrcs == 0) - continue; - OperandList Srcs(NumSrcs); - for (SizeT i = 0; i < Inst->getSrcSize(); ++i) - Srcs[i] = Inst->getSrc(i); - if (Dest) - Srcs[NumSrcs - 1] = Dest; - for (SizeT SrcNum = 0; SrcNum < NumSrcs; ++SrcNum) { - Operand *Src = Srcs[SrcNum]; - SizeT NumVars = Src->getNumVars(); - for (SizeT J = 0; J < NumVars; ++J) { - Variable *Var = Src->getVar(J); - if (!Var->hasReg() && Var->getWeight().isInf()) { - llvm::SmallBitVector AvailableTypedRegisters = - AvailableRegisters & getRegisterSetForType(Var->getType()); - assert(AvailableTypedRegisters.any()); - int32_t RegNum = AvailableTypedRegisters.find_first(); - Var->setRegNum(RegNum); - AvailableRegisters[RegNum] = false; - } - if (Var->hasReg()) { - int32_t RegNum = Var->getRegNum(); - assert(!AvailableRegisters[RegNum]); - if (LastUses[Var] == Inst) { - if (WhiteList[RegNum]) - FreedRegisters[RegNum] = true; - } - } - } - } - AvailableRegisters |= FreedRegisters; - } } template <> void ConstantInteger32::emit(GlobalContext *Ctx) const {
diff --git a/src/IceTranslator.cpp b/src/IceTranslator.cpp index edaa74f..08e5697 100644 --- a/src/IceTranslator.cpp +++ b/src/IceTranslator.cpp
@@ -83,10 +83,12 @@ ErrorStatus = true; } - if (Ctx->getFlags().UseIntegratedAssembler) { - Func->emitIAS(); - } else { - Func->emit(); + if (!ErrorStatus) { + if (Ctx->getFlags().UseIntegratedAssembler) { + Func->emitIAS(); + } else { + Func->emit(); + } } Ctx->dumpStats(Func->getFunctionName()); }
diff --git a/tests_lit/llvm2ice_tests/ebp_args.ll b/tests_lit/llvm2ice_tests/ebp_args.ll index 4b7c928..09709b0 100644 --- a/tests_lit/llvm2ice_tests/ebp_args.ll +++ b/tests_lit/llvm2ice_tests/ebp_args.ll
@@ -38,10 +38,10 @@ ; CHECK: movzx eax, byte ptr [ebp - 4] ; CHECK: mov dword ptr [ebp - 16], eax ; CHECK: sub esp, 16 -; CHECK: mov ecx, dword ptr [ebp + 8] -; CHECK: mov dword ptr [esp], ecx -; CHECK: mov ecx, dword ptr [ebp - 12] -; CHECK: mov dword ptr [esp + 4], ecx -; CHECK: mov ecx, dword ptr [ebp - 16] -; CHECK: mov dword ptr [esp + 8], ecx +; CHECK: mov eax, dword ptr [ebp + 8] +; CHECK: mov dword ptr [esp], eax +; CHECK: mov eax, dword ptr [ebp - 12] +; CHECK: mov dword ptr [esp + 4], eax +; CHECK: mov eax, dword ptr [ebp - 16] +; CHECK: mov dword ptr [esp + 8], eax ; CHECK: call -4
diff --git a/tests_lit/llvm2ice_tests/nop-insertion.ll b/tests_lit/llvm2ice_tests/nop-insertion.ll index 835ac76..0c252ca 100644 --- a/tests_lit/llvm2ice_tests/nop-insertion.ll +++ b/tests_lit/llvm2ice_tests/nop-insertion.ll
@@ -31,9 +31,9 @@ ; PROB50: pmuludq %xmm2, %xmm1 ; PROB50: nop # variant = 0 ; PROB50: shufps $136, %xmm1, %xmm0 -; PROB50: pshufd $216, %xmm0, %xmm1 +; PROB50: pshufd $216, %xmm0, %xmm0 ; PROB50: nop # variant = 2 -; PROB50: movups %xmm1, (%esp) +; PROB50: movups %xmm0, (%esp) ; PROB50: movups (%esp), %xmm0 ; PROB50: addl $60, %esp ; PROB50: nop # variant = 0 @@ -59,9 +59,9 @@ ; PROB90: nop # variant = 3 ; PROB90: shufps $136, %xmm1, %xmm0 ; PROB90: nop # variant = 4 -; PROB90: pshufd $216, %xmm0, %xmm1 +; PROB90: pshufd $216, %xmm0, %xmm0 ; PROB90: nop # variant = 2 -; PROB90: movups %xmm1, (%esp) +; PROB90: movups %xmm0, (%esp) ; PROB90: nop # variant = 4 ; PROB90: movups (%esp), %xmm0 ; PROB90: nop # variant = 2 @@ -86,9 +86,9 @@ ; MAXNOPS2: nop # variant = 3 ; MAXNOPS2: pmuludq %xmm2, %xmm1 ; MAXNOPS2: shufps $136, %xmm1, %xmm0 -; MAXNOPS2: pshufd $216, %xmm0, %xmm1 +; MAXNOPS2: pshufd $216, %xmm0, %xmm0 ; MAXNOPS2: nop # variant = 3 -; MAXNOPS2: movups %xmm1, (%esp) +; MAXNOPS2: movups %xmm0, (%esp) ; MAXNOPS2: nop # variant = 0 ; MAXNOPS2: movups (%esp), %xmm0 ; MAXNOPS2: nop # variant = 2